sanscript 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+ require "ice_nine"
3
+
4
+ module Sanscript
5
+ module Refinements
6
+ refine Object do
7
+ def deep_dup
8
+ dup
9
+ rescue TypeError
10
+ self
11
+ end
12
+
13
+ def deep_freeze
14
+ IceNine.deep_freeze(self)
15
+ end
16
+ end
17
+
18
+ refine NilClass do
19
+ def deep_dup
20
+ self
21
+ end
22
+ end
23
+
24
+ refine FalseClass do
25
+ def deep_dup
26
+ self
27
+ end
28
+ end
29
+
30
+ refine TrueClass do
31
+ def deep_dup
32
+ self
33
+ end
34
+ end
35
+
36
+ refine Symbol do
37
+ def deep_dup
38
+ self
39
+ end
40
+ end
41
+
42
+ refine Numeric do
43
+ def deep_dup
44
+ self
45
+ end
46
+ end
47
+
48
+ # Necessary to re-override Numeric
49
+ require "bigdecimal"
50
+ refine BigDecimal do
51
+ def deep_dup
52
+ dup
53
+ end
54
+ end
55
+
56
+ refine String do
57
+ def w_split
58
+ split(/\s/)
59
+ end
60
+ end
61
+
62
+ refine Array do
63
+ def deep_dup
64
+ map { |value| value.deep_dup } # rubocop:disable Style/SymbolProc
65
+ end
66
+ end
67
+
68
+ refine Hash do
69
+ def deep_dup
70
+ hash = dup
71
+ each_pair do |key, value|
72
+ if ::String === key # rubocop:disable Style/CaseEquality
73
+ hash[key] = value.deep_dup
74
+ else
75
+ hash.delete(key)
76
+ hash[key.deep_dup] = value.deep_dup
77
+ end
78
+ end
79
+ hash
80
+ end
81
+ end
82
+
83
+ refine Set do
84
+ def deep_dup
85
+ set_a = to_a
86
+ set_a.map! do |val|
87
+ next val if ::String === val # rubocop:disable Style/CaseEquality
88
+ val.deep_dup
89
+ end
90
+ self.class[set_a]
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,343 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sanscript/refinements"
4
+ require "sanscript/transliterate/schemes"
5
+ #
6
+ # Sanscript
7
+ #
8
+ # Sanscript is a Sanskrit transliteration library. Currently, it supports
9
+ # other Indian languages only incidentally.
10
+ #
11
+ # Released under the MIT and GPL Licenses.
12
+ #
13
+ module Sanscript
14
+ using Refinements
15
+ module Transliterate
16
+ class << self
17
+ attr_reader :defaults, :schemes, :roman_schemes, :all_alternates
18
+ end
19
+
20
+ @defaults = {
21
+ skip_sgml: false,
22
+ syncope: false,
23
+ }
24
+
25
+ @cache = {}
26
+
27
+ module_function
28
+
29
+ #
30
+ # Return a list of available schemes.
31
+ #
32
+ # @return array of scheme identifiers
33
+ #
34
+ def scheme_names
35
+ @schemes.keys.sort!
36
+ end
37
+
38
+ #
39
+ # Check whether the given scheme encodes romanized Sanskrit.
40
+ #
41
+ # @param name the scheme name
42
+ # @return boolean
43
+ #
44
+ def roman_scheme?(name)
45
+ @roman_schemes.include?(name.to_sym)
46
+ end
47
+
48
+ #
49
+ # Add a Brahmic scheme to Sanscript.
50
+ #
51
+ # Schemes are of two types: "Brahmic" and "roman". Brahmic consonants
52
+ # have an inherent vowel sound, but roman consonants do not. This is the
53
+ # main difference between these two types of scheme.
54
+ #
55
+ # A scheme definition is an object ("{}") that maps a group name to a
56
+ # list of characters. For illustration, see the "devanagari" scheme at
57
+ # the top of this file.
58
+ #
59
+ # You can use whatever group names you like, but for the best results,
60
+ # you should use the same group names that Sanscript does.
61
+ #
62
+ # @param name the scheme name
63
+ # @param scheme the scheme data itself. This should be constructed as
64
+ # described above.
65
+ #
66
+ def add_brahmic_scheme(name, scheme)
67
+ @schemes[name.to_sym] = scheme.deep_dup.deep_freeze
68
+ end
69
+
70
+ #
71
+ # Add a roman scheme to Sanscript.
72
+ #
73
+ # See the comments on Sanscript.add_brahmic_scheme. The "vowel_marks" field
74
+ # can be omitted.
75
+ #
76
+ # @param name the scheme name
77
+ # @param scheme the scheme data itself
78
+ #
79
+ def add_roman_scheme(name, scheme)
80
+ name = name.to_sym
81
+ scheme = scheme.deep_dup
82
+ scheme[:vowel_marks] = scheme[:vowels][1..-1] unless scheme.key?(:vowel_marks)
83
+ @schemes[name] = scheme.deep_freeze
84
+ @roman_schemes.add(name)
85
+ end
86
+
87
+ #
88
+ # Create a deep copy of an object, for certain kinds of objects.
89
+ #
90
+ # @param scheme the scheme to copy
91
+ # @return the copy
92
+ #
93
+
94
+ # Set up various schemes
95
+ begin
96
+ # Set up roman schemes
97
+ kolkata = @schemes[:kolkata] = @schemes[:iast].deep_dup
98
+ scheme_names = %i[iast itrans hk kolkata slp1 velthuis wx]
99
+ kolkata[:vowels] = %w[a ā i ī u ū ṛ ṝ ḷ ḹ e ē ai o ō au]
100
+
101
+ # These schemes already belong to Sanscript.schemes. But by adding
102
+ # them again with `addRomanScheme`, we automatically build up
103
+ # `roman_schemes` and define a `vowel_marks` field for each one.
104
+ scheme_names.each do |name|
105
+ add_roman_scheme(name, @schemes[name])
106
+ end
107
+
108
+ # ITRANS variant, which supports Dravidian short 'e' and 'o'.
109
+ itrans_dravidian = @schemes[:itrans].deep_dup
110
+ itrans_dravidian[:vowels] = %w[a A i I u U Ri RRI LLi LLi e E ai o O au]
111
+ itrans_dravidian[:vowel_marks] = itrans_dravidian[:vowels][1..-1]
112
+ @all_alternates[:itrans_dravidian] = @all_alternates[:itrans]
113
+ add_roman_scheme(:itrans_dravidian, itrans_dravidian)
114
+
115
+ # ensure deep freeze on all existing schemes and alternates
116
+ @schemes.each { |_, scheme| scheme.deep_freeze }
117
+ @all_alternates.each { |_, scheme| scheme.deep_freeze }
118
+ end
119
+
120
+ # /**
121
+ # Transliterate from one script to another.
122
+ # *
123
+ # @param data the string to transliterate
124
+ # @param from the source script
125
+ # @param to the destination script
126
+ # @param options transliteration options
127
+ # @return the finished string
128
+ #
129
+ def transliterate(data, from, to, options = {})
130
+ from = from.to_sym
131
+ to = to.to_sym
132
+ raise "Scheme not known ':#{from}'" unless @schemes.key?(from)
133
+ raise "Scheme not known ':#{to}'" unless @schemes.key?(to)
134
+
135
+ data = data.to_str.dup
136
+ options = @defaults.merge(options)
137
+ map = make_map(from, to)
138
+
139
+ data.gsub!(/(<.*?>)/, "##\\1##") if options[:skip_sgml]
140
+
141
+ # Easy way out for "{\m+}", "\", and ".h".
142
+ if from == :itrans
143
+ data.gsub!(/\{\\m\+\}/, ".h.N")
144
+ data.gsub!(/\.h/, "")
145
+ data.gsub!(/\\([^'`_]|$)/, "##\\1##")
146
+ end
147
+
148
+ if map[:from_roman?]
149
+ transliterate_roman(data, map, options)
150
+ else
151
+ transliterate_brahmic(data, map)
152
+ end
153
+ end
154
+
155
+ class << self
156
+ private
157
+
158
+ #
159
+ # Create a map from every character in `from` to its partner in `to`.
160
+ # Also, store any "marks" that `from` might have.
161
+ #
162
+ # @param from input scheme
163
+ # @param to output scheme
164
+ #
165
+ def make_map(from, to)
166
+ @cache[:"#{from}_#{to}"] ||= begin
167
+ alternates = @all_alternates[from] || {}
168
+ consonants = {}
169
+ from_scheme = @schemes[from]
170
+ letters = {}
171
+ token_lengths = []
172
+ marks = {}
173
+ to_scheme = @schemes[to]
174
+
175
+ from_scheme.each do |group, from_group|
176
+ to_group = to_scheme[group]
177
+ next if to_group.nil?
178
+
179
+ from_group.each_with_index do |f, i|
180
+ t = to_group[i]
181
+ alts = alternates[f] || []
182
+ token_lengths.push(f.length)
183
+ token_lengths.concat(alts.map(&:length))
184
+
185
+ if group == :vowel_marks || group == :virama
186
+ marks[f] = t
187
+ alts.each { |alt| marks[alt] = t }
188
+ else
189
+ letters[f] = t
190
+ alts.each { |alt| letters[alt] = t }
191
+
192
+ if group == :consonants || group == :other
193
+ consonants[f] = t
194
+ alts.each { |alt| consonants[alt] = t }
195
+ end
196
+ end
197
+ end
198
+ end
199
+
200
+ {
201
+ consonants: consonants,
202
+ from_roman?: roman_scheme?(from),
203
+ letters: letters,
204
+ marks: marks,
205
+ max_token_length: token_lengths.max,
206
+ to_roman?: roman_scheme?(to),
207
+ virama: to_scheme[:virama].first,
208
+ }.deep_freeze
209
+ end
210
+ end
211
+
212
+ #
213
+ # Transliterate from a romanized script.
214
+ #
215
+ # @param data the string to transliterate
216
+ # @param map map data generated from makeMap()
217
+ # @param options transliteration options
218
+ # @return the finished string
219
+ #
220
+ def transliterate_roman(data, map, options = {})
221
+ options = @defaults.merge(options)
222
+ data = data.to_str.dup
223
+ buf = []
224
+ token_buffer = String.new
225
+ had_consonant = false
226
+ transliteration_enabled = true
227
+
228
+ until data.empty? && token_buffer.empty?
229
+ token_buffer << data.slice!(0, map[:max_token_length] - token_buffer.length)
230
+
231
+ # Match all token substrings to our map.
232
+ (0...map[:max_token_length]).each do |j|
233
+ token = token_buffer[0, map[:max_token_length] - j]
234
+
235
+ if token == "##"
236
+ transliteration_enabled = !transliteration_enabled
237
+ token_buffer.slice!(0, 2)
238
+ break
239
+ end
240
+ temp_letter = map[:letters][token]
241
+ if !temp_letter.nil? && transliteration_enabled
242
+ if map[:to_roman?]
243
+ buf << temp_letter
244
+ else
245
+ # Handle the implicit vowel. Ignore 'a' and force
246
+ # vowels to appear as marks if we've just seen a
247
+ # consonant.
248
+ if had_consonant
249
+ temp_mark = map[:marks][token]
250
+ if !temp_mark.nil?
251
+ buf << temp_mark
252
+ elsif token != "a"
253
+ buf << map[:virama] << temp_letter
254
+ end
255
+ else
256
+ buf << temp_letter
257
+ end
258
+ had_consonant = map[:consonants].key?(token)
259
+ end
260
+ token_buffer.slice!(0, map[:max_token_length] - j)
261
+ break
262
+ elsif j == map[:max_token_length] - 1
263
+ if had_consonant
264
+ had_consonant = false
265
+ buf << map[:virama] unless options[:syncope]
266
+ end
267
+ buf << token
268
+ token_buffer.slice!(0, 1)
269
+ # 'break' is redundant here, "j == ..." is true only on
270
+ # the last iteration.
271
+ end
272
+ end
273
+ end
274
+ buf << map[:virama] if had_consonant && !options[:syncope]
275
+ buf.join("")
276
+ end
277
+
278
+ #
279
+ # Transliterate from a Brahmic script.
280
+ #
281
+ # @param data the string to transliterate
282
+ # @param map map data generated from makeMap()
283
+ # @return the finished string
284
+ #
285
+ def transliterate_brahmic(data, map)
286
+ data = data.to_str.dup
287
+ buf = []
288
+ dangling_hash = false
289
+ had_roman_consonant = false
290
+ transliteration_enabled = true
291
+
292
+ until data.empty?
293
+ l = data.slice!(0, 1)
294
+ # Toggle transliteration state
295
+ if l == "#"
296
+ if dangling_hash
297
+ transliteration_enabled = !transliteration_enabled
298
+ dangling_hash = false
299
+ else
300
+ dangling_hash = true
301
+ end
302
+ if had_roman_consonant
303
+ buf << "a"
304
+ had_roman_consonant = false
305
+ end
306
+ next
307
+ elsif !transliteration_enabled
308
+ buf << l
309
+ next
310
+ end
311
+
312
+ temp = map[:marks][l]
313
+ if !temp.nil?
314
+ buf << temp
315
+ had_roman_consonant = false
316
+ else
317
+ if dangling_hash
318
+ buf << "#"
319
+ dangling_hash = false
320
+ end
321
+ if had_roman_consonant
322
+ buf << "a"
323
+ had_roman_consonant = false
324
+ end
325
+
326
+ # Push transliterated letter if possible. Otherwise, push
327
+ # the letter itself.
328
+ temp = map[:letters][l]
329
+ if !temp.nil?
330
+ buf << temp
331
+ had_roman_consonant = map[:to_roman?] && map[:consonants].key?(l)
332
+ else
333
+ buf << l
334
+ end
335
+ end
336
+ end
337
+
338
+ buf << "a" if had_roman_consonant
339
+ buf.join("")
340
+ end
341
+ end
342
+ end
343
+ end
@@ -0,0 +1,312 @@
1
+ # frozen_string_literal: true
2
+ require "sanscript/refinements"
3
+
4
+ module Sanscript
5
+ using Refinements
6
+ module Transliterate
7
+ # Schemes
8
+ # =======
9
+ # Schemes are of two kinds: "Brahmic" and "roman." "Brahmic" schemes
10
+ # describe abugida scripts found in India. "Roman" schemes describe
11
+ # manufactured alphabets that are meant to describe or encode Brahmi
12
+ # scripts. Abugidas and alphabets are processed by separate algorithms
13
+ # because of the unique difficulties involved with each.
14
+ #
15
+ # Brahmic consonants are stated without a virama. Roman consonants are
16
+ # stated without the vowel 'a'.
17
+ #
18
+ # (Since "abugida" is not a well-known term, Sanscript uses "Brahmic"
19
+ # and "roman" for clarity.)
20
+ #
21
+ @schemes = {
22
+
23
+ # Bengali
24
+ # -------
25
+ # 'va' and 'ba' are both rendered as ব.
26
+ #
27
+ bengali: {
28
+ vowels: "অ আ ই ঈ উ ঊ ঋ ৠ ঌ ৡ এ ঐ ও ঔ".w_split,
29
+ vowel_marks: "া ি ী ু ূ ৃ ৄ ৢ ৣ ে ৈ ো ৌ".w_split,
30
+ other_marks: "ং ঃ ঁ".w_split,
31
+ virama: ["্"],
32
+ consonants: "ক খ গ ঘ ঙ চ ছ জ ঝ ঞ ট ঠ ড ঢ ণ ত থ দ ধ ন প ফ ব ভ ম য র ল ব শ ষ স হ ळ ক্ষ জ্ঞ".w_split,
33
+ symbols: "০ ১ ২ ৩ ৪ ৫ ৬ ৭ ৮ ৯ ॐ ঽ । ॥".w_split,
34
+ other: " ড ঢ য ".w_split,
35
+ },
36
+
37
+ # Devanagari
38
+ # ----------
39
+ # The most comprehensive and unambiguous Brahmic script listed.
40
+ #
41
+ devanagari: {
42
+ # "Independent" forms of the vowels. These are used whenever the
43
+ # vowel does not immediately follow a consonant.
44
+ vowels: "अ आ इ ई उ ऊ ऋ ॠ ऌ ॡ ऎ ए ऐ ऒ ओ औ".w_split,
45
+
46
+ # "Dependent" forms of the vowels. These are used whenever the
47
+ # vowel immediately follows a consonant. If a letter is not
48
+ # listed in `vowels`, it should not be listed here.
49
+ vowel_marks: "ा ि ी ु ू ृ ॄ ॢ ॣ ॆ े ै ॊ ो ौ".w_split,
50
+
51
+ # Miscellaneous marks, all of which are used in Sanskrit.
52
+ other_marks: "ं ः ँ".w_split,
53
+
54
+ # In syllabic scripts like Devanagari, consonants have an inherent
55
+ # vowel that must be suppressed explicitly. We do so by putting a
56
+ # virama after the consonant.
57
+ virama: ["्"],
58
+
59
+ # Various Sanskrit consonants and consonant clusters. Every token
60
+ # here has an explicit vowel. Thus "क" is "ka" instead of "k".
61
+ consonants: "क ख ग घ ङ च छ ज झ ञ ट ठ ड ढ ण त थ द ध न प फ ब भ म य र ल व श ष स ह ळ क्ष ज्ञ".w_split,
62
+
63
+ # Numbers and punctuation
64
+ symbols: "० १ २ ३ ४ ५ ६ ७ ८ ९ ॐ ऽ । ॥".w_split,
65
+
66
+ # Zero-width joiner. This is used to separate a consonant cluster
67
+ # and avoid a complex ligature.
68
+ zwj: ["\u200D"],
69
+
70
+ # Dummy consonant. This is used in ITRANS to prevert certain types
71
+ # of parser ambiguity. Thus "barau" -> बरौ but "bara_u" -> बरउ.
72
+ skip: [""],
73
+
74
+ # Vedic accent. Udatta and anudatta.
75
+ accent: %W[\u0951 \u0952],
76
+
77
+ # Accent combined with anusvara and and visarga. For compatibility
78
+ # with ITRANS, which allows the reverse of these four.
79
+ combo_accent: "ः॑ ः॒ ं॑ ं॒".w_split,
80
+
81
+ candra: ["ॅ"],
82
+
83
+ # Non-Sanskrit consonants
84
+ other: "क़ ख़ ग़ ज़ ड़ ढ़ फ़ य़ ऱ".w_split,
85
+ },
86
+
87
+ # Gujarati
88
+ # --------
89
+ # Sanskrit-complete.
90
+ #
91
+ gujarati: {
92
+ vowels: "અ આ ઇ ઈ ઉ ઊ ઋ ૠ ઌ ૡ એ ઐ ઓ ઔ".w_split,
93
+ vowel_marks: "ા િ ી ુ ૂ ૃ ૄ ૢ ૣ ે ૈ ો ૌ".w_split,
94
+ other_marks: "ં ઃ ઁ".w_split,
95
+ virama: ["્"],
96
+ consonants: "ક ખ ગ ઘ ઙ ચ છ જ ઝ ઞ ટ ઠ ડ ઢ ણ ત થ દ ધ ન પ ફ બ ભ મ ય ર લ વ શ ષ સ હ ળ ક્ષ જ્ઞ".w_split,
97
+ symbols: "૦ ૧ ૨ ૩ ૪ ૫ ૬ ૭ ૮ ૯ ૐ ઽ ૤ ૥".w_split,
98
+ candra: ["ૅ"],
99
+ },
100
+
101
+ # Gurmukhi
102
+ # --------
103
+ # Missing R/RR/lR/lRR
104
+ #
105
+ gurmukhi: {
106
+ vowels: "ਅ ਆ ਇ ਈ ਉ ਊ ਏ ਐ ਓ ਔ".w_split,
107
+ vowel_marks: "ਾ ਿ ੀ ੁ ੂ ੇ ੈ ੋ ੌ".w_split,
108
+ other_marks: "ਂ ਃ ਁ".w_split,
109
+ virama: ["੍"],
110
+ consonants: "ਕ ਖ ਗ ਘ ਙ ਚ ਛ ਜ ਝ ਞ ਟ ਠ ਡ ਢ ਣ ਤ ਥ ਦ ਧ ਨ ਪ ਫ ਬ ਭ ਮ ਯ ਰ ਲ ਵ ਸ਼ ਸ਼ ਸ ਹ ਲ਼ ਕ੍ਸ਼ ਜ੍ਞ".w_split,
111
+ symbols: "੦ ੧ ੨ ੩ ੪ ੫ ੬ ੭ ੮ ੯ ॐ ऽ । ॥".w_split,
112
+ other: " ਖ ਗ ਜ ਡ ਫ ".w_split,
113
+ },
114
+
115
+ # Kannada
116
+ # -------
117
+ # Sanskrit-complete.
118
+ #
119
+ kannada: {
120
+ vowels: "ಅ ಆ ಇ ಈ ಉ ಊ ಋ ೠ ಌ ೡ ಎ ಏ ಐ ಒ ಓ ಔ".w_split,
121
+ vowel_marks: "ಾ ಿ ೀ ು ೂ ೃ ೄ ೢ ೣ ೆ ೇ ೈ ೊ ೋ ೌ".w_split,
122
+ other_marks: "ಂ ಃ ँ".w_split,
123
+ virama: ["್"],
124
+ consonants: "ಕ ಖ ಗ ಘ ಙ ಚ ಛ ಜ ಝ ಞ ಟ ಠ ಡ ಢ ಣ ತ ಥ ದ ಧ ನ ಪ ಫ ಬ ಭ ಮ ಯ ರ ಲ ವ ಶ ಷ ಸ ಹ ಳ ಕ್ಷ ಜ್ಞ".w_split,
125
+ symbols: "೦ ೧ ೨ ೩ ೪ ೫ ೬ ೭ ೮ ೯ ಓಂ ಽ । ॥".w_split,
126
+ other: " ಫ ಱ".w_split,
127
+ },
128
+
129
+ # Malayalam
130
+ # ---------
131
+ # Sanskrit-complete.
132
+ #
133
+ malayalam: {
134
+ vowels: "അ ആ ഇ ഈ ഉ ഊ ഋ ൠ ഌ ൡ എ ഏ ഐ ഒ ഓ ഔ".w_split,
135
+ vowel_marks: "ാ ി ീ ു ൂ ൃ ൄ ൢ ൣ െ േ ൈ ൊ ോ ൌ".w_split,
136
+ other_marks: "ം ഃ ँ".w_split,
137
+ virama: ["്"],
138
+ consonants: "ക ഖ ഗ ഘ ങ ച ഛ ജ ഝ ഞ ട ഠ ഡ ഢ ണ ത ഥ ദ ധ ന പ ഫ ബ ഭ മ യ ര ല വ ശ ഷ സ ഹ ള ക്ഷ ജ്ഞ".w_split,
139
+ symbols: "൦ ൧ ൨ ൩ ൪ ൫ ൬ ൭ ൮ ൯ ഓം ഽ । ॥".w_split,
140
+ other: " റ".w_split,
141
+ },
142
+
143
+ # Oriya
144
+ # -----
145
+ # Sanskrit-complete.
146
+ #
147
+ oriya: {
148
+ vowels: "ଅ ଆ ଇ ଈ ଉ ଊ ଋ ୠ ଌ ୡ ଏ ଐ ଓ ଔ".w_split,
149
+ vowel_marks: "ା ି ୀ ୁ ୂ ୃ ୄ ୢ ୣ େ ୈ ୋ ୌ".w_split,
150
+ other_marks: "ଂ ଃ ଁ".w_split,
151
+ virama: ["୍"],
152
+ consonants: "କ ଖ ଗ ଘ ଙ ଚ ଛ ଜ ଝ ଞ ଟ ଠ ଡ ଢ ଣ ତ ଥ ଦ ଧ ନ ପ ଫ ବ ଭ ମ ଯ ର ଲ ଵ ଶ ଷ ସ ହ ଳ କ୍ଷ ଜ୍ଞ".w_split,
153
+ symbols: "୦ ୧ ୨ ୩ ୪ ୫ ୬ ୭ ୮ ୯ ଓଂ ଽ । ॥".w_split,
154
+ other: " ଡ ଢ ଯ ".w_split,
155
+ },
156
+
157
+ # Tamil
158
+ # -----
159
+ # Missing R/RR/lR/lRR vowel marks and voice/aspiration distinctions.
160
+ # The most incomplete of the Sanskrit schemes here.
161
+ #
162
+ tamil: {
163
+ vowels: "அ ஆ இ ஈ உ ஊ எ ஏ ஐ ஒ ஓ ஔ".w_split,
164
+ vowel_marks: "ா ி ீ ு ூ ெ ே ை ொ ோ ௌ".w_split,
165
+ other_marks: "ஂ ஃ ".w_split,
166
+ virama: ["்"],
167
+ consonants: "க க க க ங ச ச ஜ ச ஞ ட ட ட ட ண த த த த ந ப ப ப ப ம ய ர ல வ ஶ ஷ ஸ ஹ ள க்ஷ ஜ்ஞ".w_split,
168
+ symbols: "௦ ௧ ௨ ௩ ௪ ௫ ௬ ௭ ௮ ௯ ௐ ऽ । ॥".w_split,
169
+ other: " ற".w_split,
170
+ },
171
+
172
+ # Telugu
173
+ # ------
174
+ # Sanskrit-complete.
175
+ #
176
+ telugu: {
177
+ vowels: "అ ఆ ఇ ఈ ఉ ఊ ఋ ౠ ఌ ౡ ఎ ఏ ఐ ఒ ఓ ఔ".w_split,
178
+ vowel_marks: "ా ి ీ ు ూ ృ ౄ ౢ ౣ ె ే ై ొ ో ౌ".w_split,
179
+ other_marks: "ం ః ఁ".w_split,
180
+ virama: ["్"],
181
+ consonants: "క ఖ గ ఘ ఙ చ ఛ జ ఝ ఞ ట ఠ డ ఢ ణ త థ ద ధ న ప ఫ బ భ మ య ర ల వ శ ష స హ ళ క్ష జ్ఞ".w_split,
182
+ symbols: "౦ ౧ ౨ ౩ ౪ ౫ ౬ ౭ ౮ ౯ ఓం ఽ । ॥".w_split,
183
+ other: " ఱ".w_split,
184
+ },
185
+
186
+ # International Alphabet of Sanskrit Transliteration
187
+ # --------------------------------------------------
188
+ # The most "professional" Sanskrit romanization scheme.
189
+ #
190
+ iast: {
191
+ vowels: "a ā i ī u ū ṛ ṝ ḷ ḹ e ai o au".w_split,
192
+ other_marks: ["ṃ", "ḥ", "~"],
193
+ virama: [""],
194
+ consonants: "k kh g gh ṅ c ch j jh ñ ṭ ṭh ḍ ḍh ṇ t th d dh n p ph b bh m y r l v ś ṣ s h ḻ kṣ jñ".w_split,
195
+ symbols: "0 1 2 3 4 5 6 7 8 9 oṃ ' । ॥".w_split,
196
+ },
197
+
198
+ # ITRANS
199
+ # ------
200
+ # One of the first romanization schemes -- and one of the most
201
+ # complicated. For alternate forms, see the "allAlternates" variable
202
+ # below.
203
+ # *
204
+ # '_' is a "null" letter, which allows adjacent vowels.
205
+ #
206
+ itrans: {
207
+ vowels: "a A i I u U RRi RRI LLi LLI e ai o au".w_split,
208
+ other_marks: ["M", "H", ".N"],
209
+ virama: [""],
210
+ consonants: "k kh g gh ~N ch Ch j jh ~n T Th D Dh N t th d dh n p ph b bh m y r l v sh Sh s h L kSh j~n".w_split,
211
+ symbols: "0 1 2 3 4 5 6 7 8 9 OM .a | ||".w_split,
212
+ candra: [".c"],
213
+ zwj: ["{}"],
214
+ skip: ["_"],
215
+ accent: ["\\'", "\\_"],
216
+ combo_accent: "\\'H \\_H \\'M \\_M".w_split,
217
+ other: "q K G z .D .Dh f Y R".w_split,
218
+ },
219
+
220
+ # Harvard-Kyoto
221
+ # -------------
222
+ # A simple 1:1 mapping.
223
+ #
224
+ hk: {
225
+ vowels: "a A i I u U R RR lR lRR e ai o au".w_split,
226
+ other_marks: "M H ~".w_split,
227
+ virama: [""],
228
+ consonants: "k kh g gh G c ch j jh J T Th D Dh N t th d dh n p ph b bh m y r l v z S s h L kS jJ".w_split,
229
+ symbols: "0 1 2 3 4 5 6 7 8 9 OM ' | ||".w_split,
230
+ },
231
+
232
+ # National Library at Kolkata
233
+ # ---------------------------
234
+ # Apart from using "ē" and "ō" instead of "e" and "o", this scheme is
235
+ # identical to IAST. ṝ, ḷ, and ḹ are not part of the scheme proper.
236
+ # *
237
+ # This is defined further below.
238
+ #
239
+
240
+ # Sanskrit Library Phonetic Basic
241
+ # -------------------------------
242
+ # With one ASCII letter per phoneme, this is the tersest transliteration
243
+ # scheme in use today and is especially suited to computer processing.
244
+ #
245
+ slp1: {
246
+ vowels: "a A i I u U f F x X e E o O".w_split,
247
+ other_marks: "M H ~".w_split,
248
+ virama: [""],
249
+ consonants: "k K g G N c C j J Y w W q Q R t T d D n p P b B m y r l v S z s h L kz jY".w_split,
250
+ symbols: "0 1 2 3 4 5 6 7 8 9 oM ' . ..".w_split,
251
+ },
252
+
253
+ # Velthuis
254
+ # --------
255
+ # A case-insensitive Sanskrit encoding.
256
+ #
257
+ velthuis: {
258
+ vowels: "a aa i ii u uu .r .rr .li .ll e ai o au".w_split,
259
+ other_marks: ".m .h ".w_split,
260
+ virama: [""],
261
+ consonants: 'k kh g gh "n c ch j jh ~n .t .th .d .d .n t th d dh n p ph b bh m y r l v ~s .s s h L k.s j~n'.w_split,
262
+ symbols: "0 1 2 3 4 5 6 7 8 9 o.m ' | ||".w_split,
263
+ },
264
+
265
+ # WX
266
+ # --
267
+ # As terse as SLP1.
268
+ #
269
+ wx: {
270
+ vowels: "a A i I u U q Q L e E o O".w_split,
271
+ other_marks: "M H z".w_split,
272
+ virama: [""],
273
+ consonants: "k K g G f c C j J F t T d D N w W x X n p P b B m y r l v S R s h kR jF".w_split,
274
+ symbols: "0 1 2 3 4 5 6 7 8 9 oM ' | ||".w_split,
275
+ },
276
+ }
277
+
278
+ # Set of names of schemes
279
+ @roman_schemes = Set.new
280
+
281
+ # Map of alternate encodings.
282
+ @all_alternates = {
283
+ itrans: {
284
+ "A" => ["aa"],
285
+ "I" => %w[ii ee],
286
+ "U" => %w[uu oo],
287
+ "RRi" => ["R^i"],
288
+ "RRI" => ["R^I"],
289
+ "LLi" => ["L^i"],
290
+ "LLI" => ["L^I"],
291
+ "M" => [".m", ".n"],
292
+ "~N" => ["N^"],
293
+ "ch" => ["c"],
294
+ "Ch" => %w[C chh],
295
+ "~n" => ["JN"],
296
+ "v" => ["w"],
297
+ "Sh" => %w[S shh],
298
+ "kSh" => %w[kS x],
299
+ "j~n" => %w[GY dny],
300
+ "OM" => ["AUM"],
301
+ "\\_" => ["\\`"],
302
+ "\\_H" => ["\\`H"],
303
+ "\\'M" => ["\\'.m", "\\'.n"],
304
+ "\\_M" => "\\_.m \\_.n \\`M \\`.m \\`.n".w_split,
305
+ ".a" => ["~"],
306
+ "|" => ["."],
307
+ "||" => [".."],
308
+ "z" => ["J"],
309
+ },
310
+ }
311
+ end
312
+ end