sanscript 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+ require "ice_nine"
3
+
4
+ module Sanscript
5
+ module Refinements
6
+ refine Object do
7
+ def deep_dup
8
+ dup
9
+ rescue TypeError
10
+ self
11
+ end
12
+
13
+ def deep_freeze
14
+ IceNine.deep_freeze(self)
15
+ end
16
+ end
17
+
18
+ refine NilClass do
19
+ def deep_dup
20
+ self
21
+ end
22
+ end
23
+
24
+ refine FalseClass do
25
+ def deep_dup
26
+ self
27
+ end
28
+ end
29
+
30
+ refine TrueClass do
31
+ def deep_dup
32
+ self
33
+ end
34
+ end
35
+
36
+ refine Symbol do
37
+ def deep_dup
38
+ self
39
+ end
40
+ end
41
+
42
+ refine Numeric do
43
+ def deep_dup
44
+ self
45
+ end
46
+ end
47
+
48
+ # Necessary to re-override Numeric
49
+ require "bigdecimal"
50
+ refine BigDecimal do
51
+ def deep_dup
52
+ dup
53
+ end
54
+ end
55
+
56
+ refine String do
57
+ def w_split
58
+ split(/\s/)
59
+ end
60
+ end
61
+
62
+ refine Array do
63
+ def deep_dup
64
+ map { |value| value.deep_dup } # rubocop:disable Style/SymbolProc
65
+ end
66
+ end
67
+
68
+ refine Hash do
69
+ def deep_dup
70
+ hash = dup
71
+ each_pair do |key, value|
72
+ if ::String === key # rubocop:disable Style/CaseEquality
73
+ hash[key] = value.deep_dup
74
+ else
75
+ hash.delete(key)
76
+ hash[key.deep_dup] = value.deep_dup
77
+ end
78
+ end
79
+ hash
80
+ end
81
+ end
82
+
83
+ refine Set do
84
+ def deep_dup
85
+ set_a = to_a
86
+ set_a.map! do |val|
87
+ next val if ::String === val # rubocop:disable Style/CaseEquality
88
+ val.deep_dup
89
+ end
90
+ self.class[set_a]
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,343 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sanscript/refinements"
4
+ require "sanscript/transliterate/schemes"
5
+ #
6
+ # Sanscript
7
+ #
8
+ # Sanscript is a Sanskrit transliteration library. Currently, it supports
9
+ # other Indian languages only incidentally.
10
+ #
11
+ # Released under the MIT and GPL Licenses.
12
+ #
13
+ module Sanscript
14
+ using Refinements
15
+ module Transliterate
16
+ class << self
17
+ attr_reader :defaults, :schemes, :roman_schemes, :all_alternates
18
+ end
19
+
20
+ @defaults = {
21
+ skip_sgml: false,
22
+ syncope: false,
23
+ }
24
+
25
+ @cache = {}
26
+
27
+ module_function
28
+
29
+ #
30
+ # Return a list of available schemes.
31
+ #
32
+ # @return array of scheme identifiers
33
+ #
34
+ def scheme_names
35
+ @schemes.keys.sort!
36
+ end
37
+
38
+ #
39
+ # Check whether the given scheme encodes romanized Sanskrit.
40
+ #
41
+ # @param name the scheme name
42
+ # @return boolean
43
+ #
44
+ def roman_scheme?(name)
45
+ @roman_schemes.include?(name.to_sym)
46
+ end
47
+
48
+ #
49
+ # Add a Brahmic scheme to Sanscript.
50
+ #
51
+ # Schemes are of two types: "Brahmic" and "roman". Brahmic consonants
52
+ # have an inherent vowel sound, but roman consonants do not. This is the
53
+ # main difference between these two types of scheme.
54
+ #
55
+ # A scheme definition is an object ("{}") that maps a group name to a
56
+ # list of characters. For illustration, see the "devanagari" scheme at
57
+ # the top of this file.
58
+ #
59
+ # You can use whatever group names you like, but for the best results,
60
+ # you should use the same group names that Sanscript does.
61
+ #
62
+ # @param name the scheme name
63
+ # @param scheme the scheme data itself. This should be constructed as
64
+ # described above.
65
+ #
66
+ def add_brahmic_scheme(name, scheme)
67
+ @schemes[name.to_sym] = scheme.deep_dup.deep_freeze
68
+ end
69
+
70
+ #
71
+ # Add a roman scheme to Sanscript.
72
+ #
73
+ # See the comments on Sanscript.add_brahmic_scheme. The "vowel_marks" field
74
+ # can be omitted.
75
+ #
76
+ # @param name the scheme name
77
+ # @param scheme the scheme data itself
78
+ #
79
+ def add_roman_scheme(name, scheme)
80
+ name = name.to_sym
81
+ scheme = scheme.deep_dup
82
+ scheme[:vowel_marks] = scheme[:vowels][1..-1] unless scheme.key?(:vowel_marks)
83
+ @schemes[name] = scheme.deep_freeze
84
+ @roman_schemes.add(name)
85
+ end
86
+
87
+ #
88
+ # Create a deep copy of an object, for certain kinds of objects.
89
+ #
90
+ # @param scheme the scheme to copy
91
+ # @return the copy
92
+ #
93
+
94
+ # Set up various schemes
95
+ begin
96
+ # Set up roman schemes
97
+ kolkata = @schemes[:kolkata] = @schemes[:iast].deep_dup
98
+ scheme_names = %i[iast itrans hk kolkata slp1 velthuis wx]
99
+ kolkata[:vowels] = %w[a ā i ī u ū ṛ ṝ ḷ ḹ e ē ai o ō au]
100
+
101
+ # These schemes already belong to Sanscript.schemes. But by adding
102
+ # them again with `addRomanScheme`, we automatically build up
103
+ # `roman_schemes` and define a `vowel_marks` field for each one.
104
+ scheme_names.each do |name|
105
+ add_roman_scheme(name, @schemes[name])
106
+ end
107
+
108
+ # ITRANS variant, which supports Dravidian short 'e' and 'o'.
109
+ itrans_dravidian = @schemes[:itrans].deep_dup
110
+ itrans_dravidian[:vowels] = %w[a A i I u U Ri RRI LLi LLi e E ai o O au]
111
+ itrans_dravidian[:vowel_marks] = itrans_dravidian[:vowels][1..-1]
112
+ @all_alternates[:itrans_dravidian] = @all_alternates[:itrans]
113
+ add_roman_scheme(:itrans_dravidian, itrans_dravidian)
114
+
115
+ # ensure deep freeze on all existing schemes and alternates
116
+ @schemes.each { |_, scheme| scheme.deep_freeze }
117
+ @all_alternates.each { |_, scheme| scheme.deep_freeze }
118
+ end
119
+
120
+ # /**
121
+ # Transliterate from one script to another.
122
+ # *
123
+ # @param data the string to transliterate
124
+ # @param from the source script
125
+ # @param to the destination script
126
+ # @param options transliteration options
127
+ # @return the finished string
128
+ #
129
+ def transliterate(data, from, to, options = {})
130
+ from = from.to_sym
131
+ to = to.to_sym
132
+ raise "Scheme not known ':#{from}'" unless @schemes.key?(from)
133
+ raise "Scheme not known ':#{to}'" unless @schemes.key?(to)
134
+
135
+ data = data.to_str.dup
136
+ options = @defaults.merge(options)
137
+ map = make_map(from, to)
138
+
139
+ data.gsub!(/(<.*?>)/, "##\\1##") if options[:skip_sgml]
140
+
141
+ # Easy way out for "{\m+}", "\", and ".h".
142
+ if from == :itrans
143
+ data.gsub!(/\{\\m\+\}/, ".h.N")
144
+ data.gsub!(/\.h/, "")
145
+ data.gsub!(/\\([^'`_]|$)/, "##\\1##")
146
+ end
147
+
148
+ if map[:from_roman?]
149
+ transliterate_roman(data, map, options)
150
+ else
151
+ transliterate_brahmic(data, map)
152
+ end
153
+ end
154
+
155
+ class << self
156
+ private
157
+
158
+ #
159
+ # Create a map from every character in `from` to its partner in `to`.
160
+ # Also, store any "marks" that `from` might have.
161
+ #
162
+ # @param from input scheme
163
+ # @param to output scheme
164
+ #
165
+ def make_map(from, to)
166
+ @cache[:"#{from}_#{to}"] ||= begin
167
+ alternates = @all_alternates[from] || {}
168
+ consonants = {}
169
+ from_scheme = @schemes[from]
170
+ letters = {}
171
+ token_lengths = []
172
+ marks = {}
173
+ to_scheme = @schemes[to]
174
+
175
+ from_scheme.each do |group, from_group|
176
+ to_group = to_scheme[group]
177
+ next if to_group.nil?
178
+
179
+ from_group.each_with_index do |f, i|
180
+ t = to_group[i]
181
+ alts = alternates[f] || []
182
+ token_lengths.push(f.length)
183
+ token_lengths.concat(alts.map(&:length))
184
+
185
+ if group == :vowel_marks || group == :virama
186
+ marks[f] = t
187
+ alts.each { |alt| marks[alt] = t }
188
+ else
189
+ letters[f] = t
190
+ alts.each { |alt| letters[alt] = t }
191
+
192
+ if group == :consonants || group == :other
193
+ consonants[f] = t
194
+ alts.each { |alt| consonants[alt] = t }
195
+ end
196
+ end
197
+ end
198
+ end
199
+
200
+ {
201
+ consonants: consonants,
202
+ from_roman?: roman_scheme?(from),
203
+ letters: letters,
204
+ marks: marks,
205
+ max_token_length: token_lengths.max,
206
+ to_roman?: roman_scheme?(to),
207
+ virama: to_scheme[:virama].first,
208
+ }.deep_freeze
209
+ end
210
+ end
211
+
212
+ #
213
+ # Transliterate from a romanized script.
214
+ #
215
+ # @param data the string to transliterate
216
+ # @param map map data generated from makeMap()
217
+ # @param options transliteration options
218
+ # @return the finished string
219
+ #
220
+ def transliterate_roman(data, map, options = {})
221
+ options = @defaults.merge(options)
222
+ data = data.to_str.dup
223
+ buf = []
224
+ token_buffer = String.new
225
+ had_consonant = false
226
+ transliteration_enabled = true
227
+
228
+ until data.empty? && token_buffer.empty?
229
+ token_buffer << data.slice!(0, map[:max_token_length] - token_buffer.length)
230
+
231
+ # Match all token substrings to our map.
232
+ (0...map[:max_token_length]).each do |j|
233
+ token = token_buffer[0, map[:max_token_length] - j]
234
+
235
+ if token == "##"
236
+ transliteration_enabled = !transliteration_enabled
237
+ token_buffer.slice!(0, 2)
238
+ break
239
+ end
240
+ temp_letter = map[:letters][token]
241
+ if !temp_letter.nil? && transliteration_enabled
242
+ if map[:to_roman?]
243
+ buf << temp_letter
244
+ else
245
+ # Handle the implicit vowel. Ignore 'a' and force
246
+ # vowels to appear as marks if we've just seen a
247
+ # consonant.
248
+ if had_consonant
249
+ temp_mark = map[:marks][token]
250
+ if !temp_mark.nil?
251
+ buf << temp_mark
252
+ elsif token != "a"
253
+ buf << map[:virama] << temp_letter
254
+ end
255
+ else
256
+ buf << temp_letter
257
+ end
258
+ had_consonant = map[:consonants].key?(token)
259
+ end
260
+ token_buffer.slice!(0, map[:max_token_length] - j)
261
+ break
262
+ elsif j == map[:max_token_length] - 1
263
+ if had_consonant
264
+ had_consonant = false
265
+ buf << map[:virama] unless options[:syncope]
266
+ end
267
+ buf << token
268
+ token_buffer.slice!(0, 1)
269
+ # 'break' is redundant here, "j == ..." is true only on
270
+ # the last iteration.
271
+ end
272
+ end
273
+ end
274
+ buf << map[:virama] if had_consonant && !options[:syncope]
275
+ buf.join("")
276
+ end
277
+
278
+ #
279
+ # Transliterate from a Brahmic script.
280
+ #
281
+ # @param data the string to transliterate
282
+ # @param map map data generated from makeMap()
283
+ # @return the finished string
284
+ #
285
+ def transliterate_brahmic(data, map)
286
+ data = data.to_str.dup
287
+ buf = []
288
+ dangling_hash = false
289
+ had_roman_consonant = false
290
+ transliteration_enabled = true
291
+
292
+ until data.empty?
293
+ l = data.slice!(0, 1)
294
+ # Toggle transliteration state
295
+ if l == "#"
296
+ if dangling_hash
297
+ transliteration_enabled = !transliteration_enabled
298
+ dangling_hash = false
299
+ else
300
+ dangling_hash = true
301
+ end
302
+ if had_roman_consonant
303
+ buf << "a"
304
+ had_roman_consonant = false
305
+ end
306
+ next
307
+ elsif !transliteration_enabled
308
+ buf << l
309
+ next
310
+ end
311
+
312
+ temp = map[:marks][l]
313
+ if !temp.nil?
314
+ buf << temp
315
+ had_roman_consonant = false
316
+ else
317
+ if dangling_hash
318
+ buf << "#"
319
+ dangling_hash = false
320
+ end
321
+ if had_roman_consonant
322
+ buf << "a"
323
+ had_roman_consonant = false
324
+ end
325
+
326
+ # Push transliterated letter if possible. Otherwise, push
327
+ # the letter itself.
328
+ temp = map[:letters][l]
329
+ if !temp.nil?
330
+ buf << temp
331
+ had_roman_consonant = map[:to_roman?] && map[:consonants].key?(l)
332
+ else
333
+ buf << l
334
+ end
335
+ end
336
+ end
337
+
338
+ buf << "a" if had_roman_consonant
339
+ buf.join("")
340
+ end
341
+ end
342
+ end
343
+ end
@@ -0,0 +1,312 @@
1
+ # frozen_string_literal: true
2
+ require "sanscript/refinements"
3
+
4
+ module Sanscript
5
+ using Refinements
6
+ module Transliterate
7
+ # Schemes
8
+ # =======
9
+ # Schemes are of two kinds: "Brahmic" and "roman." "Brahmic" schemes
10
+ # describe abugida scripts found in India. "Roman" schemes describe
11
+ # manufactured alphabets that are meant to describe or encode Brahmi
12
+ # scripts. Abugidas and alphabets are processed by separate algorithms
13
+ # because of the unique difficulties involved with each.
14
+ #
15
+ # Brahmic consonants are stated without a virama. Roman consonants are
16
+ # stated without the vowel 'a'.
17
+ #
18
+ # (Since "abugida" is not a well-known term, Sanscript uses "Brahmic"
19
+ # and "roman" for clarity.)
20
+ #
21
+ @schemes = {
22
+
23
+ # Bengali
24
+ # -------
25
+ # 'va' and 'ba' are both rendered as ব.
26
+ #
27
+ bengali: {
28
+ vowels: "অ আ ই ঈ উ ঊ ঋ ৠ ঌ ৡ এ ঐ ও ঔ".w_split,
29
+ vowel_marks: "া ি ী ু ূ ৃ ৄ ৢ ৣ ে ৈ ো ৌ".w_split,
30
+ other_marks: "ং ঃ ঁ".w_split,
31
+ virama: ["্"],
32
+ consonants: "ক খ গ ঘ ঙ চ ছ জ ঝ ঞ ট ঠ ড ঢ ণ ত থ দ ধ ন প ফ ব ভ ম য র ল ব শ ষ স হ ळ ক্ষ জ্ঞ".w_split,
33
+ symbols: "০ ১ ২ ৩ ৪ ৫ ৬ ৭ ৮ ৯ ॐ ঽ । ॥".w_split,
34
+ other: " ড ঢ য ".w_split,
35
+ },
36
+
37
+ # Devanagari
38
+ # ----------
39
+ # The most comprehensive and unambiguous Brahmic script listed.
40
+ #
41
+ devanagari: {
42
+ # "Independent" forms of the vowels. These are used whenever the
43
+ # vowel does not immediately follow a consonant.
44
+ vowels: "अ आ इ ई उ ऊ ऋ ॠ ऌ ॡ ऎ ए ऐ ऒ ओ औ".w_split,
45
+
46
+ # "Dependent" forms of the vowels. These are used whenever the
47
+ # vowel immediately follows a consonant. If a letter is not
48
+ # listed in `vowels`, it should not be listed here.
49
+ vowel_marks: "ा ि ी ु ू ृ ॄ ॢ ॣ ॆ े ै ॊ ो ौ".w_split,
50
+
51
+ # Miscellaneous marks, all of which are used in Sanskrit.
52
+ other_marks: "ं ः ँ".w_split,
53
+
54
+ # In syllabic scripts like Devanagari, consonants have an inherent
55
+ # vowel that must be suppressed explicitly. We do so by putting a
56
+ # virama after the consonant.
57
+ virama: ["्"],
58
+
59
+ # Various Sanskrit consonants and consonant clusters. Every token
60
+ # here has an explicit vowel. Thus "क" is "ka" instead of "k".
61
+ consonants: "क ख ग घ ङ च छ ज झ ञ ट ठ ड ढ ण त थ द ध न प फ ब भ म य र ल व श ष स ह ळ क्ष ज्ञ".w_split,
62
+
63
+ # Numbers and punctuation
64
+ symbols: "० १ २ ३ ४ ५ ६ ७ ८ ९ ॐ ऽ । ॥".w_split,
65
+
66
+ # Zero-width joiner. This is used to separate a consonant cluster
67
+ # and avoid a complex ligature.
68
+ zwj: ["\u200D"],
69
+
70
+ # Dummy consonant. This is used in ITRANS to prevert certain types
71
+ # of parser ambiguity. Thus "barau" -> बरौ but "bara_u" -> बरउ.
72
+ skip: [""],
73
+
74
+ # Vedic accent. Udatta and anudatta.
75
+ accent: %W[\u0951 \u0952],
76
+
77
+ # Accent combined with anusvara and and visarga. For compatibility
78
+ # with ITRANS, which allows the reverse of these four.
79
+ combo_accent: "ः॑ ः॒ ं॑ ं॒".w_split,
80
+
81
+ candra: ["ॅ"],
82
+
83
+ # Non-Sanskrit consonants
84
+ other: "क़ ख़ ग़ ज़ ड़ ढ़ फ़ य़ ऱ".w_split,
85
+ },
86
+
87
+ # Gujarati
88
+ # --------
89
+ # Sanskrit-complete.
90
+ #
91
+ gujarati: {
92
+ vowels: "અ આ ઇ ઈ ઉ ઊ ઋ ૠ ઌ ૡ એ ઐ ઓ ઔ".w_split,
93
+ vowel_marks: "ા િ ી ુ ૂ ૃ ૄ ૢ ૣ ે ૈ ો ૌ".w_split,
94
+ other_marks: "ં ઃ ઁ".w_split,
95
+ virama: ["્"],
96
+ consonants: "ક ખ ગ ઘ ઙ ચ છ જ ઝ ઞ ટ ઠ ડ ઢ ણ ત થ દ ધ ન પ ફ બ ભ મ ય ર લ વ શ ષ સ હ ળ ક્ષ જ્ઞ".w_split,
97
+ symbols: "૦ ૧ ૨ ૩ ૪ ૫ ૬ ૭ ૮ ૯ ૐ ઽ ૤ ૥".w_split,
98
+ candra: ["ૅ"],
99
+ },
100
+
101
+ # Gurmukhi
102
+ # --------
103
+ # Missing R/RR/lR/lRR
104
+ #
105
+ gurmukhi: {
106
+ vowels: "ਅ ਆ ਇ ਈ ਉ ਊ ਏ ਐ ਓ ਔ".w_split,
107
+ vowel_marks: "ਾ ਿ ੀ ੁ ੂ ੇ ੈ ੋ ੌ".w_split,
108
+ other_marks: "ਂ ਃ ਁ".w_split,
109
+ virama: ["੍"],
110
+ consonants: "ਕ ਖ ਗ ਘ ਙ ਚ ਛ ਜ ਝ ਞ ਟ ਠ ਡ ਢ ਣ ਤ ਥ ਦ ਧ ਨ ਪ ਫ ਬ ਭ ਮ ਯ ਰ ਲ ਵ ਸ਼ ਸ਼ ਸ ਹ ਲ਼ ਕ੍ਸ਼ ਜ੍ਞ".w_split,
111
+ symbols: "੦ ੧ ੨ ੩ ੪ ੫ ੬ ੭ ੮ ੯ ॐ ऽ । ॥".w_split,
112
+ other: " ਖ ਗ ਜ ਡ ਫ ".w_split,
113
+ },
114
+
115
+ # Kannada
116
+ # -------
117
+ # Sanskrit-complete.
118
+ #
119
+ kannada: {
120
+ vowels: "ಅ ಆ ಇ ಈ ಉ ಊ ಋ ೠ ಌ ೡ ಎ ಏ ಐ ಒ ಓ ಔ".w_split,
121
+ vowel_marks: "ಾ ಿ ೀ ು ೂ ೃ ೄ ೢ ೣ ೆ ೇ ೈ ೊ ೋ ೌ".w_split,
122
+ other_marks: "ಂ ಃ ँ".w_split,
123
+ virama: ["್"],
124
+ consonants: "ಕ ಖ ಗ ಘ ಙ ಚ ಛ ಜ ಝ ಞ ಟ ಠ ಡ ಢ ಣ ತ ಥ ದ ಧ ನ ಪ ಫ ಬ ಭ ಮ ಯ ರ ಲ ವ ಶ ಷ ಸ ಹ ಳ ಕ್ಷ ಜ್ಞ".w_split,
125
+ symbols: "೦ ೧ ೨ ೩ ೪ ೫ ೬ ೭ ೮ ೯ ಓಂ ಽ । ॥".w_split,
126
+ other: " ಫ ಱ".w_split,
127
+ },
128
+
129
+ # Malayalam
130
+ # ---------
131
+ # Sanskrit-complete.
132
+ #
133
+ malayalam: {
134
+ vowels: "അ ആ ഇ ഈ ഉ ഊ ഋ ൠ ഌ ൡ എ ഏ ഐ ഒ ഓ ഔ".w_split,
135
+ vowel_marks: "ാ ി ീ ു ൂ ൃ ൄ ൢ ൣ െ േ ൈ ൊ ോ ൌ".w_split,
136
+ other_marks: "ം ഃ ँ".w_split,
137
+ virama: ["്"],
138
+ consonants: "ക ഖ ഗ ഘ ങ ച ഛ ജ ഝ ഞ ട ഠ ഡ ഢ ണ ത ഥ ദ ധ ന പ ഫ ബ ഭ മ യ ര ല വ ശ ഷ സ ഹ ള ക്ഷ ജ്ഞ".w_split,
139
+ symbols: "൦ ൧ ൨ ൩ ൪ ൫ ൬ ൭ ൮ ൯ ഓം ഽ । ॥".w_split,
140
+ other: " റ".w_split,
141
+ },
142
+
143
+ # Oriya
144
+ # -----
145
+ # Sanskrit-complete.
146
+ #
147
+ oriya: {
148
+ vowels: "ଅ ଆ ଇ ଈ ଉ ଊ ଋ ୠ ଌ ୡ ଏ ଐ ଓ ଔ".w_split,
149
+ vowel_marks: "ା ି ୀ ୁ ୂ ୃ ୄ ୢ ୣ େ ୈ ୋ ୌ".w_split,
150
+ other_marks: "ଂ ଃ ଁ".w_split,
151
+ virama: ["୍"],
152
+ consonants: "କ ଖ ଗ ଘ ଙ ଚ ଛ ଜ ଝ ଞ ଟ ଠ ଡ ଢ ଣ ତ ଥ ଦ ଧ ନ ପ ଫ ବ ଭ ମ ଯ ର ଲ ଵ ଶ ଷ ସ ହ ଳ କ୍ଷ ଜ୍ଞ".w_split,
153
+ symbols: "୦ ୧ ୨ ୩ ୪ ୫ ୬ ୭ ୮ ୯ ଓଂ ଽ । ॥".w_split,
154
+ other: " ଡ ଢ ଯ ".w_split,
155
+ },
156
+
157
+ # Tamil
158
+ # -----
159
+ # Missing R/RR/lR/lRR vowel marks and voice/aspiration distinctions.
160
+ # The most incomplete of the Sanskrit schemes here.
161
+ #
162
+ tamil: {
163
+ vowels: "அ ஆ இ ஈ உ ஊ எ ஏ ஐ ஒ ஓ ஔ".w_split,
164
+ vowel_marks: "ா ி ீ ு ூ ெ ே ை ொ ோ ௌ".w_split,
165
+ other_marks: "ஂ ஃ ".w_split,
166
+ virama: ["்"],
167
+ consonants: "க க க க ங ச ச ஜ ச ஞ ட ட ட ட ண த த த த ந ப ப ப ப ம ய ர ல வ ஶ ஷ ஸ ஹ ள க்ஷ ஜ்ஞ".w_split,
168
+ symbols: "௦ ௧ ௨ ௩ ௪ ௫ ௬ ௭ ௮ ௯ ௐ ऽ । ॥".w_split,
169
+ other: " ற".w_split,
170
+ },
171
+
172
+ # Telugu
173
+ # ------
174
+ # Sanskrit-complete.
175
+ #
176
+ telugu: {
177
+ vowels: "అ ఆ ఇ ఈ ఉ ఊ ఋ ౠ ఌ ౡ ఎ ఏ ఐ ఒ ఓ ఔ".w_split,
178
+ vowel_marks: "ా ి ీ ు ూ ృ ౄ ౢ ౣ ె ే ై ొ ో ౌ".w_split,
179
+ other_marks: "ం ః ఁ".w_split,
180
+ virama: ["్"],
181
+ consonants: "క ఖ గ ఘ ఙ చ ఛ జ ఝ ఞ ట ఠ డ ఢ ణ త థ ద ధ న ప ఫ బ భ మ య ర ల వ శ ష స హ ళ క్ష జ్ఞ".w_split,
182
+ symbols: "౦ ౧ ౨ ౩ ౪ ౫ ౬ ౭ ౮ ౯ ఓం ఽ । ॥".w_split,
183
+ other: " ఱ".w_split,
184
+ },
185
+
186
+ # International Alphabet of Sanskrit Transliteration
187
+ # --------------------------------------------------
188
+ # The most "professional" Sanskrit romanization scheme.
189
+ #
190
+ iast: {
191
+ vowels: "a ā i ī u ū ṛ ṝ ḷ ḹ e ai o au".w_split,
192
+ other_marks: ["ṃ", "ḥ", "~"],
193
+ virama: [""],
194
+ consonants: "k kh g gh ṅ c ch j jh ñ ṭ ṭh ḍ ḍh ṇ t th d dh n p ph b bh m y r l v ś ṣ s h ḻ kṣ jñ".w_split,
195
+ symbols: "0 1 2 3 4 5 6 7 8 9 oṃ ' । ॥".w_split,
196
+ },
197
+
198
+ # ITRANS
199
+ # ------
200
+ # One of the first romanization schemes -- and one of the most
201
+ # complicated. For alternate forms, see the "allAlternates" variable
202
+ # below.
203
+ # *
204
+ # '_' is a "null" letter, which allows adjacent vowels.
205
+ #
206
+ itrans: {
207
+ vowels: "a A i I u U RRi RRI LLi LLI e ai o au".w_split,
208
+ other_marks: ["M", "H", ".N"],
209
+ virama: [""],
210
+ consonants: "k kh g gh ~N ch Ch j jh ~n T Th D Dh N t th d dh n p ph b bh m y r l v sh Sh s h L kSh j~n".w_split,
211
+ symbols: "0 1 2 3 4 5 6 7 8 9 OM .a | ||".w_split,
212
+ candra: [".c"],
213
+ zwj: ["{}"],
214
+ skip: ["_"],
215
+ accent: ["\\'", "\\_"],
216
+ combo_accent: "\\'H \\_H \\'M \\_M".w_split,
217
+ other: "q K G z .D .Dh f Y R".w_split,
218
+ },
219
+
220
+ # Harvard-Kyoto
221
+ # -------------
222
+ # A simple 1:1 mapping.
223
+ #
224
+ hk: {
225
+ vowels: "a A i I u U R RR lR lRR e ai o au".w_split,
226
+ other_marks: "M H ~".w_split,
227
+ virama: [""],
228
+ consonants: "k kh g gh G c ch j jh J T Th D Dh N t th d dh n p ph b bh m y r l v z S s h L kS jJ".w_split,
229
+ symbols: "0 1 2 3 4 5 6 7 8 9 OM ' | ||".w_split,
230
+ },
231
+
232
+ # National Library at Kolkata
233
+ # ---------------------------
234
+ # Apart from using "ē" and "ō" instead of "e" and "o", this scheme is
235
+ # identical to IAST. ṝ, ḷ, and ḹ are not part of the scheme proper.
236
+ # *
237
+ # This is defined further below.
238
+ #
239
+
240
+ # Sanskrit Library Phonetic Basic
241
+ # -------------------------------
242
+ # With one ASCII letter per phoneme, this is the tersest transliteration
243
+ # scheme in use today and is especially suited to computer processing.
244
+ #
245
+ slp1: {
246
+ vowels: "a A i I u U f F x X e E o O".w_split,
247
+ other_marks: "M H ~".w_split,
248
+ virama: [""],
249
+ consonants: "k K g G N c C j J Y w W q Q R t T d D n p P b B m y r l v S z s h L kz jY".w_split,
250
+ symbols: "0 1 2 3 4 5 6 7 8 9 oM ' . ..".w_split,
251
+ },
252
+
253
+ # Velthuis
254
+ # --------
255
+ # A case-insensitive Sanskrit encoding.
256
+ #
257
+ velthuis: {
258
+ vowels: "a aa i ii u uu .r .rr .li .ll e ai o au".w_split,
259
+ other_marks: ".m .h ".w_split,
260
+ virama: [""],
261
+ consonants: 'k kh g gh "n c ch j jh ~n .t .th .d .d .n t th d dh n p ph b bh m y r l v ~s .s s h L k.s j~n'.w_split,
262
+ symbols: "0 1 2 3 4 5 6 7 8 9 o.m ' | ||".w_split,
263
+ },
264
+
265
+ # WX
266
+ # --
267
+ # As terse as SLP1.
268
+ #
269
+ wx: {
270
+ vowels: "a A i I u U q Q L e E o O".w_split,
271
+ other_marks: "M H z".w_split,
272
+ virama: [""],
273
+ consonants: "k K g G f c C j J F t T d D N w W x X n p P b B m y r l v S R s h kR jF".w_split,
274
+ symbols: "0 1 2 3 4 5 6 7 8 9 oM ' | ||".w_split,
275
+ },
276
+ }
277
+
278
+ # Set of names of schemes
279
+ @roman_schemes = Set.new
280
+
281
+ # Map of alternate encodings.
282
+ @all_alternates = {
283
+ itrans: {
284
+ "A" => ["aa"],
285
+ "I" => %w[ii ee],
286
+ "U" => %w[uu oo],
287
+ "RRi" => ["R^i"],
288
+ "RRI" => ["R^I"],
289
+ "LLi" => ["L^i"],
290
+ "LLI" => ["L^I"],
291
+ "M" => [".m", ".n"],
292
+ "~N" => ["N^"],
293
+ "ch" => ["c"],
294
+ "Ch" => %w[C chh],
295
+ "~n" => ["JN"],
296
+ "v" => ["w"],
297
+ "Sh" => %w[S shh],
298
+ "kSh" => %w[kS x],
299
+ "j~n" => %w[GY dny],
300
+ "OM" => ["AUM"],
301
+ "\\_" => ["\\`"],
302
+ "\\_H" => ["\\`H"],
303
+ "\\'M" => ["\\'.m", "\\'.n"],
304
+ "\\_M" => "\\_.m \\_.n \\`M \\`.m \\`.n".w_split,
305
+ ".a" => ["~"],
306
+ "|" => ["."],
307
+ "||" => [".."],
308
+ "z" => ["J"],
309
+ },
310
+ }
311
+ end
312
+ end