sanscript 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.codeclimate.yml +9 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.rubocop.yml +97 -0
- data/.travis.yml +9 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +43 -0
- data/Rakefile +7 -0
- data/bin/console +12 -0
- data/bin/setup +8 -0
- data/lib/sanscript.rb +29 -0
- data/lib/sanscript/benchmark.rb +53 -0
- data/lib/sanscript/detect.rb +77 -0
- data/lib/sanscript/refinements.rb +94 -0
- data/lib/sanscript/transliterate.rb +343 -0
- data/lib/sanscript/transliterate/schemes.rb +312 -0
- data/lib/sanscript/version.rb +4 -0
- data/sanscript.gemspec +29 -0
- metadata +148 -0
@@ -0,0 +1,94 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "ice_nine"
|
3
|
+
|
4
|
+
module Sanscript
|
5
|
+
module Refinements
|
6
|
+
refine Object do
|
7
|
+
def deep_dup
|
8
|
+
dup
|
9
|
+
rescue TypeError
|
10
|
+
self
|
11
|
+
end
|
12
|
+
|
13
|
+
def deep_freeze
|
14
|
+
IceNine.deep_freeze(self)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
refine NilClass do
|
19
|
+
def deep_dup
|
20
|
+
self
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
refine FalseClass do
|
25
|
+
def deep_dup
|
26
|
+
self
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
refine TrueClass do
|
31
|
+
def deep_dup
|
32
|
+
self
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
refine Symbol do
|
37
|
+
def deep_dup
|
38
|
+
self
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
refine Numeric do
|
43
|
+
def deep_dup
|
44
|
+
self
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Necessary to re-override Numeric
|
49
|
+
require "bigdecimal"
|
50
|
+
refine BigDecimal do
|
51
|
+
def deep_dup
|
52
|
+
dup
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
refine String do
|
57
|
+
def w_split
|
58
|
+
split(/\s/)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
refine Array do
|
63
|
+
def deep_dup
|
64
|
+
map { |value| value.deep_dup } # rubocop:disable Style/SymbolProc
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
refine Hash do
|
69
|
+
def deep_dup
|
70
|
+
hash = dup
|
71
|
+
each_pair do |key, value|
|
72
|
+
if ::String === key # rubocop:disable Style/CaseEquality
|
73
|
+
hash[key] = value.deep_dup
|
74
|
+
else
|
75
|
+
hash.delete(key)
|
76
|
+
hash[key.deep_dup] = value.deep_dup
|
77
|
+
end
|
78
|
+
end
|
79
|
+
hash
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
refine Set do
|
84
|
+
def deep_dup
|
85
|
+
set_a = to_a
|
86
|
+
set_a.map! do |val|
|
87
|
+
next val if ::String === val # rubocop:disable Style/CaseEquality
|
88
|
+
val.deep_dup
|
89
|
+
end
|
90
|
+
self.class[set_a]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,343 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "sanscript/refinements"
|
4
|
+
require "sanscript/transliterate/schemes"
|
5
|
+
#
|
6
|
+
# Sanscript
|
7
|
+
#
|
8
|
+
# Sanscript is a Sanskrit transliteration library. Currently, it supports
|
9
|
+
# other Indian languages only incidentally.
|
10
|
+
#
|
11
|
+
# Released under the MIT and GPL Licenses.
|
12
|
+
#
|
13
|
+
module Sanscript
|
14
|
+
using Refinements
|
15
|
+
module Transliterate
|
16
|
+
class << self
|
17
|
+
attr_reader :defaults, :schemes, :roman_schemes, :all_alternates
|
18
|
+
end
|
19
|
+
|
20
|
+
@defaults = {
|
21
|
+
skip_sgml: false,
|
22
|
+
syncope: false,
|
23
|
+
}
|
24
|
+
|
25
|
+
@cache = {}
|
26
|
+
|
27
|
+
module_function
|
28
|
+
|
29
|
+
#
|
30
|
+
# Return a list of available schemes.
|
31
|
+
#
|
32
|
+
# @return array of scheme identifiers
|
33
|
+
#
|
34
|
+
def scheme_names
|
35
|
+
@schemes.keys.sort!
|
36
|
+
end
|
37
|
+
|
38
|
+
#
|
39
|
+
# Check whether the given scheme encodes romanized Sanskrit.
|
40
|
+
#
|
41
|
+
# @param name the scheme name
|
42
|
+
# @return boolean
|
43
|
+
#
|
44
|
+
def roman_scheme?(name)
|
45
|
+
@roman_schemes.include?(name.to_sym)
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Add a Brahmic scheme to Sanscript.
|
50
|
+
#
|
51
|
+
# Schemes are of two types: "Brahmic" and "roman". Brahmic consonants
|
52
|
+
# have an inherent vowel sound, but roman consonants do not. This is the
|
53
|
+
# main difference between these two types of scheme.
|
54
|
+
#
|
55
|
+
# A scheme definition is an object ("{}") that maps a group name to a
|
56
|
+
# list of characters. For illustration, see the "devanagari" scheme at
|
57
|
+
# the top of this file.
|
58
|
+
#
|
59
|
+
# You can use whatever group names you like, but for the best results,
|
60
|
+
# you should use the same group names that Sanscript does.
|
61
|
+
#
|
62
|
+
# @param name the scheme name
|
63
|
+
# @param scheme the scheme data itself. This should be constructed as
|
64
|
+
# described above.
|
65
|
+
#
|
66
|
+
def add_brahmic_scheme(name, scheme)
|
67
|
+
@schemes[name.to_sym] = scheme.deep_dup.deep_freeze
|
68
|
+
end
|
69
|
+
|
70
|
+
#
|
71
|
+
# Add a roman scheme to Sanscript.
|
72
|
+
#
|
73
|
+
# See the comments on Sanscript.add_brahmic_scheme. The "vowel_marks" field
|
74
|
+
# can be omitted.
|
75
|
+
#
|
76
|
+
# @param name the scheme name
|
77
|
+
# @param scheme the scheme data itself
|
78
|
+
#
|
79
|
+
def add_roman_scheme(name, scheme)
|
80
|
+
name = name.to_sym
|
81
|
+
scheme = scheme.deep_dup
|
82
|
+
scheme[:vowel_marks] = scheme[:vowels][1..-1] unless scheme.key?(:vowel_marks)
|
83
|
+
@schemes[name] = scheme.deep_freeze
|
84
|
+
@roman_schemes.add(name)
|
85
|
+
end
|
86
|
+
|
87
|
+
#
|
88
|
+
# Create a deep copy of an object, for certain kinds of objects.
|
89
|
+
#
|
90
|
+
# @param scheme the scheme to copy
|
91
|
+
# @return the copy
|
92
|
+
#
|
93
|
+
|
94
|
+
# Set up various schemes
|
95
|
+
begin
|
96
|
+
# Set up roman schemes
|
97
|
+
kolkata = @schemes[:kolkata] = @schemes[:iast].deep_dup
|
98
|
+
scheme_names = %i[iast itrans hk kolkata slp1 velthuis wx]
|
99
|
+
kolkata[:vowels] = %w[a ā i ī u ū ṛ ṝ ḷ ḹ e ē ai o ō au]
|
100
|
+
|
101
|
+
# These schemes already belong to Sanscript.schemes. But by adding
|
102
|
+
# them again with `addRomanScheme`, we automatically build up
|
103
|
+
# `roman_schemes` and define a `vowel_marks` field for each one.
|
104
|
+
scheme_names.each do |name|
|
105
|
+
add_roman_scheme(name, @schemes[name])
|
106
|
+
end
|
107
|
+
|
108
|
+
# ITRANS variant, which supports Dravidian short 'e' and 'o'.
|
109
|
+
itrans_dravidian = @schemes[:itrans].deep_dup
|
110
|
+
itrans_dravidian[:vowels] = %w[a A i I u U Ri RRI LLi LLi e E ai o O au]
|
111
|
+
itrans_dravidian[:vowel_marks] = itrans_dravidian[:vowels][1..-1]
|
112
|
+
@all_alternates[:itrans_dravidian] = @all_alternates[:itrans]
|
113
|
+
add_roman_scheme(:itrans_dravidian, itrans_dravidian)
|
114
|
+
|
115
|
+
# ensure deep freeze on all existing schemes and alternates
|
116
|
+
@schemes.each { |_, scheme| scheme.deep_freeze }
|
117
|
+
@all_alternates.each { |_, scheme| scheme.deep_freeze }
|
118
|
+
end
|
119
|
+
|
120
|
+
# /**
|
121
|
+
# Transliterate from one script to another.
|
122
|
+
# *
|
123
|
+
# @param data the string to transliterate
|
124
|
+
# @param from the source script
|
125
|
+
# @param to the destination script
|
126
|
+
# @param options transliteration options
|
127
|
+
# @return the finished string
|
128
|
+
#
|
129
|
+
def transliterate(data, from, to, options = {})
|
130
|
+
from = from.to_sym
|
131
|
+
to = to.to_sym
|
132
|
+
raise "Scheme not known ':#{from}'" unless @schemes.key?(from)
|
133
|
+
raise "Scheme not known ':#{to}'" unless @schemes.key?(to)
|
134
|
+
|
135
|
+
data = data.to_str.dup
|
136
|
+
options = @defaults.merge(options)
|
137
|
+
map = make_map(from, to)
|
138
|
+
|
139
|
+
data.gsub!(/(<.*?>)/, "##\\1##") if options[:skip_sgml]
|
140
|
+
|
141
|
+
# Easy way out for "{\m+}", "\", and ".h".
|
142
|
+
if from == :itrans
|
143
|
+
data.gsub!(/\{\\m\+\}/, ".h.N")
|
144
|
+
data.gsub!(/\.h/, "")
|
145
|
+
data.gsub!(/\\([^'`_]|$)/, "##\\1##")
|
146
|
+
end
|
147
|
+
|
148
|
+
if map[:from_roman?]
|
149
|
+
transliterate_roman(data, map, options)
|
150
|
+
else
|
151
|
+
transliterate_brahmic(data, map)
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
class << self
|
156
|
+
private
|
157
|
+
|
158
|
+
#
|
159
|
+
# Create a map from every character in `from` to its partner in `to`.
|
160
|
+
# Also, store any "marks" that `from` might have.
|
161
|
+
#
|
162
|
+
# @param from input scheme
|
163
|
+
# @param to output scheme
|
164
|
+
#
|
165
|
+
def make_map(from, to)
|
166
|
+
@cache[:"#{from}_#{to}"] ||= begin
|
167
|
+
alternates = @all_alternates[from] || {}
|
168
|
+
consonants = {}
|
169
|
+
from_scheme = @schemes[from]
|
170
|
+
letters = {}
|
171
|
+
token_lengths = []
|
172
|
+
marks = {}
|
173
|
+
to_scheme = @schemes[to]
|
174
|
+
|
175
|
+
from_scheme.each do |group, from_group|
|
176
|
+
to_group = to_scheme[group]
|
177
|
+
next if to_group.nil?
|
178
|
+
|
179
|
+
from_group.each_with_index do |f, i|
|
180
|
+
t = to_group[i]
|
181
|
+
alts = alternates[f] || []
|
182
|
+
token_lengths.push(f.length)
|
183
|
+
token_lengths.concat(alts.map(&:length))
|
184
|
+
|
185
|
+
if group == :vowel_marks || group == :virama
|
186
|
+
marks[f] = t
|
187
|
+
alts.each { |alt| marks[alt] = t }
|
188
|
+
else
|
189
|
+
letters[f] = t
|
190
|
+
alts.each { |alt| letters[alt] = t }
|
191
|
+
|
192
|
+
if group == :consonants || group == :other
|
193
|
+
consonants[f] = t
|
194
|
+
alts.each { |alt| consonants[alt] = t }
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
{
|
201
|
+
consonants: consonants,
|
202
|
+
from_roman?: roman_scheme?(from),
|
203
|
+
letters: letters,
|
204
|
+
marks: marks,
|
205
|
+
max_token_length: token_lengths.max,
|
206
|
+
to_roman?: roman_scheme?(to),
|
207
|
+
virama: to_scheme[:virama].first,
|
208
|
+
}.deep_freeze
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
#
|
213
|
+
# Transliterate from a romanized script.
|
214
|
+
#
|
215
|
+
# @param data the string to transliterate
|
216
|
+
# @param map map data generated from makeMap()
|
217
|
+
# @param options transliteration options
|
218
|
+
# @return the finished string
|
219
|
+
#
|
220
|
+
def transliterate_roman(data, map, options = {})
|
221
|
+
options = @defaults.merge(options)
|
222
|
+
data = data.to_str.dup
|
223
|
+
buf = []
|
224
|
+
token_buffer = String.new
|
225
|
+
had_consonant = false
|
226
|
+
transliteration_enabled = true
|
227
|
+
|
228
|
+
until data.empty? && token_buffer.empty?
|
229
|
+
token_buffer << data.slice!(0, map[:max_token_length] - token_buffer.length)
|
230
|
+
|
231
|
+
# Match all token substrings to our map.
|
232
|
+
(0...map[:max_token_length]).each do |j|
|
233
|
+
token = token_buffer[0, map[:max_token_length] - j]
|
234
|
+
|
235
|
+
if token == "##"
|
236
|
+
transliteration_enabled = !transliteration_enabled
|
237
|
+
token_buffer.slice!(0, 2)
|
238
|
+
break
|
239
|
+
end
|
240
|
+
temp_letter = map[:letters][token]
|
241
|
+
if !temp_letter.nil? && transliteration_enabled
|
242
|
+
if map[:to_roman?]
|
243
|
+
buf << temp_letter
|
244
|
+
else
|
245
|
+
# Handle the implicit vowel. Ignore 'a' and force
|
246
|
+
# vowels to appear as marks if we've just seen a
|
247
|
+
# consonant.
|
248
|
+
if had_consonant
|
249
|
+
temp_mark = map[:marks][token]
|
250
|
+
if !temp_mark.nil?
|
251
|
+
buf << temp_mark
|
252
|
+
elsif token != "a"
|
253
|
+
buf << map[:virama] << temp_letter
|
254
|
+
end
|
255
|
+
else
|
256
|
+
buf << temp_letter
|
257
|
+
end
|
258
|
+
had_consonant = map[:consonants].key?(token)
|
259
|
+
end
|
260
|
+
token_buffer.slice!(0, map[:max_token_length] - j)
|
261
|
+
break
|
262
|
+
elsif j == map[:max_token_length] - 1
|
263
|
+
if had_consonant
|
264
|
+
had_consonant = false
|
265
|
+
buf << map[:virama] unless options[:syncope]
|
266
|
+
end
|
267
|
+
buf << token
|
268
|
+
token_buffer.slice!(0, 1)
|
269
|
+
# 'break' is redundant here, "j == ..." is true only on
|
270
|
+
# the last iteration.
|
271
|
+
end
|
272
|
+
end
|
273
|
+
end
|
274
|
+
buf << map[:virama] if had_consonant && !options[:syncope]
|
275
|
+
buf.join("")
|
276
|
+
end
|
277
|
+
|
278
|
+
#
|
279
|
+
# Transliterate from a Brahmic script.
|
280
|
+
#
|
281
|
+
# @param data the string to transliterate
|
282
|
+
# @param map map data generated from makeMap()
|
283
|
+
# @return the finished string
|
284
|
+
#
|
285
|
+
def transliterate_brahmic(data, map)
|
286
|
+
data = data.to_str.dup
|
287
|
+
buf = []
|
288
|
+
dangling_hash = false
|
289
|
+
had_roman_consonant = false
|
290
|
+
transliteration_enabled = true
|
291
|
+
|
292
|
+
until data.empty?
|
293
|
+
l = data.slice!(0, 1)
|
294
|
+
# Toggle transliteration state
|
295
|
+
if l == "#"
|
296
|
+
if dangling_hash
|
297
|
+
transliteration_enabled = !transliteration_enabled
|
298
|
+
dangling_hash = false
|
299
|
+
else
|
300
|
+
dangling_hash = true
|
301
|
+
end
|
302
|
+
if had_roman_consonant
|
303
|
+
buf << "a"
|
304
|
+
had_roman_consonant = false
|
305
|
+
end
|
306
|
+
next
|
307
|
+
elsif !transliteration_enabled
|
308
|
+
buf << l
|
309
|
+
next
|
310
|
+
end
|
311
|
+
|
312
|
+
temp = map[:marks][l]
|
313
|
+
if !temp.nil?
|
314
|
+
buf << temp
|
315
|
+
had_roman_consonant = false
|
316
|
+
else
|
317
|
+
if dangling_hash
|
318
|
+
buf << "#"
|
319
|
+
dangling_hash = false
|
320
|
+
end
|
321
|
+
if had_roman_consonant
|
322
|
+
buf << "a"
|
323
|
+
had_roman_consonant = false
|
324
|
+
end
|
325
|
+
|
326
|
+
# Push transliterated letter if possible. Otherwise, push
|
327
|
+
# the letter itself.
|
328
|
+
temp = map[:letters][l]
|
329
|
+
if !temp.nil?
|
330
|
+
buf << temp
|
331
|
+
had_roman_consonant = map[:to_roman?] && map[:consonants].key?(l)
|
332
|
+
else
|
333
|
+
buf << l
|
334
|
+
end
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
338
|
+
buf << "a" if had_roman_consonant
|
339
|
+
buf.join("")
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
end
|
@@ -0,0 +1,312 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "sanscript/refinements"
|
3
|
+
|
4
|
+
module Sanscript
|
5
|
+
using Refinements
|
6
|
+
module Transliterate
|
7
|
+
# Schemes
|
8
|
+
# =======
|
9
|
+
# Schemes are of two kinds: "Brahmic" and "roman." "Brahmic" schemes
|
10
|
+
# describe abugida scripts found in India. "Roman" schemes describe
|
11
|
+
# manufactured alphabets that are meant to describe or encode Brahmi
|
12
|
+
# scripts. Abugidas and alphabets are processed by separate algorithms
|
13
|
+
# because of the unique difficulties involved with each.
|
14
|
+
#
|
15
|
+
# Brahmic consonants are stated without a virama. Roman consonants are
|
16
|
+
# stated without the vowel 'a'.
|
17
|
+
#
|
18
|
+
# (Since "abugida" is not a well-known term, Sanscript uses "Brahmic"
|
19
|
+
# and "roman" for clarity.)
|
20
|
+
#
|
21
|
+
@schemes = {
|
22
|
+
|
23
|
+
# Bengali
|
24
|
+
# -------
|
25
|
+
# 'va' and 'ba' are both rendered as ব.
|
26
|
+
#
|
27
|
+
bengali: {
|
28
|
+
vowels: "অ আ ই ঈ উ ঊ ঋ ৠ ঌ ৡ এ ঐ ও ঔ".w_split,
|
29
|
+
vowel_marks: "া ি ী ু ূ ৃ ৄ ৢ ৣ ে ৈ ো ৌ".w_split,
|
30
|
+
other_marks: "ং ঃ ঁ".w_split,
|
31
|
+
virama: ["্"],
|
32
|
+
consonants: "ক খ গ ঘ ঙ চ ছ জ ঝ ঞ ট ঠ ড ঢ ণ ত থ দ ধ ন প ফ ব ভ ম য র ল ব শ ষ স হ ळ ক্ষ জ্ঞ".w_split,
|
33
|
+
symbols: "০ ১ ২ ৩ ৪ ৫ ৬ ৭ ৮ ৯ ॐ ঽ । ॥".w_split,
|
34
|
+
other: " ড ঢ য ".w_split,
|
35
|
+
},
|
36
|
+
|
37
|
+
# Devanagari
|
38
|
+
# ----------
|
39
|
+
# The most comprehensive and unambiguous Brahmic script listed.
|
40
|
+
#
|
41
|
+
devanagari: {
|
42
|
+
# "Independent" forms of the vowels. These are used whenever the
|
43
|
+
# vowel does not immediately follow a consonant.
|
44
|
+
vowels: "अ आ इ ई उ ऊ ऋ ॠ ऌ ॡ ऎ ए ऐ ऒ ओ औ".w_split,
|
45
|
+
|
46
|
+
# "Dependent" forms of the vowels. These are used whenever the
|
47
|
+
# vowel immediately follows a consonant. If a letter is not
|
48
|
+
# listed in `vowels`, it should not be listed here.
|
49
|
+
vowel_marks: "ा ि ी ु ू ृ ॄ ॢ ॣ ॆ े ै ॊ ो ौ".w_split,
|
50
|
+
|
51
|
+
# Miscellaneous marks, all of which are used in Sanskrit.
|
52
|
+
other_marks: "ं ः ँ".w_split,
|
53
|
+
|
54
|
+
# In syllabic scripts like Devanagari, consonants have an inherent
|
55
|
+
# vowel that must be suppressed explicitly. We do so by putting a
|
56
|
+
# virama after the consonant.
|
57
|
+
virama: ["्"],
|
58
|
+
|
59
|
+
# Various Sanskrit consonants and consonant clusters. Every token
|
60
|
+
# here has an explicit vowel. Thus "क" is "ka" instead of "k".
|
61
|
+
consonants: "क ख ग घ ङ च छ ज झ ञ ट ठ ड ढ ण त थ द ध न प फ ब भ म य र ल व श ष स ह ळ क्ष ज्ञ".w_split,
|
62
|
+
|
63
|
+
# Numbers and punctuation
|
64
|
+
symbols: "० १ २ ३ ४ ५ ६ ७ ८ ९ ॐ ऽ । ॥".w_split,
|
65
|
+
|
66
|
+
# Zero-width joiner. This is used to separate a consonant cluster
|
67
|
+
# and avoid a complex ligature.
|
68
|
+
zwj: ["\u200D"],
|
69
|
+
|
70
|
+
# Dummy consonant. This is used in ITRANS to prevert certain types
|
71
|
+
# of parser ambiguity. Thus "barau" -> बरौ but "bara_u" -> बरउ.
|
72
|
+
skip: [""],
|
73
|
+
|
74
|
+
# Vedic accent. Udatta and anudatta.
|
75
|
+
accent: %W[\u0951 \u0952],
|
76
|
+
|
77
|
+
# Accent combined with anusvara and and visarga. For compatibility
|
78
|
+
# with ITRANS, which allows the reverse of these four.
|
79
|
+
combo_accent: "ः॑ ः॒ ं॑ ं॒".w_split,
|
80
|
+
|
81
|
+
candra: ["ॅ"],
|
82
|
+
|
83
|
+
# Non-Sanskrit consonants
|
84
|
+
other: "क़ ख़ ग़ ज़ ड़ ढ़ फ़ य़ ऱ".w_split,
|
85
|
+
},
|
86
|
+
|
87
|
+
# Gujarati
|
88
|
+
# --------
|
89
|
+
# Sanskrit-complete.
|
90
|
+
#
|
91
|
+
gujarati: {
|
92
|
+
vowels: "અ આ ઇ ઈ ઉ ઊ ઋ ૠ ઌ ૡ એ ઐ ઓ ઔ".w_split,
|
93
|
+
vowel_marks: "ા િ ી ુ ૂ ૃ ૄ ૢ ૣ ે ૈ ો ૌ".w_split,
|
94
|
+
other_marks: "ં ઃ ઁ".w_split,
|
95
|
+
virama: ["્"],
|
96
|
+
consonants: "ક ખ ગ ઘ ઙ ચ છ જ ઝ ઞ ટ ઠ ડ ઢ ણ ત થ દ ધ ન પ ફ બ ભ મ ય ર લ વ શ ષ સ હ ળ ક્ષ જ્ઞ".w_split,
|
97
|
+
symbols: "૦ ૧ ૨ ૩ ૪ ૫ ૬ ૭ ૮ ૯ ૐ ઽ ".w_split,
|
98
|
+
candra: ["ૅ"],
|
99
|
+
},
|
100
|
+
|
101
|
+
# Gurmukhi
|
102
|
+
# --------
|
103
|
+
# Missing R/RR/lR/lRR
|
104
|
+
#
|
105
|
+
gurmukhi: {
|
106
|
+
vowels: "ਅ ਆ ਇ ਈ ਉ ਊ ਏ ਐ ਓ ਔ".w_split,
|
107
|
+
vowel_marks: "ਾ ਿ ੀ ੁ ੂ ੇ ੈ ੋ ੌ".w_split,
|
108
|
+
other_marks: "ਂ ਃ ਁ".w_split,
|
109
|
+
virama: ["੍"],
|
110
|
+
consonants: "ਕ ਖ ਗ ਘ ਙ ਚ ਛ ਜ ਝ ਞ ਟ ਠ ਡ ਢ ਣ ਤ ਥ ਦ ਧ ਨ ਪ ਫ ਬ ਭ ਮ ਯ ਰ ਲ ਵ ਸ਼ ਸ਼ ਸ ਹ ਲ਼ ਕ੍ਸ਼ ਜ੍ਞ".w_split,
|
111
|
+
symbols: "੦ ੧ ੨ ੩ ੪ ੫ ੬ ੭ ੮ ੯ ॐ ऽ । ॥".w_split,
|
112
|
+
other: " ਖ ਗ ਜ ਡ ਫ ".w_split,
|
113
|
+
},
|
114
|
+
|
115
|
+
# Kannada
|
116
|
+
# -------
|
117
|
+
# Sanskrit-complete.
|
118
|
+
#
|
119
|
+
kannada: {
|
120
|
+
vowels: "ಅ ಆ ಇ ಈ ಉ ಊ ಋ ೠ ಌ ೡ ಎ ಏ ಐ ಒ ಓ ಔ".w_split,
|
121
|
+
vowel_marks: "ಾ ಿ ೀ ು ೂ ೃ ೄ ೢ ೣ ೆ ೇ ೈ ೊ ೋ ೌ".w_split,
|
122
|
+
other_marks: "ಂ ಃ ँ".w_split,
|
123
|
+
virama: ["್"],
|
124
|
+
consonants: "ಕ ಖ ಗ ಘ ಙ ಚ ಛ ಜ ಝ ಞ ಟ ಠ ಡ ಢ ಣ ತ ಥ ದ ಧ ನ ಪ ಫ ಬ ಭ ಮ ಯ ರ ಲ ವ ಶ ಷ ಸ ಹ ಳ ಕ್ಷ ಜ್ಞ".w_split,
|
125
|
+
symbols: "೦ ೧ ೨ ೩ ೪ ೫ ೬ ೭ ೮ ೯ ಓಂ ಽ । ॥".w_split,
|
126
|
+
other: " ಫ ಱ".w_split,
|
127
|
+
},
|
128
|
+
|
129
|
+
# Malayalam
|
130
|
+
# ---------
|
131
|
+
# Sanskrit-complete.
|
132
|
+
#
|
133
|
+
malayalam: {
|
134
|
+
vowels: "അ ആ ഇ ഈ ഉ ഊ ഋ ൠ ഌ ൡ എ ഏ ഐ ഒ ഓ ഔ".w_split,
|
135
|
+
vowel_marks: "ാ ി ീ ു ൂ ൃ ൄ ൢ ൣ െ േ ൈ ൊ ോ ൌ".w_split,
|
136
|
+
other_marks: "ം ഃ ँ".w_split,
|
137
|
+
virama: ["്"],
|
138
|
+
consonants: "ക ഖ ഗ ഘ ങ ച ഛ ജ ഝ ഞ ട ഠ ഡ ഢ ണ ത ഥ ദ ധ ന പ ഫ ബ ഭ മ യ ര ല വ ശ ഷ സ ഹ ള ക്ഷ ജ്ഞ".w_split,
|
139
|
+
symbols: "൦ ൧ ൨ ൩ ൪ ൫ ൬ ൭ ൮ ൯ ഓം ഽ । ॥".w_split,
|
140
|
+
other: " റ".w_split,
|
141
|
+
},
|
142
|
+
|
143
|
+
# Oriya
|
144
|
+
# -----
|
145
|
+
# Sanskrit-complete.
|
146
|
+
#
|
147
|
+
oriya: {
|
148
|
+
vowels: "ଅ ଆ ଇ ଈ ଉ ଊ ଋ ୠ ଌ ୡ ଏ ଐ ଓ ଔ".w_split,
|
149
|
+
vowel_marks: "ା ି ୀ ୁ ୂ ୃ ୄ ୢ ୣ େ ୈ ୋ ୌ".w_split,
|
150
|
+
other_marks: "ଂ ଃ ଁ".w_split,
|
151
|
+
virama: ["୍"],
|
152
|
+
consonants: "କ ଖ ଗ ଘ ଙ ଚ ଛ ଜ ଝ ଞ ଟ ଠ ଡ ଢ ଣ ତ ଥ ଦ ଧ ନ ପ ଫ ବ ଭ ମ ଯ ର ଲ ଵ ଶ ଷ ସ ହ ଳ କ୍ଷ ଜ୍ଞ".w_split,
|
153
|
+
symbols: "୦ ୧ ୨ ୩ ୪ ୫ ୬ ୭ ୮ ୯ ଓଂ ଽ । ॥".w_split,
|
154
|
+
other: " ଡ ଢ ଯ ".w_split,
|
155
|
+
},
|
156
|
+
|
157
|
+
# Tamil
|
158
|
+
# -----
|
159
|
+
# Missing R/RR/lR/lRR vowel marks and voice/aspiration distinctions.
|
160
|
+
# The most incomplete of the Sanskrit schemes here.
|
161
|
+
#
|
162
|
+
tamil: {
|
163
|
+
vowels: "அ ஆ இ ஈ உ ஊ எ ஏ ஐ ஒ ஓ ஔ".w_split,
|
164
|
+
vowel_marks: "ா ி ீ ு ூ ெ ே ை ொ ோ ௌ".w_split,
|
165
|
+
other_marks: "ஂ ஃ ".w_split,
|
166
|
+
virama: ["்"],
|
167
|
+
consonants: "க க க க ங ச ச ஜ ச ஞ ட ட ட ட ண த த த த ந ப ப ப ப ம ய ர ல வ ஶ ஷ ஸ ஹ ள க்ஷ ஜ்ஞ".w_split,
|
168
|
+
symbols: "௦ ௧ ௨ ௩ ௪ ௫ ௬ ௭ ௮ ௯ ௐ ऽ । ॥".w_split,
|
169
|
+
other: " ற".w_split,
|
170
|
+
},
|
171
|
+
|
172
|
+
# Telugu
|
173
|
+
# ------
|
174
|
+
# Sanskrit-complete.
|
175
|
+
#
|
176
|
+
telugu: {
|
177
|
+
vowels: "అ ఆ ఇ ఈ ఉ ఊ ఋ ౠ ఌ ౡ ఎ ఏ ఐ ఒ ఓ ఔ".w_split,
|
178
|
+
vowel_marks: "ా ి ీ ు ూ ృ ౄ ౢ ౣ ె ే ై ొ ో ౌ".w_split,
|
179
|
+
other_marks: "ం ః ఁ".w_split,
|
180
|
+
virama: ["్"],
|
181
|
+
consonants: "క ఖ గ ఘ ఙ చ ఛ జ ఝ ఞ ట ఠ డ ఢ ణ త థ ద ధ న ప ఫ బ భ మ య ర ల వ శ ష స హ ళ క్ష జ్ఞ".w_split,
|
182
|
+
symbols: "౦ ౧ ౨ ౩ ౪ ౫ ౬ ౭ ౮ ౯ ఓం ఽ । ॥".w_split,
|
183
|
+
other: " ఱ".w_split,
|
184
|
+
},
|
185
|
+
|
186
|
+
# International Alphabet of Sanskrit Transliteration
|
187
|
+
# --------------------------------------------------
|
188
|
+
# The most "professional" Sanskrit romanization scheme.
|
189
|
+
#
|
190
|
+
iast: {
|
191
|
+
vowels: "a ā i ī u ū ṛ ṝ ḷ ḹ e ai o au".w_split,
|
192
|
+
other_marks: ["ṃ", "ḥ", "~"],
|
193
|
+
virama: [""],
|
194
|
+
consonants: "k kh g gh ṅ c ch j jh ñ ṭ ṭh ḍ ḍh ṇ t th d dh n p ph b bh m y r l v ś ṣ s h ḻ kṣ jñ".w_split,
|
195
|
+
symbols: "0 1 2 3 4 5 6 7 8 9 oṃ ' । ॥".w_split,
|
196
|
+
},
|
197
|
+
|
198
|
+
# ITRANS
|
199
|
+
# ------
|
200
|
+
# One of the first romanization schemes -- and one of the most
|
201
|
+
# complicated. For alternate forms, see the "allAlternates" variable
|
202
|
+
# below.
|
203
|
+
# *
|
204
|
+
# '_' is a "null" letter, which allows adjacent vowels.
|
205
|
+
#
|
206
|
+
itrans: {
|
207
|
+
vowels: "a A i I u U RRi RRI LLi LLI e ai o au".w_split,
|
208
|
+
other_marks: ["M", "H", ".N"],
|
209
|
+
virama: [""],
|
210
|
+
consonants: "k kh g gh ~N ch Ch j jh ~n T Th D Dh N t th d dh n p ph b bh m y r l v sh Sh s h L kSh j~n".w_split,
|
211
|
+
symbols: "0 1 2 3 4 5 6 7 8 9 OM .a | ||".w_split,
|
212
|
+
candra: [".c"],
|
213
|
+
zwj: ["{}"],
|
214
|
+
skip: ["_"],
|
215
|
+
accent: ["\\'", "\\_"],
|
216
|
+
combo_accent: "\\'H \\_H \\'M \\_M".w_split,
|
217
|
+
other: "q K G z .D .Dh f Y R".w_split,
|
218
|
+
},
|
219
|
+
|
220
|
+
# Harvard-Kyoto
|
221
|
+
# -------------
|
222
|
+
# A simple 1:1 mapping.
|
223
|
+
#
|
224
|
+
hk: {
|
225
|
+
vowels: "a A i I u U R RR lR lRR e ai o au".w_split,
|
226
|
+
other_marks: "M H ~".w_split,
|
227
|
+
virama: [""],
|
228
|
+
consonants: "k kh g gh G c ch j jh J T Th D Dh N t th d dh n p ph b bh m y r l v z S s h L kS jJ".w_split,
|
229
|
+
symbols: "0 1 2 3 4 5 6 7 8 9 OM ' | ||".w_split,
|
230
|
+
},
|
231
|
+
|
232
|
+
# National Library at Kolkata
|
233
|
+
# ---------------------------
|
234
|
+
# Apart from using "ē" and "ō" instead of "e" and "o", this scheme is
|
235
|
+
# identical to IAST. ṝ, ḷ, and ḹ are not part of the scheme proper.
|
236
|
+
# *
|
237
|
+
# This is defined further below.
|
238
|
+
#
|
239
|
+
|
240
|
+
# Sanskrit Library Phonetic Basic
|
241
|
+
# -------------------------------
|
242
|
+
# With one ASCII letter per phoneme, this is the tersest transliteration
|
243
|
+
# scheme in use today and is especially suited to computer processing.
|
244
|
+
#
|
245
|
+
slp1: {
|
246
|
+
vowels: "a A i I u U f F x X e E o O".w_split,
|
247
|
+
other_marks: "M H ~".w_split,
|
248
|
+
virama: [""],
|
249
|
+
consonants: "k K g G N c C j J Y w W q Q R t T d D n p P b B m y r l v S z s h L kz jY".w_split,
|
250
|
+
symbols: "0 1 2 3 4 5 6 7 8 9 oM ' . ..".w_split,
|
251
|
+
},
|
252
|
+
|
253
|
+
# Velthuis
|
254
|
+
# --------
|
255
|
+
# A case-insensitive Sanskrit encoding.
|
256
|
+
#
|
257
|
+
velthuis: {
|
258
|
+
vowels: "a aa i ii u uu .r .rr .li .ll e ai o au".w_split,
|
259
|
+
other_marks: ".m .h ".w_split,
|
260
|
+
virama: [""],
|
261
|
+
consonants: 'k kh g gh "n c ch j jh ~n .t .th .d .d .n t th d dh n p ph b bh m y r l v ~s .s s h L k.s j~n'.w_split,
|
262
|
+
symbols: "0 1 2 3 4 5 6 7 8 9 o.m ' | ||".w_split,
|
263
|
+
},
|
264
|
+
|
265
|
+
# WX
|
266
|
+
# --
|
267
|
+
# As terse as SLP1.
|
268
|
+
#
|
269
|
+
wx: {
|
270
|
+
vowels: "a A i I u U q Q L e E o O".w_split,
|
271
|
+
other_marks: "M H z".w_split,
|
272
|
+
virama: [""],
|
273
|
+
consonants: "k K g G f c C j J F t T d D N w W x X n p P b B m y r l v S R s h kR jF".w_split,
|
274
|
+
symbols: "0 1 2 3 4 5 6 7 8 9 oM ' | ||".w_split,
|
275
|
+
},
|
276
|
+
}
|
277
|
+
|
278
|
+
# Set of names of schemes
|
279
|
+
@roman_schemes = Set.new
|
280
|
+
|
281
|
+
# Map of alternate encodings.
|
282
|
+
@all_alternates = {
|
283
|
+
itrans: {
|
284
|
+
"A" => ["aa"],
|
285
|
+
"I" => %w[ii ee],
|
286
|
+
"U" => %w[uu oo],
|
287
|
+
"RRi" => ["R^i"],
|
288
|
+
"RRI" => ["R^I"],
|
289
|
+
"LLi" => ["L^i"],
|
290
|
+
"LLI" => ["L^I"],
|
291
|
+
"M" => [".m", ".n"],
|
292
|
+
"~N" => ["N^"],
|
293
|
+
"ch" => ["c"],
|
294
|
+
"Ch" => %w[C chh],
|
295
|
+
"~n" => ["JN"],
|
296
|
+
"v" => ["w"],
|
297
|
+
"Sh" => %w[S shh],
|
298
|
+
"kSh" => %w[kS x],
|
299
|
+
"j~n" => %w[GY dny],
|
300
|
+
"OM" => ["AUM"],
|
301
|
+
"\\_" => ["\\`"],
|
302
|
+
"\\_H" => ["\\`H"],
|
303
|
+
"\\'M" => ["\\'.m", "\\'.n"],
|
304
|
+
"\\_M" => "\\_.m \\_.n \\`M \\`.m \\`.n".w_split,
|
305
|
+
".a" => ["~"],
|
306
|
+
"|" => ["."],
|
307
|
+
"||" => [".."],
|
308
|
+
"z" => ["J"],
|
309
|
+
},
|
310
|
+
}
|
311
|
+
end
|
312
|
+
end
|