interscript 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +246 -14
  3. data/bin/interscript +38 -17
  4. data/bin/setup +8 -0
  5. data/lib/g2pwrapper.py +34 -0
  6. data/lib/interscript.rb +140 -16
  7. data/lib/interscript/command.rb +27 -0
  8. data/lib/interscript/mapping.rb +125 -0
  9. data/lib/interscript/version.rb +1 -1
  10. data/lib/model-7 +0 -0
  11. data/lib/tha-pt-b-7 +0 -0
  12. data/maps/acadsin-zho-Hani-Latn-2002.yaml +38912 -0
  13. data/maps/alalc-bel-cyrl-latn-1997.yaml +125 -0
  14. data/maps/alalc-ben-Beng-Latn-2017.yaml +130 -0
  15. data/maps/alalc-bul-Cyrl-Latn-1997.yaml +94 -0
  16. data/maps/alalc-ell-Grek-Latn-1997.yaml +625 -0
  17. data/maps/alalc-ell-Grek-Latn-2010.yaml +628 -0
  18. data/maps/alalc-kat-Geok-Latn-1997.yaml +112 -0
  19. data/maps/alalc-kat-Geor-Latn-1997.yaml +146 -0
  20. data/maps/alalc-kor-Hang-Latn-1997.yaml +94 -0
  21. data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +103 -0
  22. data/maps/alalc-mkd-cyrl-latn-1997.yaml +114 -0
  23. data/maps/alalc-srp-Cyrl-Latn-1997.yaml +114 -0
  24. data/maps/alalc-srp-cyrl-latn-2013.yaml +135 -0
  25. data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +141 -0
  26. data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +16 -0
  27. data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +283 -0
  28. data/maps/{bas-rus-Cyrl-Latn-bss.yaml → bas-rus-Cyrl-Latn-2017-bss.yaml} +57 -31
  29. data/maps/{bas-rus-Cyrl-Latn-oss.yaml → bas-rus-Cyrl-Latn-2017-oss.yaml} +54 -34
  30. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +294 -0
  31. data/maps/bgn-kor-Hang-Latn-1943.yaml +31 -0
  32. data/maps/bgn-kor-Kore-Latn-1943.yaml +31 -0
  33. data/maps/bgna-bul-Cyrl-Latn-2006.yaml +208 -0
  34. data/maps/bgna-bul-Cyrl-Latn-2009.yaml +208 -0
  35. data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +1 -2
  36. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +104 -0
  37. data/maps/bgnpcgn-bel-cyrl-latn-1979.yaml +285 -0
  38. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +115 -0
  39. data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +10 -64
  40. data/maps/bgnpcgn-chn-Hans-Latn-1979.yaml +7456 -0
  41. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +702 -0
  42. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +20 -0
  43. data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +257 -0
  44. data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +127 -0
  45. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +43 -0
  46. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +253 -0
  47. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +48 -0
  48. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +48 -0
  49. data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +159 -0
  50. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +190 -0
  51. data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +145 -64
  52. data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +166 -0
  53. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +75 -2
  54. data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +208 -0
  55. data/maps/by-bel-Cyrl-Latn-1998.yaml +168 -0
  56. data/maps/by-bel-Cyrl-Latn-2007.yaml +115 -0
  57. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +685 -0
  58. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +681 -0
  59. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +20 -0
  60. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +32 -0
  61. data/maps/ggg-kat-Geor-Latn-2002.yaml +89 -0
  62. data/maps/gki-bel-cyrl-latn-1992.yaml +33 -0
  63. data/maps/gki-bel-cyrl-latn-2000.yaml +201 -0
  64. data/maps/gost-rus-cyrl-latn-16876-71-1983.yaml +186 -0
  65. data/maps/hk-yue-Hani-Latn-1888.yaml +38497 -0
  66. data/maps/icao-bel-Cyrl-Latn-9303.yaml +108 -92
  67. data/maps/icao-bul-Cyrl-Latn-9303.yaml +1 -2
  68. data/maps/icao-heb-Hebr-Latn-9303.yaml +118 -124
  69. data/maps/icao-mkd-Cyrl-Latn-9303.yaml +1 -2
  70. data/maps/icao-per-Arab-Latn-9303.yaml +5 -6
  71. data/maps/icao-rus-Cyrl-Latn-9303.yaml +1 -2
  72. data/maps/icao-srp-Cyrl-Latn-9303.yaml +1 -2
  73. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +1 -2
  74. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +610 -0
  75. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +41 -0
  76. data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +62 -0
  77. data/maps/{iso-rus-Cyrl-Latn-iso9.yaml → iso-rus-Cyrl-Latn-9-1995.yaml} +2 -3
  78. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +109 -0
  79. data/maps/kp-kor-Hang-Latn-2002.yaml +901 -0
  80. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +44820 -0
  81. data/maps/mext-jpn-Hrkt-Latn-1954.yaml +411 -0
  82. data/maps/moct-kor-Hang-Latn-2000.yaml +803 -0
  83. data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +541 -0
  84. data/maps/nil-kor-Hang-Hang-jamo.yaml +11193 -0
  85. data/maps/odni-kat-Geor-Latn-2015.yaml +88 -0
  86. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +157 -0
  87. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +90 -0
  88. data/maps/royin-tha-Thai-Latn-1968.yaml +179 -0
  89. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +180 -0
  90. data/maps/royin-tha-Thai-Latn-1999.yaml +76 -0
  91. data/maps/{cn-chn-Hans-Latn-pinyin.yaml → sac-zho-Hans-Latn-1979.yaml} +6 -7
  92. data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +222 -0
  93. data/maps/ua-ukr-Cyrl-Latn-1996.yaml +193 -0
  94. data/maps/un-bel-Cyrl-Latn-2007.yaml +114 -0
  95. data/maps/un-ben-Beng-Latn-2016.yaml +534 -0
  96. data/maps/un-ell-Grek-Latn-1987-tl.yaml +32 -0
  97. data/maps/un-ell-Grek-Latn-1987-ts.yaml +20 -0
  98. data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +780 -0
  99. data/maps/un-mon-Mong-Latn-2013.yaml +19 -6
  100. data/maps/un-rus-Cyrl-Latn-1987.yaml +166 -0
  101. data/maps/un-ukr-cyrl-latn-1998.yaml +30 -0
  102. data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +406 -0
  103. data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +386 -0
  104. data/maps/var-kor-Hang-Latn-mr-1939.yaml +1054 -0
  105. data/maps/var-kor-Kore-Hang-2013.yaml +59754 -0
  106. data/maps/var-kor-Kore-Latn-mr-1939.yaml +37 -0
  107. data/maps/var-tha-Thai-Thai-phonemic.yaml +59 -0
  108. data/maps/var-tha-Thai-Zsym-ipa.yaml +301 -0
  109. data/maps/var-zho-Hani-Latn-1979.yaml +38908 -0
  110. data/spec/interscript/mapping_spec.rb +42 -0
  111. data/spec/interscript_spec.rb +20 -5
  112. data/spec/spec_helper.rb +3 -1
  113. metadata +149 -24
  114. data/maps/bgnpcgn-chn-Hans-Latn-pinyin.yaml +0 -7503
  115. data/maps/historic-jpn-Hrkt-Latn-hepburn.yaml +0 -336
  116. data/maps/icao-gre-Grek-Latn-9303.yaml +0 -101
  117. data/maps/mext-jpn-Hrkt-Latn-hepburn.yaml +0 -330
  118. data/maps/mext-jpn-Hrkt-Latn-kunrei.yaml +0 -308
  119. data/maps/un-jpn-Hrkt-Latn-hepburn.yaml +0 -313
  120. data/maps/un-jpn-Hrkt-Latn-kunrei.yaml +0 -354
@@ -0,0 +1,193 @@
1
+ ---
2
+ authority_id: ua
3
+ id: 1996
4
+ language: ukr
5
+ source_script: Cyrl
6
+ destination_script: Latn
7
+ name: Government of Ukraine Ukrainian System (1996)
8
+ url: http://transliteration.eki.ee/pdf/Ukrainian.pdf
9
+ creation_date: 1996
10
+ description: Romanization table for Ukrainian. The current national system of romanization.
11
+
12
+ notes:
13
+ - gh is used in the romanization of зг zgh.
14
+ - In initial position є -> ye, ї -> yi, й -> y, ю -> yu, я -> ya.
15
+
16
+ tests:
17
+ - source: Алушта
18
+ expected: Alushta
19
+ - source: Борщагівка
20
+ expected: Borschahivka
21
+ - source: Вишгород
22
+ expected: Vyshhorod
23
+ - source: Гадяч
24
+ expected: Hadiach
25
+ - source: Згорани
26
+ expected: Zghorany
27
+ - source: Ґалаґан
28
+ expected: Galagan
29
+ - source: Дон
30
+ expected: Don
31
+ - source: Рівне
32
+ expected: Rivne
33
+ - source: Єнакієве
34
+ expected: Yenakiieve
35
+ - source: Наєнко
36
+ expected: Naienko
37
+ - source: Житомир
38
+ expected: Zhytomyr
39
+ - source: Запоріжжя
40
+ expected: Zaporizhzhia
41
+ - source: Закарпаття
42
+ expected: Zakarpattia
43
+ - source: Медвин
44
+ expected: Medvyn
45
+ - source: Іршава
46
+ expected: Irshava
47
+ - source: Їжакевич
48
+ expected: Yizhakevych
49
+ - source: Кадіївка
50
+ expected: Kadiivka
51
+ - source: Йосипівка
52
+ expected: Yosypivka
53
+ - source: Київ
54
+ expected: Kyiv
55
+ - source: Лебедин
56
+ expected: Lebedyn
57
+ - source: Миколаїв
58
+ expected: Mykolaiv
59
+ - source: Ніжин
60
+ expected: Nizhyn
61
+ - source: Одеса
62
+ expected: Odesa
63
+ - source: Полтава
64
+ expected: Poltava
65
+ - source: Ромни
66
+ expected: Romny
67
+ - source: Суми
68
+ expected: Sumy
69
+ - source: Тетерів
70
+ expected: Teteriv
71
+ - source: Ужгород
72
+ expected: Uzhhorod
73
+ - source: Фастів
74
+ expected: Fastiv
75
+ - source: Харків
76
+ expected: Kharkiv
77
+ - source: Біла Церква
78
+ expected: Bila Tserkva
79
+ - source: Чернівці
80
+ expected: Chernivtsi
81
+ - source: Шостка
82
+ expected: Shostka
83
+ - source: Гоща
84
+ expected: Hoscha
85
+ - source: Русь
86
+ expected: Rus’
87
+ - source: Юрій
88
+ expected: Yurii
89
+ - source: Крюківка
90
+ expected: Kriukivka
91
+ - source: Яготин
92
+ expected: Yahotyn
93
+ - source: Ічня
94
+ expected: Ichnia
95
+ - source: Знам’янка
96
+ expected: Znam”ianka
97
+
98
+ map:
99
+ rules:
100
+ - pattern: (?<=[Зз])\u0413 # Г after З or з
101
+ result: Gh
102
+ - pattern: (?<=[Зз])\u0433 # г after З or з
103
+ result: gh
104
+ - pattern: (?<!\b\u2019)\b\u0404 # Є in initial position -> Ye
105
+ result: Ye
106
+ - pattern: (?<!\b\u2019)\b\u0454 # є in initial position -> ye
107
+ result: ye
108
+ - pattern: (?<!\b\u2019)\b\u0407 # Ї in initial position -> Yi
109
+ result: Yi
110
+ - pattern: (?<!\b\u2019)\b\u0457 # ї in initial position -> yi
111
+ result: yi
112
+ - pattern: (?<!\b\u2019)\b\u0419 # Й in initial position -> Y
113
+ result: "Y"
114
+ - pattern: (?<!\b\u2019)\b\u0419 # й in initial position -> y
115
+ result: "y"
116
+ - pattern: (?<!\b\u2019)\b\u042e # Ю in initial position -> Yu
117
+ result: Yu
118
+ - pattern: (?<!\b\u2019)\b\u044e # ю in initial position -> yu
119
+ result: yu
120
+ - pattern: (?<!\b\u2019)\b\u042f # Я in initial position -> Ya
121
+ result: Ya
122
+ - pattern: (?<!\b\u2019)\b\u044f # я in initial position -> ya
123
+ result: ya
124
+ - pattern: \b\u2019\b # ’ in the middle of a word -> ”
125
+ result: "\u201d"
126
+
127
+ characters:
128
+ "\u0410": "A" # А
129
+ "\u0411": "B" # Б
130
+ "\u0412": "V" # В
131
+ "\u0413": "H" # Г
132
+ "\u0490": "G" # Ґ
133
+ "\u0414": "D" # Д
134
+ "\u0415": "E" # Е
135
+ "\u0404": "Ie" # Є
136
+ "\u0416": "Zh" # Ж
137
+ "\u0417": "Z" # З
138
+ "\u0418": "Y" # И
139
+ "\u0406": "I" # І
140
+ "\u0407": "I" # Ї
141
+ "\u0419": "I" # Й
142
+ "\u041a": "K" # К
143
+ "\u041b": "L" # Л
144
+ "\u041c": "M" # М
145
+ "\u041d": "N" # Н
146
+ "\u041e": "O" # О
147
+ "\u041f": "P" # П
148
+ "\u0420": "R" # Р
149
+ "\u0421": "S" # С
150
+ "\u0422": "T" # Т
151
+ "\u0423": "U" # У
152
+ "\u0424": "F" # Ф
153
+ "\u0425": "Kh" # Х
154
+ "\u0426": "Ts" # Ц
155
+ "\u0427": "Ch" # Ч
156
+ "\u0428": "Sh" # Ш
157
+ "\u0429": "Sch" # Щ
158
+ "\u042e": "Iu" # Ю
159
+ "\u042f": "Ia" # Я
160
+ "\u042c": "\u2019" # Ь -> ’
161
+ "\u0430": "a" # а
162
+ "\u0431": "b" # б
163
+ "\u0432": "v" # в
164
+ "\u0433": "h" # г
165
+ "\u0491": "g" # ґ
166
+ "\u0434": "d" # д
167
+ "\u0435": "e" # е
168
+ "\u0454": "ie" # є
169
+ "\u0436": "zh" # ж
170
+ "\u0437": "z" # з
171
+ "\u0438": "y" # и
172
+ "\u0456": "i" # і
173
+ "\u0457": "i" # ї
174
+ "\u0439": "i" # й
175
+ "\u043a": "k" # к
176
+ "\u043b": "l" # л
177
+ "\u043c": "m" # м
178
+ "\u043d": "n" # н
179
+ "\u043e": "o" # о
180
+ "\u043f": "p" # п
181
+ "\u0440": "r" # р
182
+ "\u0441": "s" # с
183
+ "\u0442": "t" # т
184
+ "\u0443": "u" # у
185
+ "\u0444": "f" # ф
186
+ "\u0445": "kh" # х
187
+ "\u0446": "ts" # ц
188
+ "\u0447": "ch" # ч
189
+ "\u0448": "sh" # ш
190
+ "\u0449": "sch" # щ
191
+ "\u044e": "iu" # ю
192
+ "\u044f": "ia" # я
193
+ "\u044c": "\u2019" # Ь -> ’
@@ -0,0 +1,114 @@
1
+ ---
2
+ authority_id: un
3
+ id: 2007
4
+ language: bel
5
+ source_script: Cyrl
6
+ destination_script: Latn
7
+ name: National System of Geographic Names Transmission into Roman Alphabet in Belarus
8
+ url: https://unstats.un.org/unsd/geoinfo/UNGEGN/docs/9th-uncsgn-docs/crp/9th_UNCSGN_e-conf-98-crp-21.pdf
9
+ creation_date: 2007
10
+ description: |
11
+ RESOLUTION OF THE STATE COMMITTEE
12
+ ON PROPERTY OF THE REPUBLIC OF BELARUS June 11, 2007 No. 38
13
+
14
+ 8/16668 (06/18/2007) On amendments and additions to the Instructions
15
+ for the transliteration of geographical names of the
16
+ Republic of Belarus in letters of the Latin alphabet
17
+
18
+ Based on the Regulation on the State Property Committee of the Republic of Belarus,
19
+ approved by the Decree of the Council of Ministers of the Republic of Belarusdated July 29, 2006
20
+ No. 958 "Issues of the State Committee on Property of the Republic of Belarus"
21
+ tests: # the same as "by-bel-cyrl-Latn-2007"
22
+ - source: Аршанскi
23
+ expected: Aršanski
24
+ - source: Бешанковічы
25
+ expected: Biešankovičy
26
+ - source: Віцебск
27
+ expected: Viciebsk
28
+ - source: Гомель
29
+ expected: Homieĺ
30
+ - source: Гаўя
31
+ expected: Haŭja
32
+ - source: Добруш
33
+ expected: Dobruš
34
+ - source: Ельск
35
+ expected: Jeĺsk
36
+ - source: Бабаедава
37
+ expected: Babajedava
38
+ - source: Венцавічы
39
+ expected: Viencavičy
40
+ - source: Ёды
41
+ expected: Jody
42
+ - source: Вераб'ёвічы
43
+ expected: Vierabjovičy
44
+ - source: Мёры
45
+ expected: Miory
46
+ - source: Зэльва
47
+ expected: Zeĺva
48
+ - source: Iванава
49
+ expected: Ivanava
50
+ - source: Iўе
51
+ expected: Iŭje
52
+ - source: Лагойск
53
+ expected: Lahojsk
54
+ - source: Круглае
55
+ expected: Kruhlaje
56
+ - source: Лошыца
57
+ expected: Lošyca
58
+ - source: Любань
59
+ expected: Liubań
60
+ - source: Магілёў
61
+ expected: Mahilioŭ
62
+ - source: Нясвіж
63
+ expected: Niasviž
64
+ - source: Орша
65
+ expected: Orša
66
+ - source: Паставы
67
+ expected: Pastavy
68
+ - source: Рагачоў
69
+ expected: Rahačoŭ
70
+ - source: Смаргонь
71
+ expected: Smarhoń
72
+ - source: Талачын
73
+ expected: Talačyn
74
+ - source: Узда
75
+ expected: Uzda
76
+ - source: Шаркаўшчына
77
+ expected: Šarkaŭščyna
78
+ - source: Фаніпаль
79
+ expected: Fanipaĺ
80
+ - source: Хоцімск
81
+ expected: Chocimsk
82
+ - source: Цёмны Лес
83
+ expected: Ciomny Lies
84
+ - source: Чавусы
85
+ expected: Čavusy
86
+ - source: Шумілiна
87
+ expected: Šumilina
88
+ - source: Чыгірынка
89
+ expected: Čyhirynka
90
+ - source: Чэрвень
91
+ expected: Červień
92
+ - source: Друць
93
+ expected: Druć
94
+ - source: Чачэрск
95
+ expected: Čačersk
96
+ - source: Юхнаўка
97
+ expected: Juchnaŭka
98
+ - source: Гаюціна
99
+ expected: Hajucina
100
+ - source: Цюрлi
101
+ expected: Ciurli
102
+ - source: Любонічы
103
+ expected: Liuboničy
104
+ - source: Ямнае
105
+ expected: Jamnaje
106
+ - source: Баяры
107
+ expected: Bajary
108
+ - source: Валяр'яны
109
+ expected: Valiarjany
110
+ - source: Вязынка
111
+ expected: Viazynka
112
+
113
+ map:
114
+ inherit: "by-bel-Cyrl-Latn-2007"
@@ -0,0 +1,534 @@
1
+ ---
2
+ authority_id: un
3
+ id: 2016
4
+ language: ben
5
+ source_script: Beng
6
+ destination_script: Latn
7
+ name: Bengali Romanization, Version 4.0
8
+ url: http://www.eki.ee/wgrs/rom1_bn.htm
9
+ creation_date: 2016
10
+ description: |
11
+ The United Nations recommended system was approved in 1972 (II/11)
12
+ and amended in 1977 (III/12), based on a report prepared by D. N.
13
+ Sharma. The tables and their corrections were published in volume II of
14
+ the conference reports1,2.
15
+
16
+ There is no evidence of the use of the system either in Bangladesh,
17
+ in India or in international cartographic products. The resolution
18
+ IV/17 (1982) recommended association, inter alia, with Bangladesh, in
19
+ carrying out further studies on the system.
20
+
21
+ Bengali (Bānglā) uses an alphasyllabic script whereby each character
22
+ represents a syllable rather than one sound. Vowels and diphthongs are
23
+ marked in two ways: as independent characters (used syllable-initially)
24
+ and in an abbreviated form, to denote vowels after consonants. The
25
+ romanization table is unambiguous but the user would have to recognize
26
+ many ligatures not given in the original table. The system is mostly
27
+ reversible but there exist some ambiguities in the romanization of
28
+ vowels (independent vs. abbreviated characters) and consonants
29
+ (ligatures vs. character sequences).
30
+
31
+ Other systems of romanization
32
+
33
+ For differences between the UN system and the ISO transliteration
34
+ standard ISO 15919: 2001 see the section on the romanization of Hindi.
35
+
36
+ References
37
+
38
+ Second United Nations Conference on the Standardization of
39
+ Geographical Names. London, 10–31 May 1972. Vol. II. Technical papers.
40
+ United Nations. New York 1974, pp. 139–140.
41
+
42
+ Third United Nations Conference on the Standardization of
43
+ Geographical Names. Athens, 17 August – 7 September 1977. Vol. II,
44
+ Technical papers, pp. 393 etc.
45
+
46
+ notes:
47
+ - |
48
+ In the romanization system below character variations and the table of ligatures have been added.
49
+
50
+ I. Independent vowel characters
51
+
52
+ 1 অ a
53
+ 2 আ ā
54
+ 3 ই i
55
+ 4 ঈ ī
56
+ 5 উ u
57
+ 6 ঊ ū
58
+ 7 ঋ ṛ
59
+ 8 এ e
60
+ 9 ঐ ai
61
+ 10 ও o
62
+ 11 ঔ au
63
+
64
+
65
+
66
+
67
+ - Where two Roman equivalents are given, the second (in brackets) is
68
+ used for recording the pronunciation of place-names while the first
69
+ form is for general use.
70
+ - In the table only word-initial character variants are shown.
71
+ Depending on the position in the word many variants of the characters
72
+ are used as well as some ligatures. These features are not covered here.
73
+ - For technical reasons the characters of the Mongolian script are
74
+ turned 90˚ anti-clockwise.
75
+
76
+ tests:
77
+ - source: "র্ক"
78
+ expected: "rka"
79
+ - source: "গ্র"
80
+ expected: "gra"
81
+ - source: "ত্য"
82
+ expected: "tya"
83
+
84
+ - source: |
85
+ আমার সোনার বাংলা, আমি তোমায় ভালোবাসি।
86
+ চিরদিন তোমার আকাশ, তোমার বাতাস, আমার প্রাণে বাজায় বাঁশি॥
87
+ ও মা, ফাগুনে তোর আমের বনে ঘ্রাণে পাগল করে, মরি হায়, হায় রে—
88
+ ও মা, অঘ্রাণে তোর ভরা ক্ষেতে আমি কী দেখেছি মধুর হাসি॥
89
+
90
+ কী শোভা, কী ছায়া গো, কী স্নেহ, কী মায়া গো—
91
+ কী আঁচল বিছায়েছ বটের মূলে, নদীর কূলে কূলে।
92
+ মা, তোর মুখের বাণী আমার কানে লাগে সুধার মতো,
93
+ মরি হায়, হায় রে—
94
+ মা, তোর বদনখানি মলিন হলে, ও মা, আমি নয়নজলে ভাসি॥
95
+
96
+ # Note: There are still couple of improvements we can do in the
97
+ # transilation system, but for now this could work
98
+ #
99
+ # But please revisit this - specially the use case of `য়`, it's adding
100
+ # some mixed character in the text.
101
+ #
102
+ expected: |
103
+ āmaāra saonaāra baāṁlaā, āmai taomaāj̱aA় bhaālaobaāsai।
104
+ chairadaina taomaāra ākaāsha, taomaāra baātaāsa, āmaāra praāṇae baājaāj̱aA় baām̐shai॥
105
+ o maā, phaāgaunae taora āmaera banae ghraāṇae paāgala karae, marai haāj̱aA়, haāj̱aA় rae—
106
+ o maā, aghraāṇae taora bharaā kṣhaetae āmai kaī daekhaechhai madhaura haāsai॥
107
+
108
+ kaī shaobhaā, kaī chhaāj̱aA়ā gao, kaī snaeha, kaī maāj̱aA়ā gao—
109
+ kaī ām̐chala baichhaāj̱aA়echha baṭaera maūlae, nadaīra kaūlae kaūlae।
110
+ maā, taora maukhaera baāṇaī āmaāra kaānae laāgae saudhaāra matao,
111
+ marai haāj̱aA়, haāj̱aA় rae—
112
+ maā, taora badanakhaānai malaina halae, o maā, āmai naj̱aA়najalae bhaāsai॥
113
+
114
+ map:
115
+ characters:
116
+
117
+ # I. Independent vowel characters
118
+
119
+ 'অ': 'a' # 1
120
+ 'আ': 'ā' # 2
121
+ 'ই': 'i' # 3
122
+ 'ঈ': 'ī' # 4
123
+ 'উ': 'u' # 5
124
+ 'ঊ': 'ū' # 6
125
+ 'ঋ': 'ṛ' # 7
126
+ 'এ': 'e' # 8
127
+ 'ঐ': 'ai' # 9
128
+ 'ও': 'o' # 10
129
+ 'ঔ': 'au' # 11
130
+
131
+ # II. Abbreviated vowel characters (ক stands for any consonant character)
132
+
133
+ # 'ক': 'a' # 1
134
+ '\u09be': 'ā' # 2 কা
135
+ '\u09bf': 'i' # 3 কি
136
+ '\u09c0': 'ī' # 4 কী
137
+ '\u09c1': 'u' # 5 কু Exceptions: গু gu; রু ru; শু shu; হু hu; ন্তু ntu; স্তু stu.
138
+ '\u09c2': 'ū' # 6 কূ Exception: রূ rū.
139
+ '\u09c3': 'ṛ' # 7 কৃ Exception: হৃ hṛ.
140
+ '\u09c7': 'e' # 8 কে
141
+ '\u09c8': 'ai' # 9 কৈ
142
+ '\u09cb': 'o' # 10 কো
143
+ '\u09cc': 'au' # 11 কৌ
144
+
145
+ # II 5 Exceptions
146
+ 'গু': 'gu'
147
+ 'রু': 'ru'
148
+ 'শু': 'shu'
149
+ 'হু': 'hu'
150
+ 'ন্তু': 'ntu'
151
+ 'স্তু': 'stu'
152
+ # II 6 Exceptions
153
+ 'রূ': 'rū'
154
+ # II 7 Exceptions
155
+ 'হৃ': 'hṛ'
156
+
157
+ # III. Other symbols (ক stands for any consonant character)
158
+
159
+ '\u0982': 'ṁ' # 1 কং
160
+ '\u0981': 'm̐' # 2 কঁ
161
+ '\u0983': 'ḥ' # 3 কঃ
162
+ '\u09cd\u200c': '' # 4 ক্‌ Pronunciation without a vowel; special form: ৎ t.
163
+
164
+ # III 4 special form
165
+ 'ৎ': 't'
166
+
167
+ # IV. Consonant characters
168
+
169
+ 'ক': 'ka' # 1
170
+ 'খ': 'kha' # 2
171
+ 'গ': 'ga' # 3
172
+ 'ঘ': 'gha' # 4
173
+ 'ঙ': 'ṅa' # 5
174
+ 'চ': 'cha' # 6
175
+ 'ছ': 'chha' # 7
176
+ 'জ': 'ja' # 8
177
+ 'ঝ': 'jha' # 9
178
+ 'ঞ': 'ña' # 10
179
+ 'ট': 'ṭa' # 11
180
+ 'ঠ': 'ṭha' # 12
181
+ 'ড': 'ḍa' # 13 A Dotted variants of the characters: ড় ṙa; ঢ় ṙha; য় ya.
182
+ 'ঢ': 'ḍha' # 14 A Dotted variants of the characters: ড় ṙa; ঢ় ṙha; য় ya.
183
+ 'ণ': 'ṇa' # 15
184
+ 'ত': 'ta' # 16
185
+ 'থ': 'tha' # 17
186
+ 'দ': 'da' # 18
187
+ 'ধ': 'dha' # 19
188
+ 'ন': 'na' # 20
189
+ 'প': 'pa' # 21
190
+ 'ফ': 'pha' # 22
191
+ 'ব': 'ba' # 23
192
+ 'ভ': 'bha' # 24
193
+ 'ম': 'ma' # 25
194
+ 'য': 'j̱aA' # 26
195
+ 'র': 'ra' # 27
196
+ 'ল': 'la' # 28
197
+ 'শ': 'sha' # 29
198
+ 'ষ': 'ṣha' # 30
199
+ 'স': 'sa' # 31
200
+ 'হ': 'ha' # 32
201
+
202
+ # IV 13, 14
203
+ 'ড়': 'ṙa'
204
+ 'ঢ়': 'ṙha'
205
+ 'য়': 'ya'
206
+
207
+
208
+ # V. Ligatures
209
+ # Adscript forms of some consonants
210
+ #
211
+ # We already implemented one to one mapping for most commonly used
212
+ # combined letters - (Zuktabarna), so we can ignore this custom rules
213
+ # fro now.
214
+ #
215
+ # 'র্‍': 'r-:'
216
+ # '‍্র': '-r:'
217
+ # '‍্য': '-y:'
218
+
219
+
220
+ # Other ligatures (the list is not complete)
221
+
222
+ 'ক্ক': 'kka'
223
+ 'ক্ট': 'kṭa'
224
+ 'ক্ত': 'kta'
225
+ 'ক্ন': 'kna'
226
+ 'ক্ম': 'kma'
227
+ 'ক্র': 'kra'
228
+ 'ক্ল': 'kla'
229
+ 'ক্ব': 'kva'
230
+ 'ক্ষ': 'kṣha'
231
+ 'ক্ষ্ন': 'kṣhna'
232
+ 'ক্ষ্ম': 'kṣhma'
233
+ 'ক্ষ্ব': 'kṣhva'
234
+
235
+ 'ক্স': 'ksa'
236
+ 'গ্গ': 'gga'
237
+ 'গ্দ': 'gda'
238
+ 'গ্ধ': 'gdha'
239
+ 'গ্ন': 'gna'
240
+ 'গ্ম': 'gma'
241
+ 'গ্র': 'gra'
242
+ 'গ্ল': 'gla'
243
+ 'ঘ্র': 'ghra'
244
+ 'ঙ্ক': 'ṅka'
245
+ 'ঙ্গ': 'ṅga'
246
+ 'চ্চ': 'chcha'
247
+
248
+ 'চ্ছ': 'chchha'
249
+ 'চ্ছ্ব': 'chchhva'
250
+ 'চ্ঞ': 'chña'
251
+ 'জ্জ': 'jja'
252
+ 'জ্জ্ব': 'jjva'
253
+ 'জ্ঝ': 'jjha'
254
+ 'জ্ঞ': 'jña'
255
+ 'জ্ব': 'jva'
256
+ 'ঞ্চ': 'ñcha'
257
+ 'ঞ্ছ': 'ñchha'
258
+ 'ঞ্জ': 'ñja'
259
+ 'ঞ্ঝ': 'ñjha'
260
+
261
+ 'ট্ট': 'ṭṭa'
262
+ 'ড্ড': 'ḍḍa'
263
+ 'ণ্ট': 'ṇṭa'
264
+ 'ণ্ঠ': 'ṇṭha'
265
+ 'ণ্ড': 'ṇḍa'
266
+ 'ত্ত': 'tta'
267
+ 'ত্ত্ব': 'ttva'
268
+ 'ত্থ': 'ttha'
269
+ 'ত্ন': 'tna'
270
+ 'ত্ম': 'tma'
271
+ 'ত্র': 'tra'
272
+ 'ত্ল': 'tla'
273
+
274
+ 'ত্ব': 'tva'
275
+ 'দ্দ': 'dda'
276
+ 'দ্দ্ব': 'ddva'
277
+ 'দ্ধ': 'ddha'
278
+ 'দ্ধ্ব': 'ddhva'
279
+ 'দ্ন': 'dna'
280
+ 'দ্ব': 'dva'
281
+ 'দ্ভ': 'dbha'
282
+ 'দ্ম': 'dma'
283
+ 'দ্র': 'dra'
284
+ 'দ্ল': 'dla'
285
+ 'ধ্র': 'dhra'
286
+
287
+ 'ন্ঠ': 'nṭha'
288
+ 'ন্ড': 'nḍa'
289
+ 'ন্ক': 'nka'
290
+ 'ন্ত': 'nta'
291
+ 'ন্ত্র': 'ntra'
292
+ 'ন্থ': 'ntha'
293
+ 'ন্দ': 'nda'
294
+ 'ন্দ্র': 'ndra'
295
+ 'ন্ধ': 'ndha'
296
+ 'ন্ন': 'nna'
297
+ 'ন্ম': 'nma'
298
+ 'ন্ব': 'nva'
299
+
300
+ 'প্ন': 'pna'
301
+ 'প্ত': 'pta'
302
+ 'প্প': 'ppa'
303
+ 'প্র': 'pra'
304
+ 'প্ল': 'pla'
305
+ 'ফ্র': 'phra'
306
+ 'ব্জ': 'bja'
307
+ 'ব্দ': 'bda'
308
+ 'ব্ধ': 'bdha'
309
+ 'ব্ব': 'bba'
310
+ 'ব্র': 'bra'
311
+ 'ভ্র': 'bhra'
312
+ 'ম্প': 'mpa'
313
+ 'ম্ব': 'mba'
314
+ 'ম্ভ': 'mbha'
315
+ 'ম্ভ্র': 'mbhra'
316
+ 'ম্ম': 'mma'
317
+ 'ম্র': 'mra'
318
+ 'ম্ল': 'mla'
319
+ 'ল্ক': 'lka'
320
+ 'ল্ট': 'lṭa'
321
+ 'ল্ড': 'lḍa'
322
+ 'ল্ম': 'lma'
323
+ 'ল্ল': 'lla'
324
+
325
+ 'শ্চ': 'shcha'
326
+ 'শ্ছ': 'shchha'
327
+ 'শ্ত': 'shta'
328
+ 'শ্ন': 'shna'
329
+ 'শ্ম': 'shma'
330
+ 'শ্র': 'shra'
331
+ 'শ্ল': 'shla'
332
+ 'শ্ব': 'shva'
333
+ 'ষ্ক': 'ṣhka'
334
+ 'ষ্ট': 'ṣhṭa'
335
+ 'ষ্ট্র': 'ṣhṭra'
336
+ 'ষ্ঠ': 'ṣhṭha'
337
+
338
+ 'ষ্ঞ': 'ṣhña'
339
+ 'ষ্প': 'ṣhpa'
340
+ 'ষ্ফ': 'ṣhpha'
341
+ 'স্ক': 'ska'
342
+ 'স্ক্র': 'skra'
343
+ 'স্খ': 'skha'
344
+ 'স্ত': 'sta'
345
+ 'স্ন': 'sna'
346
+ 'স্ম': 'sma'
347
+ 'স্র': 'sra'
348
+ 'স্ব': 'sva'
349
+ 'হ্ন': 'hna'
350
+
351
+ 'হ্ম': 'hma'
352
+ 'হ্র': 'hra'
353
+ 'হ্ল': 'hla'
354
+
355
+ # Zuktabarna - combined letters
356
+ #
357
+ # The followings are not the official list, but this has been
358
+ # collected and varified from some reliable source.
359
+ # Source: https://www.somewhereinblog.net/blog/trivuzblog/28849694
360
+ #
361
+ 'ক্ট্র': 'kṭra'
362
+ 'ক্ত্র': 'ktra'
363
+ 'ক্য': 'kya'
364
+ 'ক্ষ্ণ': 'kṣṇa'
365
+ 'ক্ষ্ম': 'kṣma'
366
+ 'খ্য': 'khaj̱a'
367
+ 'খ্র': 'khra'
368
+ 'গ্ন': 'gna'
369
+ 'গ্‌ণ': 'gṇa'
370
+ 'গ্ধ্য': 'gdhya'
371
+ 'গ্ধ্র': 'gdhra'
372
+ 'গ্ন্য': 'gnya'
373
+ 'গ্ব': 'gva'
374
+ 'গ্য': 'gya'
375
+ 'গ্র্য': 'grya'
376
+ 'ঘ্ন': 'ghna'
377
+ 'ঘ্য': 'ghya'
378
+ 'ঙ্‌ক্ত': 'ṅkata'
379
+ 'ঙ্ক্য': 'ṅkaya'
380
+ 'ঙ্ক্ষ': 'ṅkṣa'
381
+ 'ঙ্খ': 'ṅkha'
382
+ 'ঙ্গ্য': 'ṅgaya'
383
+ 'ঙ্ঘ': 'ṅgha'
384
+ 'ঙ্ঘ্য': 'ṅghya'
385
+ 'ঙ্ঘ্র': 'ṅghra'
386
+ 'ঙ্ম': 'ṅma'
387
+ 'চ্ছ্র': 'cchra'
388
+ 'চ্ব': 'cva'
389
+ 'চ্য': 'cya'
390
+ 'জ্য': 'jya'
391
+ 'জ্র': 'jra'
392
+ 'ট্ব': 'ṭva'
393
+ 'ট্ম': 'ṭma'
394
+ 'ট্য': 'ṭya'
395
+ 'ট্র': 'ṭra'
396
+ 'ড্ব': 'ḍva'
397
+ 'ড্য': 'ḍya'
398
+ 'ড্র': 'ḍra'
399
+ 'ড়্গ': 'ḍga'
400
+ 'ঢ্য': 'ḍhya'
401
+ 'ঢ্র': 'ḍhra'
402
+ 'ণ্ঠ্য': 'ṇṭhya'
403
+ 'ণ্ড্য': 'ṇḍya'
404
+ 'ণ্ড্র': 'ṇḍra'
405
+ 'ণ্ঢ': 'ṇḍha'
406
+ 'ণ্ণ': 'ṇṇa'
407
+ 'ণ্ব': 'ṇva'
408
+ 'ণ্ম': 'ṇma'
409
+ 'ণ্য': 'ṇya'
410
+ 'ৎক': 'tka'
411
+ 'ত্ত্য': 'ttya'
412
+ 'ত্ম্য': 'tmya'
413
+ 'ত্য': 'tya'
414
+ 'ত্র্য': 'trya'
415
+ 'ৎল': 'tla'
416
+ 'ৎস': 'tsa'
417
+ 'থ্ব': 'thva'
418
+ 'থ্য': 'thya'
419
+ 'থ্র': 'thra'
420
+ 'দ্গ': 'dga'
421
+ 'দ্ঘ': 'dgha'
422
+ 'দ্ভ্র': 'dbhra'
423
+ 'দ্য': 'dya'
424
+ 'দ্র্য': 'draya'
425
+ 'ধ্ন': 'dhna'
426
+ 'ধ্ব': 'dhva'
427
+ 'ধ্ম': 'dhma'
428
+ 'ধ্য': 'dya'
429
+ 'ন্ট': 'nṭa'
430
+ 'ন্ট্র': 'nṭra'
431
+ 'ন্ড্র': 'nḍra'
432
+ 'ন্ত্ব': 'ntva'
433
+ 'ন্ত্য': 'ntaya'
434
+ 'ন্ত্র্য': 'ntraya'
435
+ 'ন্থ্র': 'nthra'
436
+ 'ন্দ্য': 'ndya'
437
+ 'ন্দ্ব': 'ndva'
438
+ 'ন্ধ্য': 'ndhya'
439
+ 'ন্ধ্র': 'ndhra'
440
+ 'ন্য': 'nya'
441
+ 'প্ট': 'pṭa'
442
+ 'প্য': 'pya'
443
+ 'প্র্য': 'praya'
444
+ 'প্স': 'psa'
445
+ 'ফ্ল': 'phla'
446
+ 'ব্য': 'bya'
447
+ 'ব্ল': 'bla'
448
+ 'ভ্ব': 'bhva'
449
+ 'ভ্য': 'bhya'
450
+ 'ম্ন': 'mna'
451
+ 'ম্প্র': 'mpra'
452
+ 'ম্ফ': 'mpha'
453
+ 'ম্ব্র': 'mvra'
454
+ 'ম্য': 'mya'
455
+ 'য্য': 'j̱aya'
456
+ 'র্ক': 'rka'
457
+ 'র্ক্য': 'rkya'
458
+ 'র্গ্য': 'rgya'
459
+ 'র্ঘ্য': 'rghya'
460
+ 'র্চ্য': 'rchya'
461
+ 'র্জ্য': 'rjya'
462
+ 'র্ণ্য': 'rṇya'
463
+ 'র্ত্য': 'rtya'
464
+ 'র্থ্য': 'rthya'
465
+ 'র্ব্য': 'rvya'
466
+ 'র্ম্য': 'rmya'
467
+ 'র্শ্য': 'rshya'
468
+ 'র্ষ্য': 'rṣhya'
469
+ 'র্হ্য': 'rhya'
470
+ 'র্খ': 'rkha'
471
+ 'র্গ': 'rga'
472
+ 'র্গ্র': 'rgra'
473
+ 'র্ঘ': 'rgha'
474
+ 'র্চ': 'rcha'
475
+ 'র্ছ': 'rchha'
476
+ 'র্জ': 'rja'
477
+ 'র্ঝ': 'rjha'
478
+ 'র্ট': 'rṭa'
479
+ 'র্ড': 'rḍa'
480
+ 'র্ণ': 'rṇa'
481
+ 'র্ত': 'rta'
482
+ 'র্ত্র': 'rtra'
483
+ 'র্থ': 'rtha'
484
+ 'র্দ': 'rda'
485
+ 'র্দ্ব': 'rdva'
486
+ 'র্দ্র': 'rdra'
487
+ 'র্ধ': 'rdha'
488
+ 'র্ধ্ব': 'rdhba'
489
+ 'র্ন': 'rna'
490
+ 'র্প': 'rpa'
491
+ 'র্ফ': 'rpha'
492
+ 'র্ভ': 'rbha'
493
+ 'র্ম': 'rma'
494
+ 'র্য': 'rya'
495
+ 'র্ল': 'rla'
496
+ 'র্শ': 'rsha'
497
+ 'র্শ্ব': 'rshba'
498
+ 'র্ষ': 'rṣha'
499
+ 'র্স': 'rsa'
500
+ 'র্হ': 'rha'
501
+ 'র্ঢ্য': 'rḍhya'
502
+ 'ল্ক্য': 'lkaya'
503
+ 'ল্গ': 'lga'
504
+ 'ল্প': 'lpa'
505
+ 'ল্‌ফ': 'lpha'
506
+ 'ল্ফ': 'lpha'
507
+ 'ল্ব': 'lba'
508
+ 'ল্‌ভ': 'lbha'
509
+ 'ল্য': 'lya'
510
+ 'শ্য': 'sya'
511
+ 'ষ্ক্র': 'ṣkra'
512
+ 'ষ্ট্য': 'ṣṭya'
513
+ 'ষ্ঠ্য': 'ṣṭhya'
514
+ 'ষ্ণ': 'ṣṇa'
515
+ 'ষ্প্র': 'ṣpra'
516
+ 'ষ্ব': 'ṣva'
517
+ 'ষ্ম': 'ṣma'
518
+ 'ষ্য': 'ṣya'
519
+ 'স্ট': 'sṭa'
520
+ 'স্ট্র': 'sṭra'
521
+ 'স্ত্ব': 'stva'
522
+ 'স্ত্য': 'stṣya'
523
+ 'স্ত্র': 'stra'
524
+ 'স্থ': 'stha'
525
+ 'স্থ্য': 'sthya'
526
+ 'স্প': 'spa'
527
+ 'স্প্র': 'spra'
528
+ 'স্প্‌ল': 'spala'
529
+ 'স্ফ': 'spha'
530
+ 'স্য': 'sya'
531
+ 'স্ল': 'sla'
532
+ 'হ্ণ': 'hṇa'
533
+ 'হ্ব': 'hva'
534
+ 'হ্য': 'hya'