interscript 0.1.7 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +1 -3
  3. data/aliases.json +1 -0
  4. data/lib/interscript.rb +8 -3
  5. data/lib/interscript/fs.rb +27 -0
  6. data/lib/interscript/mapping.rb +3 -1
  7. data/lib/interscript/opal.rb +142 -3
  8. data/lib/interscript/opal/entrypoint.rb +8 -0
  9. data/lib/interscript/opal/exports.rb +11 -0
  10. data/lib/interscript/opal/maps.js.erb +2 -4
  11. data/lib/interscript/version.rb +1 -1
  12. data/maps/alalc-ara-Arab-Latn-1997.yaml +5 -5
  13. data/maps/alalc-asm-Deva-Latn-1997.yaml +104 -10
  14. data/maps/alalc-asm-Deva-Latn-2012.yaml +18 -3
  15. data/maps/alalc-aze-Arab-Latn-1997.yaml +376 -0
  16. data/maps/alalc-ben-Beng-Latn-1997.yaml +291 -0
  17. data/maps/alalc-div-Thaa-Latn-1997.yaml +211 -0
  18. data/maps/alalc-hin-Deva-Latn-1997.yaml +102 -10
  19. data/maps/alalc-hin-Deva-Latn-2011.yaml +19 -1
  20. data/maps/alalc-kan-Kana-Latn-1997.yaml +274 -0
  21. data/maps/alalc-kan-Kana-Latn-2011.yaml +63 -0
  22. data/maps/alalc-ori-Orya-Latn-1997.yaml +284 -0
  23. data/maps/alalc-ori-Orya-Latn-2011.yaml +67 -0
  24. data/maps/alalc-pra-Deva-Latn-2012.yaml +2 -2
  25. data/maps/alalc-san-Deva-Latn-2012.yaml +78 -9
  26. data/maps/alalc-tel-Telu-Latn-1997.yaml +284 -0
  27. data/maps/alalc-tel-Telu-Latn-2011.yaml +64 -0
  28. data/maps/az-aze-Cyrl-Latn-1939.yaml +105 -0
  29. data/maps/az-aze-Cyrl-Latn-1958.yaml +45 -0
  30. data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +3 -1
  31. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +111 -104
  32. data/maps/bgnpcgn-bal-Arab-Latn-2008.yaml +329 -0
  33. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +1 -1
  34. data/maps/bgnpcgn-div-Thaa-Latn-1988.yaml +75 -0
  35. data/maps/bgnpcgn-far-Latn-Latn-1964.yaml +28 -0
  36. data/maps/bgnpcgn-isl-Latn-Latn-1964.yaml +37 -0
  37. data/maps/bgnpcgn-kaz-Cyrl-Latn-1979.yaml +247 -0
  38. data/maps/bgnpcgn-kir-Cyrl-Latn-1979.yaml +218 -0
  39. data/maps/bgnpcgn-kur-Arab-Latn-2007.yaml +249 -0
  40. data/maps/bgnpcgn-per-Arab-Latn-1958.yaml +2 -0
  41. data/maps/bgnpcgn-prs-Arab-Latn-2007.yaml +87 -53
  42. data/maps/bgnpcgn-pus-Arab-Latn-1968.yaml +377 -0
  43. data/maps/bgnpcgn-srp-Cyrl-Latn-1962.yaml +73 -0
  44. data/maps/bgnpcgn-urd-Arab-Latn-2007.yaml +459 -0
  45. data/maps/{bis-knd-Knda-Latn-13194-1991.yaml → bis-kan-Kana-Latn-13194-1991.yaml} +2 -2
  46. data/maps/bis-ori-Orya-Latn-13194-1991.yaml +17 -2
  47. data/maps/iso-ara-Arab-Latn-233-1984.yaml +1 -1
  48. data/maps/{iso-kan-Knda-Latn-15919-2001.yaml → iso-kan-Kana-Latn-15919-2001.yaml} +1 -1
  49. data/maps/{mns-mon-Cyrl-Latn-5217-2012.yaml → masm-mon-Cyrl-Latn-5217-2012.yaml} +2 -2
  50. data/maps/{mns-mon-Latn-Cyrl-5217-2012.yaml → masm-mon-Latn-Cyrl-5217-2012.yaml} +1 -1
  51. data/maps/mv-div-Thaa-Latn-1987.yaml +200 -0
  52. data/maps/odni-ara-Arab-Latn-2004.yaml +137 -0
  53. data/maps/odni-ara-Arab-Latn-2015.yaml +20 -130
  54. data/maps/odni-bul-Cyrl-Latn-2005.yaml +90 -0
  55. data/maps/odni-fas-Arab-Latn-2004.yaml +276 -0
  56. data/maps/odni-hin-Deva-Latn-2004.yaml +182 -0
  57. data/maps/odni-mkd-Cyrl-Latn-2005.yaml +21 -0
  58. data/maps/odni-prs-Arab-Latn-2004.yaml +123 -0
  59. data/maps/{odni-per-Arab-Latn-2015.yaml → odni-prs-Arab-Latn-2015.yaml} +0 -0
  60. data/maps/odni-srp-Cyrl-Latn-2005.yaml +36 -0
  61. data/maps/odni-tuk-Cyrl-Latn-2015.yaml +170 -0
  62. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +4 -0
  63. data/maps/un-ara-Arab-Latn-2017.yaml +1 -1
  64. data/maps/un-asm-Beng-Latn-1972.yaml +223 -0
  65. data/maps/un-guj-Gujr-Latn-1972.yaml +229 -0
  66. data/maps/un-hin-Deva-Latn-2016.yaml +104 -10
  67. data/maps/un-kan-Kana-Latn-2016.yaml +254 -0
  68. data/maps/un-mal-Mlym-Latn-1972.yaml +251 -0
  69. data/maps/un-mar-Deva-Latn-2016.yaml +24 -13
  70. data/maps/un-nep-Deva-Latn-1972.yaml +40 -121
  71. data/maps/un-ori-Orya-Latn-1972.yaml +247 -0
  72. data/maps/un-pan-Guru-Latn-1972.yaml +402 -0
  73. data/maps/un-prs-Arab-Latn-1967.yaml +236 -0
  74. data/maps/un-tam-Taml-Latn-1972.yaml +194 -0
  75. data/maps/un-tel-Telu-Latn-1972.yaml +270 -0
  76. data/maps/un-urd-Arab-Latn-1972.yaml +405 -0
  77. data/maps/var-amh-Ethi-Latn-eae-2003.yaml +466 -0
  78. data/maps/var-gez-Ethi-Latn-eae-2003.yaml +76 -0
  79. data/spec/interscript/filenames_spec.rb +6 -369
  80. data/spec/interscript_spec.rb +10 -2
  81. metadata +50 -7
  82. data/lib/interscript/opal/map_translate.rb +0 -7
@@ -31,10 +31,25 @@ notes:
31
31
 
32
32
  tests:
33
33
  - source: "ৰাজ্যিক স্বাস্থ্য মন্ত্ৰী পীয়ুষ হাজৰিকাৰ বিৰুদ্ধে দাখিল কৰা হৈছে এজাহাৰ।"
34
- expected: "raājaẏaika sabaāsathaẏa manataraī paīyausha haājaraikaāra bairaudadhae daākhaila karaā haaichae ejaāhaāra."
34
+ expected: "rājẏika sbāsthẏa mantrī pīyusha hājarikāra biruddhe dākhila karā haiche ejāhāra."
35
35
  - source: "কোৰোনা মহামাৰীৰ এই সময়ত সভাখনত হাজাৰ হাজাৰ লোকে মাস্ক পৰিধান নকৰাৰ লগতে সামাজিক দূৰত্ব নমনাৰ অভিযোগ উত্থাপন কৰা হৈছে"
36
- expected: "kaoraonaā mahaāmaāraīra ei samayata sabhaākhanata haājaāra haājaāra laokae maāsaka paraidhaāna nakaraāra lagatae saāmaājaika daūrataba namanaāra abhaiẏaoga utathaāpana karaā haaichae"
37
-
36
+ expected: "koronā mahāmārīra ei samayata sabhākhanata hājāra hājāra loke māska paridhāna nakarāra lagate sāmājika dūratba namanāra abhiẏoga utthāpana karā haiche"
37
+ - source: "হাওৰাঘাটৰ গ্ৰামীণ বিকাশ বেংক হিতাধিকাৰীৰ পৰা উৎকোচ লৈ গ্ৰেপ্তাৰ বিজেপি কৰ্মী যীচু কেম্পাই"
38
+ expected: "hāorāghāṭara grāmīṇa bikāśa beṃka hitādhikārīra parā uṭkoca lai greptāra bijepi karmī ẏīcu kempāi"
39
+ - source: "জ্যেষ্ঠ সাংবাদিক পৰাগ ভূঞাৰ মৃত্যুক লৈ তদন্ত আৰম্ভ চিআইডিৰ"
40
+ expected: "jẏeshṭha sāṃbādika parāga bhūñāra mṛtẏuka lai tadanta ārambha ciāiḍira"
41
+ - source: "সাংবাদিক পৰাগ ভূঞাৰ মৃত্যুৰ উচিত তদন্তৰ দাবীত নলবাৰীত অৱস্থান ধৰ্মঘট"
42
+ expected: "sāṃbādika parāga bhūñāra mṛtẏura ucita tadantara dābīta nalabārīta awasthāna dharmaghaṭa"
43
+ - source: "দৰঙৰ বিভিন্ন অঞ্চলত মানসিক ৰোগৰ সজাগতামূলক বাটৰ নাট প্ৰদৰ্শন"
44
+ expected: "daraṅara bibhinna añcalata mānasika rogara sajāgatāmūlaka bāṭara nāṭa pradarśana"
45
+ - source: "অযোধ্যাত দীপাৱলীঃ ৫.৮৬ লাখ মাটি চাকি জ্বলাই গঢ়িলে গিনিজ ৱ’ৰ্ল্ড ৰেকৰ্ড"
46
+ expected: "aẏodhẏāta dīpāwalīḥ 5.86 lākha māṭi cāki jbalāi gaḍha়ile ginija wa’rlḍa rekarḍa"
47
+ - source: "ৰাজ্যত আকৌ ২৩৩ জন কোভিড পজিটিভ, সুস্থ হৈছে ৬৪২ জন"
48
+ expected: "rājẏata ākau 233 jana kobhiḍa pajiṭibha, sustha haiche 642 jana"
49
+ - source: "এতিয়ালৈকে ৰাজ্যত এই ভাইৰাছত আক্ৰান্ত লোকৰ সংখ্যা ২১০০৬৮জনলৈ পাইছে বৃদ্ধি।"
50
+ expected: "etiyālaike rājẏata ei bhāirāchata ākrānta lokara saṃkhẏā 210068janalai pāiche bṛddhi."
51
+ - source: "এতিয়ালৈকে ৰাজ্যত কোৰোনাত আক্ৰান্ত হৈ ৯৫৮জন লোক হেৰুৱাইছে প্ৰাণ।"
52
+ expected: "etiyālaike rājẏata koronāta ākrānta hai 958jana loka heruwāiche prāṇa."
38
53
  map:
39
54
 
40
55
  inherit: "alalc-asm-Deva-Latn-1997"
@@ -0,0 +1,376 @@
1
+ ---
2
+ authority_id: alalc
3
+ id: 1997
4
+ language: iso-639-2:aze
5
+ source_script: Arab
6
+ destination_script: Latn
7
+ name: ALA-LC Romanization Table -- azerbaij (1997)
8
+ alias:
9
+ ogc11122:
10
+ code: aze_Arab2Latn_ALA_1997
11
+ description: Arabic ALA-Library of Congress 1997 System
12
+ url: http://catdir.loc.gov/catdir/cpso/romanization/azerbaij.pdf
13
+ creation_date: 1997
14
+ description: |
15
+ ALA-LC Romanization table for Arabic
16
+
17
+ notes:
18
+ - 1. As seen in the examples above, vowel harmony, which is
19
+ found in Modern Turkish, applies to Azerbaijani as well.
20
+
21
+ - 2. The letter ى in final position may represent the long
22
+ vowel romanized á, in addition to the
23
+ vowels romanized i and 1. This occurs in Arabic names, such
24
+ as Mustafá
25
+ al-Musanná
26
+
27
+ - 3. Vowel points are used sparingly in Azerbaijani
28
+ publications. For romanization, they must be
29
+ supplied from a dictionary.
30
+
31
+ - Rule 1 ء hamza
32
+ (a) When initial, ء is not represented in romanization
33
+ üzdah أوزدة
34
+
35
+ (b) When medial or final in words of Perso-Arabic origin, 9 is
36
+ romanized as ’ (alit), except when it accompanies the
37
+ phonetic sound e (as in men), in which case it is romanized
38
+ by e.
39
+ mas’alah مَسئَلة
40
+ gecah كئجة
41
+ necah نئجة
42
+
43
+ - Rule 2 ˜ (maddah)
44
+
45
+ (a) Initial آ is romanized ā.
46
+
47
+ ādām آدام
48
+
49
+ (b) Medial آ, when it represents the phonetic combination ’ā, is so romanized.
50
+
51
+ Heydar'ãbãd حيدَرآبَاد
52
+
53
+ - Rule 3 ّ
54
+ (shaddah or tashdid) is represented by doubling the letter or digraph concerned.
55
+ sãqqãl سَاقَّال
56
+
57
+ Note the exceptional case where ّ is written over و and ي to represent
58
+ the combination of long vowel plus consonants.
59
+ madaníyat مَدَنِيَّت
60
+
61
+ - |
62
+ Rule 4 Tanvīn (written form ٌ, ً (ًا), or ٍ ) which occurs chiefly in Arabic words,
63
+ is romanized un, in, an, and an, respectively.
64
+
65
+ (a) When it occurs in indefinite nouns derived from defective roots.
66
+
67
+ qāḍin قاضٍ
68
+ ma‘nan معنىً
69
+
70
+ (b) When it indicates the adverbial use of a noun or adjective.
71
+
72
+ ṭab‘an طبعًا
73
+ faj’atan فجأةً
74
+ al-Mushtarik waḍ‘an المشترك وضعاً
75
+ wa-al-muftariq ṣuq‘an والمفترق صقعاً
76
+
77
+ - ة in a word in the construct state is romanized t. See rule 7(b).
78
+
79
+ - The consonant letter ö at the end of Arabic words in the
80
+ genetive construction (izãfah) is romanized by t.
81
+
82
+ takmilat al-axbãr تَكمِلَة الأخبَار
83
+
84
+ # Grammatical Structure as It Affects Romanization
85
+ - Rule 6 izãfah. When two Persian words are used in an Azerbaijani
86
+ context in a relationship known as izãfah, the first word (
87
+ the muzãf) is followed by an additional letter or syllable
88
+ in romanization. This is added according to the following
89
+ rules
90
+ (a) When the muzaf bears no special mark of izãfah, it is
91
+ followed by -i.
92
+ Sazman-i tabligãti-Islãm سازمان تبليغات اسلامي
93
+
94
+ (b) When the muzãf is marked by the addition of 9, it is followed by -'i.
95
+ Nãbigah-'i dahr نابغة دَهر
96
+
97
+ (c) When the muzãf is marked by the addition of û, it is followed by -yi.
98
+ darya-yi nur دَريَاي نُور
99
+
100
+ (d) izãfah is represented in romanization of personal names only when
101
+ implied in the Persian script.
102
+ Mucír-i BeylaqãnT مَجير بيلقاني
103
+ Maktabí-i Sírãzí مكتبي شيرازي
104
+
105
+ # Affixes and Compounds
106
+ - Rule 7 Affixes.
107
+ (a) When the affix and the word with it is connected grammatically are
108
+ written separately in Azerbaijani, the two are separated in romanization
109
+ by a single prime(').
110
+
111
+ (b) The Arabic article al is separated by a hyphen, in romanization,
112
+ from the word to which it is prefixed.
113
+
114
+ - Rule 8 Compounds.
115
+ When the elements of a compound (except a compound personal name)
116
+ are written separately in Azerbaijani, they are separated in
117
+ romanization by a single prime(').
118
+ # Orthography of Azerbaijani in Romanization
119
+
120
+ - Rule 9 Capitalization
121
+
122
+ (a) Rules for the capitalization of English are followed, except that
123
+ the Arabic article al, is lower cased in all positions.
124
+
125
+ (b) Diacritics are used with both upper and lower case letters in romanization.
126
+
127
+ - Rule 10 Foreign words.
128
+ Foreign words in an Azerbaijani context, including Persian and Arabic words,
129
+ are romanized according to the rules for Azerbaijani. For short vowels not
130
+ indicated in the script, the Azerbaijani vowels nearest the original
131
+ pronunciation of the word are supplied in romanization.
132
+
133
+ tests:
134
+ - source: بَرَكَت
135
+ expected: Barakat
136
+
137
+ - source: سَاحِل
138
+ expected: Sāḥil
139
+
140
+ - source: بَادِمجَان
141
+ expected: Bādimcān
142
+
143
+ - source: قُدرَت
144
+ expected: Qudrat
145
+
146
+ - source: بُوغَا
147
+ expected: Būğā
148
+
149
+ - source: آرَام
150
+ expected: Ārām
151
+
152
+ - source: اِئنلِي
153
+ expected: Enlī
154
+
155
+ - source: دَلِيل
156
+ expected: Dalīl
157
+
158
+ - source: قَارَانلِيق
159
+ expected: Qārānlīq
160
+
161
+ - source: اِيش
162
+ expected: Īş
163
+
164
+ - source: اِيشِيق
165
+ expected: Īşīq
166
+
167
+ - source: اُون
168
+ expected: 'On'
169
+
170
+ - source: ُاون
171
+ expected: Ūn
172
+
173
+ - source: ُاؤن
174
+ expected: Ön
175
+
176
+ # - source: ُأوزُوم
177
+ # expected: üzūm
178
+
179
+ - source: اَيْوَان
180
+ expected: Eyvān
181
+
182
+ - source: اَوحَدِي
183
+ expected: Awḥadī
184
+
185
+ - source: َاوَّل
186
+ expected: Avval
187
+
188
+ - source: طَهي
189
+ expected: Ṭahy
190
+
191
+ # From Rule 1 - part a
192
+
193
+ - source: ُأوزدَة
194
+ expected: Üzdah
195
+
196
+ # From Rule 1 - part b
197
+
198
+ - source: مَسئَلَة
199
+ expected: Mas’alah
200
+
201
+ - source: گِئجَة
202
+ expected: Gecah
203
+
204
+ - source: نِئچَة
205
+ expected: Neçah
206
+
207
+ # From Rule 2 - part a
208
+ - source: آدَام
209
+ expected: Ādām
210
+
211
+ # From Rule 2 - part b
212
+ - source: حَيْدَرآبَاد
213
+ expected: Ḥeydar’ābād
214
+
215
+ # From Rule 3
216
+ - source: سَاقَّال
217
+ expected: Sāqqāl
218
+
219
+ - source: مَدَنِيَّت
220
+ expected: Madanīyat
221
+
222
+ # From Rule 5
223
+
224
+ - source: تَكمِلَة الأَخبَار
225
+ expected: Takmilat al-Axbār
226
+
227
+
228
+ map:
229
+ postrules:
230
+ - pattern: (?<=\b)(?<!\b[‘|’|'])[\u0061-\uFFFF]
231
+ result: "upcase"
232
+
233
+ - pattern : '\bAl' # الت
234
+ result: 'al'
235
+
236
+ characters:
237
+
238
+ '\u0628\u0651': 'bb' # ب
239
+ '\u067E\u0651': 'pp' # پ
240
+ '\u062A\u0651': 'tt' # ت
241
+ '\u062b\u0651': 's̱s̱' # ث
242
+ '\u062C\u0651': 'cc' # ج
243
+ '\u0686\u0651': 'çç' # چ
244
+ '\u062d\u0651': 'ḥḥ' # ح
245
+ '\u062E\u0651': 'xx' # خ
246
+ '\u062F\u0651': 'dd' # د
247
+ '\u0630\u0651': 'ẕẕ' # ذ
248
+ '\u0631\u0651': 'rr' # ر
249
+ '\u0632\u0651': 'zz' # ز
250
+ '\u0698\u0651': 'jj' # ژ
251
+ '\u0633\u0651': 'ss' # س
252
+ '\u0634\u0651': 'şş' # ش
253
+ '\u0635\u0651': 'ṣṣ' # ص
254
+ '\u0636\u0651': 'z̤z̤' # ض
255
+ '\u0637\u0651': 'ṭṭ' # ط
256
+ '\u0638\u0651': 'ẓẓ' # ظ
257
+ '\u0639\u0651': '‘‘' # ع
258
+ '\u063A\u0651': 'ğğ' # غ
259
+ '\u0341\u0651': 'ff' # ف
260
+ '\u0642\u0651': 'qq' # ق
261
+ '\u06A9\u0651': 'kk' # ك
262
+ '\u0643\u0651': 'kk' # ك
263
+ '\u06AF\u0651': 'gg' # گ
264
+ '\u0644\u0651': 'll' # ل
265
+ '\u0645\u0651': 'mm' # م
266
+ '\u0646\u0651': 'nn' # ن
267
+ '\u0648\u0651': 'vv' # و
268
+ '\u0647\u0651': 'hh' # ه
269
+ '\u064A\u0651': 'yy' # ي
270
+
271
+ '\u060c': ',' # ،
272
+
273
+ '\u0627\u0644\u0644\u0651\u064e\u0647': "Allāh"
274
+
275
+ '\b\u0627\u0644' : 'al-' # ال
276
+
277
+ '\u0628': 'b' # ب
278
+ '\u067E': 'p' # پ
279
+ '\u062A': 't' # ت
280
+ '\u062b': 's̱' # ث
281
+ '\u062C': 'c' # ج
282
+ '\u0686': 'ç' # چ
283
+ '\u062d': 'ḥ' # ح
284
+ '\u062E': 'x' # خ
285
+ '\u062F': 'd' # د
286
+ '\u0630': 'ẕ' # ذ
287
+ '\u0631': 'r' # ر
288
+ '\u0632': 'z' # ز
289
+ '\u0698': 'j' # ژ
290
+ '\u0633': 's' # س
291
+ '\u0634': 'ş' # ش
292
+ '\u0635': 'ṣ' # ص
293
+ '\u0636': 'z̤' # ض
294
+ '\u0637': 'ṭ' # ط
295
+ '\u0638': 'ẓ' # ظ
296
+ '\u0639': '‘' # ع
297
+ '\u063A': 'ğ' # غ
298
+ '\u0341': 'f' # ف
299
+ '\u0642': 'q' # ق
300
+ '\u06A9': 'k' # ك
301
+ '\u0643': 'k' # ك
302
+ '\u06AF': 'g' # گ
303
+ '\u0644': 'l' # ل
304
+ '\u0645': 'm' # م
305
+ '\u0646': 'n' # ن
306
+ '\u0648': 'v' # و
307
+ '\u0647': 'h' # ه
308
+ '\u064A': 'y' # ي
309
+
310
+ # Vowels and Diphthongs
311
+ '\u064e': 'a'
312
+ '\u0650': 'i'
313
+ '\u064f': 'u'
314
+ '\u064f\u0648' : 'ū' # ـُو damma followed by و
315
+ '\u064e\u0627' : 'ā' # ـَا fatha followed by ا
316
+ '\u0622' : 'ā' # آ
317
+ '\u0650[\u0621|\u0623|\u0624|\u0626]' : 'e' # ـِأ kasra followed by hamza
318
+ '\u0650\u064a' : 'ī' # ـِي kasra followed by ي
319
+ '\u0650\u0627\u064a' : 'ī' # ـِي kasra followed by ي
320
+ '[\u064f]?\u0627\u064f\u0648' : 'o' # ـُاُو
321
+ '\u064f\u0627\u0648' : 'ū' # ـُاو
322
+ '\u064f\u0627\u0624' : 'ö' # ـُاؤ
323
+ '\u064f\u0623\u0648' : 'ü' # ـُأو
324
+ '\u064e\u064a\u0652' : 'ey' # ـَيْ
325
+ '\u064e\u0648\u0652?' : 'aw' # ـَوْ
326
+ '\b\u0627' : '' # ا
327
+ '\b\u064e\u0627': 'a' # ـَا
328
+ '\b\u0650\u0627': 'i' # ـِا
329
+
330
+ # hamzah
331
+ '\b\u0623' : '' # أ
332
+ '\u0623' : '’' # أ
333
+ '\u0624': '’' # ؤ
334
+ '\u0626' : "’" # ئ
335
+ '\b\u0622' : 'ā' # آ
336
+ '(?<!\b\u0627\u0644)(?<!\b)\u0622(?![\b|\u0621])' : '’ā' # آ in middle, not final, or initial, or after ال
337
+ '\u064e\u0622' : 'ā' # ـَآ fatha followed by ا
338
+ '\u0622' : '' # آ
339
+
340
+ # Rule 3 - shadda
341
+ '\u0650\u064a\u0651' : 'īy' # ـِيَّ
342
+ '\u064f\u0648\u0651' : 'ūw' # ـَوّ damma followed by و with shadda
343
+ '\u0650\u064a\u0651\b' : 'ī' # ـِيَّ
344
+ '\u064e\u0648\u0651' : 'aww' # ـَوّ fatha followed by و with shadda
345
+ '\u064e\u064a\u0651' : 'ayy' # ـَيّ fatha followed by و with shadda
346
+
347
+ # Rule 4 - tanvin
348
+ '\u064c': 'un' # ٌ
349
+ '\u064b': 'an' # ً
350
+ '\u064d': 'in' # ٍ
351
+ # tanween should be onb the letter preceeding the end in case of ا, ى
352
+ # however, it's common that people mistake that, so we're handling both orders
353
+ '\u064b\u0649': 'an' # ً
354
+ '\u064b\u0627': 'an' # ً
355
+ '\u0649\u064b': 'an' # ً
356
+ '\u0627\u064b': 'an' # ً
357
+
358
+ # Rule 5 ta' marboota
359
+ '\u0629' : 't' # ة in the middle of the sentence
360
+ '\u0629$' : 'h'
361
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{2})\u0629' : 'h'
362
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{3})\u0629' : 'h'
363
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{4})\u0629' : 'h'
364
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{5})\u0629' : 'h'
365
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{6})\u0629' : 'h'
366
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{7})\u0629' : 'h'
367
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{8})\u0629' : 'h'
368
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{9})\u0629' : 'h'
369
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{10})\u0629' : 'h'
370
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{11})\u0629' : 'h'
371
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{12})\u0629' : 'h'
372
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{13})\u0629' : 'h'
373
+
374
+
375
+ '\u0650\u064a(?=\u064e|u064f)' : 'iy' # ـِي kasra followed by ي
376
+ '\u064e\u0649' : 'á' # ـَى fatha followed by ى which is ا not ي
@@ -0,0 +1,291 @@
1
+ ---
2
+ authority_id: alalc
3
+ id: 1997
4
+ language: iso-639-2:ben
5
+ source_script: Beng
6
+ destination_script: Latn
7
+ name: Bengali Romanization, 1997
8
+ alias:
9
+ ogc11122:
10
+ code: ben_Beng2Latn_ALA_1997
11
+ description: Bengali ALA-Library of Congress 1997 System
12
+ url: http://catdir.loc.gov/catdir/cpso/romanization/bengali.pdf
13
+ creation_date: 1997
14
+ description: |
15
+ ALA-LC Romanization table for Bengali
16
+
17
+ notes:
18
+
19
+ - Only the vowel forms that appear at the beginning of a syllable are
20
+ listed; the forms used for vowels following a consonant can be found in
21
+ grammars; no distinction between the two is made in transliteration.
22
+
23
+ - |
24
+ The vowel a is implicit after all consonants and consonant clusters
25
+ and is supplied in transliteration, with the following exceptions:
26
+
27
+ a) when another vowel is indicated by its appropriate sign; and
28
+ b) when the absence of any vowel is indicated by the subscript symbol ( ্ )
29
+ called hasanta or birāma.
30
+
31
+ - ব is used both as a labial and as a semivowel. When it occurs as the
32
+ second or subsequent consonant of a consonant cluster, it is
33
+ transliterated va. When ব is doubled, it is transliterated bba.
34
+
35
+ - Candrabindu before guttural, palatal, cerebral, and dental occlusives
36
+ is transliterated n̐. Before labials, sibilants, semivowels, the
37
+ aspirate, vowels, and in final position it is transliterated m̐.
38
+
39
+ - When doubled, abagraha is transliterated by two apostrophes ( ’’ ).
40
+
41
+ tests:
42
+ - source: |
43
+ আমার সোনার বাংলা, আমি তোমায় ভালোবাসি।
44
+ চিরদিন তোমার আকাশ, তোমার বাতাস, আমার প্রাণে বাজায় বাঁশি॥
45
+ ও মা, ফাগুনে তোর আমের বনে ঘ্রাণে পাগল করে, মরি হায়, হায় রে—
46
+ ও মা, অঘ্রাণে তোর ভরা ক্ষেতে আমি কী দেখেছি মধুর হাসি॥
47
+
48
+ কী শোভা, কী ছায়া গো, কী স্নেহ, কী মায়া গো—
49
+ কী আঁচল বিছায়েছ বটের মূলে, নদীর কূলে কূলে।
50
+ মা, তোর মুখের বাণী আমার কানে লাগে সুধার মতো,
51
+ মরি হায়, হায় রে—
52
+ মা, তোর বদনখানি মলিন হলে, ও মা, আমি নয়নজলে ভাসি॥
53
+
54
+ expected: |
55
+ āmāra sonāra bāṃlā, āmi tomāẏa bhālobāsi।
56
+ ciradina tomāra ākāśa, tomāra bātāsa, āmāra prāṇe bājāẏa bān̐śi॥
57
+ o mā, phāgune tora āmera bane ghrāṇe pāgala kare, mari hāẏa, hāẏa re—
58
+ o mā, aghrāṇe tora bharā kshete āmi kī dekhechi madhura hāsi॥
59
+
60
+ kī śobhā, kī chāyaṛā go, kī sneha, kī māyaṛā go—
61
+ kī ām̐cala bichāyaṛecha baṭera mūle, nadīra kūle kūle।
62
+ mā, tora mukhera bāṇī āmāra kāne lāge sudhāra mato,
63
+ mari hāẏa, hāẏa re—
64
+ mā, tora badanakhāni malina hale, o mā, āmi naẏanajale bhāsi॥
65
+ - source: "ট্রাম্প-বাইডেন মহারণ: জয় দাবি দুজনেরই"
66
+ expected: "ṭrāmpa-bāiḍena mahāraṇa: jaẏa dābi dujanerai"
67
+ - source: "রিপাবলিকান গভর্নর ডগ ডসি বলেছেন, ফলাফল নিয়ে এখনই কথা বলার সময় আসেনি।"
68
+ expected: "ripābalikāna gabharnara ḍaga ḍasi balechena, phalāphala niẏe ekhanai kathā balāra samaẏa āseni।"
69
+ - source: "অনেক আগে থেকেই ট্রাম্প ফ্লোরিডায় জিতে গেছেন বলে গণমাধ্যমগুলোতে তুলে ধরা হচ্ছে"
70
+ expected: "aneka āge thekei ṭrāmpa phloriḍāẏa jite gechena bale gaṇamādhyamagulote tule dharā hacche"
71
+ - source: "করোনায় আরও ২১ মৃত্যু, নতুন শনাক্ত ১৫১৭"
72
+ expected: "karonāẏa ārao 21 mṛtyu, natuna śanākta 1517"
73
+ - source: "শালিক পাখিকে পোষ মানানোর মতো কঠিন কাজ করা কিশোর রোহানের বাড়ি কুষ্টিয়া শহরের পিটিআই সড়কে।"
74
+ expected: "śālika pākhike posha mānānora mato kaṭhina kāja karā kiśora rohānera bāṛi kushṭiẏā śaharera piṭiāi saṛake।"
75
+ - source: "সুইং স্টেটের সর্বশেষ অবস্থা দেখে মনে হচ্ছে, দুজনের ভাগ্য দুলছে পেন্ডুলামে।"
76
+ expected: "suiṃ sṭeṭera sarbaśesha abasthā dekhe mane hacche, dujanera bhāgya dulache penḍulāme।"
77
+ - source: "২০১৬ সালের নির্বাচনে বহিরাগত হিসেবেই ডোনাল্ড ট্রাম্পের রাজনীতিতে আগমন"
78
+ expected: "2016 sālera nirbācane bahirāgata hisebei ḍonālḍa ṭrāmpera rājanītite āgamana"
79
+ - source: "কই সঙ্গে রাজনীতির পাদপ্রদীপ থেকে সম্পূর্ণ বাইরে থাকা তাঁর পরিবারও চলে আসে রাজনীতির আলোচনায়"
80
+ expected: "kai saṅge rājanītira pādapradīpa theke sampūrṇa bāire thākā tān̐ra paribārao cale āse rājanītira ālocanāẏa"
81
+ - source: "নির্বাচনী প্রচারের সময় প্রেসিডেন্ট ডোনাল্ড ট্রাম্পের পরিবারের সদস্যরা মাঠে নেমেছেন"
82
+ expected: "nirbācanī pracārera samaẏa presiḍenṭa ḍonālḍa ṭrāmpera paribārera sadasyarā māṭhe nemechena"
83
+ - source: "তাঁরা সমর্থকদের উদ্দেশ্যে বলেছেন, এ নির্বাচন শুধু প্রেসিডেন্ট ডোনাল্ড ট্রাম্পের প্রতি নয়"
84
+ expected: "tān̐rā samarthakadera uddeśye balechena, e nirbācana śudhu presiḍenṭa ḍonālḍa ṭrāmpera prati naẏa"
85
+ - source: "স্মার্টফোন কিনতে ৮ হাজার করে ঋণ পাবেন ৪১৫০১ শিক্ষার্থী"
86
+ expected: "smārṭaphona kinate 8 hājāra kare ṛṇa pābena 41501 śikshārthī"
87
+ - source: "বার্সা সমর্থকদের নিয়ে উদ্‌যাপনের কথা গিলতে হলো ভিদালকে"
88
+ expected: "bārsā samarthakadera niẏe ud‌yāpanera kathā gilate halo bhidālake"
89
+
90
+ map:
91
+
92
+ rules:
93
+ #rule II
94
+ - pattern: ([ক]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
95
+ result: 'k'
96
+ - pattern: ([খ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
97
+ result: 'kh'
98
+ - pattern: ([গ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
99
+ result: 'g'
100
+ - pattern: ([ঘ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
101
+ result: 'gh'
102
+ - pattern: ([ঙ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
103
+ result: 'ṅ'
104
+ - pattern: ([চ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
105
+ result: 'c'
106
+ - pattern: ([ছ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
107
+ result: 'ch'
108
+ - pattern: ([জ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
109
+ result: 'j'
110
+ - pattern: ([ঝ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
111
+ result: 'jh'
112
+ - pattern: ([ঞ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
113
+ result: 'ñ'
114
+ - pattern: ([ট]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
115
+ result: 'ṭ'
116
+ - pattern: ([ঠ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
117
+ result: 'ṭh'
118
+ - pattern: ([ড]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
119
+ result: 'ḍ'
120
+ - pattern: ([ড়]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
121
+ result: 'ṛ'
122
+ - pattern: ([ড়]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
123
+ result: 'ṛ'
124
+ - pattern: ([ঢ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
125
+ result: 'ḍh'
126
+ - pattern: ([ঢ়]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
127
+ result: 'ṛh'
128
+ - pattern: ([ণ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
129
+ result: 'ṇ'
130
+ - pattern: ([ত]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
131
+ result: 't'
132
+ - pattern: ([ৎ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
133
+ result: 'ṯ'
134
+ - pattern: ([থ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
135
+ result: 'th'
136
+ - pattern: ([দ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
137
+ result: 'd'
138
+ - pattern: ([ধ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
139
+ result: 'dh'
140
+ - pattern: ([ন]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
141
+ result: 'n'
142
+ - pattern: ([প]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
143
+ result: 'p'
144
+ - pattern: ([ফ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
145
+ result: 'ph'
146
+ - pattern: ([ব]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
147
+ result: 'b'
148
+ - pattern: ([ভ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
149
+ result: 'bh'
150
+ - pattern: ([ম]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
151
+ result: 'm'
152
+ - pattern: ([য]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
153
+ result: 'y'
154
+ - pattern: ([য়]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
155
+ result: 'ẏ'
156
+ - pattern: ([য়]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
157
+ result: 'ẏ'
158
+ - pattern: ([র]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
159
+ result: 'r'
160
+ - pattern: ([ল]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
161
+ result: 'l'
162
+ - pattern: ([শ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
163
+ result: 'ś'
164
+ - pattern: ([ষ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
165
+ result: 'sh'
166
+ - pattern: ([স]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
167
+ result: 's'
168
+ - pattern: ([হ]=?)(?=[\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd])
169
+ result: 'h'
170
+
171
+ # Rule III
172
+ - pattern: ([ବ]=?)(?=[\u09AC])
173
+ result: 'bba'
174
+ # Rule V
175
+ - pattern: \u0981(?=[কখগঘঙচছজঝঞটঠডডঢঢণতৎথদধন])
176
+ result: "m̐"
177
+ - pattern: (?<=)\u0b01(?=\b)
178
+ result: "m̐"
179
+
180
+ characters:
181
+
182
+ # Vowels and Diphthongs
183
+
184
+ 'অ': 'a'
185
+ 'আ': 'ā'
186
+ 'ই': 'i'
187
+ 'ঈ': 'ī'
188
+ 'উ': 'u'
189
+ 'ঊ': 'ū'
190
+ 'এ': 'e'
191
+ 'ঐ': 'ai'
192
+ 'ও': 'o'
193
+ 'ঔ': 'au'
194
+ 'ঋ': 'ṛ'
195
+ 'ৠ': 'ṝ'
196
+ 'ঌ': 'ḹ'
197
+
198
+ # Consonants
199
+ # Gutturals
200
+ 'ক': 'ka'
201
+ 'খ': 'kha'
202
+ 'গ': 'ga'
203
+ 'ঘ': 'gha'
204
+ 'ঙ': 'ṅa'
205
+
206
+ # Palatals
207
+ 'চ': 'ca'
208
+ 'ছ': 'cha'
209
+ 'জ': 'ja'
210
+ 'ঝ': 'jha'
211
+ 'ঞ': 'ña'
212
+
213
+ # Cerebrals
214
+ 'ট': 'ṭa'
215
+ 'ঠ': 'ṭha'
216
+ 'ড': 'ḍa'
217
+ 'ড়': 'ṛa'
218
+ 'ড়': 'ṛa'
219
+ 'ঢ': 'ḍha'
220
+ 'ঢ়': 'ṛha'
221
+ 'ণ': 'ṇa'
222
+
223
+ # Dentals
224
+ 'ত': 'ta'
225
+ 'ৎ': 'ṯa'
226
+ 'থ': 'tha'
227
+ 'দ': 'da'
228
+ 'ধ': 'dha'
229
+ 'ন': 'na'
230
+
231
+ # Labials
232
+ 'প': 'pa'
233
+ 'ফ': 'pha'
234
+ 'ব': 'ba' # see Note 3
235
+ 'ভ': 'bha'
236
+ 'ম': 'ma'
237
+
238
+ # Semivowels
239
+ 'য': 'ya'
240
+ 'য়': 'ẏa'
241
+ 'য়': 'ẏa'
242
+ 'র': 'ra'
243
+ 'ল': 'la'
244
+
245
+ # Sibilants
246
+ 'শ': 'śa'
247
+ 'ষ': 'sha'
248
+ 'স': 'sa'
249
+
250
+ # Aspirate
251
+ 'হ': 'ha'
252
+
253
+ # Anusvāra
254
+ 'ং': 'ṃ'
255
+
256
+ # Bisarga
257
+ 'ঃ': 'ḥ'
258
+
259
+ # Candrabindu (anunāsika)
260
+ '\u0981': 'n̐'
261
+
262
+ # Abagraha (see Note 5)
263
+ 'ऽ': '’' # (apostrophe)
264
+
265
+
266
+ # Medials # Needed for connecting constants
267
+
268
+ '\u09be': 'ā'
269
+ '\u09bf': 'i'
270
+ '\u09c0': 'ī'
271
+ '\u09c1': 'u'
272
+ '\u09c2': 'ū'
273
+ '\u09c3': 'ṛ'
274
+ '\u09c7': 'e'
275
+ '\u09c8': 'ai'
276
+ '\u09cb': 'o'
277
+ '\u09cc': 'au'
278
+ '\u09cd': ''
279
+
280
+ #Numbers
281
+
282
+ '০': '0'
283
+ '১': '1'
284
+ '২': '2'
285
+ '৩': '3'
286
+ '৪': '4'
287
+ '৫': '5'
288
+ '৬': '6'
289
+ '৭': '7'
290
+ '৮': '8'
291
+ '৯': '9'