interscript 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +1 -3
- data/aliases.json +1 -0
- data/lib/interscript.rb +8 -3
- data/lib/interscript/fs.rb +27 -0
- data/lib/interscript/mapping.rb +3 -1
- data/lib/interscript/opal.rb +142 -3
- data/lib/interscript/opal/entrypoint.rb +8 -0
- data/lib/interscript/opal/exports.rb +11 -0
- data/lib/interscript/opal/maps.js.erb +2 -4
- data/lib/interscript/version.rb +1 -1
- data/maps/alalc-ara-Arab-Latn-1997.yaml +5 -5
- data/maps/alalc-asm-Deva-Latn-1997.yaml +104 -10
- data/maps/alalc-asm-Deva-Latn-2012.yaml +18 -3
- data/maps/alalc-aze-Arab-Latn-1997.yaml +376 -0
- data/maps/alalc-ben-Beng-Latn-1997.yaml +291 -0
- data/maps/alalc-div-Thaa-Latn-1997.yaml +211 -0
- data/maps/alalc-hin-Deva-Latn-1997.yaml +102 -10
- data/maps/alalc-hin-Deva-Latn-2011.yaml +19 -1
- data/maps/alalc-kan-Kana-Latn-1997.yaml +274 -0
- data/maps/alalc-kan-Kana-Latn-2011.yaml +63 -0
- data/maps/alalc-ori-Orya-Latn-1997.yaml +284 -0
- data/maps/alalc-ori-Orya-Latn-2011.yaml +67 -0
- data/maps/alalc-pra-Deva-Latn-2012.yaml +2 -2
- data/maps/alalc-san-Deva-Latn-2012.yaml +78 -9
- data/maps/alalc-tel-Telu-Latn-1997.yaml +284 -0
- data/maps/alalc-tel-Telu-Latn-2011.yaml +64 -0
- data/maps/az-aze-Cyrl-Latn-1939.yaml +105 -0
- data/maps/az-aze-Cyrl-Latn-1958.yaml +45 -0
- data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +3 -1
- data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +111 -104
- data/maps/bgnpcgn-bal-Arab-Latn-2008.yaml +329 -0
- data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +1 -1
- data/maps/bgnpcgn-div-Thaa-Latn-1988.yaml +75 -0
- data/maps/bgnpcgn-far-Latn-Latn-1964.yaml +28 -0
- data/maps/bgnpcgn-isl-Latn-Latn-1964.yaml +37 -0
- data/maps/bgnpcgn-kaz-Cyrl-Latn-1979.yaml +247 -0
- data/maps/bgnpcgn-kir-Cyrl-Latn-1979.yaml +218 -0
- data/maps/bgnpcgn-kur-Arab-Latn-2007.yaml +249 -0
- data/maps/bgnpcgn-per-Arab-Latn-1958.yaml +2 -0
- data/maps/bgnpcgn-prs-Arab-Latn-2007.yaml +87 -53
- data/maps/bgnpcgn-pus-Arab-Latn-1968.yaml +377 -0
- data/maps/bgnpcgn-srp-Cyrl-Latn-1962.yaml +73 -0
- data/maps/bgnpcgn-urd-Arab-Latn-2007.yaml +459 -0
- data/maps/{bis-knd-Knda-Latn-13194-1991.yaml → bis-kan-Kana-Latn-13194-1991.yaml} +2 -2
- data/maps/bis-ori-Orya-Latn-13194-1991.yaml +17 -2
- data/maps/iso-ara-Arab-Latn-233-1984.yaml +1 -1
- data/maps/{iso-kan-Knda-Latn-15919-2001.yaml → iso-kan-Kana-Latn-15919-2001.yaml} +1 -1
- data/maps/{mns-mon-Cyrl-Latn-5217-2012.yaml → masm-mon-Cyrl-Latn-5217-2012.yaml} +2 -2
- data/maps/{mns-mon-Latn-Cyrl-5217-2012.yaml → masm-mon-Latn-Cyrl-5217-2012.yaml} +1 -1
- data/maps/mv-div-Thaa-Latn-1987.yaml +200 -0
- data/maps/odni-ara-Arab-Latn-2004.yaml +137 -0
- data/maps/odni-ara-Arab-Latn-2015.yaml +20 -130
- data/maps/odni-bul-Cyrl-Latn-2005.yaml +90 -0
- data/maps/odni-fas-Arab-Latn-2004.yaml +276 -0
- data/maps/odni-hin-Deva-Latn-2004.yaml +182 -0
- data/maps/odni-mkd-Cyrl-Latn-2005.yaml +21 -0
- data/maps/odni-prs-Arab-Latn-2004.yaml +123 -0
- data/maps/{odni-per-Arab-Latn-2015.yaml → odni-prs-Arab-Latn-2015.yaml} +0 -0
- data/maps/odni-srp-Cyrl-Latn-2005.yaml +36 -0
- data/maps/odni-tuk-Cyrl-Latn-2015.yaml +170 -0
- data/maps/odni-ukr-Cyrl-Latn-2015.yaml +4 -0
- data/maps/un-ara-Arab-Latn-2017.yaml +1 -1
- data/maps/un-asm-Beng-Latn-1972.yaml +223 -0
- data/maps/un-guj-Gujr-Latn-1972.yaml +229 -0
- data/maps/un-hin-Deva-Latn-2016.yaml +104 -10
- data/maps/un-kan-Kana-Latn-2016.yaml +254 -0
- data/maps/un-mal-Mlym-Latn-1972.yaml +251 -0
- data/maps/un-mar-Deva-Latn-2016.yaml +24 -13
- data/maps/un-nep-Deva-Latn-1972.yaml +40 -121
- data/maps/un-ori-Orya-Latn-1972.yaml +247 -0
- data/maps/un-pan-Guru-Latn-1972.yaml +402 -0
- data/maps/un-prs-Arab-Latn-1967.yaml +236 -0
- data/maps/un-tam-Taml-Latn-1972.yaml +194 -0
- data/maps/un-tel-Telu-Latn-1972.yaml +270 -0
- data/maps/un-urd-Arab-Latn-1972.yaml +405 -0
- data/maps/var-amh-Ethi-Latn-eae-2003.yaml +466 -0
- data/maps/var-gez-Ethi-Latn-eae-2003.yaml +76 -0
- data/spec/interscript/filenames_spec.rb +6 -369
- data/spec/interscript_spec.rb +10 -2
- metadata +50 -7
- data/lib/interscript/opal/map_translate.rb +0 -7
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
---
|
|
2
|
+
authority_id: bgnpcgn
|
|
3
|
+
id: 2007
|
|
4
|
+
language: iso-639-3:prs
|
|
5
|
+
source_script: Arab
|
|
6
|
+
destination_script: Latn
|
|
7
|
+
name: BGN/PCGN Romanization System -- Pashto (1968)
|
|
8
|
+
url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/693760/ROMANIZATION_OF_PASHTO.pdf
|
|
9
|
+
creation_date: 1968
|
|
10
|
+
confirmation_date: 2017-11
|
|
11
|
+
description: |
|
|
12
|
+
Pashto is an Indo-Iranian language and is one of two
|
|
13
|
+
nationally official languages in Afghanistan and one of
|
|
14
|
+
five regionally recognised languages in Pakistan. The
|
|
15
|
+
romanization system presented here may be applied to all
|
|
16
|
+
Pashto geographical names. Although the BGN/PCGN policy for
|
|
17
|
+
geographical names in Afghanistan is to apply the BGN/PCGN
|
|
18
|
+
national system of romanization for Afghanistan (2007),
|
|
19
|
+
which incorporates Dari elements, when applied to a Pashto
|
|
20
|
+
geographical name, the romanized results of the BGN/PCGN
|
|
21
|
+
national system for Afghanistan are the same as those of
|
|
22
|
+
this Pashto romanization system1 . The Pashto alphabet uses
|
|
23
|
+
a modified form of the Perso-Arabic script, and contains
|
|
24
|
+
twelve additional consonants not present in standard
|
|
25
|
+
Arabic, as well as three additional vowel characters and an
|
|
26
|
+
additional vowel point. ڼ گ ښ ژ ږ ړ ډ ځ څ چ ټ پ :Consonants
|
|
27
|
+
ٙ :Point Vowel; ې ۍ ى :Vowels The points used in Arabic to
|
|
28
|
+
mark short vowels and certain other diacritical marks are
|
|
29
|
+
not written in Pashto. Consequently, a reference source may
|
|
30
|
+
sometimes be required to aid correct identification of the
|
|
31
|
+
standard spellings and proper vowels and elimination of
|
|
32
|
+
dialectal and idiosyncratic variations. In the interests of
|
|
33
|
+
clarity, a column showing vowel pointing from Arabic to
|
|
34
|
+
indicate short vowels has been included in the examples
|
|
35
|
+
below, alongside the unpointed form that will usually be
|
|
36
|
+
encountered. However it should be noted that the
|
|
37
|
+
pronunciation of short vowels will vary. (Note: it is
|
|
38
|
+
recommended that a font such as Scheherazade, available
|
|
39
|
+
from www.sil.org, which includes the Unicode extended
|
|
40
|
+
Arabic sub-range, be used to view this system2 .)
|
|
41
|
+
|
|
42
|
+
notes:
|
|
43
|
+
- 1. Alif ( ا ) should be romanized as follows
|
|
44
|
+
a. Initially,it indicates that the word begins with a vowel or
|
|
45
|
+
diphthong; the alif itself is not romanized, but rather the
|
|
46
|
+
short vowel it “carries” is romanized; e.g., Aslam Zhrandah
|
|
47
|
+
ه َد ن ژر سلَم َأ ميړ → b. When it carries a
|
|
48
|
+
maddah ()آ (see vowel table, row 3), it represents ā;
|
|
49
|
+
e.g., Band. Mīṟ د ن ب َ آب → Āb c. Medially and
|
|
50
|
+
finally it represents ā (see table 2, row 2); e.g., ۍ
|
|
51
|
+
ماڼ → Māṉêy d. Medially and finally in words of Arabic
|
|
52
|
+
origin, alif may serve as the bearer of hamzah, e.g.
|
|
53
|
+
رأس → ra’s. See also note 4.
|
|
54
|
+
|
|
55
|
+
- 2. The characters tsē ( څ ) and dzē ( ځ ) may be
|
|
56
|
+
romanized t͡ s and d͡ z (the combining double breve (
|
|
57
|
+
Unicode 0361) appearing over the digraph) when for special
|
|
58
|
+
reasons it is desired that confusion be avoided between
|
|
59
|
+
ت (t) plus س (s) and between د (d) plus ز (z),
|
|
60
|
+
respectively.
|
|
61
|
+
|
|
62
|
+
- 3. Occasionally the character sequences ه ك , ه ز ,
|
|
63
|
+
ه س , and ه گ occur . They may be romanized k·h, z·
|
|
64
|
+
h, s·h, and g·h in order to differentiate these
|
|
65
|
+
romanizations from the digraphs kh, zh, sh, and gh, which
|
|
66
|
+
are used to represent the characters خ , ژ, ش , and
|
|
67
|
+
غ respectively .
|
|
68
|
+
|
|
69
|
+
- 4. Hamzah ( ء ) should be romanized as follows a. In
|
|
70
|
+
word-initial position, where it will appear either above or
|
|
71
|
+
below alif ( indicates a short vowel and should not itself
|
|
72
|
+
be romanized. romanized by an apostrophe, e.g. أ or
|
|
73
|
+
إ ), it In other positions it should be جُزء → juz’. b.
|
|
74
|
+
Yeh with hamzah ( ئ ) should be romanized êy, unless it
|
|
75
|
+
represents the compound (iẕāfah) morpheme, in which case it
|
|
76
|
+
is romanized according to note 9 below.
|
|
77
|
+
|
|
78
|
+
- 5. The division of words utilized in Pashto writing is
|
|
79
|
+
followed in romanization, except that the elements –ābād, -
|
|
80
|
+
khwā, -shahr, -zādah, -zay and -ullāh are always romanized
|
|
81
|
+
as part of the preceding word, e.g. آباد ت م َ ْح
|
|
82
|
+
ر َ → Raḩmatābād and الله ت م َ ْح ر َ →
|
|
83
|
+
Raḩmatullāh. However, when the word for God ( الله )
|
|
84
|
+
appears as a standalone word it should be written Allāh.
|
|
85
|
+
Note also the “dagger alif” ( ٙ) above the second ل (lām)
|
|
86
|
+
in the word الله ; this, like the short vowels, is not
|
|
87
|
+
written in Pashto but should be romanized ā, like a full-
|
|
88
|
+
size alif. Persian derivational endings such as –vand and
|
|
89
|
+
endings of Turkish origin such as –lar, -lī, -lū, -i, -u, -
|
|
90
|
+
si, and –su, should be written together with the preceding
|
|
91
|
+
word.
|
|
92
|
+
|
|
93
|
+
- 6. The Pashto preposition د should be romanized dê in
|
|
94
|
+
agreement with its pronunciation, despite the fact that
|
|
95
|
+
it is sometimes pointed with kasrah ( ٙ ).
|
|
96
|
+
|
|
97
|
+
- 7. In names of Arabic origin, the l of the definite article
|
|
98
|
+
al/ul is assimilated before the ‘sun letters’ t, s̄ , d,
|
|
99
|
+
z̄ , r, z, s, sh, ş, ẕ, ţ, z̧ , l and n. In romanization,
|
|
100
|
+
the article will be written al or its assimilated
|
|
101
|
+
equivalent in name-initial position but ul or its
|
|
102
|
+
assimilated equivalent elsewhere; the article should be
|
|
103
|
+
separated from the name it precedes and should not be
|
|
104
|
+
capitalized, except at the beginning of a name, e.g. جَبَل
|
|
105
|
+
السَرَاج → Jabal us Sarāj
|
|
106
|
+
|
|
107
|
+
- 8. In Arabic names, a shaddah, ٙ is used to denote the
|
|
108
|
+
doubling of a particular consonant character, e.g. مَّد
|
|
109
|
+
َح م ُ → Muḩammad. However, in Pashto this ‘doubling’
|
|
110
|
+
is frequently omitted in both Perso-Arabic script and the
|
|
111
|
+
resulting romanization. Guidance on doubling may be taken
|
|
112
|
+
from an authoritative names source, such as an Afghan
|
|
113
|
+
government source or Pashto dictionary; for example, it is
|
|
114
|
+
usual to see Ḩājī without and ‘Abbās with the doubled
|
|
115
|
+
consonant. The doubled y consonant is almost always
|
|
116
|
+
retained, as in Sayyid or Qayyūm
|
|
117
|
+
|
|
118
|
+
- 9. The iẕāfah morpheme is not a grammatical feature of
|
|
119
|
+
Pashto and, if encountered in a linguistically hybrid
|
|
120
|
+
geographical name (i.e. combining features of both Pashto
|
|
121
|
+
and Dari), it should be treated according to the BGN/PCGN
|
|
122
|
+
national system of romanization for Afghanistan, 2007, as –
|
|
123
|
+
e, unless the preceding word ends with a silent heh ()ه
|
|
124
|
+
or a vowel when it should be shown – ye, e.g. 10. The
|
|
125
|
+
character sequence خو , صار ح ِ غر → Ghar-e Ḩişār;
|
|
126
|
+
و ن َ ه ٔ لع َ ق َ → when followed by ا or
|
|
127
|
+
ی , Qal‘ah-ye Now.
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
- 10. The character sequence خو when followed by ا or
|
|
131
|
+
ی ,should be romanized khw, although the w is either not
|
|
132
|
+
pronounced, or only weakly pronounced; e.g. خواجه →
|
|
133
|
+
khwājah.
|
|
134
|
+
|
|
135
|
+
- 11. An inventory of letter-diacritic combinations in addition to the unmodified letters of the
|
|
136
|
+
basic Roman script is
|
|
137
|
+
‘ (U+2018)
|
|
138
|
+
ʼ (U+2019)
|
|
139
|
+
Ā (U+0100)
|
|
140
|
+
ā (U+0101)
|
|
141
|
+
Á (U+00C1)
|
|
142
|
+
á (U+00E1)
|
|
143
|
+
Ḏ (U+0044+0031)
|
|
144
|
+
ḏ (U+0064+00031)
|
|
145
|
+
Ē (U+0112)
|
|
146
|
+
ē (U+0113)
|
|
147
|
+
Ê (U+00CA)
|
|
148
|
+
ê (U+00EA)
|
|
149
|
+
Ḩ (U+1E28)
|
|
150
|
+
ḩ (U+1E29)
|
|
151
|
+
Ī (U+012A)
|
|
152
|
+
ī (U+012B)
|
|
153
|
+
N̄ (U+004E+0304)
|
|
154
|
+
n̄ (U+004E+0304)
|
|
155
|
+
Ō (U+014C)
|
|
156
|
+
ō (U+014D)
|
|
157
|
+
Ṟ (U+0052+0031)
|
|
158
|
+
ṟ (U+0072+0031)
|
|
159
|
+
Ş (U+015E)
|
|
160
|
+
ş (U+015F)
|
|
161
|
+
S̄ (U+0053+0304)
|
|
162
|
+
s̄ (U+0073+0304)
|
|
163
|
+
Ṯ (U+0054+0031)
|
|
164
|
+
ṯ (U+0074+0031)
|
|
165
|
+
Ţ (U+0162)
|
|
166
|
+
ţ (U+0163)
|
|
167
|
+
Ū (U+016A)
|
|
168
|
+
ū (U+016B)
|
|
169
|
+
Z̧ (U+005A+0327)
|
|
170
|
+
z̧ (U+007A+0327)
|
|
171
|
+
Z̄ (U+005A+0304)
|
|
172
|
+
z̄ (U+007A+0304)
|
|
173
|
+
Ẕ (U+005A+0331)
|
|
174
|
+
ẕ (U+007A+0331)
|
|
175
|
+
Z͟ H (U+005A+0048+035F)
|
|
176
|
+
z͟ h (U+007A+0068+035F)
|
|
177
|
+
|
|
178
|
+
tests:
|
|
179
|
+
- source: بَغْلان
|
|
180
|
+
expected: Baghlān
|
|
181
|
+
|
|
182
|
+
- source: پُوټَكَى
|
|
183
|
+
expected: Pōṯakay
|
|
184
|
+
|
|
185
|
+
- source: شِيرِين تَگَاب
|
|
186
|
+
expected: Shīrīn Tagāb
|
|
187
|
+
|
|
188
|
+
- source: کُوْټ
|
|
189
|
+
expected: Kōṯ
|
|
190
|
+
|
|
191
|
+
- source: ثَابِر
|
|
192
|
+
expected: S̄ābir
|
|
193
|
+
|
|
194
|
+
- source: جَلال آبَاد
|
|
195
|
+
expected: Jalālābād
|
|
196
|
+
|
|
197
|
+
- source: چَارِيكَار
|
|
198
|
+
expected: Chārīkār
|
|
199
|
+
|
|
200
|
+
- source: ځَدْرَاڼ
|
|
201
|
+
expected: Dzadrāṉ
|
|
202
|
+
|
|
203
|
+
- source: څَوکۍ
|
|
204
|
+
expected: Tsowkêy
|
|
205
|
+
|
|
206
|
+
- source: حَضْرَتِ إِمَام
|
|
207
|
+
expected: Ḩaẕrat-e Imām
|
|
208
|
+
|
|
209
|
+
- source: خُوْسْت
|
|
210
|
+
expected: Khōst
|
|
211
|
+
|
|
212
|
+
- source: سْپِين بُوْلْدَک
|
|
213
|
+
expected: Spīn Bōldak
|
|
214
|
+
|
|
215
|
+
- source: ډَنْډ وَ پَتَان
|
|
216
|
+
expected: Ḏanḏ Wa Patān
|
|
217
|
+
|
|
218
|
+
- source: كَنْدَهَار
|
|
219
|
+
expected: Kandahār
|
|
220
|
+
|
|
221
|
+
- source: أَنْدَړ
|
|
222
|
+
expected: Andaṟ
|
|
223
|
+
|
|
224
|
+
- source: كُنْدُز
|
|
225
|
+
expected: Kunduz
|
|
226
|
+
|
|
227
|
+
- source: مِير أَسْلَم ژْرَنْدَه
|
|
228
|
+
expected: Mīr Aslam Zhrandah
|
|
229
|
+
|
|
230
|
+
- source: ږِيرَه
|
|
231
|
+
expected: Z͟hīrah
|
|
232
|
+
|
|
233
|
+
- source: سَمَنْگَان
|
|
234
|
+
expected: Samangān
|
|
235
|
+
|
|
236
|
+
- source: كښٙتَه كَلا
|
|
237
|
+
expected: Ks͟hêtah Kalā
|
|
238
|
+
|
|
239
|
+
- source: قَيْصَار
|
|
240
|
+
expected: Qayşār
|
|
241
|
+
|
|
242
|
+
- source: فَيض آبَاد
|
|
243
|
+
expected: Faīẕābād
|
|
244
|
+
|
|
245
|
+
- source: حَضْرَتِ سُلْطَان
|
|
246
|
+
expected: Ḩaẕrat-e Sulţān
|
|
247
|
+
|
|
248
|
+
- source: ظَاهِر كَلا
|
|
249
|
+
expected: Z̧āhir Kalā
|
|
250
|
+
|
|
251
|
+
- source: پُلِ عَلَم
|
|
252
|
+
expected: Pul-e ‘Alam
|
|
253
|
+
|
|
254
|
+
- source: غَزْنِي
|
|
255
|
+
expected: Ghaznī
|
|
256
|
+
|
|
257
|
+
- source: مَزَارِ شَرِيف
|
|
258
|
+
expected: Mazār-e Sharīf
|
|
259
|
+
|
|
260
|
+
- source: قَيْصَار
|
|
261
|
+
expected: Qayşār
|
|
262
|
+
|
|
263
|
+
- source: كَنْدَهَار
|
|
264
|
+
expected: Kandahār
|
|
265
|
+
|
|
266
|
+
- source: گَرْدېز
|
|
267
|
+
expected: Gardēz
|
|
268
|
+
|
|
269
|
+
- source: کَابُل
|
|
270
|
+
expected: Kābul
|
|
271
|
+
|
|
272
|
+
- source: مَيمَنَه
|
|
273
|
+
expected: Maīmanah
|
|
274
|
+
|
|
275
|
+
- source: خَان آبَاد
|
|
276
|
+
expected: Khānābād
|
|
277
|
+
|
|
278
|
+
- source: مَاڼۍ
|
|
279
|
+
expected: Māṉêy
|
|
280
|
+
|
|
281
|
+
- source: وَاخَان
|
|
282
|
+
expected: Wākhān
|
|
283
|
+
|
|
284
|
+
- source: يَنْگِي قَلعَه
|
|
285
|
+
expected: Yangī Qal‘ah
|
|
286
|
+
|
|
287
|
+
- source: جَلال آبَاد
|
|
288
|
+
expected: Jalālābād
|
|
289
|
+
|
|
290
|
+
- source: مُرْغَاب کَابُل
|
|
291
|
+
expected: Murghāb Kābul
|
|
292
|
+
|
|
293
|
+
- source: گٙردُون
|
|
294
|
+
expected: Gêrdōn
|
|
295
|
+
|
|
296
|
+
- source: آب بَنْد
|
|
297
|
+
expected: Āb Band
|
|
298
|
+
|
|
299
|
+
- source: سْپِين بُوْلْدَک
|
|
300
|
+
expected: Spīn Bōldak
|
|
301
|
+
|
|
302
|
+
- source: جَوزجَان
|
|
303
|
+
expected: Jowzjān
|
|
304
|
+
|
|
305
|
+
- source: گَرْدېز
|
|
306
|
+
expected: Gardēz
|
|
307
|
+
|
|
308
|
+
- source: مَیدان شَهْر
|
|
309
|
+
expected: Maīdān Shahr
|
|
310
|
+
|
|
311
|
+
- source: ډَنْډِ سُفْلىٰ
|
|
312
|
+
expected: Ḏanḏ-e Suflá
|
|
313
|
+
|
|
314
|
+
- source: جَبَل السَرَاج
|
|
315
|
+
expected: Jabal us Sarāj
|
|
316
|
+
map:
|
|
317
|
+
inherit: bgnpcgn-prs-Arab-Latn-2007
|
|
318
|
+
postrules:
|
|
319
|
+
- pattern: (?<=\b)(?<!\b[‘|’|'|-])[\u0061-\uFFFF]
|
|
320
|
+
result: "upcase"
|
|
321
|
+
# don't capitalize defined article in the middle of a sentence
|
|
322
|
+
- pattern : ' Ut T' # الت
|
|
323
|
+
result: ' ut T'
|
|
324
|
+
- pattern : ' Us̄ S̄' # الث
|
|
325
|
+
result: ' us̄ S̄'
|
|
326
|
+
- pattern : ' Ud D' # الد
|
|
327
|
+
result: ' ud D'
|
|
328
|
+
- pattern : ' Uz̄ Z̄' # الذ
|
|
329
|
+
result: ' uz̄ Z̄'
|
|
330
|
+
- pattern : ' Ur R' # الر
|
|
331
|
+
result: ' ur R'
|
|
332
|
+
- pattern : ' Uz Z' # الز
|
|
333
|
+
result: ' uz Z'
|
|
334
|
+
- pattern : ' Us S' # الس
|
|
335
|
+
result: ' us S'
|
|
336
|
+
- pattern : ' Ush Sh' # الش
|
|
337
|
+
result: ' ush Sh'
|
|
338
|
+
- pattern : ' Uş Ş' # الص
|
|
339
|
+
result: ' uş Ş'
|
|
340
|
+
- pattern : ' Uẕ Ẕ' # الض
|
|
341
|
+
result: ' uẕ Ẕ'
|
|
342
|
+
- pattern : ' Uţ Ţ' # الط
|
|
343
|
+
result: ' uţ Ţ'
|
|
344
|
+
- pattern : ' Uz̧ Z̧' # الظ
|
|
345
|
+
result: ' uz̧ Z̧'
|
|
346
|
+
- pattern : ' Ul L' # الل
|
|
347
|
+
result: ' ul L'
|
|
348
|
+
- pattern : ' Un n' # الن
|
|
349
|
+
result: ' un N'
|
|
350
|
+
characters:
|
|
351
|
+
|
|
352
|
+
'\u0650': 'i' # ِ kasra
|
|
353
|
+
'\u064f': 'u' # ُ damma
|
|
354
|
+
|
|
355
|
+
'\u0650\b' : '-e' # ِ kasra
|
|
356
|
+
|
|
357
|
+
'\s\u0627\u0644\u0644\u0651\u064e\u0647' : 'ullāh' # Note5
|
|
358
|
+
'\u0652' : '' # ْ sokoon
|
|
359
|
+
'\u0659': 'ê'
|
|
360
|
+
|
|
361
|
+
# Sun letters
|
|
362
|
+
'\b\u0627\u0644\u062a' : 'ut t' # الت
|
|
363
|
+
'\b\u0627\u0644\u062b' : 'us̄ s̄' # الث
|
|
364
|
+
'\b\u0627\u0644\u062f' : 'ud d' # الد
|
|
365
|
+
'\b\u0627\u0644\u0630' : 'uz̄ z̄' # الذ
|
|
366
|
+
'\b\u0627\u0644\u0631' : 'ur r' # الر
|
|
367
|
+
'\b\u0627\u0644\u0632' : 'uz z' # الز
|
|
368
|
+
'\b\u0627\u0644\u0633' : 'us s' # الس
|
|
369
|
+
'\b\u0627\u0644\u0634' : 'ush sh' # الش
|
|
370
|
+
'\b\u0627\u0644\u0635' : 'uş ş' # الص
|
|
371
|
+
'\b\u0627\u0644\u0636' : 'uẕ ẕ' # الض
|
|
372
|
+
'\b\u0627\u0644\u0637' : 'uţ ţ' # الط
|
|
373
|
+
'\b\u0627\u0644\u0638' : 'uz̧ z̧' # الظ
|
|
374
|
+
'\b\u0627\u0644\u0644' : 'ul l' # الل
|
|
375
|
+
'\b\u0627\u0644\u0646' : 'un n' # الن
|
|
376
|
+
|
|
377
|
+
'\u0626': 'êy' # ئ
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
---
|
|
2
|
+
authority_id: bgnpcgn
|
|
3
|
+
id: 1962
|
|
4
|
+
language: iso-639-2:srp
|
|
5
|
+
source_script: Cyrl
|
|
6
|
+
destination_script: Latn
|
|
7
|
+
name: TRANSLITERATION OF SERBIAN CYRILLIC
|
|
8
|
+
creation_date: 1962
|
|
9
|
+
confirmation_date: 1962
|
|
10
|
+
description: |
|
|
11
|
+
Serbo-Croatian, the official national language of Yugoslavia, is a single literary language.
|
|
12
|
+
In Serbian areas it is written in the cyrillic (Serbian) alphabet, while in Croatian areas it is
|
|
13
|
+
written in the roman (Croatian) alphabet.
|
|
14
|
+
Both the BGN and PCGN use the standard Croation equivalents for fomanizing the Serbian cyrillic
|
|
15
|
+
alphabet whenether romanized names are not available.
|
|
16
|
+
|
|
17
|
+
notes:
|
|
18
|
+
- The digraph dj(Dj) will occasionally be found as the Croatian equivalent of ђ(Ђ),
|
|
19
|
+
but the use of dj should be limited to those instances where it is found in roman sources.
|
|
20
|
+
|
|
21
|
+
tests:
|
|
22
|
+
- source: Шупља Стена
|
|
23
|
+
expected: Šuplja Stena
|
|
24
|
+
- source: Чукарица
|
|
25
|
+
expected: Čukarica
|
|
26
|
+
- source: Црна Трава
|
|
27
|
+
expected: Crna Trava
|
|
28
|
+
- source: Херцег Нови
|
|
29
|
+
expected: Herceg Novi
|
|
30
|
+
- source: Улцињ
|
|
31
|
+
expected: Ulcinj
|
|
32
|
+
- source: Ужице
|
|
33
|
+
expected: Užice
|
|
34
|
+
- source: Тресаначка Река
|
|
35
|
+
expected: Tresanačka Reka
|
|
36
|
+
- source: Сјеница
|
|
37
|
+
expected: Sjenica
|
|
38
|
+
- source: Рожаје
|
|
39
|
+
expected: Rožaje
|
|
40
|
+
- source: Пљевља
|
|
41
|
+
expected: Pljevlja
|
|
42
|
+
- source: Оџаци
|
|
43
|
+
expected: Odžaci
|
|
44
|
+
- source: Никшић
|
|
45
|
+
expected: Nikšić
|
|
46
|
+
- source: Медвеђа
|
|
47
|
+
expected: Medveđa
|
|
48
|
+
- source: Лозница
|
|
49
|
+
expected: Loznica
|
|
50
|
+
- source: Књажевац
|
|
51
|
+
expected: Knjaževac
|
|
52
|
+
- source: Зрењанин
|
|
53
|
+
expected: Zrenjanin
|
|
54
|
+
- source: Житорађа
|
|
55
|
+
expected: Žitorađa
|
|
56
|
+
- source: Ервеник
|
|
57
|
+
expected: Ervenik
|
|
58
|
+
- source: Доње Љупче
|
|
59
|
+
expected: Donje Ljupče
|
|
60
|
+
- source: Гусиње
|
|
61
|
+
expected: Gusinje
|
|
62
|
+
- source: ГУСИЊЕ
|
|
63
|
+
expected: GUSINJE
|
|
64
|
+
- source: Врњачка Бања
|
|
65
|
+
expected: Vrnjačka Banja
|
|
66
|
+
- source: Бијело Поље
|
|
67
|
+
expected: Bijelo Polje
|
|
68
|
+
- source: Алибунар
|
|
69
|
+
expected: Alibunar
|
|
70
|
+
|
|
71
|
+
map:
|
|
72
|
+
inherit: bgnpcgn-srp-Cyrl-Latn-2005
|
|
73
|
+
postrules:
|
|
@@ -0,0 +1,459 @@
|
|
|
1
|
+
---
|
|
2
|
+
authority_id: bgnpcgn
|
|
3
|
+
id: 2007
|
|
4
|
+
language: iso-639-2:urd
|
|
5
|
+
source_script: Arab
|
|
6
|
+
destination_script: Latn
|
|
7
|
+
name: BGN/PCGN Romanization System -- Urdu (2007)
|
|
8
|
+
alias:
|
|
9
|
+
ogc11122:
|
|
10
|
+
code: uas_Arab2Latn_BGN_2007
|
|
11
|
+
description: Unified Afghan Romanization System US Board on Geographic Names (BGN)/The Permanent Committee on Geographical Names (PCGN) 2007
|
|
12
|
+
url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/693788/ROMANIZATION_OF_URDU.pdf
|
|
13
|
+
creation_date: 2007
|
|
14
|
+
confirmation_date: 2017-11
|
|
15
|
+
description: |
|
|
16
|
+
The following is the approved romanization system for
|
|
17
|
+
deriving standard spellings of Urdu geographical names for
|
|
18
|
+
Pakistan. It was jointly adopted by BGN and PCGN at the
|
|
19
|
+
23rd BGN/PCGN Conference in Washington, DC, in 2007 and it
|
|
20
|
+
is based on the Hunterian romanization system for Urdu,
|
|
21
|
+
which has been used by the Surveys of India and Pakistan
|
|
22
|
+
for romanizing Urdu geographical names for more than one
|
|
23
|
+
hundred years. The BGN/PCGN system laid out below includes
|
|
24
|
+
diacritical marks in order that the original script can be
|
|
25
|
+
derived from the romanized form (i.e. it is reversible).
|
|
26
|
+
For desk users requiring a diacritic-free form, these
|
|
27
|
+
diacritics can simply be removed. In every case the same
|
|
28
|
+
basic Roman-script characters are kept as are used in the
|
|
29
|
+
Hunterian system. The BGN/PCGN forms have further been
|
|
30
|
+
designed to harmonize with the BGN/PCGN Persian
|
|
31
|
+
romanization system.
|
|
32
|
+
notes:
|
|
33
|
+
- 1. When the vowel sign zīr ( ِ) occurs word-finally in the
|
|
34
|
+
first element of a compound, it is assumed to mark the
|
|
35
|
+
Persian izafat
|
|
36
|
+
morpheme, and is romanized -e, not i.
|
|
37
|
+
- 2. The source of almost all example names is the 1951
|
|
38
|
+
Census of Pakistan, Village List, Northwest Frontier
|
|
39
|
+
Province, Chitral
|
|
40
|
+
State. Office of the Provincial Superintendant of Census,
|
|
41
|
+
North-West Frontier Province, Peshawar.
|
|
42
|
+
- 3. No examples of aspirated dental r (rh, رھ ( were found,
|
|
43
|
+
though this phoneme is assumed to be part of the phonology
|
|
44
|
+
of
|
|
45
|
+
Urdu, and was therefore left out of Table 2.
|
|
46
|
+
- 4. Note that the short vowels in the Urdu examples are not
|
|
47
|
+
pointed.
|
|
48
|
+
- 5. Occasionally, sequences of /z/ or /s/ plus /h/ may be
|
|
49
|
+
encountered, i.e. z·h, s·h. These may be romanized with the
|
|
50
|
+
Unicode
|
|
51
|
+
'center dot' (U+00B7) separating the two letters, to
|
|
52
|
+
distinguish them from the digraphs /zh/ and /sh/.
|
|
53
|
+
- Commented tests are blocked by this issue https://github.com/interscript/interscript/issues/572
|
|
54
|
+
depends on the different ways of handling ي to y or e AND و to u or o
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
tests:
|
|
58
|
+
# - source: بوغدِی
|
|
59
|
+
# expected: Boghdī
|
|
60
|
+
|
|
61
|
+
- source: پَالِير
|
|
62
|
+
expected: Pālīr
|
|
63
|
+
|
|
64
|
+
# - source: بیزوت كَلے
|
|
65
|
+
# expected: Bezot Kale
|
|
66
|
+
|
|
67
|
+
# - source: عَمَل كوٹ
|
|
68
|
+
# expected: ‘Amal Koṭ
|
|
69
|
+
|
|
70
|
+
- source: ثَابِر
|
|
71
|
+
expected: S̄ābir
|
|
72
|
+
|
|
73
|
+
- source: شَاه نَثَار ميلة
|
|
74
|
+
expected: Shāh Nas̄ār Mylah
|
|
75
|
+
|
|
76
|
+
# - source: بَرجُو ميلَه
|
|
77
|
+
# expected: Barjū Melah
|
|
78
|
+
|
|
79
|
+
- source: چَپرِی
|
|
80
|
+
expected: Chaprī
|
|
81
|
+
|
|
82
|
+
- source: أَحمَد خَان كَلے
|
|
83
|
+
expected: Aḩmad Khān Kale
|
|
84
|
+
|
|
85
|
+
# - source: آكَا خيل
|
|
86
|
+
# expected: Ākā Khel
|
|
87
|
+
|
|
88
|
+
- source: دُرَانِي
|
|
89
|
+
expected: Durānī
|
|
90
|
+
|
|
91
|
+
- source: ڈَنگِیلا
|
|
92
|
+
expected: Ḍangīlā
|
|
93
|
+
|
|
94
|
+
- source: ذَرَانِی
|
|
95
|
+
expected: Z̄arānī
|
|
96
|
+
|
|
97
|
+
- source: بُركِي
|
|
98
|
+
expected: Burkī
|
|
99
|
+
|
|
100
|
+
- source: گِیدَڑَه
|
|
101
|
+
expected: Gīdaṛah
|
|
102
|
+
|
|
103
|
+
- source: عَلِي زَائِي
|
|
104
|
+
expected: ‘Alī Zā’ī
|
|
105
|
+
|
|
106
|
+
# - source: ژوب
|
|
107
|
+
# expected: Zhob
|
|
108
|
+
|
|
109
|
+
- source: بِسَاتُو
|
|
110
|
+
expected: Bisātū
|
|
111
|
+
|
|
112
|
+
- source: أَحمَدِي شَامَا
|
|
113
|
+
expected: Aḩmadī Shāmā
|
|
114
|
+
|
|
115
|
+
- source: اَصَالَت كَلے
|
|
116
|
+
expected: Aşālat Kale
|
|
117
|
+
|
|
118
|
+
- source: خَضَر خَان
|
|
119
|
+
expected: Khaẕar Khān
|
|
120
|
+
|
|
121
|
+
- source: سُلْطَان
|
|
122
|
+
expected: Sulţān
|
|
123
|
+
|
|
124
|
+
- source: عَزَم سَيِّد نُور كَلے
|
|
125
|
+
expected: ‘Azam Sayyid Nūr Kale
|
|
126
|
+
|
|
127
|
+
# - source: عَلَم شير
|
|
128
|
+
# expected: ‘Alam Sher
|
|
129
|
+
|
|
130
|
+
- source: بغَاكِي
|
|
131
|
+
expected: Bghākī
|
|
132
|
+
|
|
133
|
+
# - source: مُظَفَر كوٹ
|
|
134
|
+
# expected: Muz̧afar Koṭ
|
|
135
|
+
|
|
136
|
+
- source: حَقدَرَه
|
|
137
|
+
expected: Ḩaqdarah
|
|
138
|
+
|
|
139
|
+
- source: کَچکِینَہ
|
|
140
|
+
expected: Kachkīnah
|
|
141
|
+
|
|
142
|
+
- source: بَاگَن
|
|
143
|
+
expected: Bāgan
|
|
144
|
+
|
|
145
|
+
- source: بُلبَلَک
|
|
146
|
+
expected: Bulbalak
|
|
147
|
+
|
|
148
|
+
- source: بِلیَامِین
|
|
149
|
+
expected: Bilyāmīn
|
|
150
|
+
|
|
151
|
+
- source: نَہر
|
|
152
|
+
expected: Nahr
|
|
153
|
+
|
|
154
|
+
# - source: جوکَالِیَاں
|
|
155
|
+
# expected: Jokālīāñ
|
|
156
|
+
|
|
157
|
+
- source: اَرَوْالِی
|
|
158
|
+
expected: Arawālī
|
|
159
|
+
|
|
160
|
+
# - source: هیروشاه
|
|
161
|
+
# expected: Heroshāh
|
|
162
|
+
|
|
163
|
+
- source: مَہردِی
|
|
164
|
+
expected: Mahrdī
|
|
165
|
+
|
|
166
|
+
- source: بَڑھ
|
|
167
|
+
expected: Baṛh
|
|
168
|
+
|
|
169
|
+
# - source: شِیوَاؤ
|
|
170
|
+
# expected: Shīwā’o
|
|
171
|
+
|
|
172
|
+
- source: یَاردَا کَلے
|
|
173
|
+
expected: Yārdā Kale
|
|
174
|
+
|
|
175
|
+
- source: بهَائِي خَان
|
|
176
|
+
expected: Bhā’ī Khān
|
|
177
|
+
|
|
178
|
+
- source: پھاشک
|
|
179
|
+
expected: Phāshk
|
|
180
|
+
|
|
181
|
+
- source: تھَلّ
|
|
182
|
+
expected: Thall
|
|
183
|
+
|
|
184
|
+
- source: پَٹھان ريَا
|
|
185
|
+
expected: Paṭhān Ryā
|
|
186
|
+
|
|
187
|
+
- source: جھِیل
|
|
188
|
+
expected: Jhīl
|
|
189
|
+
|
|
190
|
+
- source: غَزْنِي سْپِين
|
|
191
|
+
expected: Ghaznī Spīn
|
|
192
|
+
|
|
193
|
+
- source: بَادشَاه چھُم
|
|
194
|
+
expected: Bādshāh Chhum
|
|
195
|
+
|
|
196
|
+
- source: سِندھ
|
|
197
|
+
expected: Sindh
|
|
198
|
+
|
|
199
|
+
- source: ڈھَنڈ
|
|
200
|
+
expected: Ḍhanḍ
|
|
201
|
+
|
|
202
|
+
# - source: غوزگَڑھِی
|
|
203
|
+
# expected: Ghozgaṛhī
|
|
204
|
+
|
|
205
|
+
# - source: دوغَل گاکھَر
|
|
206
|
+
# expected: Doghal Gākhar
|
|
207
|
+
|
|
208
|
+
- source: خَان گھَڑِی
|
|
209
|
+
expected: Khān Ghaṛī
|
|
210
|
+
|
|
211
|
+
- source: غُلَامَک كَلے
|
|
212
|
+
expected: Ghulāmak Kale
|
|
213
|
+
|
|
214
|
+
# - source: کاراخیل
|
|
215
|
+
# expected: Kārākhel
|
|
216
|
+
|
|
217
|
+
- source: خَپیَنگا
|
|
218
|
+
expected: Khapyangā
|
|
219
|
+
|
|
220
|
+
- source: گَندَه كَلے
|
|
221
|
+
expected: Gandah Kale
|
|
222
|
+
|
|
223
|
+
# - source: گُلونَا ڈھيرِي
|
|
224
|
+
# expected: Gulonā Ḍherī
|
|
225
|
+
|
|
226
|
+
# - source: خيرَه دِين
|
|
227
|
+
# expected: Kherah Dīn
|
|
228
|
+
|
|
229
|
+
- source: مَورپِتھِی
|
|
230
|
+
expected: Maurpithī
|
|
231
|
+
|
|
232
|
+
- source: درے پلارِی
|
|
233
|
+
expected: Dre Plārī
|
|
234
|
+
|
|
235
|
+
- source: آگرَہ
|
|
236
|
+
expected: Āgrah
|
|
237
|
+
|
|
238
|
+
- source: ڈَنڈَر
|
|
239
|
+
expected: Ḍanḍar
|
|
240
|
+
|
|
241
|
+
# - source: گِیدو
|
|
242
|
+
# expected: Gīdo
|
|
243
|
+
|
|
244
|
+
- source: گُبازانَہ
|
|
245
|
+
expected: Gubāzānah
|
|
246
|
+
|
|
247
|
+
# - source: اُوشو
|
|
248
|
+
# expected: Ūsho
|
|
249
|
+
|
|
250
|
+
- source: حَےدَر عَلِی كَلے
|
|
251
|
+
expected: Ḩaidar ‘Alī Kale
|
|
252
|
+
|
|
253
|
+
- source: تَودَہ چِینَہ
|
|
254
|
+
expected: Taudah Chīnah
|
|
255
|
+
|
|
256
|
+
- source: مُوسى خَان كَلے
|
|
257
|
+
expected: Mūsá Khān Kale
|
|
258
|
+
|
|
259
|
+
- source: مُلَّا بَاغ
|
|
260
|
+
expected: Mullā Bāgh
|
|
261
|
+
|
|
262
|
+
map:
|
|
263
|
+
postrules:
|
|
264
|
+
- pattern: (?<=\b)(?<!\b[‘|’|'|-])[\u0061-\uFFFF]
|
|
265
|
+
result: "upcase"
|
|
266
|
+
# don't capitalize defined article in the middle of a sentence
|
|
267
|
+
- pattern : ' At T' # الت
|
|
268
|
+
result: ' at T'
|
|
269
|
+
- pattern : ' As̄ S̄' # الث
|
|
270
|
+
result: ' as̄ S̄'
|
|
271
|
+
- pattern : ' Ad D' # الد
|
|
272
|
+
result: ' ad D'
|
|
273
|
+
- pattern : ' Az̄ Z̄' # الذ
|
|
274
|
+
result: ' az̄ Z̄'
|
|
275
|
+
- pattern : ' Ar R' # الر
|
|
276
|
+
result: ' ar R'
|
|
277
|
+
- pattern : ' Az Z' # الز
|
|
278
|
+
result: ' az Z'
|
|
279
|
+
- pattern : ' As S' # الس
|
|
280
|
+
result: ' as S'
|
|
281
|
+
- pattern : ' Ash Sh' # الش
|
|
282
|
+
result: ' ash Sh'
|
|
283
|
+
- pattern : ' Aş Ş' # الص
|
|
284
|
+
result: ' aş Ş'
|
|
285
|
+
- pattern : ' Aẕ Ẕ' # الض
|
|
286
|
+
result: ' aẕ Ẕ'
|
|
287
|
+
- pattern : ' Aţ Ţ' # الط
|
|
288
|
+
result: ' aţ Ţ'
|
|
289
|
+
- pattern : ' Az̧ Z̧' # الظ
|
|
290
|
+
result: ' az̧ Z̧'
|
|
291
|
+
- pattern : ' Al L' # الل
|
|
292
|
+
result: ' al L'
|
|
293
|
+
- pattern : ' An N' # الن
|
|
294
|
+
result: ' an N'
|
|
295
|
+
- pattern: " Al " # ال
|
|
296
|
+
result: " al "
|
|
297
|
+
characters:
|
|
298
|
+
# special rules
|
|
299
|
+
|
|
300
|
+
'\s(?=\u0622\u0628\u064E\u0627\u062F)': '' # space followed by abad is removed
|
|
301
|
+
'\ufdf2': 'Allāh' # See note 5
|
|
302
|
+
|
|
303
|
+
# Vowels, Diphthongs, and Diacritical Marks
|
|
304
|
+
'\u064e' : 'a' # َ fatha
|
|
305
|
+
'\u064e(?=\u0629)' : '' # َ fatha followed by ta' marboota
|
|
306
|
+
'\u064e(?=a[h|t])' : '' # َ fatha followed by ta' marboota, handling different order of conversion
|
|
307
|
+
|
|
308
|
+
'\u0652' : '' # ْ sokoon
|
|
309
|
+
'\u0659': 'ê'
|
|
310
|
+
|
|
311
|
+
'\u0650[\u064a|\u06cc]' : 'ī' # ـِي kasra followed by ي
|
|
312
|
+
'\u0650' : 'i' # karsra
|
|
313
|
+
'\u06d2' : 'e' # ـے
|
|
314
|
+
|
|
315
|
+
'\u0622' : 'ā' # آ
|
|
316
|
+
'\u064e\u0627' : 'ā' # ـَا fatha followed by ا
|
|
317
|
+
'\u0627' : 'ā' # ا
|
|
318
|
+
'\b\u0627' : '' # ا
|
|
319
|
+
|
|
320
|
+
'\u0648' : 'o' # و # suspect
|
|
321
|
+
'\u064f' : 'u' # ُ damma
|
|
322
|
+
'\u064f\u0648' : 'ū' # ـُو damma followed by و
|
|
323
|
+
|
|
324
|
+
'\u064e\u06d2' : 'ai' # ـے
|
|
325
|
+
'\u064e\u0648' : 'au' # ـَو
|
|
326
|
+
'\u0670': 'á' # ىٰ
|
|
327
|
+
'\u0649': 'á' # ىٰ
|
|
328
|
+
|
|
329
|
+
# shadda
|
|
330
|
+
'\u0628\u0651' : 'bb' # ب
|
|
331
|
+
'\u062a\u0651' : 'tt' # ت
|
|
332
|
+
'\u062b\u0651' : 'thth' # ث
|
|
333
|
+
'\u062c\u0651' : 'jj' # ج
|
|
334
|
+
'\u062d\u0651' : 'ẖẖ' # ح
|
|
335
|
+
'\u062e\u0651' : 'khkh' # خ
|
|
336
|
+
'\u062f\u0651' : 'dd' # د
|
|
337
|
+
'\u0630\u0651' : 'z̄z̄' # ذ
|
|
338
|
+
'\u0631\u0651' : 'rr' # ر
|
|
339
|
+
'\u0632\u0651' : 'zz' # ز
|
|
340
|
+
'\u0633\u0651' : 'ss' # س
|
|
341
|
+
'\u0634\u0651' : 'sh' # ش
|
|
342
|
+
'\u0635\u0651' : 'şş' # ص
|
|
343
|
+
'\u0636\u0651' : 'ḏḏ' # ض
|
|
344
|
+
'\u0637\u0651' : 'ţţ' # ط
|
|
345
|
+
'\u0638\u0651' : 'z̧z̧' # ظ
|
|
346
|
+
'\u063a\u0651' : 'ghgh' # غ
|
|
347
|
+
'\u0641\u0651' : 'ff' # ف
|
|
348
|
+
'\u0642\u0651' : 'qq' # ق
|
|
349
|
+
'\u0643\u0651' : 'kk' # ك
|
|
350
|
+
'\u0644\u0651' : 'll' # ل
|
|
351
|
+
'\u0645\u0651' : 'mm' # م
|
|
352
|
+
'\u0646\u0651' : 'nn' # ن
|
|
353
|
+
'\u0647\u0651' : 'hh' # ه
|
|
354
|
+
'\u0648\u0651' : 'ww' # و
|
|
355
|
+
'[\u064a|\u06cc]\u0651' : 'yy' # ي
|
|
356
|
+
|
|
357
|
+
# NOTE 1
|
|
358
|
+
'\u0650\b' : '-e' # ِ kasra
|
|
359
|
+
'\u0674' : '-e' # ٴ
|
|
360
|
+
'\u0654' : '-e' # ٔ
|
|
361
|
+
|
|
362
|
+
'\u0650\u064a\u0651\u064e' : 'īy' # ـِيَّ
|
|
363
|
+
'\u0650\u064a(?=\u064e|u064f)' : 'iy' # ـِي kasra followed by ي
|
|
364
|
+
'\u064e\u0649' : 'ay' # ـَى fatha followed by ى which is ا not ي
|
|
365
|
+
'\u064e\u0648\u0652' : 'aw' # ـَوْ
|
|
366
|
+
'\u064e\u064a\u0652' : 'ay' # ـَيْ
|
|
367
|
+
'\u0650\u06cc\u0651\u064e' : 'īy' # ـِيَّ
|
|
368
|
+
'\u064e\u064a' : 'aī' # ـَي
|
|
369
|
+
'\u064e\u06cc' : 'aī' # ـَي
|
|
370
|
+
# - '-ye'
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
# ta' marboota
|
|
374
|
+
'\u0629' : 'at' # ة in the middle of the sentence
|
|
375
|
+
'\u0629$' : 'ah'
|
|
376
|
+
'(?<=\b\u0627\u0644[\u0600-\u06ff]{2})\u0629' : 'ah'
|
|
377
|
+
'(?<=\b\u0627\u0644[\u0600-\u06ff]{3})\u0629' : 'ah'
|
|
378
|
+
'(?<=\b\u0627\u0644[\u0600-\u06ff]{4})\u0629' : 'ah'
|
|
379
|
+
'(?<=\b\u0627\u0644[\u0600-\u06ff]{5})\u0629' : 'ah'
|
|
380
|
+
'(?<=\b\u0627\u0644[\u0600-\u06ff]{6})\u0629' : 'ah'
|
|
381
|
+
'(?<=\b\u0627\u0644[\u0600-\u06ff]{7})\u0629' : 'ah'
|
|
382
|
+
'(?<=\b\u0627\u0644[\u0600-\u06ff]{8})\u0629' : 'ah'
|
|
383
|
+
'(?<=\b\u0627\u0644[\u0600-\u06ff]{9})\u0629' : 'ah'
|
|
384
|
+
'(?<=\b\u0627\u0644[\u0600-\u06ff]{10})\u0629' : 'ah'
|
|
385
|
+
'(?<=\b\u0627\u0644[\u0600-\u06ff]{11})\u0629' : 'ah'
|
|
386
|
+
'(?<=\b\u0627\u0644[\u0600-\u06ff]{12})\u0629' : 'ah'
|
|
387
|
+
'(?<=\b\u0627\u0644[\u0600-\u06ff]{13})\u0629' : 'ah'
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
'\u0621' : '’' # ء
|
|
392
|
+
'\u0624' : '’' # ؤ
|
|
393
|
+
'\u0624\b' : '’o' # ؤ
|
|
394
|
+
'\u0626' : '’' # ئ
|
|
395
|
+
|
|
396
|
+
'\u0623' : '' # أ
|
|
397
|
+
'\u0625' : '' # إ
|
|
398
|
+
# See note B
|
|
399
|
+
'\b\u0627\u0644' : 'al ' # ال
|
|
400
|
+
# '\uFE8E' : '' # ﺎ
|
|
401
|
+
|
|
402
|
+
# Sun letters
|
|
403
|
+
'\b\u0627\u0644\u062a' : 'at t' # الت
|
|
404
|
+
'\b\u0627\u0644\u062b' : 'as̄ s̄' # الث
|
|
405
|
+
'\b\u0627\u0644\u062f' : 'ad d' # الد
|
|
406
|
+
'\b\u0627\u0644\u0630' : 'az̄ z̄' # الذ
|
|
407
|
+
'\b\u0627\u0644\u0631' : 'ar r' # الر
|
|
408
|
+
'\b\u0627\u0644\u0632' : 'az z' # الز
|
|
409
|
+
'\b\u0627\u0644\u0633' : 'as s' # الس
|
|
410
|
+
'\b\u0627\u0644\u0634' : 'ash sh' # الش
|
|
411
|
+
'\b\u0627\u0644\u0635' : 'aş ş' # الص
|
|
412
|
+
'\b\u0627\u0644\u0636' : 'aẕ ẕ' # الض
|
|
413
|
+
'\b\u0627\u0644\u0637' : 'aţ ţ' # الط
|
|
414
|
+
'\b\u0627\u0644\u0638' : 'az̧ z̧' # الظ
|
|
415
|
+
'\b\u0627\u0644\u0644' : 'al l' # الل
|
|
416
|
+
'\b\u0627\u0644\u0646' : 'an n' # الن
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
# consonant characters
|
|
420
|
+
|
|
421
|
+
'\u0628' : 'b' # ب
|
|
422
|
+
'\u067E' : 'p' # پ
|
|
423
|
+
'\u062a' : 't' # ت
|
|
424
|
+
'\u0679' : 'ṭ' # ٹ
|
|
425
|
+
'\u062B' : 's̄' # ث
|
|
426
|
+
'\u062c' : 'j' # ج
|
|
427
|
+
'\u0686' : 'ch' # چ
|
|
428
|
+
'\u062d' : 'ḩ' # ح
|
|
429
|
+
'\u062e' : 'kh' # خ
|
|
430
|
+
'\u062f' : 'd' # د
|
|
431
|
+
'\u0688' : 'ḍ' # ڈ
|
|
432
|
+
'\u0630' : 'z̄' # ذ
|
|
433
|
+
'\u0631' : 'r' # ر
|
|
434
|
+
'\u0691' : 'ṛ' # ڑ
|
|
435
|
+
'\u0632' : 'z' # ز
|
|
436
|
+
'\u0698' : 'zh' # ژ
|
|
437
|
+
'\u0633' : 's' # س
|
|
438
|
+
'\u0634' : 'sh' # ش
|
|
439
|
+
'\u0635' : 'ş' # ص
|
|
440
|
+
'\u0636' : 'ẕ' # ض
|
|
441
|
+
'\u0637' : 'ţ' # ط
|
|
442
|
+
'\u0638' : 'z̧' # ظ
|
|
443
|
+
'\u0639' : '‘' # ع
|
|
444
|
+
'\u063a' : 'gh' # غ
|
|
445
|
+
'\u0641' : 'f' # ف
|
|
446
|
+
'\u0642' : 'q' # ق
|
|
447
|
+
'\u0643' : 'k' # ك
|
|
448
|
+
'\u06A9' : 'k' # ک
|
|
449
|
+
'\u06AF' : 'g' # گ
|
|
450
|
+
'\u0644' : 'l' # ل
|
|
451
|
+
'\u0645' : 'm' # م
|
|
452
|
+
'\u0646' : 'n' # ن
|
|
453
|
+
'\u06BA' : 'ñ' # ڼ
|
|
454
|
+
'[\u0647|\u06c1|\u06be]' : 'h' # ه
|
|
455
|
+
'\u0648' : 'w' # و
|
|
456
|
+
'[\u064a|\u06cc]' : 'y' # ي
|
|
457
|
+
# '\u0649' : 'y' # ي
|
|
458
|
+
'\u06D0' : 'ē' # ې
|
|
459
|
+
'\u06CD' : 'êy' # ۍ
|