interscript 0.1.7 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +1 -3
  3. data/aliases.json +1 -0
  4. data/lib/interscript.rb +8 -3
  5. data/lib/interscript/fs.rb +27 -0
  6. data/lib/interscript/mapping.rb +3 -1
  7. data/lib/interscript/opal.rb +142 -3
  8. data/lib/interscript/opal/entrypoint.rb +8 -0
  9. data/lib/interscript/opal/exports.rb +11 -0
  10. data/lib/interscript/opal/maps.js.erb +2 -4
  11. data/lib/interscript/version.rb +1 -1
  12. data/maps/alalc-ara-Arab-Latn-1997.yaml +5 -5
  13. data/maps/alalc-asm-Deva-Latn-1997.yaml +104 -10
  14. data/maps/alalc-asm-Deva-Latn-2012.yaml +18 -3
  15. data/maps/alalc-aze-Arab-Latn-1997.yaml +376 -0
  16. data/maps/alalc-ben-Beng-Latn-1997.yaml +291 -0
  17. data/maps/alalc-div-Thaa-Latn-1997.yaml +211 -0
  18. data/maps/alalc-hin-Deva-Latn-1997.yaml +102 -10
  19. data/maps/alalc-hin-Deva-Latn-2011.yaml +19 -1
  20. data/maps/alalc-kan-Kana-Latn-1997.yaml +274 -0
  21. data/maps/alalc-kan-Kana-Latn-2011.yaml +63 -0
  22. data/maps/alalc-ori-Orya-Latn-1997.yaml +284 -0
  23. data/maps/alalc-ori-Orya-Latn-2011.yaml +67 -0
  24. data/maps/alalc-pra-Deva-Latn-2012.yaml +2 -2
  25. data/maps/alalc-san-Deva-Latn-2012.yaml +78 -9
  26. data/maps/alalc-tel-Telu-Latn-1997.yaml +284 -0
  27. data/maps/alalc-tel-Telu-Latn-2011.yaml +64 -0
  28. data/maps/az-aze-Cyrl-Latn-1939.yaml +105 -0
  29. data/maps/az-aze-Cyrl-Latn-1958.yaml +45 -0
  30. data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +3 -1
  31. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +111 -104
  32. data/maps/bgnpcgn-bal-Arab-Latn-2008.yaml +329 -0
  33. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +1 -1
  34. data/maps/bgnpcgn-div-Thaa-Latn-1988.yaml +75 -0
  35. data/maps/bgnpcgn-far-Latn-Latn-1964.yaml +28 -0
  36. data/maps/bgnpcgn-isl-Latn-Latn-1964.yaml +37 -0
  37. data/maps/bgnpcgn-kaz-Cyrl-Latn-1979.yaml +247 -0
  38. data/maps/bgnpcgn-kir-Cyrl-Latn-1979.yaml +218 -0
  39. data/maps/bgnpcgn-kur-Arab-Latn-2007.yaml +249 -0
  40. data/maps/bgnpcgn-per-Arab-Latn-1958.yaml +2 -0
  41. data/maps/bgnpcgn-prs-Arab-Latn-2007.yaml +87 -53
  42. data/maps/bgnpcgn-pus-Arab-Latn-1968.yaml +377 -0
  43. data/maps/bgnpcgn-srp-Cyrl-Latn-1962.yaml +73 -0
  44. data/maps/bgnpcgn-urd-Arab-Latn-2007.yaml +459 -0
  45. data/maps/{bis-knd-Knda-Latn-13194-1991.yaml → bis-kan-Kana-Latn-13194-1991.yaml} +2 -2
  46. data/maps/bis-ori-Orya-Latn-13194-1991.yaml +17 -2
  47. data/maps/iso-ara-Arab-Latn-233-1984.yaml +1 -1
  48. data/maps/{iso-kan-Knda-Latn-15919-2001.yaml → iso-kan-Kana-Latn-15919-2001.yaml} +1 -1
  49. data/maps/{mns-mon-Cyrl-Latn-5217-2012.yaml → masm-mon-Cyrl-Latn-5217-2012.yaml} +2 -2
  50. data/maps/{mns-mon-Latn-Cyrl-5217-2012.yaml → masm-mon-Latn-Cyrl-5217-2012.yaml} +1 -1
  51. data/maps/mv-div-Thaa-Latn-1987.yaml +200 -0
  52. data/maps/odni-ara-Arab-Latn-2004.yaml +137 -0
  53. data/maps/odni-ara-Arab-Latn-2015.yaml +20 -130
  54. data/maps/odni-bul-Cyrl-Latn-2005.yaml +90 -0
  55. data/maps/odni-fas-Arab-Latn-2004.yaml +276 -0
  56. data/maps/odni-hin-Deva-Latn-2004.yaml +182 -0
  57. data/maps/odni-mkd-Cyrl-Latn-2005.yaml +21 -0
  58. data/maps/odni-prs-Arab-Latn-2004.yaml +123 -0
  59. data/maps/{odni-per-Arab-Latn-2015.yaml → odni-prs-Arab-Latn-2015.yaml} +0 -0
  60. data/maps/odni-srp-Cyrl-Latn-2005.yaml +36 -0
  61. data/maps/odni-tuk-Cyrl-Latn-2015.yaml +170 -0
  62. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +4 -0
  63. data/maps/un-ara-Arab-Latn-2017.yaml +1 -1
  64. data/maps/un-asm-Beng-Latn-1972.yaml +223 -0
  65. data/maps/un-guj-Gujr-Latn-1972.yaml +229 -0
  66. data/maps/un-hin-Deva-Latn-2016.yaml +104 -10
  67. data/maps/un-kan-Kana-Latn-2016.yaml +254 -0
  68. data/maps/un-mal-Mlym-Latn-1972.yaml +251 -0
  69. data/maps/un-mar-Deva-Latn-2016.yaml +24 -13
  70. data/maps/un-nep-Deva-Latn-1972.yaml +40 -121
  71. data/maps/un-ori-Orya-Latn-1972.yaml +247 -0
  72. data/maps/un-pan-Guru-Latn-1972.yaml +402 -0
  73. data/maps/un-prs-Arab-Latn-1967.yaml +236 -0
  74. data/maps/un-tam-Taml-Latn-1972.yaml +194 -0
  75. data/maps/un-tel-Telu-Latn-1972.yaml +270 -0
  76. data/maps/un-urd-Arab-Latn-1972.yaml +405 -0
  77. data/maps/var-amh-Ethi-Latn-eae-2003.yaml +466 -0
  78. data/maps/var-gez-Ethi-Latn-eae-2003.yaml +76 -0
  79. data/spec/interscript/filenames_spec.rb +6 -369
  80. data/spec/interscript_spec.rb +10 -2
  81. metadata +50 -7
  82. data/lib/interscript/opal/map_translate.rb +0 -7
@@ -0,0 +1,377 @@
1
+ ---
2
+ authority_id: bgnpcgn
3
+ id: 2007
4
+ language: iso-639-3:prs
5
+ source_script: Arab
6
+ destination_script: Latn
7
+ name: BGN/PCGN Romanization System -- Pashto (1968)
8
+ url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/693760/ROMANIZATION_OF_PASHTO.pdf
9
+ creation_date: 1968
10
+ confirmation_date: 2017-11
11
+ description: |
12
+ Pashto is an Indo-Iranian language and is one of two
13
+ nationally official languages in Afghanistan and one of
14
+ five regionally recognised languages in Pakistan. The
15
+ romanization system presented here may be applied to all
16
+ Pashto geographical names. Although the BGN/PCGN policy for
17
+ geographical names in Afghanistan is to apply the BGN/PCGN
18
+ national system of romanization for Afghanistan (2007),
19
+ which incorporates Dari elements, when applied to a Pashto
20
+ geographical name, the romanized results of the BGN/PCGN
21
+ national system for Afghanistan are the same as those of
22
+ this Pashto romanization system1 . The Pashto alphabet uses
23
+ a modified form of the Perso-Arabic script, and contains
24
+ twelve additional consonants not present in standard
25
+ Arabic, as well as three additional vowel characters and an
26
+ additional vowel point. ڼ گ ښ ژ ږ ړ ډ ځ څ چ ټ پ :Consonants
27
+ ٙ :Point Vowel; ې ۍ ى :Vowels The points used in Arabic to
28
+ mark short vowels and certain other diacritical marks are
29
+ not written in Pashto. Consequently, a reference source may
30
+ sometimes be required to aid correct identification of the
31
+ standard spellings and proper vowels and elimination of
32
+ dialectal and idiosyncratic variations. In the interests of
33
+ clarity, a column showing vowel pointing from Arabic to
34
+ indicate short vowels has been included in the examples
35
+ below, alongside the unpointed form that will usually be
36
+ encountered. However it should be noted that the
37
+ pronunciation of short vowels will vary. (Note: it is
38
+ recommended that a font such as Scheherazade, available
39
+ from www.sil.org, which includes the Unicode extended
40
+ Arabic sub-range, be used to view this system2 .)
41
+
42
+ notes:
43
+ - 1. Alif ( ‫ا‬ ) should be romanized as follows
44
+ a. Initially,it indicates that the word begins with a vowel or
45
+ diphthong; the alif itself is not romanized, but rather the
46
+ short vowel it “carries” is romanized; e.g., Aslam Zhrandah
47
+ ‫ه‬ َ‫د‬ ‫ن‬ ‫ژر‬ ‫سلَم‬ َ‫أ‬ ‫ميړ‬ → b. When it carries a
48
+ maddah (‫)آ‬ (see vowel table, row 3), it represents ā;
49
+ e.g., Band. Mīṟ ‫د‬ ‫ن‬ ‫ب‬ َ ‫آب‬ → Āb c. Medially and
50
+ finally it represents ā (see table 2, row 2); e.g., ‫ۍ‬
51
+ ‫ماڼ‬ → Māṉêy d. Medially and finally in words of Arabic
52
+ origin, alif may serve as the bearer of hamzah, e.g.
53
+ ‫رأس‬ → ra’s. See also note 4.
54
+
55
+ - 2. The characters tsē ( ‫څ‬ ) and dzē ( ‫ځ‬ ) may be
56
+ romanized t͡ s and d͡ z (the combining double breve (
57
+ Unicode 0361) appearing over the digraph) when for special
58
+ reasons it is desired that confusion be avoided between
59
+ ‫ت‬ (t) plus ‫س‬ (s) and between ‫د‬ (d) plus ‫ز‬ (z),
60
+ respectively.
61
+
62
+ - 3. Occasionally the character sequences ‫ه‬ ‫ك‬ , ‫ه‬ ‫ز‬ ,
63
+ ‫ه‬ ‫س‬ , and ‫ه‬ ‫گ‬ occur . They may be romanized k·h, z·
64
+ h, s·h, and g·h in order to differentiate these
65
+ romanizations from the digraphs kh, zh, sh, and gh, which
66
+ are used to represent the characters ‫خ‬ , ‫ژ‬, ‫ش‬ , and
67
+ ‫غ‬ respectively .
68
+
69
+ - 4. Hamzah ( ‫ء‬ ) should be romanized as follows a. In
70
+ word-initial position, where it will appear either above or
71
+ below alif ( indicates a short vowel and should not itself
72
+ be romanized. romanized by an apostrophe, e.g. ‫أ‬ or
73
+ ‫إ‬ ), it In other positions it should be ‫جُزء‬ → juz’. b.
74
+ Yeh with hamzah ( ‫ئ‬ ) should be romanized êy, unless it
75
+ represents the compound (iẕāfah) morpheme, in which case it
76
+ is romanized according to note 9 below.
77
+
78
+ - 5. The division of words utilized in Pashto writing is
79
+ followed in romanization, except that the elements –ābād, -
80
+ khwā, -shahr, -zādah, -zay and -ullāh are always romanized
81
+ as part of the preceding word, e.g. ‫آباد‬ ‫ت‬ ‫م‬ َ ْ‫ح‬
82
+ ‫ر‬ َ → Raḩmatābād and ‫الله‬ ‫ت‬ ‫م‬ َ ْ‫ح‬ ‫ر‬ َ →
83
+ Raḩmatullāh. However, when the word for God ( ‫الله‬ )
84
+ appears as a standalone word it should be written Allāh.
85
+ Note also the “dagger alif” ( ٙ) above the second ‫ل‬ (lām)
86
+ in the word ‫الله‬ ; this, like the short vowels, is not
87
+ written in Pashto but should be romanized ā, like a full-
88
+ size alif. Persian derivational endings such as –vand and
89
+ endings of Turkish origin such as –lar, -lī, -lū, -i, -u, -
90
+ si, and –su, should be written together with the preceding
91
+ word.
92
+
93
+ - 6. The Pashto preposition ‫د‬ should be romanized dê in
94
+ agreement with its pronunciation, despite the fact that
95
+ it is sometimes pointed with kasrah ( ٙ ).
96
+
97
+ - 7. In names of Arabic origin, the l of the definite article
98
+ al/ul is assimilated before the ‘sun letters’ t, s̄ , d,
99
+ z̄ , r, z, s, sh, ş, ẕ, ţ, z̧ , l and n. In romanization,
100
+ the article will be written al or its assimilated
101
+ equivalent in name-initial position but ul or its
102
+ assimilated equivalent elsewhere; the article should be
103
+ separated from the name it precedes and should not be
104
+ capitalized, except at the beginning of a name, e.g. جَبَل
105
+ السَرَاج → Jabal us Sarāj
106
+
107
+ - 8. In Arabic names, a shaddah, ٙ is used to denote the
108
+ doubling of a particular consonant character, e.g. ‫مَّد‬
109
+ َ‫ح‬ ‫م‬ ُ → Muḩammad. However, in Pashto this ‘doubling’
110
+ is frequently omitted in both Perso-Arabic script and the
111
+ resulting romanization. Guidance on doubling may be taken
112
+ from an authoritative names source, such as an Afghan
113
+ government source or Pashto dictionary; for example, it is
114
+ usual to see Ḩājī without and ‘Abbās with the doubled
115
+ consonant. The doubled y consonant is almost always
116
+ retained, as in Sayyid or Qayyūm
117
+
118
+ - 9. The iẕāfah morpheme is not a grammatical feature of
119
+ Pashto and, if encountered in a linguistically hybrid
120
+ geographical name (i.e. combining features of both Pashto
121
+ and Dari), it should be treated according to the BGN/PCGN
122
+ national system of romanization for Afghanistan, 2007, as –
123
+ e, unless the preceding word ends with a silent heh (‫)ه‬
124
+ or a vowel when it should be shown – ye, e.g. 10. The
125
+ character sequence ‫خو‬ , ‫صار‬ ‫ح‬ ِ ‫غر‬ → Ghar-e Ḩişār;
126
+ ‫و‬ ‫ن‬ َ ‫ه‬ ٔ ‫لع‬ َ ‫ق‬ َ → when followed by ‫ا‬ or
127
+ ‫ی‬ , Qal‘ah-ye Now.
128
+
129
+
130
+ - 10. The character sequence خو when followed by ‫ا‬ or
131
+ ‫ی‬ ,should be romanized khw, although the w is either not
132
+ pronounced, or only weakly pronounced; e.g. ‫خواجه‬ →
133
+ khwājah.
134
+
135
+ - 11. An inventory of letter-diacritic combinations in addition to the unmodified letters of the
136
+ basic Roman script is
137
+ ‘ (U+2018)
138
+ ʼ (U+2019)
139
+ Ā (U+0100)
140
+ ā (U+0101)
141
+ Á (U+00C1)
142
+ á (U+00E1)
143
+ Ḏ (U+0044+0031)
144
+ ḏ (U+0064+00031)
145
+ Ē (U+0112)
146
+ ē (U+0113)
147
+ Ê (U+00CA)
148
+ ê (U+00EA)
149
+ Ḩ (U+1E28)
150
+ ḩ (U+1E29)
151
+ Ī (U+012A)
152
+ ī (U+012B)
153
+ N̄ (U+004E+0304)
154
+ n̄ (U+004E+0304)
155
+ Ō (U+014C)
156
+ ō (U+014D)
157
+ Ṟ (U+0052+0031)
158
+ ṟ (U+0072+0031)
159
+ Ş (U+015E)
160
+ ş (U+015F)
161
+ S̄ (U+0053+0304)
162
+ s̄ (U+0073+0304)
163
+ Ṯ (U+0054+0031)
164
+ ṯ (U+0074+0031)
165
+ Ţ (U+0162)
166
+ ţ (U+0163)
167
+ Ū (U+016A)
168
+ ū (U+016B)
169
+ Z̧ (U+005A+0327)
170
+ z̧ (U+007A+0327)
171
+ Z̄ (U+005A+0304)
172
+ z̄ (U+007A+0304)
173
+ Ẕ (U+005A+0331)
174
+ ẕ (U+007A+0331)
175
+ Z͟ H (U+005A+0048+035F)
176
+ z͟ h (U+007A+0068+035F)
177
+
178
+ tests:
179
+ - source: بَغْلان
180
+ expected: Baghlān
181
+
182
+ - source: پُوټَكَى
183
+ expected: Pōṯakay
184
+
185
+ - source: شِيرِين تَگَاب
186
+ expected: Shīrīn Tagāb
187
+
188
+ - source: کُوْټ
189
+ expected: Kōṯ
190
+
191
+ - source: ثَابِر
192
+ expected: S̄ābir
193
+
194
+ - source: جَلال آبَاد
195
+ expected: Jalālābād
196
+
197
+ - source: چَارِيكَار
198
+ expected: Chārīkār
199
+
200
+ - source: ځَدْرَاڼ
201
+ expected: Dzadrāṉ
202
+
203
+ - source: څَوکۍ
204
+ expected: Tsowkêy
205
+
206
+ - source: حَضْرَتِ إِمَام
207
+ expected: Ḩaẕrat-e Imām
208
+
209
+ - source: خُوْسْت
210
+ expected: Khōst
211
+
212
+ - source: سْپِين بُوْلْدَک
213
+ expected: Spīn Bōldak
214
+
215
+ - source: ډَنْډ وَ پَتَان
216
+ expected: Ḏanḏ Wa Patān
217
+
218
+ - source: كَنْدَهَار
219
+ expected: Kandahār
220
+
221
+ - source: أَنْدَړ
222
+ expected: Andaṟ
223
+
224
+ - source: كُنْدُز
225
+ expected: Kunduz
226
+
227
+ - source: مِير أَسْلَم ژْرَنْدَه
228
+ expected: Mīr Aslam Zhrandah
229
+
230
+ - source: ږِيرَه
231
+ expected: Z͟hīrah
232
+
233
+ - source: سَمَنْگَان
234
+ expected: Samangān
235
+
236
+ - source: كښٙتَه كَلا
237
+ expected: Ks͟hêtah Kalā
238
+
239
+ - source: قَيْصَار
240
+ expected: Qayşār
241
+
242
+ - source: فَيض آبَاد
243
+ expected: Faīẕābād
244
+
245
+ - source: حَضْرَتِ سُلْطَان
246
+ expected: Ḩaẕrat-e Sulţān
247
+
248
+ - source: ظَاهِر كَلا
249
+ expected: Z̧āhir Kalā
250
+
251
+ - source: پُلِ عَلَم
252
+ expected: Pul-e ‘Alam
253
+
254
+ - source: غَزْنِي
255
+ expected: Ghaznī
256
+
257
+ - source: مَزَارِ شَرِيف
258
+ expected: Mazār-e Sharīf
259
+
260
+ - source: قَيْصَار
261
+ expected: Qayşār
262
+
263
+ - source: كَنْدَهَار
264
+ expected: Kandahār
265
+
266
+ - source: گَرْدېز
267
+ expected: Gardēz
268
+
269
+ - source: کَابُل
270
+ expected: Kābul
271
+
272
+ - source: مَيمَنَه
273
+ expected: Maīmanah
274
+
275
+ - source: خَان آبَاد
276
+ expected: Khānābād
277
+
278
+ - source: مَاڼۍ
279
+ expected: Māṉêy
280
+
281
+ - source: وَاخَان
282
+ expected: Wākhān
283
+
284
+ - source: يَنْگِي قَلعَه
285
+ expected: Yangī Qal‘ah
286
+
287
+ - source: جَلال آبَاد
288
+ expected: Jalālābād
289
+
290
+ - source: مُرْغَاب کَابُل
291
+ expected: Murghāb Kābul
292
+
293
+ - source: گٙردُون
294
+ expected: Gêrdōn
295
+
296
+ - source: آب بَنْد
297
+ expected: Āb Band
298
+
299
+ - source: سْپِين بُوْلْدَک
300
+ expected: Spīn Bōldak
301
+
302
+ - source: جَوزجَان
303
+ expected: Jowzjān
304
+
305
+ - source: گَرْدېز
306
+ expected: Gardēz
307
+
308
+ - source: مَیدان شَهْر
309
+ expected: Maīdān Shahr
310
+
311
+ - source: ډَنْډِ سُفْلىٰ
312
+ expected: Ḏanḏ-e Suflá
313
+
314
+ - source: جَبَل السَرَاج
315
+ expected: Jabal us Sarāj
316
+ map:
317
+ inherit: bgnpcgn-prs-Arab-Latn-2007
318
+ postrules:
319
+ - pattern: (?<=\b)(?<!\b[‘|’|'|-])[\u0061-\uFFFF]
320
+ result: "upcase"
321
+ # don't capitalize defined article in the middle of a sentence
322
+ - pattern : ' Ut T' # الت
323
+ result: ' ut T'
324
+ - pattern : ' Us̄ S̄' # الث
325
+ result: ' us̄ S̄'
326
+ - pattern : ' Ud D' # الد
327
+ result: ' ud D'
328
+ - pattern : ' Uz̄ Z̄' # الذ
329
+ result: ' uz̄ Z̄'
330
+ - pattern : ' Ur R' # الر
331
+ result: ' ur R'
332
+ - pattern : ' Uz Z' # الز
333
+ result: ' uz Z'
334
+ - pattern : ' Us S' # الس
335
+ result: ' us S'
336
+ - pattern : ' Ush Sh' # الش
337
+ result: ' ush Sh'
338
+ - pattern : ' Uş Ş' # الص
339
+ result: ' uş Ş'
340
+ - pattern : ' Uẕ Ẕ' # الض
341
+ result: ' uẕ Ẕ'
342
+ - pattern : ' Uţ Ţ' # الط
343
+ result: ' uţ Ţ'
344
+ - pattern : ' Uz̧ Z̧' # الظ
345
+ result: ' uz̧ Z̧'
346
+ - pattern : ' Ul L' # الل
347
+ result: ' ul L'
348
+ - pattern : ' Un n' # الن
349
+ result: ' un N'
350
+ characters:
351
+
352
+ '\u0650': 'i' # ِ kasra
353
+ '\u064f': 'u' # ُ damma
354
+
355
+ '\u0650\b' : '-e' # ِ kasra
356
+
357
+ '\s\u0627\u0644\u0644\u0651\u064e\u0647' : 'ullāh' # Note5
358
+ '\u0652' : '' # ْ sokoon
359
+ '\u0659': 'ê'
360
+
361
+ # Sun letters
362
+ '\b\u0627\u0644\u062a' : 'ut t' # الت
363
+ '\b\u0627\u0644\u062b' : 'us̄ s̄' # الث
364
+ '\b\u0627\u0644\u062f' : 'ud d' # الد
365
+ '\b\u0627\u0644\u0630' : 'uz̄ z̄' # الذ
366
+ '\b\u0627\u0644\u0631' : 'ur r' # الر
367
+ '\b\u0627\u0644\u0632' : 'uz z' # الز
368
+ '\b\u0627\u0644\u0633' : 'us s' # الس
369
+ '\b\u0627\u0644\u0634' : 'ush sh' # الش
370
+ '\b\u0627\u0644\u0635' : 'uş ş' # الص
371
+ '\b\u0627\u0644\u0636' : 'uẕ ẕ' # الض
372
+ '\b\u0627\u0644\u0637' : 'uţ ţ' # الط
373
+ '\b\u0627\u0644\u0638' : 'uz̧ z̧' # الظ
374
+ '\b\u0627\u0644\u0644' : 'ul l' # الل
375
+ '\b\u0627\u0644\u0646' : 'un n' # الن
376
+
377
+ '\u0626': 'êy' # ئ
@@ -0,0 +1,73 @@
1
+ ---
2
+ authority_id: bgnpcgn
3
+ id: 1962
4
+ language: iso-639-2:srp
5
+ source_script: Cyrl
6
+ destination_script: Latn
7
+ name: TRANSLITERATION OF SERBIAN CYRILLIC
8
+ creation_date: 1962
9
+ confirmation_date: 1962
10
+ description: |
11
+ Serbo-Croatian, the official national language of Yugoslavia, is a single literary language.
12
+ In Serbian areas it is written in the cyrillic (Serbian) alphabet, while in Croatian areas it is
13
+ written in the roman (Croatian) alphabet.
14
+ Both the BGN and PCGN use the standard Croation equivalents for fomanizing the Serbian cyrillic
15
+ alphabet whenether romanized names are not available.
16
+
17
+ notes:
18
+ - The digraph dj(Dj) will occasionally be found as the Croatian equivalent of ђ(Ђ),
19
+ but the use of dj should be limited to those instances where it is found in roman sources.
20
+
21
+ tests:
22
+ - source: Шупља Стена
23
+ expected: Šuplja Stena
24
+ - source: Чукарица
25
+ expected: Čukarica
26
+ - source: Црна Трава
27
+ expected: Crna Trava
28
+ - source: Херцег Нови
29
+ expected: Herceg Novi
30
+ - source: Улцињ
31
+ expected: Ulcinj
32
+ - source: Ужице
33
+ expected: Užice
34
+ - source: Тресаначка Река
35
+ expected: Tresanačka Reka
36
+ - source: Сјеница
37
+ expected: Sjenica
38
+ - source: Рожаје
39
+ expected: Rožaje
40
+ - source: Пљевља
41
+ expected: Pljevlja
42
+ - source: Оџаци
43
+ expected: Odžaci
44
+ - source: Никшић
45
+ expected: Nikšić
46
+ - source: Медвеђа
47
+ expected: Medveđa
48
+ - source: Лозница
49
+ expected: Loznica
50
+ - source: Књажевац
51
+ expected: Knjaževac
52
+ - source: Зрењанин
53
+ expected: Zrenjanin
54
+ - source: Житорађа
55
+ expected: Žitorađa
56
+ - source: Ервеник
57
+ expected: Ervenik
58
+ - source: Доње Љупче
59
+ expected: Donje Ljupče
60
+ - source: Гусиње
61
+ expected: Gusinje
62
+ - source: ГУСИЊЕ
63
+ expected: GUSINJE
64
+ - source: Врњачка Бања
65
+ expected: Vrnjačka Banja
66
+ - source: Бијело Поље
67
+ expected: Bijelo Polje
68
+ - source: Алибунар
69
+ expected: Alibunar
70
+
71
+ map:
72
+ inherit: bgnpcgn-srp-Cyrl-Latn-2005
73
+ postrules:
@@ -0,0 +1,459 @@
1
+ ---
2
+ authority_id: bgnpcgn
3
+ id: 2007
4
+ language: iso-639-2:urd
5
+ source_script: Arab
6
+ destination_script: Latn
7
+ name: BGN/PCGN Romanization System -- Urdu (2007)
8
+ alias:
9
+ ogc11122:
10
+ code: uas_Arab2Latn_BGN_2007
11
+ description: Unified Afghan Romanization System US Board on Geographic Names (BGN)/The Permanent Committee on Geographical Names (PCGN) 2007
12
+ url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/693788/ROMANIZATION_OF_URDU.pdf
13
+ creation_date: 2007
14
+ confirmation_date: 2017-11
15
+ description: |
16
+ The following is the approved romanization system for
17
+ deriving standard spellings of Urdu geographical names for
18
+ Pakistan. It was jointly adopted by BGN and PCGN at the
19
+ 23rd BGN/PCGN Conference in Washington, DC, in 2007 and it
20
+ is based on the Hunterian romanization system for Urdu,
21
+ which has been used by the Surveys of India and Pakistan
22
+ for romanizing Urdu geographical names for more than one
23
+ hundred years. The BGN/PCGN system laid out below includes
24
+ diacritical marks in order that the original script can be
25
+ derived from the romanized form (i.e. it is reversible).
26
+ For desk users requiring a diacritic-free form, these
27
+ diacritics can simply be removed. In every case the same
28
+ basic Roman-script characters are kept as are used in the
29
+ Hunterian system. The BGN/PCGN forms have further been
30
+ designed to harmonize with the BGN/PCGN Persian
31
+ romanization system.
32
+ notes:
33
+ - 1. When the vowel sign zīr ( ِ) occurs word-finally in the
34
+ first element of a compound, it is assumed to mark the
35
+ Persian izafat
36
+ morpheme, and is romanized -e, not i.
37
+ - 2. The source of almost all example names is the 1951
38
+ Census of Pakistan, Village List, Northwest Frontier
39
+ Province, Chitral
40
+ State. Office of the Provincial Superintendant of Census,
41
+ North-West Frontier Province, Peshawar.
42
+ - 3. No examples of aspirated dental r (rh, رھ ( were found,
43
+ though this phoneme is assumed to be part of the phonology
44
+ of
45
+ Urdu, and was therefore left out of Table 2.
46
+ - 4. Note that the short vowels in the Urdu examples are not
47
+ pointed.
48
+ - 5. Occasionally, sequences of /z/ or /s/ plus /h/ may be
49
+ encountered, i.e. z·h, s·h. These may be romanized with the
50
+ Unicode
51
+ 'center dot' (U+00B7) separating the two letters, to
52
+ distinguish them from the digraphs /zh/ and /sh/.
53
+ - Commented tests are blocked by this issue https://github.com/interscript/interscript/issues/572
54
+ depends on the different ways of handling ي to y or e AND و to u or o
55
+
56
+
57
+ tests:
58
+ # - source: بوغدِی
59
+ # expected: Boghdī
60
+
61
+ - source: پَالِير
62
+ expected: Pālīr
63
+
64
+ # - source: بیزوت كَلے
65
+ # expected: Bezot Kale
66
+
67
+ # - source: عَمَل كوٹ
68
+ # expected: ‘Amal Koṭ
69
+
70
+ - source: ثَابِر
71
+ expected: S̄ābir
72
+
73
+ - source: شَاه نَثَار ميلة
74
+ expected: Shāh Nas̄ār Mylah
75
+
76
+ # - source: بَرجُو ميلَه
77
+ # expected: Barjū Melah
78
+
79
+ - source: چَپرِی
80
+ expected: Chaprī
81
+
82
+ - source: أَحمَد خَان كَلے
83
+ expected: Aḩmad Khān Kale
84
+
85
+ # - source: آكَا خيل
86
+ # expected: Ākā Khel
87
+
88
+ - source: دُرَانِي
89
+ expected: Durānī
90
+
91
+ - source: ڈَنگِیلا
92
+ expected: Ḍangīlā
93
+
94
+ - source: ذَرَانِی
95
+ expected: Z̄arānī
96
+
97
+ - source: بُركِي
98
+ expected: Burkī
99
+
100
+ - source: گِیدَڑَه
101
+ expected: Gīdaṛah
102
+
103
+ - source: عَلِي زَائِي
104
+ expected: ‘Alī Zā’ī
105
+
106
+ # - source: ژوب
107
+ # expected: Zhob
108
+
109
+ - source: بِسَاتُو
110
+ expected: Bisātū
111
+
112
+ - source: أَحمَدِي شَامَا
113
+ expected: Aḩmadī Shāmā
114
+
115
+ - source: اَصَالَت كَلے
116
+ expected: Aşālat Kale
117
+
118
+ - source: خَضَر خَان
119
+ expected: Khaẕar Khān
120
+
121
+ - source: سُلْطَان
122
+ expected: Sulţān
123
+
124
+ - source: عَزَم سَيِّد نُور كَلے
125
+ expected: ‘Azam Sayyid Nūr Kale
126
+
127
+ # - source: عَلَم شير
128
+ # expected: ‘Alam Sher
129
+
130
+ - source: بغَاكِي
131
+ expected: Bghākī
132
+
133
+ # - source: مُظَفَر كوٹ
134
+ # expected: Muz̧afar Koṭ
135
+
136
+ - source: حَقدَرَه
137
+ expected: Ḩaqdarah
138
+
139
+ - source: کَچکِینَہ
140
+ expected: Kachkīnah
141
+
142
+ - source: بَاگَن
143
+ expected: Bāgan
144
+
145
+ - source: بُلبَلَک
146
+ expected: Bulbalak
147
+
148
+ - source: بِلیَامِین
149
+ expected: Bilyāmīn
150
+
151
+ - source: نَہر
152
+ expected: Nahr
153
+
154
+ # - source: جوکَالِیَاں
155
+ # expected: Jokālīāñ
156
+
157
+ - source: اَرَوْالِی
158
+ expected: Arawālī
159
+
160
+ # - source: هیروشاه
161
+ # expected: Heroshāh
162
+
163
+ - source: مَہردِی
164
+ expected: Mahrdī
165
+
166
+ - source: بَڑھ
167
+ expected: Baṛh
168
+
169
+ # - source: شِیوَاؤ
170
+ # expected: Shīwā’o
171
+
172
+ - source: یَاردَا کَلے
173
+ expected: Yārdā Kale
174
+
175
+ - source: بهَائِي خَان
176
+ expected: Bhā’ī Khān
177
+
178
+ - source: پھاشک
179
+ expected: Phāshk
180
+
181
+ - source: تھَلّ
182
+ expected: Thall
183
+
184
+ - source: پَٹھان ريَا
185
+ expected: Paṭhān Ryā
186
+
187
+ - source: جھِیل
188
+ expected: Jhīl
189
+
190
+ - source: غَزْنِي سْپِين
191
+ expected: Ghaznī Spīn
192
+
193
+ - source: بَادشَاه چھُم
194
+ expected: Bādshāh Chhum
195
+
196
+ - source: سِندھ
197
+ expected: Sindh
198
+
199
+ - source: ڈھَنڈ
200
+ expected: Ḍhanḍ
201
+
202
+ # - source: غوزگَڑھِی
203
+ # expected: Ghozgaṛhī
204
+
205
+ # - source: دوغَل گاکھَر
206
+ # expected: Doghal Gākhar
207
+
208
+ - source: خَان گھَڑِی
209
+ expected: Khān Ghaṛī
210
+
211
+ - source: غُلَامَک كَلے
212
+ expected: Ghulāmak Kale
213
+
214
+ # - source: کاراخیل
215
+ # expected: Kārākhel
216
+
217
+ - source: خَپیَنگا
218
+ expected: Khapyangā
219
+
220
+ - source: گَندَه كَلے
221
+ expected: Gandah Kale
222
+
223
+ # - source: گُلونَا ڈھيرِي
224
+ # expected: Gulonā Ḍherī
225
+
226
+ # - source: خيرَه دِين
227
+ # expected: Kherah Dīn
228
+
229
+ - source: مَورپِتھِی
230
+ expected: Maurpithī
231
+
232
+ - source: درے پلارِی
233
+ expected: Dre Plārī
234
+
235
+ - source: آگرَہ
236
+ expected: Āgrah
237
+
238
+ - source: ڈَنڈَر
239
+ expected: Ḍanḍar
240
+
241
+ # - source: گِیدو
242
+ # expected: Gīdo
243
+
244
+ - source: گُبازانَہ
245
+ expected: Gubāzānah
246
+
247
+ # - source: اُوشو
248
+ # expected: Ūsho
249
+
250
+ - source: حَےدَر عَلِی كَلے
251
+ expected: Ḩaidar ‘Alī Kale
252
+
253
+ - source: تَودَہ چِینَہ
254
+ expected: Taudah Chīnah
255
+
256
+ - source: مُوسى خَان كَلے
257
+ expected: Mūsá Khān Kale
258
+
259
+ - source: مُلَّا بَاغ
260
+ expected: Mullā Bāgh
261
+
262
+ map:
263
+ postrules:
264
+ - pattern: (?<=\b)(?<!\b[‘|’|'|-])[\u0061-\uFFFF]
265
+ result: "upcase"
266
+ # don't capitalize defined article in the middle of a sentence
267
+ - pattern : ' At T' # الت
268
+ result: ' at T'
269
+ - pattern : ' As̄ S̄' # الث
270
+ result: ' as̄ S̄'
271
+ - pattern : ' Ad D' # الد
272
+ result: ' ad D'
273
+ - pattern : ' Az̄ Z̄' # الذ
274
+ result: ' az̄ Z̄'
275
+ - pattern : ' Ar R' # الر
276
+ result: ' ar R'
277
+ - pattern : ' Az Z' # الز
278
+ result: ' az Z'
279
+ - pattern : ' As S' # الس
280
+ result: ' as S'
281
+ - pattern : ' Ash Sh' # الش
282
+ result: ' ash Sh'
283
+ - pattern : ' Aş Ş' # الص
284
+ result: ' aş Ş'
285
+ - pattern : ' Aẕ Ẕ' # الض
286
+ result: ' aẕ Ẕ'
287
+ - pattern : ' Aţ Ţ' # الط
288
+ result: ' aţ Ţ'
289
+ - pattern : ' Az̧ Z̧' # الظ
290
+ result: ' az̧ Z̧'
291
+ - pattern : ' Al L' # الل
292
+ result: ' al L'
293
+ - pattern : ' An N' # الن
294
+ result: ' an N'
295
+ - pattern: " Al " # ال
296
+ result: " al "
297
+ characters:
298
+ # special rules
299
+
300
+ '\s(?=\u0622\u0628\u064E\u0627\u062F)': '' # space followed by abad is removed
301
+ '\ufdf2': 'Allāh' # See note 5
302
+
303
+ # Vowels, Diphthongs, and Diacritical Marks
304
+ '\u064e' : 'a' # َ fatha
305
+ '\u064e(?=\u0629)' : '' # َ fatha followed by ta' marboota
306
+ '\u064e(?=a[h|t])' : '' # َ fatha followed by ta' marboota, handling different order of conversion
307
+
308
+ '\u0652' : '' # ْ sokoon
309
+ '\u0659': 'ê'
310
+
311
+ '\u0650[\u064a|\u06cc]' : 'ī' # ـِي kasra followed by ي
312
+ '\u0650' : 'i' # karsra
313
+ '\u06d2' : 'e' # ـے
314
+
315
+ '\u0622' : 'ā' # آ
316
+ '\u064e\u0627' : 'ā' # ـَا fatha followed by ا
317
+ '\u0627' : 'ā' # ا
318
+ '\b\u0627' : '' # ا
319
+
320
+ '\u0648' : 'o' # و # suspect
321
+ '\u064f' : 'u' # ُ damma
322
+ '\u064f\u0648' : 'ū' # ـُو damma followed by و
323
+
324
+ '\u064e\u06d2' : 'ai' # ـے
325
+ '\u064e\u0648' : 'au' # ـَو
326
+ '\u0670': 'á' # ىٰ
327
+ '\u0649': 'á' # ىٰ
328
+
329
+ # shadda
330
+ '\u0628\u0651' : 'bb' # ب
331
+ '\u062a\u0651' : 'tt' # ت
332
+ '\u062b\u0651' : 'thth' # ث
333
+ '\u062c\u0651' : 'jj' # ج
334
+ '\u062d\u0651' : 'ẖẖ' # ح
335
+ '\u062e\u0651' : 'khkh' # خ
336
+ '\u062f\u0651' : 'dd' # د
337
+ '\u0630\u0651' : 'z̄z̄' # ذ
338
+ '\u0631\u0651' : 'rr' # ر
339
+ '\u0632\u0651' : 'zz' # ز
340
+ '\u0633\u0651' : 'ss' # س
341
+ '\u0634\u0651' : 'sh' # ش
342
+ '\u0635\u0651' : 'şş' # ص
343
+ '\u0636\u0651' : 'ḏḏ' # ض
344
+ '\u0637\u0651' : 'ţţ' # ط
345
+ '\u0638\u0651' : 'z̧z̧' # ظ
346
+ '\u063a\u0651' : 'ghgh' # غ
347
+ '\u0641\u0651' : 'ff' # ف
348
+ '\u0642\u0651' : 'qq' # ق
349
+ '\u0643\u0651' : 'kk' # ك
350
+ '\u0644\u0651' : 'll' # ل
351
+ '\u0645\u0651' : 'mm' # م
352
+ '\u0646\u0651' : 'nn' # ن
353
+ '\u0647\u0651' : 'hh' # ه
354
+ '\u0648\u0651' : 'ww' # و
355
+ '[\u064a|\u06cc]\u0651' : 'yy' # ي
356
+
357
+ # NOTE 1
358
+ '\u0650\b' : '-e' # ِ kasra
359
+ '\u0674' : '-e' # ٴ
360
+ '\u0654' : '-e' # ٔ
361
+
362
+ '\u0650\u064a\u0651\u064e' : 'īy' # ـِيَّ
363
+ '\u0650\u064a(?=\u064e|u064f)' : 'iy' # ـِي kasra followed by ي
364
+ '\u064e\u0649' : 'ay' # ـَى fatha followed by ى which is ا not ي
365
+ '\u064e\u0648\u0652' : 'aw' # ـَوْ
366
+ '\u064e\u064a\u0652' : 'ay' # ـَيْ
367
+ '\u0650\u06cc\u0651\u064e' : 'īy' # ـِيَّ
368
+ '\u064e\u064a' : 'aī' # ـَي
369
+ '\u064e\u06cc' : 'aī' # ـَي
370
+ # - '-ye'
371
+
372
+
373
+ # ta' marboota
374
+ '\u0629' : 'at' # ة in the middle of the sentence
375
+ '\u0629$' : 'ah'
376
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{2})\u0629' : 'ah'
377
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{3})\u0629' : 'ah'
378
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{4})\u0629' : 'ah'
379
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{5})\u0629' : 'ah'
380
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{6})\u0629' : 'ah'
381
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{7})\u0629' : 'ah'
382
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{8})\u0629' : 'ah'
383
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{9})\u0629' : 'ah'
384
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{10})\u0629' : 'ah'
385
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{11})\u0629' : 'ah'
386
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{12})\u0629' : 'ah'
387
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{13})\u0629' : 'ah'
388
+
389
+
390
+
391
+ '\u0621' : '’' # ء
392
+ '\u0624' : '’' # ؤ
393
+ '\u0624\b' : '’o' # ؤ
394
+ '\u0626' : '’' # ئ
395
+
396
+ '\u0623' : '' # أ
397
+ '\u0625' : '' # إ
398
+ # See note B
399
+ '\b\u0627\u0644' : 'al ' # ال
400
+ # '\uFE8E' : '' # ﺎ
401
+
402
+ # Sun letters
403
+ '\b\u0627\u0644\u062a' : 'at t' # الت
404
+ '\b\u0627\u0644\u062b' : 'as̄ s̄' # الث
405
+ '\b\u0627\u0644\u062f' : 'ad d' # الد
406
+ '\b\u0627\u0644\u0630' : 'az̄ z̄' # الذ
407
+ '\b\u0627\u0644\u0631' : 'ar r' # الر
408
+ '\b\u0627\u0644\u0632' : 'az z' # الز
409
+ '\b\u0627\u0644\u0633' : 'as s' # الس
410
+ '\b\u0627\u0644\u0634' : 'ash sh' # الش
411
+ '\b\u0627\u0644\u0635' : 'aş ş' # الص
412
+ '\b\u0627\u0644\u0636' : 'aẕ ẕ' # الض
413
+ '\b\u0627\u0644\u0637' : 'aţ ţ' # الط
414
+ '\b\u0627\u0644\u0638' : 'az̧ z̧' # الظ
415
+ '\b\u0627\u0644\u0644' : 'al l' # الل
416
+ '\b\u0627\u0644\u0646' : 'an n' # الن
417
+
418
+
419
+ # consonant characters
420
+
421
+ '\u0628' : 'b' # ب
422
+ '\u067E' : 'p' # پ
423
+ '\u062a' : 't' # ت
424
+ '\u0679' : 'ṭ' # ٹ
425
+ '\u062B' : 's̄' # ث
426
+ '\u062c' : 'j' # ج
427
+ '\u0686' : 'ch' # ‫چ‬
428
+ '\u062d' : 'ḩ' # ح
429
+ '\u062e' : 'kh' # خ
430
+ '\u062f' : 'd' # د
431
+ '\u0688' : 'ḍ' # ‫ڈ
432
+ '\u0630' : 'z̄' # ذ
433
+ '\u0631' : 'r' # ر
434
+ '\u0691' : 'ṛ' # ڑ
435
+ '\u0632' : 'z' # ز
436
+ '\u0698' : 'zh' # ‫ژ‬
437
+ '\u0633' : 's' # س
438
+ '\u0634' : 'sh' # ش
439
+ '\u0635' : 'ş' # ص
440
+ '\u0636' : 'ẕ' # ض
441
+ '\u0637' : 'ţ' # ط
442
+ '\u0638' : 'z̧' # ظ
443
+ '\u0639' : '‘' # ع
444
+ '\u063a' : 'gh' # غ
445
+ '\u0641' : 'f' # ف
446
+ '\u0642' : 'q' # ق
447
+ '\u0643' : 'k' # ك
448
+ '\u06A9' : 'k' # ک
449
+ '\u06AF' : 'g' # ‫گ‬
450
+ '\u0644' : 'l' # ل
451
+ '\u0645' : 'm' # م
452
+ '\u0646' : 'n' # ن
453
+ '\u06BA' : 'ñ' # ڼ
454
+ '[\u0647|\u06c1|\u06be]' : 'h' # ه
455
+ '\u0648' : 'w' # و
456
+ '[\u064a|\u06cc]' : 'y' # ي
457
+ # '\u0649' : 'y' # ي
458
+ '\u06D0' : 'ē' # ې
459
+ '\u06CD' : 'êy' # ‫ۍ‬