interscript 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +1 -3
  3. data/aliases.json +1 -0
  4. data/lib/interscript.rb +8 -3
  5. data/lib/interscript/fs.rb +27 -0
  6. data/lib/interscript/mapping.rb +3 -1
  7. data/lib/interscript/opal.rb +142 -3
  8. data/lib/interscript/opal/entrypoint.rb +8 -0
  9. data/lib/interscript/opal/exports.rb +11 -0
  10. data/lib/interscript/opal/maps.js.erb +2 -4
  11. data/lib/interscript/version.rb +1 -1
  12. data/maps/alalc-ara-Arab-Latn-1997.yaml +5 -5
  13. data/maps/alalc-asm-Deva-Latn-1997.yaml +104 -10
  14. data/maps/alalc-asm-Deva-Latn-2012.yaml +18 -3
  15. data/maps/alalc-aze-Arab-Latn-1997.yaml +376 -0
  16. data/maps/alalc-ben-Beng-Latn-1997.yaml +291 -0
  17. data/maps/alalc-div-Thaa-Latn-1997.yaml +211 -0
  18. data/maps/alalc-hin-Deva-Latn-1997.yaml +102 -10
  19. data/maps/alalc-hin-Deva-Latn-2011.yaml +19 -1
  20. data/maps/alalc-kan-Kana-Latn-1997.yaml +274 -0
  21. data/maps/alalc-kan-Kana-Latn-2011.yaml +63 -0
  22. data/maps/alalc-ori-Orya-Latn-1997.yaml +284 -0
  23. data/maps/alalc-ori-Orya-Latn-2011.yaml +67 -0
  24. data/maps/alalc-pra-Deva-Latn-2012.yaml +2 -2
  25. data/maps/alalc-san-Deva-Latn-2012.yaml +78 -9
  26. data/maps/alalc-tel-Telu-Latn-1997.yaml +284 -0
  27. data/maps/alalc-tel-Telu-Latn-2011.yaml +64 -0
  28. data/maps/az-aze-Cyrl-Latn-1939.yaml +105 -0
  29. data/maps/az-aze-Cyrl-Latn-1958.yaml +45 -0
  30. data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +3 -1
  31. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +111 -104
  32. data/maps/bgnpcgn-bal-Arab-Latn-2008.yaml +329 -0
  33. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +1 -1
  34. data/maps/bgnpcgn-div-Thaa-Latn-1988.yaml +75 -0
  35. data/maps/bgnpcgn-far-Latn-Latn-1964.yaml +28 -0
  36. data/maps/bgnpcgn-isl-Latn-Latn-1964.yaml +37 -0
  37. data/maps/bgnpcgn-kaz-Cyrl-Latn-1979.yaml +247 -0
  38. data/maps/bgnpcgn-kir-Cyrl-Latn-1979.yaml +218 -0
  39. data/maps/bgnpcgn-kur-Arab-Latn-2007.yaml +249 -0
  40. data/maps/bgnpcgn-per-Arab-Latn-1958.yaml +2 -0
  41. data/maps/bgnpcgn-prs-Arab-Latn-2007.yaml +87 -53
  42. data/maps/bgnpcgn-pus-Arab-Latn-1968.yaml +377 -0
  43. data/maps/bgnpcgn-srp-Cyrl-Latn-1962.yaml +73 -0
  44. data/maps/bgnpcgn-urd-Arab-Latn-2007.yaml +459 -0
  45. data/maps/{bis-knd-Knda-Latn-13194-1991.yaml → bis-kan-Kana-Latn-13194-1991.yaml} +2 -2
  46. data/maps/bis-ori-Orya-Latn-13194-1991.yaml +17 -2
  47. data/maps/iso-ara-Arab-Latn-233-1984.yaml +1 -1
  48. data/maps/{iso-kan-Knda-Latn-15919-2001.yaml → iso-kan-Kana-Latn-15919-2001.yaml} +1 -1
  49. data/maps/{mns-mon-Cyrl-Latn-5217-2012.yaml → masm-mon-Cyrl-Latn-5217-2012.yaml} +2 -2
  50. data/maps/{mns-mon-Latn-Cyrl-5217-2012.yaml → masm-mon-Latn-Cyrl-5217-2012.yaml} +1 -1
  51. data/maps/mv-div-Thaa-Latn-1987.yaml +200 -0
  52. data/maps/odni-ara-Arab-Latn-2004.yaml +137 -0
  53. data/maps/odni-ara-Arab-Latn-2015.yaml +20 -130
  54. data/maps/odni-bul-Cyrl-Latn-2005.yaml +90 -0
  55. data/maps/odni-fas-Arab-Latn-2004.yaml +276 -0
  56. data/maps/odni-hin-Deva-Latn-2004.yaml +182 -0
  57. data/maps/odni-mkd-Cyrl-Latn-2005.yaml +21 -0
  58. data/maps/odni-prs-Arab-Latn-2004.yaml +123 -0
  59. data/maps/{odni-per-Arab-Latn-2015.yaml → odni-prs-Arab-Latn-2015.yaml} +0 -0
  60. data/maps/odni-srp-Cyrl-Latn-2005.yaml +36 -0
  61. data/maps/odni-tuk-Cyrl-Latn-2015.yaml +170 -0
  62. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +4 -0
  63. data/maps/un-ara-Arab-Latn-2017.yaml +1 -1
  64. data/maps/un-asm-Beng-Latn-1972.yaml +223 -0
  65. data/maps/un-guj-Gujr-Latn-1972.yaml +229 -0
  66. data/maps/un-hin-Deva-Latn-2016.yaml +104 -10
  67. data/maps/un-kan-Kana-Latn-2016.yaml +254 -0
  68. data/maps/un-mal-Mlym-Latn-1972.yaml +251 -0
  69. data/maps/un-mar-Deva-Latn-2016.yaml +24 -13
  70. data/maps/un-nep-Deva-Latn-1972.yaml +40 -121
  71. data/maps/un-ori-Orya-Latn-1972.yaml +247 -0
  72. data/maps/un-pan-Guru-Latn-1972.yaml +402 -0
  73. data/maps/un-prs-Arab-Latn-1967.yaml +236 -0
  74. data/maps/un-tam-Taml-Latn-1972.yaml +194 -0
  75. data/maps/un-tel-Telu-Latn-1972.yaml +270 -0
  76. data/maps/un-urd-Arab-Latn-1972.yaml +405 -0
  77. data/maps/var-amh-Ethi-Latn-eae-2003.yaml +466 -0
  78. data/maps/var-gez-Ethi-Latn-eae-2003.yaml +76 -0
  79. data/spec/interscript/filenames_spec.rb +6 -369
  80. data/spec/interscript_spec.rb +10 -2
  81. metadata +50 -7
  82. data/lib/interscript/opal/map_translate.rb +0 -7
@@ -0,0 +1,249 @@
1
+ ---
2
+ authority_id: bgnpcgn
3
+ id: 2007
4
+ language: kur
5
+ source_script: Arab
6
+ destination_script: Latn
7
+ name: ROMANIZATION OF KURDISH -- BGN/PCGN 2007
8
+ url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/693727/ROMANIZATION_OF_KURDISH.pdf
9
+ creation_date: 2007
10
+ confirmation date: 2017-12
11
+ description: |
12
+ The tabulation below is applicable to the Kurdish language as a
13
+ whole. It is based for the most part on the Hawar Roman alphabet used
14
+ in the Library of Congress Standard Kurdish Orthography Table, but it
15
+ also incorporates certain non-Hawar elements found in A Kurdish-English
16
+ Dictionary (Taufiq Wahby & C J Edmonds, OUP, 1966). The tabulation
17
+ covers both major varieties of the Kurdish language: Kurmanji and
18
+ Sorani. Kurmanji is spoken principally in Turkey and in Iraq north of
19
+ the Great Zab River (Dahūk/Dihok Governorate). It is generally written
20
+ in Roman script, and usually employs the Roman orthography. Sorani is
21
+ spoken principally in Iraq south of the Great Zab river (Arbīl/Hewlêr
22
+ and As Sulaymānīyah/Slêmanî governorates). It is generally written in
23
+ Perso-Arabic script, and usually employs the Perso-Arabic script
24
+ orthography.
25
+
26
+ Kurdish forms of geographical names in Turkey will usually be found
27
+ in Roman script, and so no romanization process will be required. The
28
+ digraph options for consonant letters '\u0686', '\u0634', and '\u063A'
29
+ will not be encountered for such names. In Iraq, Syria, and Iran,
30
+ Kurdish will usually be encountered in Perso-Arabic script, in which
31
+ case it should be romanized into the corresponding Roman script form.
32
+ Kurdish geographical names for places and features outside Turkey,
33
+ found in Roman script form, should, where necessary and if possible, be
34
+ tailored to fit the orthography of the Romanization shown below and
35
+ should employ the digraph options for consonant letters '\u0686',
36
+ '\u0634', and '\u063A'.
37
+
38
+ notes:
39
+
40
+ - In pure Kurdish words hamza is borne by yā’ ( ئ ) and occurs only
41
+ before initial vowels; it is not romanized. Medial and final hamza in
42
+ Arabic borrowings are romanized by ’ (apostrophe – Unicode encoding
43
+ 2019).
44
+
45
+ - The letters ث ذ ص ض ط ظ do not occur in pure Kurdish words. In Arabic
46
+ borrowings some writers retain these letters, others substitute س ز س ز
47
+ ت ز respectively. Only the letters ط ض and ص are catered for in the
48
+ Library of Congress tabulation, as reflected in lines 16-18 of the
49
+ above Consonant table. Words of obvious Arabic origin occurring in a
50
+ Kurdish toponymic environment will be treated as Kurdish rather than
51
+ Arabic, as will words of other non-Kurdish origins.
52
+
53
+ - The digraph options appearing in rows 6, 15 and 20 of the consonants
54
+ table should be used for Kurdish geographical names in Iraq, Iran, and
55
+ Syria. The single character options should be used for Kurdish
56
+ geographical names in Turkey.
57
+
58
+ - ڨ is used to represent v in foreign words. Some southern Kurdish
59
+ writers use it to represent the v in borrowings from northern Kurdish
60
+ dialects. و is pronounced as a v in the north and as a w elsewhere.
61
+
62
+ - Hā’ can be used as a vowel or a consonant. The initial (ه) and medial
63
+ (forms are used for the consonant h, Consonant table, row 31, while the
64
+ final (ه) and independent (forms are used to represent the vowel e,
65
+ Vowel table, row 1. Therefore, when used as a consonant, the final and
66
+ independent forms of hā’ will be seen as ‘ه’ instead of ‘and ‘ه’,
67
+ respectively. For example, مهه meh, (“month”). When used as ‘e’, the
68
+ hā’ behaves like the letters alif (ا) , wāw, dāl (د) , and rā (ر) , in
69
+ that it never joins to the following letter (i.e., it has no medial
70
+ form). Consequently, the following letter will display the initial
71
+ form, e.g. هەولێر Hewlêr (unless there is only one following letter, in
72
+ which case it will be written in the independent form, e.g. ماوەت
73
+ Mawet). As with other vowels (see special rules 2 and 3), initial e is
74
+ preceded by the kursî hamza, yielding initial ئه , e.g. ئهني enî
75
+ “forehead”.
76
+
77
+ - In pure Kurdish words, the vowel ى is always long î, e.g. كانى ماسێ
78
+ Kanî Masê. When it represents îzafe, it is also romanized î and joined
79
+ by means of a hyphen to its preceding word e.g. پارێزگاى دهۆك Parêzga-î
80
+ Dihok.
81
+
82
+ - |
83
+ An inventory of letter-diacritic combinations, used in addition to
84
+ the unmodified letters of the basic Roman script in the Romanization of
85
+ Kurdish, with their Unicode encoding, is:
86
+
87
+ '‘': '\u2018' , '’': '2019'
88
+ 'Ç': '00C7' , 'ç': '00E7'
89
+ 'Ḍ': '1E0C' , 'ḍ': '1E0D'
90
+ 'Ê': '00CA' , 'ê': '00EA'
91
+
92
+ # There is no single Unicode encoding for these letter-diacritic combinations.
93
+ 'Ḧ': '0048+0308' , 'ḧ': '0068+0308'
94
+ 'Î': '00CE' , 'î': '00EE'
95
+ 'Ł': '0141' , 'ł': '0142'
96
+ 'Ö': '00D6' , 'ö': '00F6'
97
+ 'Ṟ': '1E5E' , 'ṟ': '1E5F'
98
+ 'Ş': '015E' , 'ş': '015F'
99
+ 'Ṣ': '1E62' , 'ṣ': '1E63'
100
+ 'Ṭ': '1E6C' , 'ṭ': '1E6D'
101
+ 'Û': '00DB' , 'û': '00FB'
102
+ 'Ü': '00DC' , 'ü': '00FC'
103
+ 'Ẍ': '1E8C' , 'ẍ': '1E8D'
104
+
105
+ - The Romanization column shows only lowercase forms but, when
106
+ romanizing, uppercase and lowercase Roman letters as appropriate should
107
+ be used.
108
+
109
+ # Special Rules
110
+ - The conjunction و (and) should be rendered u if the
111
+ preceding word ends in a consonant, and w if the preceding
112
+ word ends in a vowel. It should be separated by spaces from
113
+ the preceding and following words.
114
+
115
+ - In the Perso-Arabic orthography for Kurdish, all vowels are
116
+ written, with the exception of the short i, which is
117
+ expressed with a kasrah under the preceding consonant (ِ).
118
+ In Perso-Arabic script, the kasrah will rarely be written (
119
+ e.g., كرن kirin “to do”). Like all Kurdish vowels, the
120
+ short i will be preceded by a kursî hamza )ئ )if it appears
121
+ at the beginning of a word (see 3 below; see row 4 of vowel
122
+ table).
123
+
124
+ - In the Perso-Arabic orthography for Kurdish, when a vowel
125
+ comes at the beginning of a word, or when a vowel directly
126
+ follows another vowel, a kursî hamza )ئ )precedes it (e.g.,
127
+ ئاگر agir “fire”).
128
+
129
+ - A Kurdish word will never start with alif )ا .)A Kurdish
130
+ word may begin with a yā’ (ي) or wāw )و ,)but only when
131
+ they are used as a consonant, when they will be romanized
132
+ as y and w, respectively.
133
+
134
+ - When preceded by a consonant, yā’ (ي )and wāw )و )should be
135
+ romanized î and u, respectively. When preceded by a vowel (
136
+ including short i, which is not written), yā’ (ي )and wāw (
137
+ و )should be romanized y and w, respectively.
138
+
139
+ - The Arabic sign shaddah ( ّ ) denoting a doubled consonant
140
+ is not used in Kurdish; doubled consonants, which are rare,
141
+ are written twice e.g. موحەممەد Muḧemmed; ننا موسه Musanna.
142
+ Shaddah might be used in Arabic borrowings but, as in
143
+ unpointed Arabic, would generally be omitted.
144
+
145
+ - Particles such as له le (= at, in, on) and به be (= to,
146
+ for, by, with) should be written separately from their
147
+ following word, e.g. كوردستانێ له Le Kurdistanê “in
148
+ Kurdistan”
149
+
150
+ - Occasionally the character sequences چه ,سه and گه occur.
151
+ They may be romanized c·h, s·h, and g·h in order to
152
+ differentiate those romanizations from the digraphs ch, sh,
153
+ and gh.
154
+
155
+ tests:
156
+ - source: كاني ماسێ
157
+ expected: Kanî Masê
158
+
159
+ - source: كِرِن
160
+ expected: Kirin
161
+
162
+ - source: ئاگِر
163
+ expected: Agir
164
+
165
+ - source: موحەممەد
166
+ expected: Muḧemmed
167
+
168
+ # - source: موسەننا # issue 604
169
+ # expected: Musanna
170
+
171
+ - source: لەكوردِستانێ
172
+ expected: Le Kurdistanê
173
+
174
+ map:
175
+ postrules:
176
+ - pattern: (?<=\b)(?<!\b[‘|’|'])[\u0061-\uFFFF]
177
+ result: "upcase"
178
+
179
+ characters:
180
+
181
+ '\u0650' : 'i' # ِ kasra special rule 2
182
+ '\u0644\u06d5' : 'le ' # special rule 7
183
+ '\u0628\u06d5' : 'be ' # special rule 7
184
+ # Note 1
185
+ '\u0621' : '’' # ء
186
+ '\u0624' : '’' # ؤ
187
+ '\u0626' : '’' # ئ
188
+
189
+ "(?<=[\u0621|\u0628|\u067E|\u062A|\u062C|\u0686|\u062D|\u062E|\u062F|\u0631|\u0695|\u0632|\u0698|\u0633|\u0634|\u0635|\u0636|\u0637|\u0639|\u063A|\u0341|\u06A8|\u0642|\u06A9|\u0643|\u06AF|\u0644|\u06B5|\u0645|\u0646|\u0648|\u0647|\u064A])\u0648" : 'u' # special note 4/5
190
+ "(?<=[\u0621|\u0628|\u067E|\u062A|\u062C|\u0686|\u062D|\u062E|\u062F|\u0631|\u0695|\u0632|\u0698|\u0633|\u0634|\u0635|\u0636|\u0637|\u0639|\u063A|\u0341|\u06A8|\u0642|\u06A9|\u0643|\u06AF|\u0644|\u06B5|\u0645|\u0646|\u0648|\u0647|\u064A])\u064A" : 'î' # special note 4/5
191
+ '\u0621': '’' # ء (see note 1 and 7)
192
+ '\u0628': 'b' # ب
193
+ '\u067E': 'p' # پ
194
+ '\u062A': 't' # ت (see note 2)
195
+ '\u062C': 'c' # ج
196
+ '\u0686': # چ (see notes 3 and 7)
197
+ - 'ch'
198
+ - 'ç'
199
+ '\u062D': 'ḧ' # ح
200
+ '\u062E': 'x' # خ
201
+ '\u062F': 'd' # د
202
+ '\u0631': 'r' # ر
203
+ '\u0695': 'ṟ' # ڕ (Formerly written ڒ ڔ or رر according to typeface available; may vary on older sources. See note 7.)
204
+ '\u0632': 'z' # ز (see note 2)
205
+ '\u0698': 'j' # ژ
206
+ '\u0633': 's' # س (see note 2)
207
+ '\u0634': # ش (see notes 3 and 7)
208
+ - 'sh'
209
+ - 'ş'
210
+ '\u0635': 'ṣ' # ص (see notes 2 and 7)
211
+ '\u0636': 'ḍ' # ض (see notes 2 and 7)
212
+ '\u0637': 'ṭ' # ط (see notes 2 and 7)
213
+ '\u0639': '‘' # ع (see note 7)
214
+ '\u063A': # غ (see notes 3 and 7)
215
+ - 'gh'
216
+ - 'ẍ'
217
+ '\u0341': 'f' # ف
218
+ '\u06A8': 'v' # ڨ (see note 4)
219
+ '\u0642': 'q' # ق
220
+ '\u06A9': 'k' # ك
221
+ '\u0643': 'k' # ك
222
+ '\u06AF': 'g' # گ
223
+ '\u0644': 'l' # ل
224
+ '\u06B5': 'ł' # ڵ (Formerly written ڶ according to type available; may vary on older sources. See note 7)
225
+ '\u0645': 'm' # م
226
+ '\u0646': 'n' # ن
227
+ '\u0648': 'w' # و (see note 4)
228
+ '\u0647': 'h' # ه (see note 5)
229
+ '\u064A': 'y' # ي
230
+
231
+ # VOWELS
232
+ '\u0647\b': 'e' # See notes 1 and 5
233
+ '\u06D5': 'e' # See notes 1 and 5
234
+ '\u0626\u06D5': 'e' # See notes 1 and 5
235
+ '\u0627': 'a' # See note 1
236
+ '\u0626\u0627': 'a' # See note 1
237
+ '\u064A': 'î' # See notes 1, 6 and 7
238
+ '\u0626\u064A': 'î' # See notes 1, 6 and 7
239
+ '\u0626': 'i'
240
+ '\u06CE': 'ê' # See note 7
241
+ '\u0626\u06CE': 'ê' # See note 7
242
+ '\u0648': 'u'
243
+ '\u0626\u0648': 'u'
244
+ '\u0648\u0648': 'û' # See note 7
245
+ '\u0626\u0648\u0648': 'û' # See note 7
246
+ '\u06C6': 'o'
247
+ '\u0626\u06C6': 'o'
248
+ '\u0648': 'ö' # Rare; previously written وي . See note 7
249
+ '\u06CA': 'ü' # Only appearing in some dialects and only in old sources. Often equated to /û/ (row 7 above). Sometimes written يو See note 7.
@@ -196,6 +196,8 @@ tests:
196
196
  - source: اللَّه آبَاد
197
197
  expected: Allāhābād
198
198
 
199
+ - source: اِيران
200
+ expected: Īrān
199
201
  map:
200
202
  postrules:
201
203
  - pattern: (?<=\b)(?<!\b[‘|’|'|-])[\u0061-\uFFFF]
@@ -203,172 +203,172 @@ notes:
203
203
 
204
204
  tests:
205
205
  - source: بَغْلان
206
- expected: baghlān
206
+ expected: Baghlān
207
207
 
208
208
  - source: پُوټَكَى
209
- expected: pōṯakay
209
+ expected: Pōṯakay
210
210
 
211
211
  - source: شِيرِين تَگَاب
212
- expected: shīrīn tagāb
212
+ expected: Shīrīn Tagāb
213
213
 
214
214
  - source: کُوْټ
215
- expected: kōṯ
215
+ expected: Kōṯ
216
216
 
217
217
  - source: ثَابِر
218
- expected: s̄ābir
218
+ expected: S̄ābir
219
219
 
220
220
  - source: جَلال آبَاد
221
- expected: jalālābād
221
+ expected: Jalālābād
222
222
 
223
223
  - source: چَارِيكَار
224
- expected: chārīkār
224
+ expected: Chārīkār
225
225
 
226
226
  - source: ځَدْرَاڼ
227
- expected: dzadrāṉ
227
+ expected: Dzadrāṉ
228
228
 
229
229
  - source: څَوکۍ
230
- expected: tsowkêy
230
+ expected: Tsowkêy
231
231
 
232
232
  - source: حَضْرَتِ إِمَام
233
- expected: ḩaẕrat-e imām
233
+ expected: Ḩaẕrat-e Imām
234
234
 
235
235
  - source: خُوْسْت
236
- expected: khōst
236
+ expected: Khōst
237
237
 
238
238
  - source: سْپِين بُوْلْدَک
239
- expected: spīn bōldak
239
+ expected: Spīn Bōldak
240
240
 
241
241
  - source: ډَنْډ وَ پَتَان
242
- expected: ḏanḏ wa patān
242
+ expected: Ḏanḏ Wa Patān
243
243
 
244
244
  # - source: گُذَرْگَاهٔ نور
245
- # expected: guz̄argāh-e nūr
245
+ # expected: Guz̄argāh-e nūr
246
246
 
247
247
  - source: كَنْدَهَار
248
- expected: kandahār
248
+ expected: Kandahār
249
249
 
250
250
  - source: أَنْدَړ
251
- expected: andaṟ
251
+ expected: Andaṟ
252
252
 
253
253
  - source: كُنْدُز
254
- expected: kunduz
254
+ expected: Kunduz
255
255
 
256
256
  - source: مِير أَسْلَم ژْرَنْدَه
257
- expected: mīr aslam zhrandah
257
+ expected: Mīr Aslam Zhrandah
258
258
 
259
259
  - source: ږِيرَه
260
- expected: z͟hīrah
260
+ expected: Z͟hīrah
261
261
 
262
262
  - source: سَمَنْگَان
263
- expected: samangān
263
+ expected: Samangān
264
264
 
265
265
  # - source: مَزَارِ شَريف
266
- # expected: mazār-e sharīf
266
+ # expected: Mazār-e sharīf
267
267
 
268
268
  - source: كښٙتَه كَلا
269
- expected: ks͟hêtah kalā
269
+ expected: Ks͟hêtah Kalā
270
270
 
271
271
  - source: قَيْصَار
272
- expected: qayşār
272
+ expected: Qayşār
273
273
 
274
274
  - source: فَيض آبَاد
275
- expected: faīẕābād
275
+ expected: Faīẕābād
276
276
 
277
277
  - source: حَضْرَتِ سُلْطَان
278
- expected: ḩaẕrat-e sulţān
278
+ expected: Ḩaẕrat-e Sulţān
279
279
 
280
280
  - source: ظَاهِر كَلا
281
- expected: z̧āhir kalā
281
+ expected: Z̧āhir Kalā
282
282
 
283
283
  - source: پُلِ عَلَم
284
- expected: pul-e ‘alam
284
+ expected: Pul-e ‘Alam
285
285
 
286
286
  - source: غَزْنِي
287
- expected: ghaznī
287
+ expected: Ghaznī
288
288
 
289
289
  - source: مَزَارِ شَرِيف
290
- expected: mazār-e sharīf
290
+ expected: Mazār-e Sharīf
291
291
 
292
292
  - source: قَيْصَار
293
- expected: qayşār
293
+ expected: Qayşār
294
294
 
295
295
  - source: كَنْدَهَار
296
- expected: kandahār
296
+ expected: Kandahār
297
297
 
298
298
  - source: گَرْدېز
299
- expected: gardēz
299
+ expected: Gardēz
300
300
 
301
301
  - source: کَابُل
302
- expected: kābul
302
+ expected: Kābul
303
303
 
304
304
  - source: مَيمَنَه
305
- expected: maīmanah
305
+ expected: Maīmanah
306
306
 
307
307
  - source: خَان آبَاد
308
- expected: khānābād
308
+ expected: Khānābād
309
309
 
310
310
  - source: مَاڼۍ
311
- expected: māṉêy
311
+ expected: Māṉêy
312
312
 
313
313
  - source: وَاخَان
314
- expected: wākhān
314
+ expected: Wākhān
315
315
 
316
316
  # - source: هِرَات
317
- # expected: herāt
317
+ # expected: Herāt
318
318
 
319
319
  - source: يَنْگِي قَلعَه
320
- expected: yangī qal‘ah
320
+ expected: Yangī Qal‘ah
321
321
 
322
322
  - source: جَلال آبَاد
323
- expected: jalālābād
323
+ expected: Jalālābād
324
324
 
325
325
  # - source: هِرات پُلِ حِصَار
326
326
  # expected: Herāt Pul-e Ḩişār
327
327
 
328
328
  - source: مُرْغَاب کَابُل
329
- expected: murghāb kābul
329
+ expected: Murghāb Kābul
330
330
 
331
331
  - source: گٙردُون
332
- expected: gêrdōn
332
+ expected: Gêrdōn
333
333
 
334
334
  - source: آب بَنْد
335
- expected: āb band
335
+ expected: Āb Band
336
336
 
337
337
  - source: سْپِين بُوْلْدَک
338
- expected: spīn bōldak
338
+ expected: Spīn Bōldak
339
339
 
340
340
  # - source: بَالا بُلُوک
341
341
  # expected: Bālā Bulūk
342
342
 
343
343
  - source: جَوزجَان
344
- expected: jowzjān
344
+ expected: Jowzjān
345
345
 
346
346
  # - source: غَزْنِى سْپِين
347
- # expected: ghaznī spīn
347
+ # expected: Ghaznī spīn
348
348
 
349
349
  # - source: ريگ مَيوَنْد
350
350
  # expected: Maywand, Rēg
351
351
 
352
352
  - source: گَرْدېز
353
- expected: gardēz
353
+ expected: Gardēz
354
354
 
355
355
  - source: مَیدان شَهْر
356
- expected: maīdān shahr
356
+ expected: Maīdān Shahr
357
357
 
358
358
  - source: ډَنْډِ سُفْلىٰ
359
- expected: ḏanḏ-e suflá
359
+ expected: Ḏanḏ-e Suflá
360
360
 
361
361
  # - source: څَوْکۍ
362
362
  # expected: Tsowkêy
363
363
 
364
364
  # - source: هَوائِي ډَگَر
365
- # expected: hawā’ī ḏagar
365
+ # expected: Hawā’ī ḏagar
366
366
 
367
367
  # - source: مَزارِ شَريف
368
- # expected: mazār-e sharīf
368
+ # expected: Mazār-e sharīf
369
369
 
370
370
  # - source: دايکندی
371
- # expected: dāykundī
371
+ # expected: Dāykundī
372
372
 
373
373
  # - source: زيارت
374
374
  # expected: Zīārat
@@ -380,9 +380,43 @@ tests:
380
380
  # expected: Myā
381
381
 
382
382
  - source: جَبَل السَرَاج
383
- expected: jabal as sarāj
383
+ expected: Jabal as Sarāj
384
384
 
385
385
  map:
386
+ postrules:
387
+ - pattern: (?<=\b)(?<!\b[‘|’|'|-])[\u0061-\uFFFF]
388
+ result: "upcase"
389
+ # don't capitalize defined article in the middle of a sentence
390
+ - pattern : ' At T' # الت
391
+ result: ' at T'
392
+ - pattern : ' As̄ S̄' # الث
393
+ result: ' as̄ S̄'
394
+ - pattern : ' Ad D' # الد
395
+ result: ' ad D'
396
+ - pattern : ' Az̄ Z̄' # الذ
397
+ result: ' az̄ Z̄'
398
+ - pattern : ' Ar R' # الر
399
+ result: ' ar R'
400
+ - pattern : ' Az Z' # الز
401
+ result: ' az Z'
402
+ - pattern : ' As S' # الس
403
+ result: ' as S'
404
+ - pattern : ' Ash Sh' # الش
405
+ result: ' ash Sh'
406
+ - pattern : ' Aş Ş' # الص
407
+ result: ' aş Ş'
408
+ - pattern : ' Aẕ Ẕ' # الض
409
+ result: ' aẕ Ẕ'
410
+ - pattern : ' Aţ Ţ' # الط
411
+ result: ' aţ Ţ'
412
+ - pattern : ' Az̧ Z̧' # الظ
413
+ result: ' az̧ Z̧'
414
+ - pattern : ' Al L' # الل
415
+ result: ' al L'
416
+ - pattern : ' An N' # الن
417
+ result: ' an N'
418
+ - pattern: " Al " # ال
419
+ result: " al "
386
420
  characters:
387
421
 
388
422
  # word-medial or word-final form where so appearing in a word.