interscript 0.1.7 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.adoc +1 -3
- data/aliases.json +1 -0
- data/lib/interscript.rb +8 -3
- data/lib/interscript/fs.rb +27 -0
- data/lib/interscript/mapping.rb +3 -1
- data/lib/interscript/opal.rb +142 -3
- data/lib/interscript/opal/entrypoint.rb +8 -0
- data/lib/interscript/opal/exports.rb +11 -0
- data/lib/interscript/opal/maps.js.erb +2 -4
- data/lib/interscript/version.rb +1 -1
- data/maps/alalc-ara-Arab-Latn-1997.yaml +5 -5
- data/maps/alalc-asm-Deva-Latn-1997.yaml +104 -10
- data/maps/alalc-asm-Deva-Latn-2012.yaml +18 -3
- data/maps/alalc-aze-Arab-Latn-1997.yaml +376 -0
- data/maps/alalc-ben-Beng-Latn-1997.yaml +291 -0
- data/maps/alalc-div-Thaa-Latn-1997.yaml +211 -0
- data/maps/alalc-hin-Deva-Latn-1997.yaml +102 -10
- data/maps/alalc-hin-Deva-Latn-2011.yaml +19 -1
- data/maps/alalc-kan-Kana-Latn-1997.yaml +274 -0
- data/maps/alalc-kan-Kana-Latn-2011.yaml +63 -0
- data/maps/alalc-ori-Orya-Latn-1997.yaml +284 -0
- data/maps/alalc-ori-Orya-Latn-2011.yaml +67 -0
- data/maps/alalc-pra-Deva-Latn-2012.yaml +2 -2
- data/maps/alalc-san-Deva-Latn-2012.yaml +78 -9
- data/maps/alalc-tel-Telu-Latn-1997.yaml +284 -0
- data/maps/alalc-tel-Telu-Latn-2011.yaml +64 -0
- data/maps/az-aze-Cyrl-Latn-1939.yaml +105 -0
- data/maps/az-aze-Cyrl-Latn-1958.yaml +45 -0
- data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +3 -1
- data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +111 -104
- data/maps/bgnpcgn-bal-Arab-Latn-2008.yaml +329 -0
- data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +1 -1
- data/maps/bgnpcgn-div-Thaa-Latn-1988.yaml +75 -0
- data/maps/bgnpcgn-far-Latn-Latn-1964.yaml +28 -0
- data/maps/bgnpcgn-isl-Latn-Latn-1964.yaml +37 -0
- data/maps/bgnpcgn-kaz-Cyrl-Latn-1979.yaml +247 -0
- data/maps/bgnpcgn-kir-Cyrl-Latn-1979.yaml +218 -0
- data/maps/bgnpcgn-kur-Arab-Latn-2007.yaml +249 -0
- data/maps/bgnpcgn-per-Arab-Latn-1958.yaml +2 -0
- data/maps/bgnpcgn-prs-Arab-Latn-2007.yaml +87 -53
- data/maps/bgnpcgn-pus-Arab-Latn-1968.yaml +377 -0
- data/maps/bgnpcgn-srp-Cyrl-Latn-1962.yaml +73 -0
- data/maps/bgnpcgn-urd-Arab-Latn-2007.yaml +459 -0
- data/maps/{bis-knd-Knda-Latn-13194-1991.yaml → bis-kan-Kana-Latn-13194-1991.yaml} +2 -2
- data/maps/bis-ori-Orya-Latn-13194-1991.yaml +17 -2
- data/maps/iso-ara-Arab-Latn-233-1984.yaml +1 -1
- data/maps/{iso-kan-Knda-Latn-15919-2001.yaml → iso-kan-Kana-Latn-15919-2001.yaml} +1 -1
- data/maps/{mns-mon-Cyrl-Latn-5217-2012.yaml → masm-mon-Cyrl-Latn-5217-2012.yaml} +2 -2
- data/maps/{mns-mon-Latn-Cyrl-5217-2012.yaml → masm-mon-Latn-Cyrl-5217-2012.yaml} +1 -1
- data/maps/mv-div-Thaa-Latn-1987.yaml +200 -0
- data/maps/odni-ara-Arab-Latn-2004.yaml +137 -0
- data/maps/odni-ara-Arab-Latn-2015.yaml +20 -130
- data/maps/odni-bul-Cyrl-Latn-2005.yaml +90 -0
- data/maps/odni-fas-Arab-Latn-2004.yaml +276 -0
- data/maps/odni-hin-Deva-Latn-2004.yaml +182 -0
- data/maps/odni-mkd-Cyrl-Latn-2005.yaml +21 -0
- data/maps/odni-prs-Arab-Latn-2004.yaml +123 -0
- data/maps/{odni-per-Arab-Latn-2015.yaml → odni-prs-Arab-Latn-2015.yaml} +0 -0
- data/maps/odni-srp-Cyrl-Latn-2005.yaml +36 -0
- data/maps/odni-tuk-Cyrl-Latn-2015.yaml +170 -0
- data/maps/odni-ukr-Cyrl-Latn-2015.yaml +4 -0
- data/maps/un-ara-Arab-Latn-2017.yaml +1 -1
- data/maps/un-asm-Beng-Latn-1972.yaml +223 -0
- data/maps/un-guj-Gujr-Latn-1972.yaml +229 -0
- data/maps/un-hin-Deva-Latn-2016.yaml +104 -10
- data/maps/un-kan-Kana-Latn-2016.yaml +254 -0
- data/maps/un-mal-Mlym-Latn-1972.yaml +251 -0
- data/maps/un-mar-Deva-Latn-2016.yaml +24 -13
- data/maps/un-nep-Deva-Latn-1972.yaml +40 -121
- data/maps/un-ori-Orya-Latn-1972.yaml +247 -0
- data/maps/un-pan-Guru-Latn-1972.yaml +402 -0
- data/maps/un-prs-Arab-Latn-1967.yaml +236 -0
- data/maps/un-tam-Taml-Latn-1972.yaml +194 -0
- data/maps/un-tel-Telu-Latn-1972.yaml +270 -0
- data/maps/un-urd-Arab-Latn-1972.yaml +405 -0
- data/maps/var-amh-Ethi-Latn-eae-2003.yaml +466 -0
- data/maps/var-gez-Ethi-Latn-eae-2003.yaml +76 -0
- data/spec/interscript/filenames_spec.rb +6 -369
- data/spec/interscript_spec.rb +10 -2
- metadata +50 -7
- data/lib/interscript/opal/map_translate.rb +0 -7
@@ -0,0 +1,249 @@
|
|
1
|
+
---
|
2
|
+
authority_id: bgnpcgn
|
3
|
+
id: 2007
|
4
|
+
language: kur
|
5
|
+
source_script: Arab
|
6
|
+
destination_script: Latn
|
7
|
+
name: ROMANIZATION OF KURDISH -- BGN/PCGN 2007
|
8
|
+
url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/693727/ROMANIZATION_OF_KURDISH.pdf
|
9
|
+
creation_date: 2007
|
10
|
+
confirmation date: 2017-12
|
11
|
+
description: |
|
12
|
+
The tabulation below is applicable to the Kurdish language as a
|
13
|
+
whole. It is based for the most part on the Hawar Roman alphabet used
|
14
|
+
in the Library of Congress Standard Kurdish Orthography Table, but it
|
15
|
+
also incorporates certain non-Hawar elements found in A Kurdish-English
|
16
|
+
Dictionary (Taufiq Wahby & C J Edmonds, OUP, 1966). The tabulation
|
17
|
+
covers both major varieties of the Kurdish language: Kurmanji and
|
18
|
+
Sorani. Kurmanji is spoken principally in Turkey and in Iraq north of
|
19
|
+
the Great Zab River (Dahūk/Dihok Governorate). It is generally written
|
20
|
+
in Roman script, and usually employs the Roman orthography. Sorani is
|
21
|
+
spoken principally in Iraq south of the Great Zab river (Arbīl/Hewlêr
|
22
|
+
and As Sulaymānīyah/Slêmanî governorates). It is generally written in
|
23
|
+
Perso-Arabic script, and usually employs the Perso-Arabic script
|
24
|
+
orthography.
|
25
|
+
|
26
|
+
Kurdish forms of geographical names in Turkey will usually be found
|
27
|
+
in Roman script, and so no romanization process will be required. The
|
28
|
+
digraph options for consonant letters '\u0686', '\u0634', and '\u063A'
|
29
|
+
will not be encountered for such names. In Iraq, Syria, and Iran,
|
30
|
+
Kurdish will usually be encountered in Perso-Arabic script, in which
|
31
|
+
case it should be romanized into the corresponding Roman script form.
|
32
|
+
Kurdish geographical names for places and features outside Turkey,
|
33
|
+
found in Roman script form, should, where necessary and if possible, be
|
34
|
+
tailored to fit the orthography of the Romanization shown below and
|
35
|
+
should employ the digraph options for consonant letters '\u0686',
|
36
|
+
'\u0634', and '\u063A'.
|
37
|
+
|
38
|
+
notes:
|
39
|
+
|
40
|
+
- In pure Kurdish words hamza is borne by yā’ ( ئ ) and occurs only
|
41
|
+
before initial vowels; it is not romanized. Medial and final hamza in
|
42
|
+
Arabic borrowings are romanized by ’ (apostrophe – Unicode encoding
|
43
|
+
2019).
|
44
|
+
|
45
|
+
- The letters ث ذ ص ض ط ظ do not occur in pure Kurdish words. In Arabic
|
46
|
+
borrowings some writers retain these letters, others substitute س ز س ز
|
47
|
+
ت ز respectively. Only the letters ط ض and ص are catered for in the
|
48
|
+
Library of Congress tabulation, as reflected in lines 16-18 of the
|
49
|
+
above Consonant table. Words of obvious Arabic origin occurring in a
|
50
|
+
Kurdish toponymic environment will be treated as Kurdish rather than
|
51
|
+
Arabic, as will words of other non-Kurdish origins.
|
52
|
+
|
53
|
+
- The digraph options appearing in rows 6, 15 and 20 of the consonants
|
54
|
+
table should be used for Kurdish geographical names in Iraq, Iran, and
|
55
|
+
Syria. The single character options should be used for Kurdish
|
56
|
+
geographical names in Turkey.
|
57
|
+
|
58
|
+
- ڨ is used to represent v in foreign words. Some southern Kurdish
|
59
|
+
writers use it to represent the v in borrowings from northern Kurdish
|
60
|
+
dialects. و is pronounced as a v in the north and as a w elsewhere.
|
61
|
+
|
62
|
+
- Hā’ can be used as a vowel or a consonant. The initial (ه) and medial
|
63
|
+
(forms are used for the consonant h, Consonant table, row 31, while the
|
64
|
+
final (ه) and independent (forms are used to represent the vowel e,
|
65
|
+
Vowel table, row 1. Therefore, when used as a consonant, the final and
|
66
|
+
independent forms of hā’ will be seen as ‘ه’ instead of ‘and ‘ه’,
|
67
|
+
respectively. For example, مهه meh, (“month”). When used as ‘e’, the
|
68
|
+
hā’ behaves like the letters alif (ا) , wāw, dāl (د) , and rā (ر) , in
|
69
|
+
that it never joins to the following letter (i.e., it has no medial
|
70
|
+
form). Consequently, the following letter will display the initial
|
71
|
+
form, e.g. هەولێر Hewlêr (unless there is only one following letter, in
|
72
|
+
which case it will be written in the independent form, e.g. ماوەت
|
73
|
+
Mawet). As with other vowels (see special rules 2 and 3), initial e is
|
74
|
+
preceded by the kursî hamza, yielding initial ئه , e.g. ئهني enî
|
75
|
+
“forehead”.
|
76
|
+
|
77
|
+
- In pure Kurdish words, the vowel ى is always long î, e.g. كانى ماسێ
|
78
|
+
Kanî Masê. When it represents îzafe, it is also romanized î and joined
|
79
|
+
by means of a hyphen to its preceding word e.g. پارێزگاى دهۆك Parêzga-î
|
80
|
+
Dihok.
|
81
|
+
|
82
|
+
- |
|
83
|
+
An inventory of letter-diacritic combinations, used in addition to
|
84
|
+
the unmodified letters of the basic Roman script in the Romanization of
|
85
|
+
Kurdish, with their Unicode encoding, is:
|
86
|
+
|
87
|
+
'‘': '\u2018' , '’': '2019'
|
88
|
+
'Ç': '00C7' , 'ç': '00E7'
|
89
|
+
'Ḍ': '1E0C' , 'ḍ': '1E0D'
|
90
|
+
'Ê': '00CA' , 'ê': '00EA'
|
91
|
+
|
92
|
+
# There is no single Unicode encoding for these letter-diacritic combinations.
|
93
|
+
'Ḧ': '0048+0308' , 'ḧ': '0068+0308'
|
94
|
+
'Î': '00CE' , 'î': '00EE'
|
95
|
+
'Ł': '0141' , 'ł': '0142'
|
96
|
+
'Ö': '00D6' , 'ö': '00F6'
|
97
|
+
'Ṟ': '1E5E' , 'ṟ': '1E5F'
|
98
|
+
'Ş': '015E' , 'ş': '015F'
|
99
|
+
'Ṣ': '1E62' , 'ṣ': '1E63'
|
100
|
+
'Ṭ': '1E6C' , 'ṭ': '1E6D'
|
101
|
+
'Û': '00DB' , 'û': '00FB'
|
102
|
+
'Ü': '00DC' , 'ü': '00FC'
|
103
|
+
'Ẍ': '1E8C' , 'ẍ': '1E8D'
|
104
|
+
|
105
|
+
- The Romanization column shows only lowercase forms but, when
|
106
|
+
romanizing, uppercase and lowercase Roman letters as appropriate should
|
107
|
+
be used.
|
108
|
+
|
109
|
+
# Special Rules
|
110
|
+
- The conjunction و (and) should be rendered u if the
|
111
|
+
preceding word ends in a consonant, and w if the preceding
|
112
|
+
word ends in a vowel. It should be separated by spaces from
|
113
|
+
the preceding and following words.
|
114
|
+
|
115
|
+
- In the Perso-Arabic orthography for Kurdish, all vowels are
|
116
|
+
written, with the exception of the short i, which is
|
117
|
+
expressed with a kasrah under the preceding consonant (ِ).
|
118
|
+
In Perso-Arabic script, the kasrah will rarely be written (
|
119
|
+
e.g., كرن kirin “to do”). Like all Kurdish vowels, the
|
120
|
+
short i will be preceded by a kursî hamza )ئ )if it appears
|
121
|
+
at the beginning of a word (see 3 below; see row 4 of vowel
|
122
|
+
table).
|
123
|
+
|
124
|
+
- In the Perso-Arabic orthography for Kurdish, when a vowel
|
125
|
+
comes at the beginning of a word, or when a vowel directly
|
126
|
+
follows another vowel, a kursî hamza )ئ )precedes it (e.g.,
|
127
|
+
ئاگر agir “fire”).
|
128
|
+
|
129
|
+
- A Kurdish word will never start with alif )ا .)A Kurdish
|
130
|
+
word may begin with a yā’ (ي) or wāw )و ,)but only when
|
131
|
+
they are used as a consonant, when they will be romanized
|
132
|
+
as y and w, respectively.
|
133
|
+
|
134
|
+
- When preceded by a consonant, yā’ (ي )and wāw )و )should be
|
135
|
+
romanized î and u, respectively. When preceded by a vowel (
|
136
|
+
including short i, which is not written), yā’ (ي )and wāw (
|
137
|
+
و )should be romanized y and w, respectively.
|
138
|
+
|
139
|
+
- The Arabic sign shaddah ( ّ ) denoting a doubled consonant
|
140
|
+
is not used in Kurdish; doubled consonants, which are rare,
|
141
|
+
are written twice e.g. موحەممەد Muḧemmed; ننا موسه Musanna.
|
142
|
+
Shaddah might be used in Arabic borrowings but, as in
|
143
|
+
unpointed Arabic, would generally be omitted.
|
144
|
+
|
145
|
+
- Particles such as له le (= at, in, on) and به be (= to,
|
146
|
+
for, by, with) should be written separately from their
|
147
|
+
following word, e.g. كوردستانێ له Le Kurdistanê “in
|
148
|
+
Kurdistan”
|
149
|
+
|
150
|
+
- Occasionally the character sequences چه ,سه and گه occur.
|
151
|
+
They may be romanized c·h, s·h, and g·h in order to
|
152
|
+
differentiate those romanizations from the digraphs ch, sh,
|
153
|
+
and gh.
|
154
|
+
|
155
|
+
tests:
|
156
|
+
- source: كاني ماسێ
|
157
|
+
expected: Kanî Masê
|
158
|
+
|
159
|
+
- source: كِرِن
|
160
|
+
expected: Kirin
|
161
|
+
|
162
|
+
- source: ئاگِر
|
163
|
+
expected: Agir
|
164
|
+
|
165
|
+
- source: موحەممەد
|
166
|
+
expected: Muḧemmed
|
167
|
+
|
168
|
+
# - source: موسەننا # issue 604
|
169
|
+
# expected: Musanna
|
170
|
+
|
171
|
+
- source: لەكوردِستانێ
|
172
|
+
expected: Le Kurdistanê
|
173
|
+
|
174
|
+
map:
|
175
|
+
postrules:
|
176
|
+
- pattern: (?<=\b)(?<!\b[‘|’|'])[\u0061-\uFFFF]
|
177
|
+
result: "upcase"
|
178
|
+
|
179
|
+
characters:
|
180
|
+
|
181
|
+
'\u0650' : 'i' # ِ kasra special rule 2
|
182
|
+
'\u0644\u06d5' : 'le ' # special rule 7
|
183
|
+
'\u0628\u06d5' : 'be ' # special rule 7
|
184
|
+
# Note 1
|
185
|
+
'\u0621' : '’' # ء
|
186
|
+
'\u0624' : '’' # ؤ
|
187
|
+
'\u0626' : '’' # ئ
|
188
|
+
|
189
|
+
"(?<=[\u0621|\u0628|\u067E|\u062A|\u062C|\u0686|\u062D|\u062E|\u062F|\u0631|\u0695|\u0632|\u0698|\u0633|\u0634|\u0635|\u0636|\u0637|\u0639|\u063A|\u0341|\u06A8|\u0642|\u06A9|\u0643|\u06AF|\u0644|\u06B5|\u0645|\u0646|\u0648|\u0647|\u064A])\u0648" : 'u' # special note 4/5
|
190
|
+
"(?<=[\u0621|\u0628|\u067E|\u062A|\u062C|\u0686|\u062D|\u062E|\u062F|\u0631|\u0695|\u0632|\u0698|\u0633|\u0634|\u0635|\u0636|\u0637|\u0639|\u063A|\u0341|\u06A8|\u0642|\u06A9|\u0643|\u06AF|\u0644|\u06B5|\u0645|\u0646|\u0648|\u0647|\u064A])\u064A" : 'î' # special note 4/5
|
191
|
+
'\u0621': '’' # ء (see note 1 and 7)
|
192
|
+
'\u0628': 'b' # ب
|
193
|
+
'\u067E': 'p' # پ
|
194
|
+
'\u062A': 't' # ت (see note 2)
|
195
|
+
'\u062C': 'c' # ج
|
196
|
+
'\u0686': # چ (see notes 3 and 7)
|
197
|
+
- 'ch'
|
198
|
+
- 'ç'
|
199
|
+
'\u062D': 'ḧ' # ح
|
200
|
+
'\u062E': 'x' # خ
|
201
|
+
'\u062F': 'd' # د
|
202
|
+
'\u0631': 'r' # ر
|
203
|
+
'\u0695': 'ṟ' # ڕ (Formerly written ڒ ڔ or رر according to typeface available; may vary on older sources. See note 7.)
|
204
|
+
'\u0632': 'z' # ز (see note 2)
|
205
|
+
'\u0698': 'j' # ژ
|
206
|
+
'\u0633': 's' # س (see note 2)
|
207
|
+
'\u0634': # ش (see notes 3 and 7)
|
208
|
+
- 'sh'
|
209
|
+
- 'ş'
|
210
|
+
'\u0635': 'ṣ' # ص (see notes 2 and 7)
|
211
|
+
'\u0636': 'ḍ' # ض (see notes 2 and 7)
|
212
|
+
'\u0637': 'ṭ' # ط (see notes 2 and 7)
|
213
|
+
'\u0639': '‘' # ع (see note 7)
|
214
|
+
'\u063A': # غ (see notes 3 and 7)
|
215
|
+
- 'gh'
|
216
|
+
- 'ẍ'
|
217
|
+
'\u0341': 'f' # ف
|
218
|
+
'\u06A8': 'v' # ڨ (see note 4)
|
219
|
+
'\u0642': 'q' # ق
|
220
|
+
'\u06A9': 'k' # ك
|
221
|
+
'\u0643': 'k' # ك
|
222
|
+
'\u06AF': 'g' # گ
|
223
|
+
'\u0644': 'l' # ل
|
224
|
+
'\u06B5': 'ł' # ڵ (Formerly written ڶ according to type available; may vary on older sources. See note 7)
|
225
|
+
'\u0645': 'm' # م
|
226
|
+
'\u0646': 'n' # ن
|
227
|
+
'\u0648': 'w' # و (see note 4)
|
228
|
+
'\u0647': 'h' # ه (see note 5)
|
229
|
+
'\u064A': 'y' # ي
|
230
|
+
|
231
|
+
# VOWELS
|
232
|
+
'\u0647\b': 'e' # See notes 1 and 5
|
233
|
+
'\u06D5': 'e' # See notes 1 and 5
|
234
|
+
'\u0626\u06D5': 'e' # See notes 1 and 5
|
235
|
+
'\u0627': 'a' # See note 1
|
236
|
+
'\u0626\u0627': 'a' # See note 1
|
237
|
+
'\u064A': 'î' # See notes 1, 6 and 7
|
238
|
+
'\u0626\u064A': 'î' # See notes 1, 6 and 7
|
239
|
+
'\u0626': 'i'
|
240
|
+
'\u06CE': 'ê' # See note 7
|
241
|
+
'\u0626\u06CE': 'ê' # See note 7
|
242
|
+
'\u0648': 'u'
|
243
|
+
'\u0626\u0648': 'u'
|
244
|
+
'\u0648\u0648': 'û' # See note 7
|
245
|
+
'\u0626\u0648\u0648': 'û' # See note 7
|
246
|
+
'\u06C6': 'o'
|
247
|
+
'\u0626\u06C6': 'o'
|
248
|
+
'\u0648': 'ö' # Rare; previously written وي . See note 7
|
249
|
+
'\u06CA': 'ü' # Only appearing in some dialects and only in old sources. Often equated to /û/ (row 7 above). Sometimes written يو See note 7.
|
@@ -203,172 +203,172 @@ notes:
|
|
203
203
|
|
204
204
|
tests:
|
205
205
|
- source: بَغْلان
|
206
|
-
expected:
|
206
|
+
expected: Baghlān
|
207
207
|
|
208
208
|
- source: پُوټَكَى
|
209
|
-
expected:
|
209
|
+
expected: Pōṯakay
|
210
210
|
|
211
211
|
- source: شِيرِين تَگَاب
|
212
|
-
expected:
|
212
|
+
expected: Shīrīn Tagāb
|
213
213
|
|
214
214
|
- source: کُوْټ
|
215
|
-
expected:
|
215
|
+
expected: Kōṯ
|
216
216
|
|
217
217
|
- source: ثَابِر
|
218
|
-
expected:
|
218
|
+
expected: S̄ābir
|
219
219
|
|
220
220
|
- source: جَلال آبَاد
|
221
|
-
expected:
|
221
|
+
expected: Jalālābād
|
222
222
|
|
223
223
|
- source: چَارِيكَار
|
224
|
-
expected:
|
224
|
+
expected: Chārīkār
|
225
225
|
|
226
226
|
- source: ځَدْرَاڼ
|
227
|
-
expected:
|
227
|
+
expected: Dzadrāṉ
|
228
228
|
|
229
229
|
- source: څَوکۍ
|
230
|
-
expected:
|
230
|
+
expected: Tsowkêy
|
231
231
|
|
232
232
|
- source: حَضْرَتِ إِمَام
|
233
|
-
expected:
|
233
|
+
expected: Ḩaẕrat-e Imām
|
234
234
|
|
235
235
|
- source: خُوْسْت
|
236
|
-
expected:
|
236
|
+
expected: Khōst
|
237
237
|
|
238
238
|
- source: سْپِين بُوْلْدَک
|
239
|
-
expected:
|
239
|
+
expected: Spīn Bōldak
|
240
240
|
|
241
241
|
- source: ډَنْډ وَ پَتَان
|
242
|
-
expected:
|
242
|
+
expected: Ḏanḏ Wa Patān
|
243
243
|
|
244
244
|
# - source: گُذَرْگَاهٔ نور
|
245
|
-
# expected:
|
245
|
+
# expected: Guz̄argāh-e nūr
|
246
246
|
|
247
247
|
- source: كَنْدَهَار
|
248
|
-
expected:
|
248
|
+
expected: Kandahār
|
249
249
|
|
250
250
|
- source: أَنْدَړ
|
251
|
-
expected:
|
251
|
+
expected: Andaṟ
|
252
252
|
|
253
253
|
- source: كُنْدُز
|
254
|
-
expected:
|
254
|
+
expected: Kunduz
|
255
255
|
|
256
256
|
- source: مِير أَسْلَم ژْرَنْدَه
|
257
|
-
expected:
|
257
|
+
expected: Mīr Aslam Zhrandah
|
258
258
|
|
259
259
|
- source: ږِيرَه
|
260
|
-
expected:
|
260
|
+
expected: Z͟hīrah
|
261
261
|
|
262
262
|
- source: سَمَنْگَان
|
263
|
-
expected:
|
263
|
+
expected: Samangān
|
264
264
|
|
265
265
|
# - source: مَزَارِ شَريف
|
266
|
-
# expected:
|
266
|
+
# expected: Mazār-e sharīf
|
267
267
|
|
268
268
|
- source: كښٙتَه كَلا
|
269
|
-
expected:
|
269
|
+
expected: Ks͟hêtah Kalā
|
270
270
|
|
271
271
|
- source: قَيْصَار
|
272
|
-
expected:
|
272
|
+
expected: Qayşār
|
273
273
|
|
274
274
|
- source: فَيض آبَاد
|
275
|
-
expected:
|
275
|
+
expected: Faīẕābād
|
276
276
|
|
277
277
|
- source: حَضْرَتِ سُلْطَان
|
278
|
-
expected:
|
278
|
+
expected: Ḩaẕrat-e Sulţān
|
279
279
|
|
280
280
|
- source: ظَاهِر كَلا
|
281
|
-
expected:
|
281
|
+
expected: Z̧āhir Kalā
|
282
282
|
|
283
283
|
- source: پُلِ عَلَم
|
284
|
-
expected:
|
284
|
+
expected: Pul-e ‘Alam
|
285
285
|
|
286
286
|
- source: غَزْنِي
|
287
|
-
expected:
|
287
|
+
expected: Ghaznī
|
288
288
|
|
289
289
|
- source: مَزَارِ شَرِيف
|
290
|
-
expected:
|
290
|
+
expected: Mazār-e Sharīf
|
291
291
|
|
292
292
|
- source: قَيْصَار
|
293
|
-
expected:
|
293
|
+
expected: Qayşār
|
294
294
|
|
295
295
|
- source: كَنْدَهَار
|
296
|
-
expected:
|
296
|
+
expected: Kandahār
|
297
297
|
|
298
298
|
- source: گَرْدېز
|
299
|
-
expected:
|
299
|
+
expected: Gardēz
|
300
300
|
|
301
301
|
- source: کَابُل
|
302
|
-
expected:
|
302
|
+
expected: Kābul
|
303
303
|
|
304
304
|
- source: مَيمَنَه
|
305
|
-
expected:
|
305
|
+
expected: Maīmanah
|
306
306
|
|
307
307
|
- source: خَان آبَاد
|
308
|
-
expected:
|
308
|
+
expected: Khānābād
|
309
309
|
|
310
310
|
- source: مَاڼۍ
|
311
|
-
expected:
|
311
|
+
expected: Māṉêy
|
312
312
|
|
313
313
|
- source: وَاخَان
|
314
|
-
expected:
|
314
|
+
expected: Wākhān
|
315
315
|
|
316
316
|
# - source: هِرَات
|
317
|
-
# expected:
|
317
|
+
# expected: Herāt
|
318
318
|
|
319
319
|
- source: يَنْگِي قَلعَه
|
320
|
-
expected:
|
320
|
+
expected: Yangī Qal‘ah
|
321
321
|
|
322
322
|
- source: جَلال آبَاد
|
323
|
-
expected:
|
323
|
+
expected: Jalālābād
|
324
324
|
|
325
325
|
# - source: هِرات پُلِ حِصَار
|
326
326
|
# expected: Herāt Pul-e Ḩişār
|
327
327
|
|
328
328
|
- source: مُرْغَاب کَابُل
|
329
|
-
expected:
|
329
|
+
expected: Murghāb Kābul
|
330
330
|
|
331
331
|
- source: گٙردُون
|
332
|
-
expected:
|
332
|
+
expected: Gêrdōn
|
333
333
|
|
334
334
|
- source: آب بَنْد
|
335
|
-
expected:
|
335
|
+
expected: Āb Band
|
336
336
|
|
337
337
|
- source: سْپِين بُوْلْدَک
|
338
|
-
expected:
|
338
|
+
expected: Spīn Bōldak
|
339
339
|
|
340
340
|
# - source: بَالا بُلُوک
|
341
341
|
# expected: Bālā Bulūk
|
342
342
|
|
343
343
|
- source: جَوزجَان
|
344
|
-
expected:
|
344
|
+
expected: Jowzjān
|
345
345
|
|
346
346
|
# - source: غَزْنِى سْپِين
|
347
|
-
# expected:
|
347
|
+
# expected: Ghaznī spīn
|
348
348
|
|
349
349
|
# - source: ريگ مَيوَنْد
|
350
350
|
# expected: Maywand, Rēg
|
351
351
|
|
352
352
|
- source: گَرْدېز
|
353
|
-
expected:
|
353
|
+
expected: Gardēz
|
354
354
|
|
355
355
|
- source: مَیدان شَهْر
|
356
|
-
expected:
|
356
|
+
expected: Maīdān Shahr
|
357
357
|
|
358
358
|
- source: ډَنْډِ سُفْلىٰ
|
359
|
-
expected:
|
359
|
+
expected: Ḏanḏ-e Suflá
|
360
360
|
|
361
361
|
# - source: څَوْکۍ
|
362
362
|
# expected: Tsowkêy
|
363
363
|
|
364
364
|
# - source: هَوائِي ډَگَر
|
365
|
-
# expected:
|
365
|
+
# expected: Hawā’ī ḏagar
|
366
366
|
|
367
367
|
# - source: مَزارِ شَريف
|
368
|
-
# expected:
|
368
|
+
# expected: Mazār-e sharīf
|
369
369
|
|
370
370
|
# - source: دايکندی
|
371
|
-
# expected:
|
371
|
+
# expected: Dāykundī
|
372
372
|
|
373
373
|
# - source: زيارت
|
374
374
|
# expected: Zīārat
|
@@ -380,9 +380,43 @@ tests:
|
|
380
380
|
# expected: Myā
|
381
381
|
|
382
382
|
- source: جَبَل السَرَاج
|
383
|
-
expected:
|
383
|
+
expected: Jabal as Sarāj
|
384
384
|
|
385
385
|
map:
|
386
|
+
postrules:
|
387
|
+
- pattern: (?<=\b)(?<!\b[‘|’|'|-])[\u0061-\uFFFF]
|
388
|
+
result: "upcase"
|
389
|
+
# don't capitalize defined article in the middle of a sentence
|
390
|
+
- pattern : ' At T' # الت
|
391
|
+
result: ' at T'
|
392
|
+
- pattern : ' As̄ S̄' # الث
|
393
|
+
result: ' as̄ S̄'
|
394
|
+
- pattern : ' Ad D' # الد
|
395
|
+
result: ' ad D'
|
396
|
+
- pattern : ' Az̄ Z̄' # الذ
|
397
|
+
result: ' az̄ Z̄'
|
398
|
+
- pattern : ' Ar R' # الر
|
399
|
+
result: ' ar R'
|
400
|
+
- pattern : ' Az Z' # الز
|
401
|
+
result: ' az Z'
|
402
|
+
- pattern : ' As S' # الس
|
403
|
+
result: ' as S'
|
404
|
+
- pattern : ' Ash Sh' # الش
|
405
|
+
result: ' ash Sh'
|
406
|
+
- pattern : ' Aş Ş' # الص
|
407
|
+
result: ' aş Ş'
|
408
|
+
- pattern : ' Aẕ Ẕ' # الض
|
409
|
+
result: ' aẕ Ẕ'
|
410
|
+
- pattern : ' Aţ Ţ' # الط
|
411
|
+
result: ' aţ Ţ'
|
412
|
+
- pattern : ' Az̧ Z̧' # الظ
|
413
|
+
result: ' az̧ Z̧'
|
414
|
+
- pattern : ' Al L' # الل
|
415
|
+
result: ' al L'
|
416
|
+
- pattern : ' An N' # الن
|
417
|
+
result: ' an N'
|
418
|
+
- pattern: " Al " # ال
|
419
|
+
result: " al "
|
386
420
|
characters:
|
387
421
|
|
388
422
|
# word-medial or word-final form where so appearing in a word.
|