interscript 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +1 -3
- data/aliases.json +1 -0
- data/lib/interscript.rb +8 -3
- data/lib/interscript/fs.rb +27 -0
- data/lib/interscript/mapping.rb +3 -1
- data/lib/interscript/opal.rb +142 -3
- data/lib/interscript/opal/entrypoint.rb +8 -0
- data/lib/interscript/opal/exports.rb +11 -0
- data/lib/interscript/opal/maps.js.erb +2 -4
- data/lib/interscript/version.rb +1 -1
- data/maps/alalc-ara-Arab-Latn-1997.yaml +5 -5
- data/maps/alalc-asm-Deva-Latn-1997.yaml +104 -10
- data/maps/alalc-asm-Deva-Latn-2012.yaml +18 -3
- data/maps/alalc-aze-Arab-Latn-1997.yaml +376 -0
- data/maps/alalc-ben-Beng-Latn-1997.yaml +291 -0
- data/maps/alalc-div-Thaa-Latn-1997.yaml +211 -0
- data/maps/alalc-hin-Deva-Latn-1997.yaml +102 -10
- data/maps/alalc-hin-Deva-Latn-2011.yaml +19 -1
- data/maps/alalc-kan-Kana-Latn-1997.yaml +274 -0
- data/maps/alalc-kan-Kana-Latn-2011.yaml +63 -0
- data/maps/alalc-ori-Orya-Latn-1997.yaml +284 -0
- data/maps/alalc-ori-Orya-Latn-2011.yaml +67 -0
- data/maps/alalc-pra-Deva-Latn-2012.yaml +2 -2
- data/maps/alalc-san-Deva-Latn-2012.yaml +78 -9
- data/maps/alalc-tel-Telu-Latn-1997.yaml +284 -0
- data/maps/alalc-tel-Telu-Latn-2011.yaml +64 -0
- data/maps/az-aze-Cyrl-Latn-1939.yaml +105 -0
- data/maps/az-aze-Cyrl-Latn-1958.yaml +45 -0
- data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +3 -1
- data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +111 -104
- data/maps/bgnpcgn-bal-Arab-Latn-2008.yaml +329 -0
- data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +1 -1
- data/maps/bgnpcgn-div-Thaa-Latn-1988.yaml +75 -0
- data/maps/bgnpcgn-far-Latn-Latn-1964.yaml +28 -0
- data/maps/bgnpcgn-isl-Latn-Latn-1964.yaml +37 -0
- data/maps/bgnpcgn-kaz-Cyrl-Latn-1979.yaml +247 -0
- data/maps/bgnpcgn-kir-Cyrl-Latn-1979.yaml +218 -0
- data/maps/bgnpcgn-kur-Arab-Latn-2007.yaml +249 -0
- data/maps/bgnpcgn-per-Arab-Latn-1958.yaml +2 -0
- data/maps/bgnpcgn-prs-Arab-Latn-2007.yaml +87 -53
- data/maps/bgnpcgn-pus-Arab-Latn-1968.yaml +377 -0
- data/maps/bgnpcgn-srp-Cyrl-Latn-1962.yaml +73 -0
- data/maps/bgnpcgn-urd-Arab-Latn-2007.yaml +459 -0
- data/maps/{bis-knd-Knda-Latn-13194-1991.yaml → bis-kan-Kana-Latn-13194-1991.yaml} +2 -2
- data/maps/bis-ori-Orya-Latn-13194-1991.yaml +17 -2
- data/maps/iso-ara-Arab-Latn-233-1984.yaml +1 -1
- data/maps/{iso-kan-Knda-Latn-15919-2001.yaml → iso-kan-Kana-Latn-15919-2001.yaml} +1 -1
- data/maps/{mns-mon-Cyrl-Latn-5217-2012.yaml → masm-mon-Cyrl-Latn-5217-2012.yaml} +2 -2
- data/maps/{mns-mon-Latn-Cyrl-5217-2012.yaml → masm-mon-Latn-Cyrl-5217-2012.yaml} +1 -1
- data/maps/mv-div-Thaa-Latn-1987.yaml +200 -0
- data/maps/odni-ara-Arab-Latn-2004.yaml +137 -0
- data/maps/odni-ara-Arab-Latn-2015.yaml +20 -130
- data/maps/odni-bul-Cyrl-Latn-2005.yaml +90 -0
- data/maps/odni-fas-Arab-Latn-2004.yaml +276 -0
- data/maps/odni-hin-Deva-Latn-2004.yaml +182 -0
- data/maps/odni-mkd-Cyrl-Latn-2005.yaml +21 -0
- data/maps/odni-prs-Arab-Latn-2004.yaml +123 -0
- data/maps/{odni-per-Arab-Latn-2015.yaml → odni-prs-Arab-Latn-2015.yaml} +0 -0
- data/maps/odni-srp-Cyrl-Latn-2005.yaml +36 -0
- data/maps/odni-tuk-Cyrl-Latn-2015.yaml +170 -0
- data/maps/odni-ukr-Cyrl-Latn-2015.yaml +4 -0
- data/maps/un-ara-Arab-Latn-2017.yaml +1 -1
- data/maps/un-asm-Beng-Latn-1972.yaml +223 -0
- data/maps/un-guj-Gujr-Latn-1972.yaml +229 -0
- data/maps/un-hin-Deva-Latn-2016.yaml +104 -10
- data/maps/un-kan-Kana-Latn-2016.yaml +254 -0
- data/maps/un-mal-Mlym-Latn-1972.yaml +251 -0
- data/maps/un-mar-Deva-Latn-2016.yaml +24 -13
- data/maps/un-nep-Deva-Latn-1972.yaml +40 -121
- data/maps/un-ori-Orya-Latn-1972.yaml +247 -0
- data/maps/un-pan-Guru-Latn-1972.yaml +402 -0
- data/maps/un-prs-Arab-Latn-1967.yaml +236 -0
- data/maps/un-tam-Taml-Latn-1972.yaml +194 -0
- data/maps/un-tel-Telu-Latn-1972.yaml +270 -0
- data/maps/un-urd-Arab-Latn-1972.yaml +405 -0
- data/maps/var-amh-Ethi-Latn-eae-2003.yaml +466 -0
- data/maps/var-gez-Ethi-Latn-eae-2003.yaml +76 -0
- data/spec/interscript/filenames_spec.rb +6 -369
- data/spec/interscript_spec.rb +10 -2
- metadata +50 -7
- data/lib/interscript/opal/map_translate.rb +0 -7
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
---
|
|
2
|
+
authority_id: bgnpcgn
|
|
3
|
+
id: 2007
|
|
4
|
+
language: kur
|
|
5
|
+
source_script: Arab
|
|
6
|
+
destination_script: Latn
|
|
7
|
+
name: ROMANIZATION OF KURDISH -- BGN/PCGN 2007
|
|
8
|
+
url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/693727/ROMANIZATION_OF_KURDISH.pdf
|
|
9
|
+
creation_date: 2007
|
|
10
|
+
confirmation date: 2017-12
|
|
11
|
+
description: |
|
|
12
|
+
The tabulation below is applicable to the Kurdish language as a
|
|
13
|
+
whole. It is based for the most part on the Hawar Roman alphabet used
|
|
14
|
+
in the Library of Congress Standard Kurdish Orthography Table, but it
|
|
15
|
+
also incorporates certain non-Hawar elements found in A Kurdish-English
|
|
16
|
+
Dictionary (Taufiq Wahby & C J Edmonds, OUP, 1966). The tabulation
|
|
17
|
+
covers both major varieties of the Kurdish language: Kurmanji and
|
|
18
|
+
Sorani. Kurmanji is spoken principally in Turkey and in Iraq north of
|
|
19
|
+
the Great Zab River (Dahūk/Dihok Governorate). It is generally written
|
|
20
|
+
in Roman script, and usually employs the Roman orthography. Sorani is
|
|
21
|
+
spoken principally in Iraq south of the Great Zab river (Arbīl/Hewlêr
|
|
22
|
+
and As Sulaymānīyah/Slêmanî governorates). It is generally written in
|
|
23
|
+
Perso-Arabic script, and usually employs the Perso-Arabic script
|
|
24
|
+
orthography.
|
|
25
|
+
|
|
26
|
+
Kurdish forms of geographical names in Turkey will usually be found
|
|
27
|
+
in Roman script, and so no romanization process will be required. The
|
|
28
|
+
digraph options for consonant letters '\u0686', '\u0634', and '\u063A'
|
|
29
|
+
will not be encountered for such names. In Iraq, Syria, and Iran,
|
|
30
|
+
Kurdish will usually be encountered in Perso-Arabic script, in which
|
|
31
|
+
case it should be romanized into the corresponding Roman script form.
|
|
32
|
+
Kurdish geographical names for places and features outside Turkey,
|
|
33
|
+
found in Roman script form, should, where necessary and if possible, be
|
|
34
|
+
tailored to fit the orthography of the Romanization shown below and
|
|
35
|
+
should employ the digraph options for consonant letters '\u0686',
|
|
36
|
+
'\u0634', and '\u063A'.
|
|
37
|
+
|
|
38
|
+
notes:
|
|
39
|
+
|
|
40
|
+
- In pure Kurdish words hamza is borne by yā’ ( ئ ) and occurs only
|
|
41
|
+
before initial vowels; it is not romanized. Medial and final hamza in
|
|
42
|
+
Arabic borrowings are romanized by ’ (apostrophe – Unicode encoding
|
|
43
|
+
2019).
|
|
44
|
+
|
|
45
|
+
- The letters ث ذ ص ض ط ظ do not occur in pure Kurdish words. In Arabic
|
|
46
|
+
borrowings some writers retain these letters, others substitute س ز س ز
|
|
47
|
+
ت ز respectively. Only the letters ط ض and ص are catered for in the
|
|
48
|
+
Library of Congress tabulation, as reflected in lines 16-18 of the
|
|
49
|
+
above Consonant table. Words of obvious Arabic origin occurring in a
|
|
50
|
+
Kurdish toponymic environment will be treated as Kurdish rather than
|
|
51
|
+
Arabic, as will words of other non-Kurdish origins.
|
|
52
|
+
|
|
53
|
+
- The digraph options appearing in rows 6, 15 and 20 of the consonants
|
|
54
|
+
table should be used for Kurdish geographical names in Iraq, Iran, and
|
|
55
|
+
Syria. The single character options should be used for Kurdish
|
|
56
|
+
geographical names in Turkey.
|
|
57
|
+
|
|
58
|
+
- ڨ is used to represent v in foreign words. Some southern Kurdish
|
|
59
|
+
writers use it to represent the v in borrowings from northern Kurdish
|
|
60
|
+
dialects. و is pronounced as a v in the north and as a w elsewhere.
|
|
61
|
+
|
|
62
|
+
- Hā’ can be used as a vowel or a consonant. The initial (ه) and medial
|
|
63
|
+
(forms are used for the consonant h, Consonant table, row 31, while the
|
|
64
|
+
final (ه) and independent (forms are used to represent the vowel e,
|
|
65
|
+
Vowel table, row 1. Therefore, when used as a consonant, the final and
|
|
66
|
+
independent forms of hā’ will be seen as ‘ه’ instead of ‘and ‘ه’,
|
|
67
|
+
respectively. For example, مهه meh, (“month”). When used as ‘e’, the
|
|
68
|
+
hā’ behaves like the letters alif (ا) , wāw, dāl (د) , and rā (ر) , in
|
|
69
|
+
that it never joins to the following letter (i.e., it has no medial
|
|
70
|
+
form). Consequently, the following letter will display the initial
|
|
71
|
+
form, e.g. هەولێر Hewlêr (unless there is only one following letter, in
|
|
72
|
+
which case it will be written in the independent form, e.g. ماوەت
|
|
73
|
+
Mawet). As with other vowels (see special rules 2 and 3), initial e is
|
|
74
|
+
preceded by the kursî hamza, yielding initial ئه , e.g. ئهني enî
|
|
75
|
+
“forehead”.
|
|
76
|
+
|
|
77
|
+
- In pure Kurdish words, the vowel ى is always long î, e.g. كانى ماسێ
|
|
78
|
+
Kanî Masê. When it represents îzafe, it is also romanized î and joined
|
|
79
|
+
by means of a hyphen to its preceding word e.g. پارێزگاى دهۆك Parêzga-î
|
|
80
|
+
Dihok.
|
|
81
|
+
|
|
82
|
+
- |
|
|
83
|
+
An inventory of letter-diacritic combinations, used in addition to
|
|
84
|
+
the unmodified letters of the basic Roman script in the Romanization of
|
|
85
|
+
Kurdish, with their Unicode encoding, is:
|
|
86
|
+
|
|
87
|
+
'‘': '\u2018' , '’': '2019'
|
|
88
|
+
'Ç': '00C7' , 'ç': '00E7'
|
|
89
|
+
'Ḍ': '1E0C' , 'ḍ': '1E0D'
|
|
90
|
+
'Ê': '00CA' , 'ê': '00EA'
|
|
91
|
+
|
|
92
|
+
# There is no single Unicode encoding for these letter-diacritic combinations.
|
|
93
|
+
'Ḧ': '0048+0308' , 'ḧ': '0068+0308'
|
|
94
|
+
'Î': '00CE' , 'î': '00EE'
|
|
95
|
+
'Ł': '0141' , 'ł': '0142'
|
|
96
|
+
'Ö': '00D6' , 'ö': '00F6'
|
|
97
|
+
'Ṟ': '1E5E' , 'ṟ': '1E5F'
|
|
98
|
+
'Ş': '015E' , 'ş': '015F'
|
|
99
|
+
'Ṣ': '1E62' , 'ṣ': '1E63'
|
|
100
|
+
'Ṭ': '1E6C' , 'ṭ': '1E6D'
|
|
101
|
+
'Û': '00DB' , 'û': '00FB'
|
|
102
|
+
'Ü': '00DC' , 'ü': '00FC'
|
|
103
|
+
'Ẍ': '1E8C' , 'ẍ': '1E8D'
|
|
104
|
+
|
|
105
|
+
- The Romanization column shows only lowercase forms but, when
|
|
106
|
+
romanizing, uppercase and lowercase Roman letters as appropriate should
|
|
107
|
+
be used.
|
|
108
|
+
|
|
109
|
+
# Special Rules
|
|
110
|
+
- The conjunction و (and) should be rendered u if the
|
|
111
|
+
preceding word ends in a consonant, and w if the preceding
|
|
112
|
+
word ends in a vowel. It should be separated by spaces from
|
|
113
|
+
the preceding and following words.
|
|
114
|
+
|
|
115
|
+
- In the Perso-Arabic orthography for Kurdish, all vowels are
|
|
116
|
+
written, with the exception of the short i, which is
|
|
117
|
+
expressed with a kasrah under the preceding consonant (ِ).
|
|
118
|
+
In Perso-Arabic script, the kasrah will rarely be written (
|
|
119
|
+
e.g., كرن kirin “to do”). Like all Kurdish vowels, the
|
|
120
|
+
short i will be preceded by a kursî hamza )ئ )if it appears
|
|
121
|
+
at the beginning of a word (see 3 below; see row 4 of vowel
|
|
122
|
+
table).
|
|
123
|
+
|
|
124
|
+
- In the Perso-Arabic orthography for Kurdish, when a vowel
|
|
125
|
+
comes at the beginning of a word, or when a vowel directly
|
|
126
|
+
follows another vowel, a kursî hamza )ئ )precedes it (e.g.,
|
|
127
|
+
ئاگر agir “fire”).
|
|
128
|
+
|
|
129
|
+
- A Kurdish word will never start with alif )ا .)A Kurdish
|
|
130
|
+
word may begin with a yā’ (ي) or wāw )و ,)but only when
|
|
131
|
+
they are used as a consonant, when they will be romanized
|
|
132
|
+
as y and w, respectively.
|
|
133
|
+
|
|
134
|
+
- When preceded by a consonant, yā’ (ي )and wāw )و )should be
|
|
135
|
+
romanized î and u, respectively. When preceded by a vowel (
|
|
136
|
+
including short i, which is not written), yā’ (ي )and wāw (
|
|
137
|
+
و )should be romanized y and w, respectively.
|
|
138
|
+
|
|
139
|
+
- The Arabic sign shaddah ( ّ ) denoting a doubled consonant
|
|
140
|
+
is not used in Kurdish; doubled consonants, which are rare,
|
|
141
|
+
are written twice e.g. موحەممەد Muḧemmed; ننا موسه Musanna.
|
|
142
|
+
Shaddah might be used in Arabic borrowings but, as in
|
|
143
|
+
unpointed Arabic, would generally be omitted.
|
|
144
|
+
|
|
145
|
+
- Particles such as له le (= at, in, on) and به be (= to,
|
|
146
|
+
for, by, with) should be written separately from their
|
|
147
|
+
following word, e.g. كوردستانێ له Le Kurdistanê “in
|
|
148
|
+
Kurdistan”
|
|
149
|
+
|
|
150
|
+
- Occasionally the character sequences چه ,سه and گه occur.
|
|
151
|
+
They may be romanized c·h, s·h, and g·h in order to
|
|
152
|
+
differentiate those romanizations from the digraphs ch, sh,
|
|
153
|
+
and gh.
|
|
154
|
+
|
|
155
|
+
tests:
|
|
156
|
+
- source: كاني ماسێ
|
|
157
|
+
expected: Kanî Masê
|
|
158
|
+
|
|
159
|
+
- source: كِرِن
|
|
160
|
+
expected: Kirin
|
|
161
|
+
|
|
162
|
+
- source: ئاگِر
|
|
163
|
+
expected: Agir
|
|
164
|
+
|
|
165
|
+
- source: موحەممەد
|
|
166
|
+
expected: Muḧemmed
|
|
167
|
+
|
|
168
|
+
# - source: موسەننا # issue 604
|
|
169
|
+
# expected: Musanna
|
|
170
|
+
|
|
171
|
+
- source: لەكوردِستانێ
|
|
172
|
+
expected: Le Kurdistanê
|
|
173
|
+
|
|
174
|
+
map:
|
|
175
|
+
postrules:
|
|
176
|
+
- pattern: (?<=\b)(?<!\b[‘|’|'])[\u0061-\uFFFF]
|
|
177
|
+
result: "upcase"
|
|
178
|
+
|
|
179
|
+
characters:
|
|
180
|
+
|
|
181
|
+
'\u0650' : 'i' # ِ kasra special rule 2
|
|
182
|
+
'\u0644\u06d5' : 'le ' # special rule 7
|
|
183
|
+
'\u0628\u06d5' : 'be ' # special rule 7
|
|
184
|
+
# Note 1
|
|
185
|
+
'\u0621' : '’' # ء
|
|
186
|
+
'\u0624' : '’' # ؤ
|
|
187
|
+
'\u0626' : '’' # ئ
|
|
188
|
+
|
|
189
|
+
"(?<=[\u0621|\u0628|\u067E|\u062A|\u062C|\u0686|\u062D|\u062E|\u062F|\u0631|\u0695|\u0632|\u0698|\u0633|\u0634|\u0635|\u0636|\u0637|\u0639|\u063A|\u0341|\u06A8|\u0642|\u06A9|\u0643|\u06AF|\u0644|\u06B5|\u0645|\u0646|\u0648|\u0647|\u064A])\u0648" : 'u' # special note 4/5
|
|
190
|
+
"(?<=[\u0621|\u0628|\u067E|\u062A|\u062C|\u0686|\u062D|\u062E|\u062F|\u0631|\u0695|\u0632|\u0698|\u0633|\u0634|\u0635|\u0636|\u0637|\u0639|\u063A|\u0341|\u06A8|\u0642|\u06A9|\u0643|\u06AF|\u0644|\u06B5|\u0645|\u0646|\u0648|\u0647|\u064A])\u064A" : 'î' # special note 4/5
|
|
191
|
+
'\u0621': '’' # ء (see note 1 and 7)
|
|
192
|
+
'\u0628': 'b' # ب
|
|
193
|
+
'\u067E': 'p' # پ
|
|
194
|
+
'\u062A': 't' # ت (see note 2)
|
|
195
|
+
'\u062C': 'c' # ج
|
|
196
|
+
'\u0686': # چ (see notes 3 and 7)
|
|
197
|
+
- 'ch'
|
|
198
|
+
- 'ç'
|
|
199
|
+
'\u062D': 'ḧ' # ح
|
|
200
|
+
'\u062E': 'x' # خ
|
|
201
|
+
'\u062F': 'd' # د
|
|
202
|
+
'\u0631': 'r' # ر
|
|
203
|
+
'\u0695': 'ṟ' # ڕ (Formerly written ڒ ڔ or رر according to typeface available; may vary on older sources. See note 7.)
|
|
204
|
+
'\u0632': 'z' # ز (see note 2)
|
|
205
|
+
'\u0698': 'j' # ژ
|
|
206
|
+
'\u0633': 's' # س (see note 2)
|
|
207
|
+
'\u0634': # ش (see notes 3 and 7)
|
|
208
|
+
- 'sh'
|
|
209
|
+
- 'ş'
|
|
210
|
+
'\u0635': 'ṣ' # ص (see notes 2 and 7)
|
|
211
|
+
'\u0636': 'ḍ' # ض (see notes 2 and 7)
|
|
212
|
+
'\u0637': 'ṭ' # ط (see notes 2 and 7)
|
|
213
|
+
'\u0639': '‘' # ع (see note 7)
|
|
214
|
+
'\u063A': # غ (see notes 3 and 7)
|
|
215
|
+
- 'gh'
|
|
216
|
+
- 'ẍ'
|
|
217
|
+
'\u0341': 'f' # ف
|
|
218
|
+
'\u06A8': 'v' # ڨ (see note 4)
|
|
219
|
+
'\u0642': 'q' # ق
|
|
220
|
+
'\u06A9': 'k' # ك
|
|
221
|
+
'\u0643': 'k' # ك
|
|
222
|
+
'\u06AF': 'g' # گ
|
|
223
|
+
'\u0644': 'l' # ل
|
|
224
|
+
'\u06B5': 'ł' # ڵ (Formerly written ڶ according to type available; may vary on older sources. See note 7)
|
|
225
|
+
'\u0645': 'm' # م
|
|
226
|
+
'\u0646': 'n' # ن
|
|
227
|
+
'\u0648': 'w' # و (see note 4)
|
|
228
|
+
'\u0647': 'h' # ه (see note 5)
|
|
229
|
+
'\u064A': 'y' # ي
|
|
230
|
+
|
|
231
|
+
# VOWELS
|
|
232
|
+
'\u0647\b': 'e' # See notes 1 and 5
|
|
233
|
+
'\u06D5': 'e' # See notes 1 and 5
|
|
234
|
+
'\u0626\u06D5': 'e' # See notes 1 and 5
|
|
235
|
+
'\u0627': 'a' # See note 1
|
|
236
|
+
'\u0626\u0627': 'a' # See note 1
|
|
237
|
+
'\u064A': 'î' # See notes 1, 6 and 7
|
|
238
|
+
'\u0626\u064A': 'î' # See notes 1, 6 and 7
|
|
239
|
+
'\u0626': 'i'
|
|
240
|
+
'\u06CE': 'ê' # See note 7
|
|
241
|
+
'\u0626\u06CE': 'ê' # See note 7
|
|
242
|
+
'\u0648': 'u'
|
|
243
|
+
'\u0626\u0648': 'u'
|
|
244
|
+
'\u0648\u0648': 'û' # See note 7
|
|
245
|
+
'\u0626\u0648\u0648': 'û' # See note 7
|
|
246
|
+
'\u06C6': 'o'
|
|
247
|
+
'\u0626\u06C6': 'o'
|
|
248
|
+
'\u0648': 'ö' # Rare; previously written وي . See note 7
|
|
249
|
+
'\u06CA': 'ü' # Only appearing in some dialects and only in old sources. Often equated to /û/ (row 7 above). Sometimes written يو See note 7.
|
|
@@ -203,172 +203,172 @@ notes:
|
|
|
203
203
|
|
|
204
204
|
tests:
|
|
205
205
|
- source: بَغْلان
|
|
206
|
-
expected:
|
|
206
|
+
expected: Baghlān
|
|
207
207
|
|
|
208
208
|
- source: پُوټَكَى
|
|
209
|
-
expected:
|
|
209
|
+
expected: Pōṯakay
|
|
210
210
|
|
|
211
211
|
- source: شِيرِين تَگَاب
|
|
212
|
-
expected:
|
|
212
|
+
expected: Shīrīn Tagāb
|
|
213
213
|
|
|
214
214
|
- source: کُوْټ
|
|
215
|
-
expected:
|
|
215
|
+
expected: Kōṯ
|
|
216
216
|
|
|
217
217
|
- source: ثَابِر
|
|
218
|
-
expected:
|
|
218
|
+
expected: S̄ābir
|
|
219
219
|
|
|
220
220
|
- source: جَلال آبَاد
|
|
221
|
-
expected:
|
|
221
|
+
expected: Jalālābād
|
|
222
222
|
|
|
223
223
|
- source: چَارِيكَار
|
|
224
|
-
expected:
|
|
224
|
+
expected: Chārīkār
|
|
225
225
|
|
|
226
226
|
- source: ځَدْرَاڼ
|
|
227
|
-
expected:
|
|
227
|
+
expected: Dzadrāṉ
|
|
228
228
|
|
|
229
229
|
- source: څَوکۍ
|
|
230
|
-
expected:
|
|
230
|
+
expected: Tsowkêy
|
|
231
231
|
|
|
232
232
|
- source: حَضْرَتِ إِمَام
|
|
233
|
-
expected:
|
|
233
|
+
expected: Ḩaẕrat-e Imām
|
|
234
234
|
|
|
235
235
|
- source: خُوْسْت
|
|
236
|
-
expected:
|
|
236
|
+
expected: Khōst
|
|
237
237
|
|
|
238
238
|
- source: سْپِين بُوْلْدَک
|
|
239
|
-
expected:
|
|
239
|
+
expected: Spīn Bōldak
|
|
240
240
|
|
|
241
241
|
- source: ډَنْډ وَ پَتَان
|
|
242
|
-
expected:
|
|
242
|
+
expected: Ḏanḏ Wa Patān
|
|
243
243
|
|
|
244
244
|
# - source: گُذَرْگَاهٔ نور
|
|
245
|
-
# expected:
|
|
245
|
+
# expected: Guz̄argāh-e nūr
|
|
246
246
|
|
|
247
247
|
- source: كَنْدَهَار
|
|
248
|
-
expected:
|
|
248
|
+
expected: Kandahār
|
|
249
249
|
|
|
250
250
|
- source: أَنْدَړ
|
|
251
|
-
expected:
|
|
251
|
+
expected: Andaṟ
|
|
252
252
|
|
|
253
253
|
- source: كُنْدُز
|
|
254
|
-
expected:
|
|
254
|
+
expected: Kunduz
|
|
255
255
|
|
|
256
256
|
- source: مِير أَسْلَم ژْرَنْدَه
|
|
257
|
-
expected:
|
|
257
|
+
expected: Mīr Aslam Zhrandah
|
|
258
258
|
|
|
259
259
|
- source: ږِيرَه
|
|
260
|
-
expected:
|
|
260
|
+
expected: Z͟hīrah
|
|
261
261
|
|
|
262
262
|
- source: سَمَنْگَان
|
|
263
|
-
expected:
|
|
263
|
+
expected: Samangān
|
|
264
264
|
|
|
265
265
|
# - source: مَزَارِ شَريف
|
|
266
|
-
# expected:
|
|
266
|
+
# expected: Mazār-e sharīf
|
|
267
267
|
|
|
268
268
|
- source: كښٙتَه كَلا
|
|
269
|
-
expected:
|
|
269
|
+
expected: Ks͟hêtah Kalā
|
|
270
270
|
|
|
271
271
|
- source: قَيْصَار
|
|
272
|
-
expected:
|
|
272
|
+
expected: Qayşār
|
|
273
273
|
|
|
274
274
|
- source: فَيض آبَاد
|
|
275
|
-
expected:
|
|
275
|
+
expected: Faīẕābād
|
|
276
276
|
|
|
277
277
|
- source: حَضْرَتِ سُلْطَان
|
|
278
|
-
expected:
|
|
278
|
+
expected: Ḩaẕrat-e Sulţān
|
|
279
279
|
|
|
280
280
|
- source: ظَاهِر كَلا
|
|
281
|
-
expected:
|
|
281
|
+
expected: Z̧āhir Kalā
|
|
282
282
|
|
|
283
283
|
- source: پُلِ عَلَم
|
|
284
|
-
expected:
|
|
284
|
+
expected: Pul-e ‘Alam
|
|
285
285
|
|
|
286
286
|
- source: غَزْنِي
|
|
287
|
-
expected:
|
|
287
|
+
expected: Ghaznī
|
|
288
288
|
|
|
289
289
|
- source: مَزَارِ شَرِيف
|
|
290
|
-
expected:
|
|
290
|
+
expected: Mazār-e Sharīf
|
|
291
291
|
|
|
292
292
|
- source: قَيْصَار
|
|
293
|
-
expected:
|
|
293
|
+
expected: Qayşār
|
|
294
294
|
|
|
295
295
|
- source: كَنْدَهَار
|
|
296
|
-
expected:
|
|
296
|
+
expected: Kandahār
|
|
297
297
|
|
|
298
298
|
- source: گَرْدېز
|
|
299
|
-
expected:
|
|
299
|
+
expected: Gardēz
|
|
300
300
|
|
|
301
301
|
- source: کَابُل
|
|
302
|
-
expected:
|
|
302
|
+
expected: Kābul
|
|
303
303
|
|
|
304
304
|
- source: مَيمَنَه
|
|
305
|
-
expected:
|
|
305
|
+
expected: Maīmanah
|
|
306
306
|
|
|
307
307
|
- source: خَان آبَاد
|
|
308
|
-
expected:
|
|
308
|
+
expected: Khānābād
|
|
309
309
|
|
|
310
310
|
- source: مَاڼۍ
|
|
311
|
-
expected:
|
|
311
|
+
expected: Māṉêy
|
|
312
312
|
|
|
313
313
|
- source: وَاخَان
|
|
314
|
-
expected:
|
|
314
|
+
expected: Wākhān
|
|
315
315
|
|
|
316
316
|
# - source: هِرَات
|
|
317
|
-
# expected:
|
|
317
|
+
# expected: Herāt
|
|
318
318
|
|
|
319
319
|
- source: يَنْگِي قَلعَه
|
|
320
|
-
expected:
|
|
320
|
+
expected: Yangī Qal‘ah
|
|
321
321
|
|
|
322
322
|
- source: جَلال آبَاد
|
|
323
|
-
expected:
|
|
323
|
+
expected: Jalālābād
|
|
324
324
|
|
|
325
325
|
# - source: هِرات پُلِ حِصَار
|
|
326
326
|
# expected: Herāt Pul-e Ḩişār
|
|
327
327
|
|
|
328
328
|
- source: مُرْغَاب کَابُل
|
|
329
|
-
expected:
|
|
329
|
+
expected: Murghāb Kābul
|
|
330
330
|
|
|
331
331
|
- source: گٙردُون
|
|
332
|
-
expected:
|
|
332
|
+
expected: Gêrdōn
|
|
333
333
|
|
|
334
334
|
- source: آب بَنْد
|
|
335
|
-
expected:
|
|
335
|
+
expected: Āb Band
|
|
336
336
|
|
|
337
337
|
- source: سْپِين بُوْلْدَک
|
|
338
|
-
expected:
|
|
338
|
+
expected: Spīn Bōldak
|
|
339
339
|
|
|
340
340
|
# - source: بَالا بُلُوک
|
|
341
341
|
# expected: Bālā Bulūk
|
|
342
342
|
|
|
343
343
|
- source: جَوزجَان
|
|
344
|
-
expected:
|
|
344
|
+
expected: Jowzjān
|
|
345
345
|
|
|
346
346
|
# - source: غَزْنِى سْپِين
|
|
347
|
-
# expected:
|
|
347
|
+
# expected: Ghaznī spīn
|
|
348
348
|
|
|
349
349
|
# - source: ريگ مَيوَنْد
|
|
350
350
|
# expected: Maywand, Rēg
|
|
351
351
|
|
|
352
352
|
- source: گَرْدېز
|
|
353
|
-
expected:
|
|
353
|
+
expected: Gardēz
|
|
354
354
|
|
|
355
355
|
- source: مَیدان شَهْر
|
|
356
|
-
expected:
|
|
356
|
+
expected: Maīdān Shahr
|
|
357
357
|
|
|
358
358
|
- source: ډَنْډِ سُفْلىٰ
|
|
359
|
-
expected:
|
|
359
|
+
expected: Ḏanḏ-e Suflá
|
|
360
360
|
|
|
361
361
|
# - source: څَوْکۍ
|
|
362
362
|
# expected: Tsowkêy
|
|
363
363
|
|
|
364
364
|
# - source: هَوائِي ډَگَر
|
|
365
|
-
# expected:
|
|
365
|
+
# expected: Hawā’ī ḏagar
|
|
366
366
|
|
|
367
367
|
# - source: مَزارِ شَريف
|
|
368
|
-
# expected:
|
|
368
|
+
# expected: Mazār-e sharīf
|
|
369
369
|
|
|
370
370
|
# - source: دايکندی
|
|
371
|
-
# expected:
|
|
371
|
+
# expected: Dāykundī
|
|
372
372
|
|
|
373
373
|
# - source: زيارت
|
|
374
374
|
# expected: Zīārat
|
|
@@ -380,9 +380,43 @@ tests:
|
|
|
380
380
|
# expected: Myā
|
|
381
381
|
|
|
382
382
|
- source: جَبَل السَرَاج
|
|
383
|
-
expected:
|
|
383
|
+
expected: Jabal as Sarāj
|
|
384
384
|
|
|
385
385
|
map:
|
|
386
|
+
postrules:
|
|
387
|
+
- pattern: (?<=\b)(?<!\b[‘|’|'|-])[\u0061-\uFFFF]
|
|
388
|
+
result: "upcase"
|
|
389
|
+
# don't capitalize defined article in the middle of a sentence
|
|
390
|
+
- pattern : ' At T' # الت
|
|
391
|
+
result: ' at T'
|
|
392
|
+
- pattern : ' As̄ S̄' # الث
|
|
393
|
+
result: ' as̄ S̄'
|
|
394
|
+
- pattern : ' Ad D' # الد
|
|
395
|
+
result: ' ad D'
|
|
396
|
+
- pattern : ' Az̄ Z̄' # الذ
|
|
397
|
+
result: ' az̄ Z̄'
|
|
398
|
+
- pattern : ' Ar R' # الر
|
|
399
|
+
result: ' ar R'
|
|
400
|
+
- pattern : ' Az Z' # الز
|
|
401
|
+
result: ' az Z'
|
|
402
|
+
- pattern : ' As S' # الس
|
|
403
|
+
result: ' as S'
|
|
404
|
+
- pattern : ' Ash Sh' # الش
|
|
405
|
+
result: ' ash Sh'
|
|
406
|
+
- pattern : ' Aş Ş' # الص
|
|
407
|
+
result: ' aş Ş'
|
|
408
|
+
- pattern : ' Aẕ Ẕ' # الض
|
|
409
|
+
result: ' aẕ Ẕ'
|
|
410
|
+
- pattern : ' Aţ Ţ' # الط
|
|
411
|
+
result: ' aţ Ţ'
|
|
412
|
+
- pattern : ' Az̧ Z̧' # الظ
|
|
413
|
+
result: ' az̧ Z̧'
|
|
414
|
+
- pattern : ' Al L' # الل
|
|
415
|
+
result: ' al L'
|
|
416
|
+
- pattern : ' An N' # الن
|
|
417
|
+
result: ' an N'
|
|
418
|
+
- pattern: " Al " # ال
|
|
419
|
+
result: " al "
|
|
386
420
|
characters:
|
|
387
421
|
|
|
388
422
|
# word-medial or word-final form where so appearing in a word.
|