interscript 0.1.7 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.adoc +1 -3
- data/aliases.json +1 -0
- data/lib/interscript.rb +8 -3
- data/lib/interscript/fs.rb +27 -0
- data/lib/interscript/mapping.rb +3 -1
- data/lib/interscript/opal.rb +142 -3
- data/lib/interscript/opal/entrypoint.rb +8 -0
- data/lib/interscript/opal/exports.rb +11 -0
- data/lib/interscript/opal/maps.js.erb +2 -4
- data/lib/interscript/version.rb +1 -1
- data/maps/alalc-ara-Arab-Latn-1997.yaml +5 -5
- data/maps/alalc-asm-Deva-Latn-1997.yaml +104 -10
- data/maps/alalc-asm-Deva-Latn-2012.yaml +18 -3
- data/maps/alalc-aze-Arab-Latn-1997.yaml +376 -0
- data/maps/alalc-ben-Beng-Latn-1997.yaml +291 -0
- data/maps/alalc-div-Thaa-Latn-1997.yaml +211 -0
- data/maps/alalc-hin-Deva-Latn-1997.yaml +102 -10
- data/maps/alalc-hin-Deva-Latn-2011.yaml +19 -1
- data/maps/alalc-kan-Kana-Latn-1997.yaml +274 -0
- data/maps/alalc-kan-Kana-Latn-2011.yaml +63 -0
- data/maps/alalc-ori-Orya-Latn-1997.yaml +284 -0
- data/maps/alalc-ori-Orya-Latn-2011.yaml +67 -0
- data/maps/alalc-pra-Deva-Latn-2012.yaml +2 -2
- data/maps/alalc-san-Deva-Latn-2012.yaml +78 -9
- data/maps/alalc-tel-Telu-Latn-1997.yaml +284 -0
- data/maps/alalc-tel-Telu-Latn-2011.yaml +64 -0
- data/maps/az-aze-Cyrl-Latn-1939.yaml +105 -0
- data/maps/az-aze-Cyrl-Latn-1958.yaml +45 -0
- data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +3 -1
- data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +111 -104
- data/maps/bgnpcgn-bal-Arab-Latn-2008.yaml +329 -0
- data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +1 -1
- data/maps/bgnpcgn-div-Thaa-Latn-1988.yaml +75 -0
- data/maps/bgnpcgn-far-Latn-Latn-1964.yaml +28 -0
- data/maps/bgnpcgn-isl-Latn-Latn-1964.yaml +37 -0
- data/maps/bgnpcgn-kaz-Cyrl-Latn-1979.yaml +247 -0
- data/maps/bgnpcgn-kir-Cyrl-Latn-1979.yaml +218 -0
- data/maps/bgnpcgn-kur-Arab-Latn-2007.yaml +249 -0
- data/maps/bgnpcgn-per-Arab-Latn-1958.yaml +2 -0
- data/maps/bgnpcgn-prs-Arab-Latn-2007.yaml +87 -53
- data/maps/bgnpcgn-pus-Arab-Latn-1968.yaml +377 -0
- data/maps/bgnpcgn-srp-Cyrl-Latn-1962.yaml +73 -0
- data/maps/bgnpcgn-urd-Arab-Latn-2007.yaml +459 -0
- data/maps/{bis-knd-Knda-Latn-13194-1991.yaml → bis-kan-Kana-Latn-13194-1991.yaml} +2 -2
- data/maps/bis-ori-Orya-Latn-13194-1991.yaml +17 -2
- data/maps/iso-ara-Arab-Latn-233-1984.yaml +1 -1
- data/maps/{iso-kan-Knda-Latn-15919-2001.yaml → iso-kan-Kana-Latn-15919-2001.yaml} +1 -1
- data/maps/{mns-mon-Cyrl-Latn-5217-2012.yaml → masm-mon-Cyrl-Latn-5217-2012.yaml} +2 -2
- data/maps/{mns-mon-Latn-Cyrl-5217-2012.yaml → masm-mon-Latn-Cyrl-5217-2012.yaml} +1 -1
- data/maps/mv-div-Thaa-Latn-1987.yaml +200 -0
- data/maps/odni-ara-Arab-Latn-2004.yaml +137 -0
- data/maps/odni-ara-Arab-Latn-2015.yaml +20 -130
- data/maps/odni-bul-Cyrl-Latn-2005.yaml +90 -0
- data/maps/odni-fas-Arab-Latn-2004.yaml +276 -0
- data/maps/odni-hin-Deva-Latn-2004.yaml +182 -0
- data/maps/odni-mkd-Cyrl-Latn-2005.yaml +21 -0
- data/maps/odni-prs-Arab-Latn-2004.yaml +123 -0
- data/maps/{odni-per-Arab-Latn-2015.yaml → odni-prs-Arab-Latn-2015.yaml} +0 -0
- data/maps/odni-srp-Cyrl-Latn-2005.yaml +36 -0
- data/maps/odni-tuk-Cyrl-Latn-2015.yaml +170 -0
- data/maps/odni-ukr-Cyrl-Latn-2015.yaml +4 -0
- data/maps/un-ara-Arab-Latn-2017.yaml +1 -1
- data/maps/un-asm-Beng-Latn-1972.yaml +223 -0
- data/maps/un-guj-Gujr-Latn-1972.yaml +229 -0
- data/maps/un-hin-Deva-Latn-2016.yaml +104 -10
- data/maps/un-kan-Kana-Latn-2016.yaml +254 -0
- data/maps/un-mal-Mlym-Latn-1972.yaml +251 -0
- data/maps/un-mar-Deva-Latn-2016.yaml +24 -13
- data/maps/un-nep-Deva-Latn-1972.yaml +40 -121
- data/maps/un-ori-Orya-Latn-1972.yaml +247 -0
- data/maps/un-pan-Guru-Latn-1972.yaml +402 -0
- data/maps/un-prs-Arab-Latn-1967.yaml +236 -0
- data/maps/un-tam-Taml-Latn-1972.yaml +194 -0
- data/maps/un-tel-Telu-Latn-1972.yaml +270 -0
- data/maps/un-urd-Arab-Latn-1972.yaml +405 -0
- data/maps/var-amh-Ethi-Latn-eae-2003.yaml +466 -0
- data/maps/var-gez-Ethi-Latn-eae-2003.yaml +76 -0
- data/spec/interscript/filenames_spec.rb +6 -369
- data/spec/interscript_spec.rb +10 -2
- metadata +50 -7
- data/lib/interscript/opal/map_translate.rb +0 -7
@@ -0,0 +1,45 @@
|
|
1
|
+
---
|
2
|
+
authority_id: az
|
3
|
+
id: 1958
|
4
|
+
language: iso-639-2:aze
|
5
|
+
source_script: Cyrl
|
6
|
+
destination_script: Latn
|
7
|
+
url: https://omniglot.com/writing/azeri.htm
|
8
|
+
creation_date: 1958
|
9
|
+
description: |
|
10
|
+
In 1939 Joseph Stalin ordered the Cyrillic alphabet to be used by Azeri speakers in the Soviet Union.
|
11
|
+
|
12
|
+
notes:
|
13
|
+
- In 1947, the letter Цц was excluded from the alphabet. Previously, it was used for Russian borrowings
|
14
|
+
- In 1958, the letters Ээ, Юю, Яя were eliminated, and the letter Йй was replaced by Јј
|
15
|
+
|
16
|
+
tests:
|
17
|
+
- source: Юя
|
18
|
+
expected: Юя
|
19
|
+
# from internet
|
20
|
+
- source: Азәрбајҹан әлифбасы
|
21
|
+
expected: Azərbaycan əlifbası
|
22
|
+
- source: |
|
23
|
+
Бүтүн инсанлар ләјагәт вә һүгугларына ҝөрә азад бәрабәр доғулурлар.
|
24
|
+
Онларын шүурлары вә виҹданлары вар вә бир-бирләринә мүнасибәтдә гардашлыг руһунда давранмалыдырлар.
|
25
|
+
expected: |
|
26
|
+
Bütün insanlar ləyaqət və hüquqlarına görə azad bərabər doğulurlar.
|
27
|
+
Onların şüurları və vicdanları var və bir-birlərinə münasibətdə qardaşlıq ruhunda davranmalıdırlar.
|
28
|
+
|
29
|
+
map:
|
30
|
+
inherit: az-aze-Cyrl-Latn-1939
|
31
|
+
|
32
|
+
characters:
|
33
|
+
"\u0408": "Y" # Ј note[2]
|
34
|
+
"\u0419": ~ # Й note[2]
|
35
|
+
"\u0426": ~ # Ц note[1]
|
36
|
+
"\u042D": ~ # Э note[2]
|
37
|
+
"\u042E": ~ # Ю note[2]
|
38
|
+
"\u042F": ~ # Я note[2]
|
39
|
+
|
40
|
+
"\u0458": "y" # ј note[2]
|
41
|
+
"\u0439": ~ # й note[2]
|
42
|
+
"\u0446": ~ # ц note[1]
|
43
|
+
"\u044D": ~ # э note[2]
|
44
|
+
"\u044E": ~ # ю note[2]
|
45
|
+
"\u044F": ~ # я note[2]
|
@@ -269,6 +269,8 @@ tests:
|
|
269
269
|
- source: زَاڴُورَة
|
270
270
|
expected: Zāgūrah
|
271
271
|
|
272
|
+
- source: اِيران
|
273
|
+
expected: Īrān
|
272
274
|
|
273
275
|
map:
|
274
276
|
postrules:
|
@@ -301,7 +303,7 @@ map:
|
|
301
303
|
result: ' az̧ Z̧'
|
302
304
|
- pattern : ' Al L' # الل
|
303
305
|
result: ' al L'
|
304
|
-
- pattern : '
|
306
|
+
- pattern : ' An N' # الن
|
305
307
|
result: ' an N'
|
306
308
|
- pattern: " Al " # ال
|
307
309
|
result: " al "
|
@@ -1,104 +1,111 @@
|
|
1
|
-
---
|
2
|
-
authority_id: bgnpcgn
|
3
|
-
id: 1993
|
4
|
-
language: iso-639-2:aze
|
5
|
-
source_script: Cyrl
|
6
|
-
destination_script: Latn
|
7
|
-
name: AZERBAIJANI TABLE OF CORRESPONDENCES CYRILLIC-ROMAN -- BGN/PCGN 1993 Agreement
|
8
|
-
url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/816656/TABLE_OF_CORRESPONDENCES_FOR_AZERBAIJANI.pdf
|
9
|
-
creation_date: 1993
|
10
|
-
confirmation date: 2019-06
|
11
|
-
description: |
|
12
|
-
Azerbaijani, also known as Azeri, is the official language of the Republic of Azerbaijan. In 1991, the Azerbaijani government adopted the Roman alphabet to replace the existing Cyrillic alphabet. The presentation below provides a table of correspondences between the former Cyrillic alphabet and the current Roman alphabet. When Azerbaijani Roman-alphabet spellings are not available, this table can be used to convert Azerbaijani Cyrillic spellings.
|
13
|
-
|
14
|
-
notes:
|
15
|
-
|
16
|
-
- The special letter Ə, ə known as schwa, should be reproduced in that form whenever encountered. The characters Ə (Unicode 04D8) and ə (Unicode 04D9) should be used for schwa when writing in the Cyrillic script, but characters Ə (Unicode 018F) and ə (Unicode 0259) should be used when writing in the Roman alphabet. In those instances when it cannot be reproduced, however, the letter Ä ä may be substituted for it (see below).
|
17
|
-
|
18
|
-
- The obsolete characters й, э, ю, and я should be romanized ẏ, ė, yu., and ya.
|
19
|
-
|
20
|
-
- Unicode values are shown with the uppercase Cyrillic character first, followed by the lowercase character. It is not known whether there exists an uppercase ‘J’ specific to the Cyrillic character set.
|
21
|
-
|
22
|
-
- |
|
23
|
-
An inventory of letter-diacritic combinations, with their Unicode encoding, in addition to the unmodified letters of the basic Roman script is:
|
24
|
-
Ğ (U+011E), ğ (U+011F)
|
25
|
-
Ə (U+018F), ə (U+0259)
|
26
|
-
İ (U+0130), ı (U+0131)
|
27
|
-
Ö (U+00D6), ö (U+00F6)
|
28
|
-
Ü (U+00DC), ü (U+00FC)
|
29
|
-
Ç (U+00C7), ç (U+00E7)
|
30
|
-
Ş (U+015E), ş (U+015F)
|
31
|
-
|
32
|
-
- The Roman-script columns show only lowercase forms but, when applying the table, uppercase and lowercase Roman letters as appropriate should be used.
|
33
|
-
|
34
|
-
tests:
|
35
|
-
- source:
|
36
|
-
expected:
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
1
|
+
---
|
2
|
+
authority_id: bgnpcgn
|
3
|
+
id: 1993
|
4
|
+
language: iso-639-2:aze
|
5
|
+
source_script: Cyrl
|
6
|
+
destination_script: Latn
|
7
|
+
name: AZERBAIJANI TABLE OF CORRESPONDENCES CYRILLIC-ROMAN -- BGN/PCGN 1993 Agreement
|
8
|
+
url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/816656/TABLE_OF_CORRESPONDENCES_FOR_AZERBAIJANI.pdf
|
9
|
+
creation_date: 1993
|
10
|
+
confirmation date: 2019-06
|
11
|
+
description: |
|
12
|
+
Azerbaijani, also known as Azeri, is the official language of the Republic of Azerbaijan. In 1991, the Azerbaijani government adopted the Roman alphabet to replace the existing Cyrillic alphabet. The presentation below provides a table of correspondences between the former Cyrillic alphabet and the current Roman alphabet. When Azerbaijani Roman-alphabet spellings are not available, this table can be used to convert Azerbaijani Cyrillic spellings.
|
13
|
+
|
14
|
+
notes:
|
15
|
+
|
16
|
+
- The special letter Ə, ə known as schwa, should be reproduced in that form whenever encountered. The characters Ə (Unicode 04D8) and ə (Unicode 04D9) should be used for schwa when writing in the Cyrillic script, but characters Ə (Unicode 018F) and ə (Unicode 0259) should be used when writing in the Roman alphabet. In those instances when it cannot be reproduced, however, the letter Ä ä may be substituted for it (see below).
|
17
|
+
|
18
|
+
- The obsolete characters й, э, ю, and я should be romanized ẏ, ė, yu., and ya.
|
19
|
+
|
20
|
+
- Unicode values are shown with the uppercase Cyrillic character first, followed by the lowercase character. It is not known whether there exists an uppercase ‘J’ specific to the Cyrillic character set.
|
21
|
+
|
22
|
+
- |
|
23
|
+
An inventory of letter-diacritic combinations, with their Unicode encoding, in addition to the unmodified letters of the basic Roman script is:
|
24
|
+
Ğ (U+011E), ğ (U+011F)
|
25
|
+
Ə (U+018F), ə (U+0259)
|
26
|
+
İ (U+0130), ı (U+0131)
|
27
|
+
Ö (U+00D6), ö (U+00F6)
|
28
|
+
Ü (U+00DC), ü (U+00FC)
|
29
|
+
Ç (U+00C7), ç (U+00E7)
|
30
|
+
Ş (U+015E), ş (U+015F)
|
31
|
+
|
32
|
+
- The Roman-script columns show only lowercase forms but, when applying the table, uppercase and lowercase Roman letters as appropriate should be used.
|
33
|
+
|
34
|
+
tests:
|
35
|
+
- source: Азәрбајҹан әлифбасы
|
36
|
+
expected: Azərbaycan əlifbası
|
37
|
+
- source: |
|
38
|
+
Бүтүн инсанлар ләјагәт вә һүгугларына ҝөрә азад бәрабәр доғулурлар.
|
39
|
+
Онларын шүурлары вә виҹданлары вар вә бир-бирләринә мүнасибәтдә гардашлыг руһунда давранмалыдырлар.
|
40
|
+
expected: |
|
41
|
+
Bütün insanlar ləyaqət və hüquqlarına görə azad bərabər doğulurlar.
|
42
|
+
Onların şüurları və vicdanları var və bir-birlərinə münasibətdə qardaşlıq ruhunda davranmalıdırlar.
|
43
|
+
|
44
|
+
|
45
|
+
map:
|
46
|
+
characters:
|
47
|
+
"\u0410": "A" # А
|
48
|
+
"\u0411": "B" # Б
|
49
|
+
"\u0412": "V" # В
|
50
|
+
"\u0413": "Q" # Г
|
51
|
+
"\u0492": "\u011E" # Ғ
|
52
|
+
"\u0414": "D" # Д
|
53
|
+
"\u0415": "E" # Е
|
54
|
+
"\u04D8": "\u018F" # Ә
|
55
|
+
"\u0416": "J" # Ж
|
56
|
+
"\u0417": "Z" # З
|
57
|
+
"\u0418": "\u0130" # И
|
58
|
+
"\u042B": "I" # Ы
|
59
|
+
"\u0408": "Y" # Ј
|
60
|
+
"\u041A": "K" # К
|
61
|
+
"\u049C": "G" # Ҝ
|
62
|
+
"\u041B": "L" # Л
|
63
|
+
"\u041C": "M" # М
|
64
|
+
"\u041D": "N" # Н
|
65
|
+
"\u041E": "O" # О
|
66
|
+
"\u04E8": "\u00D6" # Ө
|
67
|
+
"\u041F": "P" # П
|
68
|
+
"\u0420": "R" # Р
|
69
|
+
"\u0421": "S" # С
|
70
|
+
"\u0422": "T" # Т
|
71
|
+
"\u0423": "U" # У
|
72
|
+
"\u04AE": "\u00DC" # Ү
|
73
|
+
"\u0424": "F" # Ф
|
74
|
+
"\u0425": "X" # Х
|
75
|
+
"\u04BA": "H" # Һ
|
76
|
+
"\u0427": "\u00C7" # Ч
|
77
|
+
"\u04B8": "C" # Ҹ
|
78
|
+
"\u0428": "\u015E" # Ш
|
79
|
+
|
80
|
+
"\u0430": "a" # а
|
81
|
+
"\u0431": "b" # б
|
82
|
+
"\u0432": "v" # в
|
83
|
+
"\u0433": "q" # г
|
84
|
+
"\u0493": "\u011F" # ғ
|
85
|
+
"\u0434": "d" # д
|
86
|
+
"\u0435": "e" # е
|
87
|
+
"\u04D9": "\u0259" # ә
|
88
|
+
"\u0436": "j" # ж
|
89
|
+
"\u0437": "z" # з
|
90
|
+
"\u0438": "i" # и
|
91
|
+
"\u044B": "\u0131" # ы
|
92
|
+
"\u0458": "y" # ј
|
93
|
+
"\u043A": "k" # к
|
94
|
+
"\u049D": "g" # ҝ
|
95
|
+
"\u043B": "l" # л
|
96
|
+
"\u043C": "m" # м
|
97
|
+
"\u043D": "n" # н
|
98
|
+
"\u043E": "o" # о
|
99
|
+
"\u04E9": "\u00F6" # ө
|
100
|
+
"\u043F": "p" # п
|
101
|
+
"\u0440": "r" # р
|
102
|
+
"\u0441": "s" # с
|
103
|
+
"\u0442": "t" # т
|
104
|
+
"\u0443": "u" # у
|
105
|
+
"\u04AF": "\u00FC" # ү
|
106
|
+
"\u0444": "f" # ф
|
107
|
+
"\u0445": "x" # х
|
108
|
+
"\u04BB": "h" # һ
|
109
|
+
"\u0447": "\u00E7" # ч
|
110
|
+
"\u04B9": "c" # ҹ
|
111
|
+
"\u0448": "\u015F" # ш
|
@@ -0,0 +1,329 @@
|
|
1
|
+
---
|
2
|
+
authority_id: bgnpcgn
|
3
|
+
id: 2008
|
4
|
+
language: bal
|
5
|
+
source_script: Arab
|
6
|
+
destination_script: Latn
|
7
|
+
name: ROMANIZATION OF BALUCHI -- BGN/PCGN 2008 System
|
8
|
+
url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/693687/ROMANIZATION_OF_BALUCHI.pdf
|
9
|
+
creation_date: 2008
|
10
|
+
confirmation date: 2017-11
|
11
|
+
description: |
|
12
|
+
The following is the BGN/PCGN-approved romanization
|
13
|
+
system for deriving standard spellings of Baluchi
|
14
|
+
geographic names. The romanization system is based on
|
15
|
+
the Hunterian system of romanization, which has been
|
16
|
+
used by the Surveys of India and Pakistan for
|
17
|
+
romanizing Baluchi geographic names for more than one
|
18
|
+
hundred years. The romanization system is compatible
|
19
|
+
with all dialects of Baluchi, including Eastern
|
20
|
+
Baluchi, Western Baluchi, and Southern Baluchi.
|
21
|
+
|
22
|
+
The BGN/PCGN system laid out below includes diacritical
|
23
|
+
marks in order that the original script can be derived
|
24
|
+
from the romanized form (i.e. it is reversible). For
|
25
|
+
desk users requiring a diacritic-free form, these
|
26
|
+
diacritics can simply be removed. In almost every case
|
27
|
+
the same basic Roman-script characters are kept as are
|
28
|
+
used in the Hunterian system. The BGN/PCGN forms have
|
29
|
+
further been designed to harmonize with the BGN/PCGN
|
30
|
+
Urdu romanization system. In rigorous romanization
|
31
|
+
(i.e. including diacritics), retroflexion is marked by
|
32
|
+
a sub-dot, and aspiration is marked by an apostrophe,
|
33
|
+
where confusion with fricative digraphs could arise.
|
34
|
+
For letters used only in Arabic loan words, the
|
35
|
+
rigorous forms have further been designed to harmonize
|
36
|
+
with the BGN/PCGN Persian romanization system.
|
37
|
+
|
38
|
+
notes:
|
39
|
+
- Occasionally, sequences of /z/ or /s/ plus /h/ may be
|
40
|
+
encountered, i.e. z·h, s·h. These may be romanized with the
|
41
|
+
Unicode 'center dot' (U+00B7) separating the two letters,
|
42
|
+
to distinguish them from the digraphs /zh/ and /sh/.
|
43
|
+
|
44
|
+
- The character ة is found very rarely in Baluchi, principally in certain Arabic religious terms, e.g. zakāt
|
45
|
+
('alms'). It should be romanized t.
|
46
|
+
|
47
|
+
- When the letters ال are found, representing the Arabic
|
48
|
+
definite article, the ل is assimilated to a following 'sun letter' ,د ,ث ,ت
|
49
|
+
ل ,ظ ,ط , ض , ,ص ,ش ,س , ,ر ,ذ or ن and is romanized t, , d, , r, z, s, sh, ş, ẕ ţ z , l, n accordingly.
|
50
|
+
|
51
|
+
- In romanization, the suffixes ءَ (-ā, singular definite)
|
52
|
+
and ءِ (-ay, possessive) are connected to the previous word
|
53
|
+
by a hyphen, though they are usually written separately.
|
54
|
+
|
55
|
+
- The word for 'and', written as و or ءُ, should be
|
56
|
+
romanized as –u-, linked by hyphens to the two words it
|
57
|
+
connects; e.g.,
|
58
|
+
ہ ٹد و س ٹد → Sind-u-Hind ('The Gangetic Plain').
|
59
|
+
|
60
|
+
- Except as specified in notes 4 and 5, word division in romanization should follow word division in the Baluchi script.
|
61
|
+
|
62
|
+
- Note that the short vowels in the Baluchi examples are not pointed.
|
63
|
+
|
64
|
+
- Certain initial, medial and final characters are not
|
65
|
+
readily available in a Unicode-encoded font in a standalone form.
|
66
|
+
|
67
|
+
- The Romanization columns show only lowercase forms but,
|
68
|
+
when romanizing, uppercase and lowercase Roman letters as
|
69
|
+
appropriate should be used.
|
70
|
+
|
71
|
+
tests:
|
72
|
+
# commented tests are blocked by https://github.com/interscript/interscript/issues/620
|
73
|
+
# 'cultivable patch of riverbed'
|
74
|
+
- source: بےنٹَگ
|
75
|
+
expected: Benṭag
|
76
|
+
|
77
|
+
# 'Japan'
|
78
|
+
- source: جاپان
|
79
|
+
expected: Jāpān
|
80
|
+
|
81
|
+
- source: اَرَبِستان
|
82
|
+
expected: Arabistān
|
83
|
+
|
84
|
+
- source: بُنجاه
|
85
|
+
expected: Bunjāh
|
86
|
+
|
87
|
+
- source: بَلوچِستان
|
88
|
+
expected: Balochistān
|
89
|
+
|
90
|
+
# 'village'
|
91
|
+
- source: حَلق
|
92
|
+
expected: Ḩalq
|
93
|
+
|
94
|
+
# 'foothills or skirts of a mountain'
|
95
|
+
- source: دامان
|
96
|
+
expected: Dāmān
|
97
|
+
|
98
|
+
- source: ڈاڈَر
|
99
|
+
expected: Ḍāḍar
|
100
|
+
|
101
|
+
# 'tomb'
|
102
|
+
- source: گُمبُذ
|
103
|
+
expected: Gumbud͟h
|
104
|
+
|
105
|
+
# 'crossroads'
|
106
|
+
- source: چار راہ
|
107
|
+
expected: Chār Rāh
|
108
|
+
|
109
|
+
# 'market'
|
110
|
+
- source: بازار
|
111
|
+
expected: Bāzār
|
112
|
+
|
113
|
+
- source: سےبِى
|
114
|
+
expected: Sebī
|
115
|
+
|
116
|
+
# - source: اِيشيا
|
117
|
+
# expected: Eshyā
|
118
|
+
|
119
|
+
|
120
|
+
# # 'homeland'
|
121
|
+
# - source: وَطَن
|
122
|
+
# expected: Waţan
|
123
|
+
|
124
|
+
# 'Bandar Abbas'
|
125
|
+
- source: عَبّاس
|
126
|
+
expected: ‘Abbās
|
127
|
+
|
128
|
+
# 'Taiwan'
|
129
|
+
- source: فارموسا
|
130
|
+
expected: Fārmosā
|
131
|
+
|
132
|
+
- source: ڈاک
|
133
|
+
expected: Ḍāk
|
134
|
+
|
135
|
+
# 'stream, irrigated area, pasture'
|
136
|
+
- source: مَلّ
|
137
|
+
expected: Mall
|
138
|
+
|
139
|
+
# - source: ہ یرات
|
140
|
+
# expected: Herāt
|
141
|
+
|
142
|
+
# 'Philippines'
|
143
|
+
- source: فِلپائِن
|
144
|
+
expected: Filpā’in
|
145
|
+
|
146
|
+
- source: مُرگاپ
|
147
|
+
expected: Murgāp
|
148
|
+
|
149
|
+
# - source: مَرو
|
150
|
+
# expected: Marw
|
151
|
+
|
152
|
+
|
153
|
+
map:
|
154
|
+
postrules:
|
155
|
+
- pattern: (?<=\b)(?<!\b[‘|’|'])[\u0061-\uFFFF]
|
156
|
+
result: "upcase"
|
157
|
+
|
158
|
+
characters:
|
159
|
+
|
160
|
+
# consonant characters
|
161
|
+
|
162
|
+
'\u0628' : 'b' # ب
|
163
|
+
'\u067E' : 'p' # پ
|
164
|
+
'\u062a' : 't' # ت
|
165
|
+
'\u0679' : 'ṭ' # see note 8 ٹ
|
166
|
+
'\u067C' : 'ṭ' # see note 8 ټ
|
167
|
+
'\u062B' : 't͟h' # see note 8 ث
|
168
|
+
'\u067F' : 't͟h' # see note 8 ٿ
|
169
|
+
'\u062c' : 'j' # ج
|
170
|
+
'\u0686' : 'ch' # چ
|
171
|
+
'\u062d' : 'ḩ' # ح
|
172
|
+
'\u062e' : 'kh' # خ
|
173
|
+
'\u062f' : 'd' # د
|
174
|
+
'\u0688' : 'ḍ' # ڈ
|
175
|
+
'\u0689' : 'ḍ' # ډ
|
176
|
+
'\u0630' : 'd͟h' # ذ
|
177
|
+
'\u0631' : 'r' # ر
|
178
|
+
'\u0691' : 'ṛ' # see note 8 ڑ
|
179
|
+
'\u0693' : 'ṛ' # see note 8 ړ
|
180
|
+
'\u0632' : 'z' # ز
|
181
|
+
'\u0698' : 'zh' # ژ
|
182
|
+
'\u0633' : 's' # س
|
183
|
+
'\u0634' : 'sh' # ش
|
184
|
+
'\u0635' : 'ş' # ص
|
185
|
+
'\u0636' : 'ẕ' # ض
|
186
|
+
'\u0637' : 'ţ' # ط
|
187
|
+
'\u0638' : 'z̧' # ظ
|
188
|
+
'\u0639' : '‘' # ع
|
189
|
+
'\u063a' : 'gh' # غ
|
190
|
+
'\u0641' : 'f' # ف
|
191
|
+
'\u0642' : 'q' # ق
|
192
|
+
'\u0643' : 'k' # ك
|
193
|
+
'\u06A9' : 'k' # ک
|
194
|
+
'\u06AF' : 'g' # گ
|
195
|
+
'\u0644' : 'l' # ل
|
196
|
+
'\u0645' : 'm' # م
|
197
|
+
'\u0646' : 'n' # ن
|
198
|
+
'\u06BA' : 'ñ' # ں
|
199
|
+
'\u0648' : 'w' # و
|
200
|
+
'\u0647' : 'h' # ه
|
201
|
+
'\u06C1' : 'h'
|
202
|
+
'\u06BE' : 'h'
|
203
|
+
'\u0621' : '’' # ء
|
204
|
+
'\u0626' : '’' # ئ
|
205
|
+
'\u0649' : 'y' # ي
|
206
|
+
'\u064A' : 'y' # ي
|
207
|
+
|
208
|
+
|
209
|
+
# Aspiration is only contrastive in Eastern Baluchi
|
210
|
+
'\u0628\u06BE' : 'bh'
|
211
|
+
|
212
|
+
# Aspiration is only contrastive in Eastern Baluchi
|
213
|
+
'\u067E\u06BE' : 'ph'
|
214
|
+
|
215
|
+
# Aspiration is only contrastive in Eastern Baluchi.
|
216
|
+
# Apostrophe distinguishes from fricative /th/.
|
217
|
+
'\u062A\u06BE' : 'th’'
|
218
|
+
|
219
|
+
# Aspiration is only contrastive in Eastern Baluchi
|
220
|
+
'\u0679\u06BE' : 'ṭh'
|
221
|
+
|
222
|
+
# Aspiration is only contrastive in Eastern Baluchi
|
223
|
+
'\u062C\u06BE' : 'jh'
|
224
|
+
|
225
|
+
# Aspiration is only contrastive in Eastern Baluchi
|
226
|
+
'\u0686\u06BE' : 'chh'
|
227
|
+
|
228
|
+
# Aspiration is only contrastive in Eastern Baluchi.
|
229
|
+
# Apostrophe distinguishes from fricative /dh/
|
230
|
+
'\u062D\u06BE' : 'dh’'
|
231
|
+
|
232
|
+
# Aspiration is only contrastive in Eastern Baluchi
|
233
|
+
'\u0688\u06BE' : 'ḍh'
|
234
|
+
|
235
|
+
# Aspiration is only contrastive in Eastern Baluchi
|
236
|
+
'\u0631\u06BE' : '\u1E5B\u0068'
|
237
|
+
|
238
|
+
# Aspiration is only contrastive in Eastern Baluchi.
|
239
|
+
# Apostrophe distinguishes from fricative /kh/
|
240
|
+
'\u06A9\u06BE' : 'kh’'
|
241
|
+
|
242
|
+
# Aspiration is only contrastive in Eastern Baluchi.
|
243
|
+
# Apostrophe distinguishes from fricative /gh/
|
244
|
+
'\u06AF\u06BE' : 'gh’' #
|
245
|
+
'\u0644\u0627' : 'lā' #
|
246
|
+
'\u06A9\u0627' : 'kā' #
|
247
|
+
'\u06AF\u0627' : 'gā' #
|
248
|
+
'\u06A9\u0644' : 'kl' #
|
249
|
+
'\u06AF\u0644' : 'gl' #
|
250
|
+
|
251
|
+
# Vowels, Diphthongs, and Diacritical Marks
|
252
|
+
'\u0650\u0649' : 'ī' # ـِي
|
253
|
+
'\u0650' : 'i' # ِ
|
254
|
+
'\u06D2' : 'e' # ـے
|
255
|
+
'\b\u0627' : '' # ا
|
256
|
+
'\u0627' : 'ā' # ا
|
257
|
+
'\u0622' : 'ā' # آ
|
258
|
+
'\u064E' : 'a' # َ
|
259
|
+
'\u0648' : 'o' # و
|
260
|
+
'\u064F' : 'u' # ُ
|
261
|
+
'\u064F\u0648' : 'ū' # ـُو
|
262
|
+
'\u064E\u06D2' : 'ay' # ـَي
|
263
|
+
'\u064E\u0648' : 'aw' # ـَو
|
264
|
+
'\u0652' : '' # Not Romanized
|
265
|
+
'\u0670' : 'á' #
|
266
|
+
|
267
|
+
'\u0628\u0651' : 'bb' # ب
|
268
|
+
'\u067E\u0651' : 'pp' # پ
|
269
|
+
'\u062a\u0651' : 'tt' # ت
|
270
|
+
'\u0679\u0651' : 'ṭṭ' # see note 8 ٹ
|
271
|
+
'\u067C\u0651' : 'ṭṭ' # see note 8 ټ
|
272
|
+
'\u062B\u0651' : 't͟ht͟h' # see note 8 ث
|
273
|
+
'\u067F\u0651' : 't͟ht͟h' # see note 8 ٿ
|
274
|
+
'\u062c\u0651' : 'jj' # ج
|
275
|
+
'\u0686\u0651' : 'chch' # چ
|
276
|
+
'\u062d\u0651' : 'ḩḩ' # ح
|
277
|
+
'\u062e\u0651' : 'khkh' # خ
|
278
|
+
'\u062f\u0651' : 'dd' # د
|
279
|
+
'\u0688\u0651' : 'ḍḍ' # ڈ
|
280
|
+
'\u0689\u0651' : 'ḍḍ' # ډ
|
281
|
+
'\u0630\u0651' : 'd͟hd͟h' # ذ
|
282
|
+
'\u0631\u0651' : 'rr' # ر
|
283
|
+
'\u0691\u0651' : 'ṛṛ' # see note 8 ڑ
|
284
|
+
'\u0693\u0651' : 'ṛṛ' # see note 8 ړ
|
285
|
+
'\u0632\u0651' : 'zz' # ز
|
286
|
+
'\u0698\u0651' : 'zhzh' # ژ
|
287
|
+
'\u0633\u0651' : 'ss' # س
|
288
|
+
'\u0634\u0651' : 'shsh' # ش
|
289
|
+
'\u0635\u0651' : 'şş' # ص
|
290
|
+
'\u0636\u0651' : 'ẕẕ' # ض
|
291
|
+
'\u0637\u0651' : 'ţţ' # ط
|
292
|
+
'\u0638\u0651' : 'z̧z̧' # ظ
|
293
|
+
'\u0639\u0651' : '‘‘' # ع
|
294
|
+
'\u063a\u0651' : 'ghgh' # غ
|
295
|
+
'\u0641\u0651' : 'ff' # ف
|
296
|
+
'\u0642\u0651' : 'qq' # ق
|
297
|
+
'\u0643\u0651' : 'kk' # ك
|
298
|
+
'\u06A9\u0651' : 'kk' # ک
|
299
|
+
'\u06AF\u0651' : 'gg' # گ
|
300
|
+
'\u0644\u0651' : 'll' # ل
|
301
|
+
'\u0645\u0651' : 'mm' # م
|
302
|
+
'\u0646\u0651' : 'nn' # ن
|
303
|
+
'\u06BA\u0651' : 'ññ' # ں
|
304
|
+
'\u0648\u0651' : 'ww' # و
|
305
|
+
'\u0647\u0651' : 'hh' # ه
|
306
|
+
'\u06C1\u0651' : 'hh'
|
307
|
+
'\u06BE\u0651' : 'hh'
|
308
|
+
'\u0621\u0651' : '’’' # ء
|
309
|
+
'\u0626\u0651' : '’’' # ئ
|
310
|
+
'\u0649\u0651' : 'yy' # ي
|
311
|
+
|
312
|
+
'\u0621\u064E' : '-ā' # see note 4
|
313
|
+
'\u0621\u0650' : '-ay' # see note 4
|
314
|
+
|
315
|
+
# Numerals
|
316
|
+
'۰' : '0'
|
317
|
+
'۱' : '1'
|
318
|
+
'۲' : '2'
|
319
|
+
'۳' : '3'
|
320
|
+
'۴' : '4'
|
321
|
+
'۵' : '5'
|
322
|
+
'۶' : '6'
|
323
|
+
'۷' : '7'
|
324
|
+
'۸' : '8'
|
325
|
+
'۹' : '9'
|
326
|
+
# Although Perso-Arabic script is written from right to
|
327
|
+
# left, numerical expressions, e.g. ۸۶۹۱ → 1968, are
|
328
|
+
# written from left to right. A comma is inserted into
|
329
|
+
# longer sequences, either after thousands, millions, etc.
|