interscript 0.1.7 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +1 -3
  3. data/aliases.json +1 -0
  4. data/lib/interscript.rb +8 -3
  5. data/lib/interscript/fs.rb +27 -0
  6. data/lib/interscript/mapping.rb +3 -1
  7. data/lib/interscript/opal.rb +142 -3
  8. data/lib/interscript/opal/entrypoint.rb +8 -0
  9. data/lib/interscript/opal/exports.rb +11 -0
  10. data/lib/interscript/opal/maps.js.erb +2 -4
  11. data/lib/interscript/version.rb +1 -1
  12. data/maps/alalc-ara-Arab-Latn-1997.yaml +5 -5
  13. data/maps/alalc-asm-Deva-Latn-1997.yaml +104 -10
  14. data/maps/alalc-asm-Deva-Latn-2012.yaml +18 -3
  15. data/maps/alalc-aze-Arab-Latn-1997.yaml +376 -0
  16. data/maps/alalc-ben-Beng-Latn-1997.yaml +291 -0
  17. data/maps/alalc-div-Thaa-Latn-1997.yaml +211 -0
  18. data/maps/alalc-hin-Deva-Latn-1997.yaml +102 -10
  19. data/maps/alalc-hin-Deva-Latn-2011.yaml +19 -1
  20. data/maps/alalc-kan-Kana-Latn-1997.yaml +274 -0
  21. data/maps/alalc-kan-Kana-Latn-2011.yaml +63 -0
  22. data/maps/alalc-ori-Orya-Latn-1997.yaml +284 -0
  23. data/maps/alalc-ori-Orya-Latn-2011.yaml +67 -0
  24. data/maps/alalc-pra-Deva-Latn-2012.yaml +2 -2
  25. data/maps/alalc-san-Deva-Latn-2012.yaml +78 -9
  26. data/maps/alalc-tel-Telu-Latn-1997.yaml +284 -0
  27. data/maps/alalc-tel-Telu-Latn-2011.yaml +64 -0
  28. data/maps/az-aze-Cyrl-Latn-1939.yaml +105 -0
  29. data/maps/az-aze-Cyrl-Latn-1958.yaml +45 -0
  30. data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +3 -1
  31. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +111 -104
  32. data/maps/bgnpcgn-bal-Arab-Latn-2008.yaml +329 -0
  33. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +1 -1
  34. data/maps/bgnpcgn-div-Thaa-Latn-1988.yaml +75 -0
  35. data/maps/bgnpcgn-far-Latn-Latn-1964.yaml +28 -0
  36. data/maps/bgnpcgn-isl-Latn-Latn-1964.yaml +37 -0
  37. data/maps/bgnpcgn-kaz-Cyrl-Latn-1979.yaml +247 -0
  38. data/maps/bgnpcgn-kir-Cyrl-Latn-1979.yaml +218 -0
  39. data/maps/bgnpcgn-kur-Arab-Latn-2007.yaml +249 -0
  40. data/maps/bgnpcgn-per-Arab-Latn-1958.yaml +2 -0
  41. data/maps/bgnpcgn-prs-Arab-Latn-2007.yaml +87 -53
  42. data/maps/bgnpcgn-pus-Arab-Latn-1968.yaml +377 -0
  43. data/maps/bgnpcgn-srp-Cyrl-Latn-1962.yaml +73 -0
  44. data/maps/bgnpcgn-urd-Arab-Latn-2007.yaml +459 -0
  45. data/maps/{bis-knd-Knda-Latn-13194-1991.yaml → bis-kan-Kana-Latn-13194-1991.yaml} +2 -2
  46. data/maps/bis-ori-Orya-Latn-13194-1991.yaml +17 -2
  47. data/maps/iso-ara-Arab-Latn-233-1984.yaml +1 -1
  48. data/maps/{iso-kan-Knda-Latn-15919-2001.yaml → iso-kan-Kana-Latn-15919-2001.yaml} +1 -1
  49. data/maps/{mns-mon-Cyrl-Latn-5217-2012.yaml → masm-mon-Cyrl-Latn-5217-2012.yaml} +2 -2
  50. data/maps/{mns-mon-Latn-Cyrl-5217-2012.yaml → masm-mon-Latn-Cyrl-5217-2012.yaml} +1 -1
  51. data/maps/mv-div-Thaa-Latn-1987.yaml +200 -0
  52. data/maps/odni-ara-Arab-Latn-2004.yaml +137 -0
  53. data/maps/odni-ara-Arab-Latn-2015.yaml +20 -130
  54. data/maps/odni-bul-Cyrl-Latn-2005.yaml +90 -0
  55. data/maps/odni-fas-Arab-Latn-2004.yaml +276 -0
  56. data/maps/odni-hin-Deva-Latn-2004.yaml +182 -0
  57. data/maps/odni-mkd-Cyrl-Latn-2005.yaml +21 -0
  58. data/maps/odni-prs-Arab-Latn-2004.yaml +123 -0
  59. data/maps/{odni-per-Arab-Latn-2015.yaml → odni-prs-Arab-Latn-2015.yaml} +0 -0
  60. data/maps/odni-srp-Cyrl-Latn-2005.yaml +36 -0
  61. data/maps/odni-tuk-Cyrl-Latn-2015.yaml +170 -0
  62. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +4 -0
  63. data/maps/un-ara-Arab-Latn-2017.yaml +1 -1
  64. data/maps/un-asm-Beng-Latn-1972.yaml +223 -0
  65. data/maps/un-guj-Gujr-Latn-1972.yaml +229 -0
  66. data/maps/un-hin-Deva-Latn-2016.yaml +104 -10
  67. data/maps/un-kan-Kana-Latn-2016.yaml +254 -0
  68. data/maps/un-mal-Mlym-Latn-1972.yaml +251 -0
  69. data/maps/un-mar-Deva-Latn-2016.yaml +24 -13
  70. data/maps/un-nep-Deva-Latn-1972.yaml +40 -121
  71. data/maps/un-ori-Orya-Latn-1972.yaml +247 -0
  72. data/maps/un-pan-Guru-Latn-1972.yaml +402 -0
  73. data/maps/un-prs-Arab-Latn-1967.yaml +236 -0
  74. data/maps/un-tam-Taml-Latn-1972.yaml +194 -0
  75. data/maps/un-tel-Telu-Latn-1972.yaml +270 -0
  76. data/maps/un-urd-Arab-Latn-1972.yaml +405 -0
  77. data/maps/var-amh-Ethi-Latn-eae-2003.yaml +466 -0
  78. data/maps/var-gez-Ethi-Latn-eae-2003.yaml +76 -0
  79. data/spec/interscript/filenames_spec.rb +6 -369
  80. data/spec/interscript_spec.rb +10 -2
  81. metadata +50 -7
  82. data/lib/interscript/opal/map_translate.rb +0 -7
@@ -0,0 +1,45 @@
1
+ ---
2
+ authority_id: az
3
+ id: 1958
4
+ language: iso-639-2:aze
5
+ source_script: Cyrl
6
+ destination_script: Latn
7
+ url: https://omniglot.com/writing/azeri.htm
8
+ creation_date: 1958
9
+ description: |
10
+ In 1939 Joseph Stalin ordered the Cyrillic alphabet to be used by Azeri speakers in the Soviet Union.
11
+
12
+ notes:
13
+ - In 1947, the letter Цц was excluded from the alphabet. Previously, it was used for Russian borrowings
14
+ - In 1958, the letters Ээ, Юю, Яя were eliminated, and the letter Йй was replaced by Јј
15
+
16
+ tests:
17
+ - source: Юя
18
+ expected: Юя
19
+ # from internet
20
+ - source: Азәрбајҹан әлифбасы
21
+ expected: Azərbaycan əlifbası
22
+ - source: |
23
+ Бүтүн инсанлар ләјагәт вә һүгугларына ҝөрә азад бәрабәр доғулурлар.
24
+ Онларын шүурлары вә виҹданлары вар вә бир-бирләринә мүнасибәтдә гардашлыг руһунда давранмалыдырлар.
25
+ expected: |
26
+ Bütün insanlar ləyaqət və hüquqlarına görə azad bərabər doğulurlar.
27
+ Onların şüurları və vicdanları var və bir-birlərinə münasibətdə qardaşlıq ruhunda davranmalıdırlar.
28
+
29
+ map:
30
+ inherit: az-aze-Cyrl-Latn-1939
31
+
32
+ characters:
33
+ "\u0408": "Y" # Ј note[2]
34
+ "\u0419": ~ # Й note[2]
35
+ "\u0426": ~ # Ц note[1]
36
+ "\u042D": ~ # Э note[2]
37
+ "\u042E": ~ # Ю note[2]
38
+ "\u042F": ~ # Я note[2]
39
+
40
+ "\u0458": "y" # ј note[2]
41
+ "\u0439": ~ # й note[2]
42
+ "\u0446": ~ # ц note[1]
43
+ "\u044D": ~ # э note[2]
44
+ "\u044E": ~ # ю note[2]
45
+ "\u044F": ~ # я note[2]
@@ -269,6 +269,8 @@ tests:
269
269
  - source: زَاڴُورَة
270
270
  expected: Zāgūrah
271
271
 
272
+ - source: اِيران
273
+ expected: Īrān
272
274
 
273
275
  map:
274
276
  postrules:
@@ -301,7 +303,7 @@ map:
301
303
  result: ' az̧ Z̧'
302
304
  - pattern : ' Al L' # الل
303
305
  result: ' al L'
304
- - pattern : ' an n' # الن
306
+ - pattern : ' An N' # الن
305
307
  result: ' an N'
306
308
  - pattern: " Al " # ال
307
309
  result: " al "
@@ -1,104 +1,111 @@
1
- ---
2
- authority_id: bgnpcgn
3
- id: 1993
4
- language: iso-639-2:aze
5
- source_script: Cyrl
6
- destination_script: Latn
7
- name: AZERBAIJANI TABLE OF CORRESPONDENCES CYRILLIC-ROMAN -- BGN/PCGN 1993 Agreement
8
- url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/816656/TABLE_OF_CORRESPONDENCES_FOR_AZERBAIJANI.pdf
9
- creation_date: 1993
10
- confirmation date: 2019-06
11
- description: |
12
- Azerbaijani, also known as Azeri, is the official language of the Republic of Azerbaijan. In 1991, the Azerbaijani government adopted the Roman alphabet to replace the existing Cyrillic alphabet. The presentation below provides a table of correspondences between the former Cyrillic alphabet and the current Roman alphabet. When Azerbaijani Roman-alphabet spellings are not available, this table can be used to convert Azerbaijani Cyrillic spellings.
13
-
14
- notes:
15
-
16
- - The special letter Ə, ə known as schwa, should be reproduced in that form whenever encountered. The characters Ə (Unicode 04D8) and ə (Unicode 04D9) should be used for schwa when writing in the Cyrillic script, but characters Ə (Unicode 018F) and ə (Unicode 0259) should be used when writing in the Roman alphabet. In those instances when it cannot be reproduced, however, the letter Ä ä may be substituted for it (see below).
17
-
18
- - The obsolete characters й, э, ю, and я should be romanized ẏ, ė, yu., and ya.
19
-
20
- - Unicode values are shown with the uppercase Cyrillic character first, followed by the lowercase character. It is not known whether there exists an uppercase ‘J’ specific to the Cyrillic character set.
21
-
22
- - |
23
- An inventory of letter-diacritic combinations, with their Unicode encoding, in addition to the unmodified letters of the basic Roman script is:
24
- Ğ (U+011E), ğ (U+011F)
25
- Ə (U+018F), ə (U+0259)
26
- İ (U+0130), ı (U+0131)
27
- Ö (U+00D6), ö (U+00F6)
28
- Ü (U+00DC), ü (U+00FC)
29
- Ç (U+00C7), ç (U+00E7)
30
- Ş (U+015E), ş (U+015F)
31
-
32
- - The Roman-script columns show only lowercase forms but, when applying the table, uppercase and lowercase Roman letters as appropriate should be used.
33
-
34
- tests:
35
- - source:
36
- expected:
37
-
38
- map:
39
- characters:
40
- '\u0410' : 'A'
41
- '\u0411' : 'B'
42
- '\u0412' : 'G'
43
- '\u0413' : 'V'
44
- '\u0492' : 'Ğ'
45
- '\u0414' : 'D'
46
- '\u0415' : 'E'
47
- '\u04D8' : 'Ә'
48
- '\u0416' : 'J'
49
- '\u0417' : 'Z'
50
- '\u0418' : 'I'
51
- '\u042B' : 'İ'
52
- '\u0408' : 'Y'
53
- '\u041A' : 'K'
54
- '\u049C' : 'G'
55
- '\u041B' : 'L'
56
- '\u041C' : 'M'
57
- '\u041D' : 'N'
58
- '\u041E' : 'O'
59
- '\u04E8' : 'Ö'
60
- '\u041F' : 'P'
61
- '\u0420' : 'R'
62
- '\u0421' : 'S'
63
- '\u0422' : 'T'
64
- '\u0423' : 'U'
65
- '\u04AE' : 'Ü'
66
- '\u0424' : 'F'
67
- '\u0425' : 'X'
68
- '\u04BA' : 'H'
69
- '\u0427' : 'Ç'
70
- '\u04B8' : 'C'
71
- '\u0428' : 'Ş'
72
-
73
- '\u0430' : 'a'
74
- '\u0431' : 'b'
75
- '\u0432' : 'v'
76
- '\u0433' : 'g'
77
- '\u0493' : 'ğ'
78
- '\u0434' : 'd'
79
- '\u0435' : 'e'
80
- '\u04D9' : 'ә'
81
- '\u0436' : 'j'
82
- '\u0437' : 'z'
83
- '\u0438' : 'i'
84
- '\u044B' : 'ı'
85
- '\u0458' : 'y'
86
- '\u043A' : 'k'
87
- '\u049D' : 'g'
88
- '\u043B' : 'l'
89
- '\u043C' : 'm'
90
- '\u043D' : 'n'
91
- '\u043E' : 'o'
92
- '\u04E9' : 'ö'
93
- '\u043F' : 'p'
94
- '\u0440' : 'r'
95
- '\u0441' : 's'
96
- '\u0442' : 't'
97
- '\u0443' : 'u'
98
- '\u04AF' : 'ü'
99
- '\u0444' : 'f'
100
- '\u0445' : 'x'
101
- '\u04BB' : 'h'
102
- '\u0447' : 'ç'
103
- '\u04B9' : 'c'
104
- '\u0448' : 'ş'
1
+ ---
2
+ authority_id: bgnpcgn
3
+ id: 1993
4
+ language: iso-639-2:aze
5
+ source_script: Cyrl
6
+ destination_script: Latn
7
+ name: AZERBAIJANI TABLE OF CORRESPONDENCES CYRILLIC-ROMAN -- BGN/PCGN 1993 Agreement
8
+ url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/816656/TABLE_OF_CORRESPONDENCES_FOR_AZERBAIJANI.pdf
9
+ creation_date: 1993
10
+ confirmation date: 2019-06
11
+ description: |
12
+ Azerbaijani, also known as Azeri, is the official language of the Republic of Azerbaijan. In 1991, the Azerbaijani government adopted the Roman alphabet to replace the existing Cyrillic alphabet. The presentation below provides a table of correspondences between the former Cyrillic alphabet and the current Roman alphabet. When Azerbaijani Roman-alphabet spellings are not available, this table can be used to convert Azerbaijani Cyrillic spellings.
13
+
14
+ notes:
15
+
16
+ - The special letter Ə, ə known as schwa, should be reproduced in that form whenever encountered. The characters Ə (Unicode 04D8) and ə (Unicode 04D9) should be used for schwa when writing in the Cyrillic script, but characters Ə (Unicode 018F) and ə (Unicode 0259) should be used when writing in the Roman alphabet. In those instances when it cannot be reproduced, however, the letter Ä ä may be substituted for it (see below).
17
+
18
+ - The obsolete characters й, э, ю, and я should be romanized ẏ, ė, yu., and ya.
19
+
20
+ - Unicode values are shown with the uppercase Cyrillic character first, followed by the lowercase character. It is not known whether there exists an uppercase ‘J’ specific to the Cyrillic character set.
21
+
22
+ - |
23
+ An inventory of letter-diacritic combinations, with their Unicode encoding, in addition to the unmodified letters of the basic Roman script is:
24
+ Ğ (U+011E), ğ (U+011F)
25
+ Ə (U+018F), ə (U+0259)
26
+ İ (U+0130), ı (U+0131)
27
+ Ö (U+00D6), ö (U+00F6)
28
+ Ü (U+00DC), ü (U+00FC)
29
+ Ç (U+00C7), ç (U+00E7)
30
+ Ş (U+015E), ş (U+015F)
31
+
32
+ - The Roman-script columns show only lowercase forms but, when applying the table, uppercase and lowercase Roman letters as appropriate should be used.
33
+
34
+ tests:
35
+ - source: Азәрбајҹан әлифбасы
36
+ expected: Azərbaycan əlifbası
37
+ - source: |
38
+ Бүтүн инсанлар ләјагәт вә һүгугларына ҝөрә азад бәрабәр доғулурлар.
39
+ Онларын шүурлары вә виҹданлары вар вә бир-бирләринә мүнасибәтдә гардашлыг руһунда давранмалыдырлар.
40
+ expected: |
41
+ Bütün insanlar ləyaqət və hüquqlarına görə azad bərabər doğulurlar.
42
+ Onların şüurları və vicdanları var və bir-birlərinə münasibətdə qardaşlıq ruhunda davranmalıdırlar.
43
+
44
+
45
+ map:
46
+ characters:
47
+ "\u0410": "A" # А
48
+ "\u0411": "B" # Б
49
+ "\u0412": "V" # В
50
+ "\u0413": "Q" # Г
51
+ "\u0492": "\u011E" # Ғ
52
+ "\u0414": "D" # Д
53
+ "\u0415": "E" # Е
54
+ "\u04D8": "\u018F" # Ә
55
+ "\u0416": "J" # Ж
56
+ "\u0417": "Z" # З
57
+ "\u0418": "\u0130" # И
58
+ "\u042B": "I" # Ы
59
+ "\u0408": "Y" # Ј
60
+ "\u041A": "K" # К
61
+ "\u049C": "G" # Ҝ
62
+ "\u041B": "L" # Л
63
+ "\u041C": "M" # М
64
+ "\u041D": "N" # Н
65
+ "\u041E": "O" # О
66
+ "\u04E8": "\u00D6" # Ө
67
+ "\u041F": "P" # П
68
+ "\u0420": "R" # Р
69
+ "\u0421": "S" # С
70
+ "\u0422": "T" # Т
71
+ "\u0423": "U" # У
72
+ "\u04AE": "\u00DC" # Ү
73
+ "\u0424": "F" # Ф
74
+ "\u0425": "X" # Х
75
+ "\u04BA": "H" # Һ
76
+ "\u0427": "\u00C7" # Ч
77
+ "\u04B8": "C" # Ҹ
78
+ "\u0428": "\u015E" # Ш
79
+
80
+ "\u0430": "a" # а
81
+ "\u0431": "b" # б
82
+ "\u0432": "v" # в
83
+ "\u0433": "q" # г
84
+ "\u0493": "\u011F" # ғ
85
+ "\u0434": "d" # д
86
+ "\u0435": "e" # е
87
+ "\u04D9": "\u0259" # ә
88
+ "\u0436": "j" # ж
89
+ "\u0437": "z" # з
90
+ "\u0438": "i" # и
91
+ "\u044B": "\u0131" # ы
92
+ "\u0458": "y" # ј
93
+ "\u043A": "k" # к
94
+ "\u049D": "g" # ҝ
95
+ "\u043B": "l" # л
96
+ "\u043C": "m" # м
97
+ "\u043D": "n" # н
98
+ "\u043E": "o" # о
99
+ "\u04E9": "\u00F6" # ө
100
+ "\u043F": "p" # п
101
+ "\u0440": "r" # р
102
+ "\u0441": "s" # с
103
+ "\u0442": "t" # т
104
+ "\u0443": "u" # у
105
+ "\u04AF": "\u00FC" # ү
106
+ "\u0444": "f" # ф
107
+ "\u0445": "x" # х
108
+ "\u04BB": "h" # һ
109
+ "\u0447": "\u00E7" # ч
110
+ "\u04B9": "c" # ҹ
111
+ "\u0448": "\u015F" # ш
@@ -0,0 +1,329 @@
1
+ ---
2
+ authority_id: bgnpcgn
3
+ id: 2008
4
+ language: bal
5
+ source_script: Arab
6
+ destination_script: Latn
7
+ name: ROMANIZATION OF BALUCHI -- BGN/PCGN 2008 System
8
+ url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/693687/ROMANIZATION_OF_BALUCHI.pdf
9
+ creation_date: 2008
10
+ confirmation date: 2017-11
11
+ description: |
12
+ The following is the BGN/PCGN-approved romanization
13
+ system for deriving standard spellings of Baluchi
14
+ geographic names. The romanization system is based on
15
+ the Hunterian system of romanization, which has been
16
+ used by the Surveys of India and Pakistan for
17
+ romanizing Baluchi geographic names for more than one
18
+ hundred years. The romanization system is compatible
19
+ with all dialects of Baluchi, including Eastern
20
+ Baluchi, Western Baluchi, and Southern Baluchi.
21
+
22
+ The BGN/PCGN system laid out below includes diacritical
23
+ marks in order that the original script can be derived
24
+ from the romanized form (i.e. it is reversible). For
25
+ desk users requiring a diacritic-free form, these
26
+ diacritics can simply be removed. In almost every case
27
+ the same basic Roman-script characters are kept as are
28
+ used in the Hunterian system. The BGN/PCGN forms have
29
+ further been designed to harmonize with the BGN/PCGN
30
+ Urdu romanization system. In rigorous romanization
31
+ (i.e. including diacritics), retroflexion is marked by
32
+ a sub-dot, and aspiration is marked by an apostrophe,
33
+ where confusion with fricative digraphs could arise.
34
+ For letters used only in Arabic loan words, the
35
+ rigorous forms have further been designed to harmonize
36
+ with the BGN/PCGN Persian romanization system.
37
+
38
+ notes:
39
+ - Occasionally, sequences of /z/ or /s/ plus /h/ may be
40
+ encountered, i.e. z·h, s·h. These may be romanized with the
41
+ Unicode 'center dot' (U+00B7) separating the two letters,
42
+ to distinguish them from the digraphs /zh/ and /sh/.
43
+
44
+ - The character ة is found very rarely in Baluchi, principally in certain Arabic religious terms, e.g. zakāt
45
+ ('alms'). It should be romanized t.
46
+
47
+ - When the letters ال are found, representing the Arabic
48
+ definite article, the ل is assimilated to a following 'sun letter' ,د ,ث ,ت
49
+ ل ,ظ ,ط , ض , ,ص ,ش ,س , ,ر ,ذ or ن and is romanized t, , d, , r, z, s, sh, ş, ẕ ţ z , l, n accordingly.
50
+
51
+ - In romanization, the suffixes ءَ (-ā, singular definite)
52
+ and ءِ (-ay, possessive) are connected to the previous word
53
+ by a hyphen, though they are usually written separately.
54
+
55
+ - The word for 'and', written as و or ءُ, should be
56
+ romanized as –u-, linked by hyphens to the two words it
57
+ connects; e.g.,
58
+ ہ ٹد و س ٹد → Sind-u-Hind ('The Gangetic Plain').
59
+
60
+ - Except as specified in notes 4 and 5, word division in romanization should follow word division in the Baluchi script.
61
+
62
+ - Note that the short vowels in the Baluchi examples are not pointed.
63
+
64
+ - Certain initial, medial and final characters are not
65
+ readily available in a Unicode-encoded font in a standalone form.
66
+
67
+ - The Romanization columns show only lowercase forms but,
68
+ when romanizing, uppercase and lowercase Roman letters as
69
+ appropriate should be used.
70
+
71
+ tests:
72
+ # commented tests are blocked by https://github.com/interscript/interscript/issues/620
73
+ # 'cultivable patch of riverbed'
74
+ - source: بےنٹَگ
75
+ expected: Benṭag
76
+
77
+ # 'Japan'
78
+ - source: جاپان
79
+ expected: Jāpān
80
+
81
+ - source: اَرَبِستان
82
+ expected: Arabistān
83
+
84
+ - source: بُنجاه
85
+ expected: Bunjāh
86
+
87
+ - source: بَلوچِستان
88
+ expected: Balochistān
89
+
90
+ # 'village'
91
+ - source: حَلق
92
+ expected: Ḩalq
93
+
94
+ # 'foothills or skirts of a mountain'
95
+ - source: دامان
96
+ expected: Dāmān
97
+
98
+ - source: ڈاڈَر
99
+ expected: Ḍāḍar
100
+
101
+ # 'tomb'
102
+ - source: گُمبُذ
103
+ expected: Gumbud͟h
104
+
105
+ # 'crossroads'
106
+ - source: چار راہ
107
+ expected: Chār Rāh
108
+
109
+ # 'market'
110
+ - source: بازار
111
+ expected: Bāzār
112
+
113
+ - source: سےبِى
114
+ expected: Sebī
115
+
116
+ # - source: اِيشيا
117
+ # expected: Eshyā
118
+
119
+
120
+ # # 'homeland'
121
+ # - source: وَطَن
122
+ # expected: Waţan
123
+
124
+ # 'Bandar Abbas'
125
+ - source: عَبّاس
126
+ expected: ‘Abbās
127
+
128
+ # 'Taiwan'
129
+ - source: فارموسا
130
+ expected: Fārmosā
131
+
132
+ - source: ڈاک
133
+ expected: Ḍāk
134
+
135
+ # 'stream, irrigated area, pasture'
136
+ - source: مَلّ
137
+ expected: Mall
138
+
139
+ # - source: ہ یرات
140
+ # expected: Herāt
141
+
142
+ # 'Philippines'
143
+ - source: فِلپائِن
144
+ expected: Filpā’in
145
+
146
+ - source: مُرگاپ
147
+ expected: Murgāp
148
+
149
+ # - source: مَرو
150
+ # expected: Marw
151
+
152
+
153
+ map:
154
+ postrules:
155
+ - pattern: (?<=\b)(?<!\b[‘|’|'])[\u0061-\uFFFF]
156
+ result: "upcase"
157
+
158
+ characters:
159
+
160
+ # consonant characters
161
+
162
+ '\u0628' : 'b' # ب
163
+ '\u067E' : 'p' # پ
164
+ '\u062a' : 't' # ت
165
+ '\u0679' : 'ṭ' # see note 8 ٹ
166
+ '\u067C' : 'ṭ' # see note 8 ټ
167
+ '\u062B' : 't͟h' # see note 8 ث
168
+ '\u067F' : 't͟h' # see note 8 ٿ
169
+ '\u062c' : 'j' # ج
170
+ '\u0686' : 'ch' # ‫چ‬
171
+ '\u062d' : 'ḩ' # ح
172
+ '\u062e' : 'kh' # خ
173
+ '\u062f' : 'd' # د
174
+ '\u0688' : 'ḍ' # ڈ
175
+ '\u0689' : 'ḍ' # ‫ډ‬
176
+ '\u0630' : 'd͟h' # ذ
177
+ '\u0631' : 'r' # ر
178
+ '\u0691' : 'ṛ' # see note 8 ڑ
179
+ '\u0693' : 'ṛ' # see note 8 ړ
180
+ '\u0632' : 'z' # ز
181
+ '\u0698' : 'zh' # ‫ژ‬
182
+ '\u0633' : 's' # س
183
+ '\u0634' : 'sh' # ش
184
+ '\u0635' : 'ş' # ص
185
+ '\u0636' : 'ẕ' # ض
186
+ '\u0637' : 'ţ' # ط
187
+ '\u0638' : 'z̧' # ظ
188
+ '\u0639' : '‘' # ع
189
+ '\u063a' : 'gh' # غ
190
+ '\u0641' : 'f' # ف
191
+ '\u0642' : 'q' # ق
192
+ '\u0643' : 'k' # ك
193
+ '\u06A9' : 'k' # ک
194
+ '\u06AF' : 'g' # ‫گ‬
195
+ '\u0644' : 'l' # ل
196
+ '\u0645' : 'm' # م
197
+ '\u0646' : 'n' # ن
198
+ '\u06BA' : 'ñ' # ں
199
+ '\u0648' : 'w' # و
200
+ '\u0647' : 'h' # ه
201
+ '\u06C1' : 'h'
202
+ '\u06BE' : 'h'
203
+ '\u0621' : '’' # ء
204
+ '\u0626' : '’' # ئ
205
+ '\u0649' : 'y' # ي
206
+ '\u064A' : 'y' # ي
207
+
208
+
209
+ # Aspiration is only contrastive in Eastern Baluchi
210
+ '\u0628\u06BE' : 'bh'
211
+
212
+ # Aspiration is only contrastive in Eastern Baluchi
213
+ '\u067E\u06BE' : 'ph'
214
+
215
+ # Aspiration is only contrastive in Eastern Baluchi.
216
+ # Apostrophe distinguishes from fricative /th/.
217
+ '\u062A\u06BE' : 'th’'
218
+
219
+ # Aspiration is only contrastive in Eastern Baluchi
220
+ '\u0679\u06BE' : 'ṭh'
221
+
222
+ # Aspiration is only contrastive in Eastern Baluchi
223
+ '\u062C\u06BE' : 'jh'
224
+
225
+ # Aspiration is only contrastive in Eastern Baluchi
226
+ '\u0686\u06BE' : 'chh'
227
+
228
+ # Aspiration is only contrastive in Eastern Baluchi.
229
+ # Apostrophe distinguishes from fricative /dh/
230
+ '\u062D\u06BE' : 'dh’'
231
+
232
+ # Aspiration is only contrastive in Eastern Baluchi
233
+ '\u0688\u06BE' : 'ḍh'
234
+
235
+ # Aspiration is only contrastive in Eastern Baluchi
236
+ '\u0631\u06BE' : '\u1E5B\u0068'
237
+
238
+ # Aspiration is only contrastive in Eastern Baluchi.
239
+ # Apostrophe distinguishes from fricative /kh/
240
+ '\u06A9\u06BE' : 'kh’'
241
+
242
+ # Aspiration is only contrastive in Eastern Baluchi.
243
+ # Apostrophe distinguishes from fricative /gh/
244
+ '\u06AF\u06BE' : 'gh’' #
245
+ '\u0644\u0627' : 'lā' #
246
+ '\u06A9\u0627' : 'kā' #
247
+ '\u06AF\u0627' : 'gā' #
248
+ '\u06A9\u0644' : 'kl' #
249
+ '\u06AF\u0644' : 'gl' #
250
+
251
+ # Vowels, Diphthongs, and Diacritical Marks
252
+ '\u0650\u0649' : 'ī' # ـِي
253
+ '\u0650' : 'i' # ِ
254
+ '\u06D2' : 'e' # ـے
255
+ '\b\u0627' : '' # ا
256
+ '\u0627' : 'ā' # ا
257
+ '\u0622' : 'ā' # آ
258
+ '\u064E' : 'a' # َ
259
+ '\u0648' : 'o' # و
260
+ '\u064F' : 'u' # ُ
261
+ '\u064F\u0648' : 'ū' # ـُو
262
+ '\u064E\u06D2' : 'ay' # ـَي
263
+ '\u064E\u0648' : 'aw' # ـَو
264
+ '\u0652' : '' # Not Romanized
265
+ '\u0670' : 'á' #
266
+
267
+ '\u0628\u0651' : 'bb' # ب
268
+ '\u067E\u0651' : 'pp' # پ
269
+ '\u062a\u0651' : 'tt' # ت
270
+ '\u0679\u0651' : 'ṭṭ' # see note 8 ٹ
271
+ '\u067C\u0651' : 'ṭṭ' # see note 8 ټ
272
+ '\u062B\u0651' : 't͟ht͟h' # see note 8 ث
273
+ '\u067F\u0651' : 't͟ht͟h' # see note 8 ٿ
274
+ '\u062c\u0651' : 'jj' # ج
275
+ '\u0686\u0651' : 'chch' # ‫چ‬
276
+ '\u062d\u0651' : 'ḩḩ' # ح
277
+ '\u062e\u0651' : 'khkh' # خ
278
+ '\u062f\u0651' : 'dd' # د
279
+ '\u0688\u0651' : 'ḍḍ' # ڈ
280
+ '\u0689\u0651' : 'ḍḍ' # ‫ډ‬
281
+ '\u0630\u0651' : 'd͟hd͟h' # ذ
282
+ '\u0631\u0651' : 'rr' # ر
283
+ '\u0691\u0651' : 'ṛṛ' # see note 8 ڑ
284
+ '\u0693\u0651' : 'ṛṛ' # see note 8 ړ
285
+ '\u0632\u0651' : 'zz' # ز
286
+ '\u0698\u0651' : 'zhzh' # ‫ژ‬
287
+ '\u0633\u0651' : 'ss' # س
288
+ '\u0634\u0651' : 'shsh' # ش
289
+ '\u0635\u0651' : 'şş' # ص
290
+ '\u0636\u0651' : 'ẕẕ' # ض
291
+ '\u0637\u0651' : 'ţţ' # ط
292
+ '\u0638\u0651' : 'z̧z̧' # ظ
293
+ '\u0639\u0651' : '‘‘' # ع
294
+ '\u063a\u0651' : 'ghgh' # غ
295
+ '\u0641\u0651' : 'ff' # ف
296
+ '\u0642\u0651' : 'qq' # ق
297
+ '\u0643\u0651' : 'kk' # ك
298
+ '\u06A9\u0651' : 'kk' # ک
299
+ '\u06AF\u0651' : 'gg' # ‫گ‬
300
+ '\u0644\u0651' : 'll' # ل
301
+ '\u0645\u0651' : 'mm' # م
302
+ '\u0646\u0651' : 'nn' # ن
303
+ '\u06BA\u0651' : 'ññ' # ں
304
+ '\u0648\u0651' : 'ww' # و
305
+ '\u0647\u0651' : 'hh' # ه
306
+ '\u06C1\u0651' : 'hh'
307
+ '\u06BE\u0651' : 'hh'
308
+ '\u0621\u0651' : '’’' # ء
309
+ '\u0626\u0651' : '’’' # ئ
310
+ '\u0649\u0651' : 'yy' # ي
311
+
312
+ '\u0621\u064E' : '-ā' # see note 4
313
+ '\u0621\u0650' : '-ay' # see note 4
314
+
315
+ # Numerals
316
+ '۰' : '0'
317
+ '۱' : '1'
318
+ '۲' : '2'
319
+ '۳' : '3'
320
+ '۴' : '4'
321
+ '۵' : '5'
322
+ '۶' : '6'
323
+ '۷' : '7'
324
+ '۸' : '8'
325
+ '۹' : '9'
326
+ # Although Perso-Arabic script is written from right to
327
+ # left, numerical expressions, e.g. ۸۶۹۱ → 1968, are
328
+ # written from left to right. A comma is inserted into
329
+ # longer sequences, either after thousands, millions, etc.