interscript 0.1.2 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +250 -17
  3. data/lib/g2pwrapper.py +34 -0
  4. data/lib/interscript.rb +142 -20
  5. data/lib/interscript/command.rb +28 -0
  6. data/lib/interscript/fs.rb +69 -0
  7. data/lib/interscript/mapping.rb +142 -0
  8. data/lib/interscript/opal.rb +57 -0
  9. data/lib/interscript/opal/entrypoint.rb +12 -0
  10. data/lib/interscript/opal/map_translate.rb +7 -0
  11. data/lib/interscript/opal/maps.js.erb +10 -0
  12. data/lib/interscript/version.rb +1 -1
  13. data/lib/model-7 +0 -0
  14. data/lib/tha-pt-b-7 +0 -0
  15. data/maps/acadsin-zho-Hani-Latn-2002.yaml +38916 -0
  16. data/maps/alalc-amh-Ethi-Latn-1997.yaml +513 -0
  17. data/maps/alalc-amh-Ethi-Latn-2011.yaml +138 -0
  18. data/maps/alalc-ara-Arab-Latn-1997.yaml +1287 -0
  19. data/maps/alalc-asm-Deva-Latn-1997.yaml +165 -0
  20. data/maps/alalc-asm-Deva-Latn-2012.yaml +40 -0
  21. data/maps/alalc-aze-Cyrl-Latn-1997.yaml +145 -0
  22. data/maps/alalc-bel-Cyrl-Latn-1997.yaml +129 -0
  23. data/maps/alalc-ben-Beng-Latn-2017.yaml +130 -0
  24. data/maps/alalc-bul-Cyrl-Latn-1997.yaml +98 -0
  25. data/maps/alalc-ell-Grek-Latn-1997.yaml +628 -0
  26. data/maps/alalc-ell-Grek-Latn-2010.yaml +626 -0
  27. data/maps/alalc-guj-Gujr-Latn-1997.yaml +266 -0
  28. data/maps/alalc-guj-Gujr-Latn-2011.yaml +64 -0
  29. data/maps/alalc-hin-Deva-Latn-1997.yaml +211 -0
  30. data/maps/alalc-hin-Deva-Latn-2011.yaml +47 -0
  31. data/maps/alalc-kat-Geok-Latn-1997.yaml +111 -0
  32. data/maps/alalc-kat-Geor-Latn-1997.yaml +150 -0
  33. data/maps/alalc-kor-Hang-Latn-1997.yaml +98 -0
  34. data/maps/alalc-mal-Mlym-Latn-1997.yaml +303 -0
  35. data/maps/alalc-mal-Mlym-Latn-2012.yaml +73 -0
  36. data/maps/alalc-mar-Deva-Latn-1997.yaml +189 -0
  37. data/maps/alalc-mar-Deva-Latn-2011.yaml +45 -0
  38. data/maps/alalc-mkd-Cyrl-Latn-1997.yaml +114 -0
  39. data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +103 -0
  40. data/maps/alalc-mon-Cyrl-Latn-1997.yaml +220 -0
  41. data/maps/alalc-pan-Guru-Latn-1997.yaml +256 -0
  42. data/maps/alalc-pan-Guru-Latn-2011.yaml +78 -0
  43. data/maps/alalc-per-Arab-Latn-1997.yaml +375 -0
  44. data/maps/alalc-pli-Deva-Latn-2012.yaml +144 -0
  45. data/maps/alalc-pra-Deva-Latn-2012.yaml +47 -0
  46. data/maps/alalc-rus-Cyrl-Latn-1997.yaml +225 -0
  47. data/maps/alalc-rus-Cyrl-Latn-2012.yaml +162 -0
  48. data/maps/alalc-san-Deva-Latn-2012.yaml +172 -0
  49. data/maps/alalc-sin-Sinh-Latn-1997.yaml +292 -0
  50. data/maps/alalc-sin-Sinh-Latn-2011.yaml +71 -0
  51. data/maps/alalc-srp-Cyrl-Latn-1997.yaml +118 -0
  52. data/maps/alalc-srp-Cyrl-Latn-2013.yaml +135 -0
  53. data/maps/alalc-tam-Taml-Latn-1997.yaml +62 -0
  54. data/maps/alalc-tam-Taml-Latn-2011.yaml +58 -0
  55. data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +145 -0
  56. data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +16 -0
  57. data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +283 -0
  58. data/maps/{bas-rus-Cyrl-Latn-bss.yaml → bas-rus-Cyrl-Latn-2017-bss.yaml} +58 -33
  59. data/maps/{bas-rus-Cyrl-Latn-oss.yaml → bas-rus-Cyrl-Latn-2017-oss.yaml} +55 -35
  60. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +292 -0
  61. data/maps/bgn-kor-Hang-Latn-1943.yaml +35 -0
  62. data/maps/bgn-kor-Kore-Latn-1943.yaml +31 -0
  63. data/maps/bgna-bul-Cyrl-Latn-2006.yaml +208 -0
  64. data/maps/bgna-bul-Cyrl-Latn-2009.yaml +208 -0
  65. data/maps/bgnpcgn-amh-Ethi-Latn-1967.yaml +532 -0
  66. data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +596 -0
  67. data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +2 -3
  68. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +104 -0
  69. data/maps/bgnpcgn-bak-Cyrl-Latn-2007.yaml +188 -0
  70. data/maps/bgnpcgn-bel-Cyrl-Latn-1979.yaml +289 -0
  71. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +119 -0
  72. data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +15 -65
  73. data/maps/bgnpcgn-che-Cyrl-Latn-2008.yaml +184 -0
  74. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +705 -0
  75. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +23 -0
  76. data/maps/{bgnpcgn-per-Arab-Latn-1956.yaml → bgnpcgn-fas-Arab-Latn-1956.yaml} +5 -2
  77. data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +257 -0
  78. data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +131 -0
  79. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +42 -0
  80. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +253 -0
  81. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +48 -0
  82. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +48 -0
  83. data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +163 -0
  84. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +190 -0
  85. data/maps/bgnpcgn-mon-Cyrl-Latn-1964.yaml +223 -0
  86. data/maps/bgnpcgn-nep-Deva-Latn-2011.yaml +230 -0
  87. data/maps/bgnpcgn-per-Arab-Latn-1958.yaml +336 -0
  88. data/maps/bgnpcgn-prs-Arab-Latn-2007.yaml +639 -0
  89. data/maps/bgnpcgn-prs-Arab-Latn-yaghoubi.yaml +459 -0
  90. data/maps/bgnpcgn-rue-Cyrl-Latn-2016.yaml +168 -0
  91. data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +150 -65
  92. data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +170 -0
  93. data/maps/bgnpcgn-tat-Cyrl-Latn-2007.yaml +220 -0
  94. data/maps/bgnpcgn-tgk-Cyrl-Latn-1994.yaml +240 -0
  95. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +80 -4
  96. data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +119 -0
  97. data/maps/bgnpcgn-uzb-Cyrl-Latn-1979.yaml +127 -0
  98. data/maps/bgnpcgn-uzb-Cyrl-Latn-2000.yaml +82 -0
  99. data/maps/bgnpcgn-zho-Hans-Latn-1979.yaml +7456 -0
  100. data/maps/bis-asm-Beng-Latn-13194-1991.yaml +159 -0
  101. data/maps/bis-ben-Beng-Latn-13194-1991.yaml +156 -0
  102. data/maps/bis-dev-Deva-Latn-13194-1991.yaml +184 -0
  103. data/maps/bis-guj-Gujr-Latn-13194-1991.yaml +181 -0
  104. data/maps/bis-knd-Knda-Latn-13194-1991.yaml +173 -0
  105. data/maps/bis-mlm-Mlym-Latn-13194-1991.yaml +176 -0
  106. data/maps/bis-ori-Orya-Latn-13194-1991.yaml +160 -0
  107. data/maps/bis-pnj-Guru-Latn-13194-1991.yaml +175 -0
  108. data/maps/bis-tel-Telu-Latn-13194-1991.yaml +170 -0
  109. data/maps/bis-tml-Taml-Latn-13194-1991.yaml +155 -0
  110. data/maps/by-bel-Cyrl-Latn-1998.yaml +172 -0
  111. data/maps/by-bel-Cyrl-Latn-2007.yaml +115 -0
  112. data/maps/din-grc-Grek-Latn-31634-2011-t1.yaml +899 -0
  113. data/maps/din-hin-Deva-Latn-33904-2018.yaml +100 -0
  114. data/maps/din-kat-Geor-Latn-32707-2010.yaml +145 -0
  115. data/maps/din-mar-Deva-Latn-33904-2018.yaml +84 -0
  116. data/maps/din-nep-Deva-Latn-33904-2018.yaml +119 -0
  117. data/maps/din-pli-Deva-Latn-33904-2018.yaml +75 -0
  118. data/maps/din-pra-Deva-Latn-33904-2018.yaml +63 -0
  119. data/maps/din-san-Deva-Latn-33904-2018.yaml +338 -0
  120. data/maps/din-tam-Taml-Latn-33903-2016.yaml +213 -0
  121. data/maps/dos-nep-Deva-Latn-1997.yaml +47 -0
  122. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +684 -0
  123. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +680 -0
  124. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +19 -0
  125. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +31 -0
  126. data/maps/ggg-kat-Geor-Latn-2002.yaml +92 -0
  127. data/maps/gki-bel-Cyrl-Latn-1992.yaml +33 -0
  128. data/maps/gki-bel-Cyrl-Latn-2000.yaml +201 -0
  129. data/maps/gost-rus-Cyrl-Latn-16876-71-1983.yaml +190 -0
  130. data/maps/gost-rus-Cyrl-Latn-7.79-2000-2002.yaml +157 -0
  131. data/maps/hk-yue-Hani-Latn-1888.yaml +38497 -0
  132. data/maps/icao-bel-Cyrl-Latn-9303.yaml +109 -98
  133. data/maps/icao-bul-Cyrl-Latn-9303.yaml +2 -7
  134. data/maps/{icao-per-Arab-Latn-9303.yaml → icao-fas-Arab-Latn-9303.yaml} +6 -8
  135. data/maps/icao-heb-Hebr-Latn-9303.yaml +119 -125
  136. data/maps/icao-mkd-Cyrl-Latn-9303.yaml +2 -3
  137. data/maps/icao-rus-Cyrl-Latn-9303.yaml +2 -4
  138. data/maps/icao-srp-Cyrl-Latn-9303.yaml +2 -3
  139. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +2 -4
  140. data/maps/iso-ara-Arab-Latn-233-1984.yaml +323 -0
  141. data/maps/iso-asm-Beng-Latn-15919-2001.yaml +75 -0
  142. data/maps/iso-ben-Beng-Latn-15919-2001.yaml +175 -0
  143. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +613 -0
  144. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +44 -0
  145. data/maps/iso-guj-Gujr-Latn-15919-2001.yaml +220 -0
  146. data/maps/iso-hin-Deva-Latn-15919-2001.yaml +87 -0
  147. data/maps/iso-inc-Deva-Latn-15919-2001.yaml +61 -0
  148. data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +66 -0
  149. data/maps/iso-kan-Knda-Latn-15919-2001.yaml +220 -0
  150. data/maps/iso-kat-Geor-Latn-9984-1996.yaml +145 -0
  151. data/maps/iso-kor-Hang-Latn-1996-method1.yaml +240 -0
  152. data/maps/iso-kor-Hang-Latn-1996-method2.yaml +226 -0
  153. data/maps/iso-mal-Mlym-Latn-15919-2001.yaml +281 -0
  154. data/maps/iso-mar-Deva-Latn-15919-2001.yaml +75 -0
  155. data/maps/iso-nep-Deva-Latn-15919-2001.yaml +87 -0
  156. data/maps/iso-ori-Orya-Latn-15919-2001.yaml +193 -0
  157. data/maps/iso-pan-Guru-Latn-15919-2001.yaml +222 -0
  158. data/maps/iso-pli-Beng-Latn-15919-2001.yaml +73 -0
  159. data/maps/iso-pli-Deva-Latn-15919-2001.yaml +74 -0
  160. data/maps/iso-pli-Sinh-Latn-15919-2001.yaml +219 -0
  161. data/maps/iso-pli-Thai-Latn-15919-2001.yaml +55 -0
  162. data/maps/iso-pra-Deva-Latn-15919-2001.yaml +59 -0
  163. data/maps/iso-prs-Arab-Latn-233-3-1999.yaml +366 -0
  164. data/maps/{iso-rus-Cyrl-Latn-iso9.yaml → iso-rus-Cyrl-Latn-9-1995.yaml} +4 -6
  165. data/maps/iso-san-Deva-Latn-15919-2001.yaml +220 -0
  166. data/maps/iso-tam-Taml-Latn-15919-2001.yaml +159 -0
  167. data/maps/iso-tel-Telu-Latn-15919-2001.yaml +220 -0
  168. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +109 -0
  169. data/maps/kp-kor-Hang-Latn-2002.yaml +909 -0
  170. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +44820 -0
  171. data/maps/mext-jpn-Hrkt-Latn-1954.yaml +411 -0
  172. data/maps/mns-mon-Cyrl-Latn-5217-2012.yaml +163 -0
  173. data/maps/mns-mon-Latn-Cyrl-5217-2012.yaml +200 -0
  174. data/maps/moct-kor-Hang-Latn-2000.yaml +807 -0
  175. data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +541 -0
  176. data/maps/mvd-bel-Cyrl-Latn-2008.yaml +225 -0
  177. data/maps/mvd-bel-Cyrl-Latn-2010.yaml +63 -0
  178. data/maps/mvd-rus-Cyrl-Latn-2008.yaml +109 -0
  179. data/maps/mvd-rus-Cyrl-Latn-2010.yaml +37 -0
  180. data/maps/odni-ara-Arab-Latn-2015.yaml +425 -0
  181. data/maps/odni-aze-Cyrl-Latn-2015.yaml +144 -0
  182. data/maps/odni-bel-Cyrl-Latn-2015.yaml +148 -0
  183. data/maps/odni-bul-Cyrl-Latn-2015.yaml +96 -0
  184. data/maps/odni-che-Cyrl-Latn-2015.yaml +169 -0
  185. data/maps/odni-fas-Arab-Latn-2015.yaml +406 -0
  186. data/maps/odni-hin-Deva-Latn-2015.yaml +258 -0
  187. data/maps/odni-kat-Geor-Latn-2015.yaml +87 -0
  188. data/maps/odni-kaz-Cyrl-Latn-2015.yaml +148 -0
  189. data/maps/odni-kir-Cyrl-Latn-2015.yaml +136 -0
  190. data/maps/odni-kor-Hang-Latn-2015.yaml +375 -0
  191. data/maps/odni-mkd-Cyrl-Latn-2015.yaml +122 -0
  192. data/maps/odni-per-Arab-Latn-2015.yaml +228 -0
  193. data/maps/odni-rus-Cyrl-Latn-2015.yaml +77 -0
  194. data/maps/odni-srp-Cyrl-Latn-2015.yaml +129 -0
  195. data/maps/odni-tat-Cyrl-Latn-2015.yaml +142 -0
  196. data/maps/odni-tgk-Cyrl-Latn-2015.yaml +148 -0
  197. data/maps/odni-uig-Cyrl-Latn-2015.yaml +138 -0
  198. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +157 -0
  199. data/maps/odni-urd-Arab-Latn-2015.yaml +221 -0
  200. data/maps/odni-uzb-Cyrl-Latn-2015.yaml +166 -0
  201. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +90 -0
  202. data/maps/royin-tha-Thai-Latn-1968.yaml +183 -0
  203. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +180 -0
  204. data/maps/royin-tha-Thai-Latn-1999.yaml +80 -0
  205. data/maps/{cn-chn-Hans-Latn-pinyin.yaml → sac-zho-Hans-Latn-1979.yaml} +11 -8
  206. data/maps/sasm-mon-Mong-Latn-general-1978.yaml +389 -0
  207. data/maps/sasm-mon-Mong-Latn-phonetic-1978.yaml +354 -0
  208. data/maps/ses-ara-Arab-Latn-1930.yaml +283 -0
  209. data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +222 -0
  210. data/maps/ua-ukr-Cyrl-Latn-1996.yaml +197 -0
  211. data/maps/ua-ukr-Cyrl-Latn-2007.yaml +75 -0
  212. data/maps/ua-ukr-Cyrl-Latn-2010.yaml +192 -0
  213. data/maps/un-amh-Ethi-Latn-2016.yaml +602 -0
  214. data/maps/un-ara-Arab-Latn-1971.yaml +139 -0
  215. data/maps/un-ara-Arab-Latn-1972.yaml +159 -0
  216. data/maps/un-ara-Arab-Latn-2017.yaml +420 -0
  217. data/maps/un-bel-Cyrl-Latn-2007.yaml +114 -0
  218. data/maps/un-ben-Beng-Latn-2016.yaml +534 -0
  219. data/maps/un-ell-Grek-Latn-1987-phonetic.yaml +780 -0
  220. data/maps/un-ell-Grek-Latn-1987-tl.yaml +31 -0
  221. data/maps/un-ell-Grek-Latn-1987-ts.yaml +19 -0
  222. data/maps/un-hin-Deva-Latn-2016.yaml +222 -0
  223. data/maps/un-mar-Deva-Latn-2016.yaml +91 -0
  224. data/maps/un-mon-Mong-Latn-general-2013.yaml +264 -0
  225. data/maps/un-mon-Mong-Latn-phonetic-2013.yaml +264 -0
  226. data/maps/un-nep-Deva-Latn-1972.yaml +350 -0
  227. data/maps/un-nep-Deva-Latn-2013.yaml +74 -0
  228. data/maps/un-rus-Cyrl-Latn-1987.yaml +166 -0
  229. data/maps/un-ukr-Cyrl-Latn-1998.yaml +53 -0
  230. data/maps/un-ukr-Cyrl-Latn-2012.yaml +162 -0
  231. data/maps/var-hin-Deva-Latn-hunterian-1872.yaml +221 -0
  232. data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +406 -0
  233. data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +386 -0
  234. data/maps/var-kor-Hang-Hang-jamo.yaml +11193 -0
  235. data/maps/var-kor-Hang-Latn-mr-1939.yaml +1054 -0
  236. data/maps/var-kor-Kore-Hang-2013.yaml +59754 -0
  237. data/maps/var-kor-Kore-Latn-mr-1939.yaml +36 -0
  238. data/maps/var-mar-Deva-Latn-hunterian-1872.yaml +43 -0
  239. data/maps/var-mon-Mong-Latn-1930.yaml +102 -0
  240. data/maps/var-mon-Mong-Latn-lessing.yaml +272 -0
  241. data/maps/var-mon-Mong-Latn-vpmc.yaml +274 -0
  242. data/maps/var-pra-Deva-Latn-iast-1912.yaml +30 -0
  243. data/maps/var-san-Deva-Latn-iast-1912.yaml +149 -0
  244. data/maps/var-tha-Thai-Thai-phonemic.yaml +59 -0
  245. data/maps/var-tha-Thai-Zsym-ipa.yaml +301 -0
  246. data/maps/var-zho-Hani-Latn-wd-1979.yaml +38912 -0
  247. data/spec/interscript/filenames_spec.rb +384 -0
  248. data/spec/interscript/mapping_spec.rb +42 -0
  249. data/spec/interscript_spec.rb +23 -5
  250. data/spec/spec_helper.rb +3 -1
  251. metadata +364 -34
  252. data/bin/interscript +0 -20
  253. data/bin/rspec +0 -29
  254. data/maps/bgnpcgn-chn-Hans-Latn-pinyin.yaml +0 -7503
  255. data/maps/historic-jpn-Hrkt-Latn-hepburn.yaml +0 -336
  256. data/maps/icao-gre-Grek-Latn-9303.yaml +0 -101
  257. data/maps/mext-jpn-Hrkt-Latn-hepburn.yaml +0 -330
  258. data/maps/mext-jpn-Hrkt-Latn-kunrei.yaml +0 -308
  259. data/maps/un-jpn-Hrkt-Latn-hepburn.yaml +0 -313
  260. data/maps/un-jpn-Hrkt-Latn-kunrei.yaml +0 -354
  261. data/maps/un-mon-Mong-Latn-2013.yaml +0 -80
@@ -0,0 +1,596 @@
1
+ ---
2
+ authority_id: bgnpcgn
3
+ id: 1956
4
+ language: iso-639-2:ara
5
+ source_script: Arab
6
+ destination_script: Latn
7
+ name: ROMANIZATION OF ARABIC -- BGN/PCGN 1956 System
8
+ alias:
9
+ ogc11122:
10
+ code: ara_Arab2Latn_BGN_1956
11
+ description: Arabic US Board on Geographic Names(BGN)/Permanent Committee on Geographical Names for British Official Use(PCGN) 1956 System
12
+ url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/858000/ROMANIZATION_OF_ARABIC.pdf
13
+ creation_date: 1956
14
+ confirmation date: 2019-12
15
+ description: |
16
+
17
+ This System was adopted by the BGN in 1946 and by the PCGN in 1956 and is applied by BGN and PCGN in the systematic romanization of Arabic geographical names in Bahrain, Egypt, Iraq, Jordan, Kuwait, Libya, Oman, Qatar, Saudi Arabia, Syria, the United Arab Emirates, Yemen, the West Bank and Gaza Strip.
18
+
19
+ Uniform results in the romanization of Arabic are difficult to obtain, since vowel points and diacritical marks are generally omitted from both handwriting and printed script. It follows that for correct identification of the words which appear in any particular name, knowledge of its standard Arabic-script spelling including proper pointing, and recognition of dialectal and idiosyncratic deviations are essential.
20
+
21
+ In order to bring about uniformity in the Roman-script spelling of geographical names in Arabic-language areas, the system is based insofar as possible on fully pointed Modern Standard Arabic (MSA). In the interest of clarity, vowel pointing to indicate short vowels has been applied to the examples given below, and examples of the, more usual, unpointed script have also been provided; it should also be noted that the dots which occur on some characters of the Arabic script are not vowels but rather are an integral part of the base consonant.
22
+
23
+ Arabic script is written from right to left, and does not make a distinction between upper and lower case.
24
+
25
+
26
+
27
+ notes: |
28
+
29
+ - (NOTE 1) The symbol ◌ is used in this system to symbolise any Arabic consonant character. It is not itself an Arabic letter.
30
+
31
+ - (NOTE 2) Hamzah (ء) is written in Arabic in association with most instances of initial alif, except those which belong to the definite article al or which bear a maddah (see note 9). Hamzah is written above the alif ( أَ) if the accompanying short vowel is a fatḩah or ḑammah and usually below the alif( أ ) if the accompanying short vowel is a kasrah.
32
+ When the purpose is to indicate the presence of a glottal stop, hamzah is written over medial alif ( أ ), wāw (ؤ) and yā’, typically without dots (ئ); or following final alif ( أ ء ), these characters serving only to “bear” the hamzah. Hamzah following kasrah ( ) is written (ئ); the yā’ is usually in the initial or medial form and the dots are omitted e.g. bi’r ( بئ ر ).
33
+ Hamzah following ḑammah ( ) is written (ؤ). Hamzah following a long vowel is written without a bearer and is positioned on the line of print like a regular character, e.g. صنعاء Şan‘ā’. The romanization of hamzah (’ - Unicode encoding 2019) should always be carefully distinguished from that of ‘ayn (‘ - Unicode encoding 2018).
34
+
35
+ - (NOTE 3) Alif (ا) occurs with the following uses
36
+ a. Initially, it indicates that the word begins with a vowel or diphthong; the alif itself is not romanized, but rather “carries” the short vowel, which is romanized; e.g., ظب ي أبو → Abū Z̧aby.
37
+
38
+ b. With maddah (آ – row 18 in the vowel table), it is represented ā; e.g., مُ عيط آلب و → Ālbū Mu‘ayţ. See also note 9.
39
+
40
+ c. Medially and finally it is represented ā; e.g., ب ا ب → Bāb, صيدا → Şaydā.
41
+
42
+ d. Medially and finally, alif may serve as the bearer of hamzah, e.g. رأس → ra’s. See also note 2.
43
+
44
+ - (NOTE 4) The tā’ marbūţah character (ة), which looks like hā’ with two dots above and occurs only at the end of words, is romanized h, except in an iḑāfah noun phrase construction, where it is romanized t, in accordance with pronunciation. e.g. Muḩāfaz̧ah (as an isolated word) but Muḩāfaz̧at Baghdād. In exceptional cases, when it is necessary to distinguish it from the tā’ marbūţah, the ending fatḩah + hā’ ( ه ) may be romanized a·h when the character hā’ (ه) is pronounced as such. Example Muntaza·h. (see also special rule 13). The tā marbūţah is always preceded by the short vowel fatḩah ( ) and is therefore romanized as ah or at, except when it is preceded by alif when it is romanized āh (not āah), e.g. Ḩamāh (حماة ), and as āt within an iḑāfah construction.
45
+
46
+ - (NOTE 5) The character yā’ (in final form but without dots) preceded by the vowel point fatḩah is known as alif maqşūrah. This character may also be pointed ى and should be romanized á. See character 7 in the vowel table.
47
+
48
+ - (NOTE 6) The classical Arabic grammatical endings written with the nunation symbols (tanwīn) may be romanized, when necessary, by an, in, un. In modern spoken Arabic, these endings have become silent and should not be romanized e.g. classical alifun; modern alif.
49
+
50
+ - (NOTE 7) Doubled consonant sounds are represented in Arabic script by placing a shaddah ( ) over a consonant character, although like the short vowels the shaddah may not always be written. In romanization the letter should be doubled, e.g. Quwwah, ‘Abbās. However, the combination of the consonant character yā’ with a shaddah preceded by a kasrah ( ي ) at the end of a word is romanized ī, e.g. Gharbī; a word ending kasrah + yā’ with a shaddah + tā’ marbūţah is romanized īyah (rather than iyyah), e.g. ال س ل يمانِ ية
51
+ is romanized As Sulaymānīyah and not As Sulaymāniyyah; and when the kasrah + yā’ + shaddah combination is followed by the sound masculine plural ending ( يين or يون ) it should be romanized as –īyīn/īyūn, e.g. ساحة العباسيين should be romanized as Sāḩat al ‘Abbāsīyīn.
52
+
53
+ - (NOTE 8) Hamzat al waşl (ٱ), which is utilized only in the pointing of classical Arabic, is romanized ’ as illustrated in the classical form of its name hamzatu’l waşli.
54
+
55
+ - (NOTE 9) Since maddah ( أ ), which is placed over alif ( أ ), often occurs in word-initial position, no confusion results from the use of ā for alif maddah ( أ ) as well as for fatḩah followed by alif ( اَ ).
56
+
57
+ - (NOTE 10) The ligature ل ا represents lām-alif, and should be romanized lā.
58
+
59
+ - (NOTE 11) In word initial position the combination Alif +Wāw (او ) is sometimes used to render an initial long vowel sound in words of non-Arabic origin. Where this is clearly the case it should be romanized Ū. In words of Arabic or uncertain origin it should be romanized Aw. In word-medial or word-final position it should always be romanized āw. Similarly the combination Alif +Yā’ (اي ) is romanized Ī to render an initial long vowel sound but as āy in word-medial or word-final position.
60
+
61
+ # SPECIAL RULES
62
+
63
+ - The Arabic definite article al (ال ) should be treated as follows |
64
+ a. Initial definite articles should be capitalized and hyphens should not be used to connect parts of names, e.g. Ash Shāriqah. When appearing medially in a name the initial ‘a’ should be lower case, e.g. Tall al Laḩm.
65
+
66
+ b. When the definite article precedes a word beginning with one of the “sun letters” t, th, d, dh, r, z, s, sh, ş, ḑ, ţ, z̧, l, or n – the l is assimilated in pronunciation and romanization, thus yielding, for example, the romanization Ar Riyāḑ, rather than Al Riyāḑ for ال ريا ض .
67
+
68
+ c. If sources contradict over the inclusion or non-inclusion of the definite article in a name, preference should be given to the form with the article.
69
+
70
+ - Conjunctions and prepositions should be romanized according to their written form in Arabic script and should be lower case. In cases where the conjunction or preposition ends in a long or short vowel any assimilated pronunciation should not be shown in the romanized form. e.g. Khabb wa ash Sha‘f (خب والشعف ). |
71
+
72
+ There are two exceptions to this rule
73
+
74
+ a. In the case of the preposition li (ل), where the alif of the definite article is assimilated in the written form as well as pronunciation, the written form should be shown in romanization as follows Mişr liţ Ţayarān (مصر للطيران ); Ash Sharikah al ‘Āmmah lil Maghāzil (الشركة العامة للمغازل ).
75
+
76
+ b. In the case of the preposition bi (ب), the alif of the definite article is assimilated in pronunciation and, although the alif remains in the written form the short vowel it carries changes from ‘a’ to ‘i’. For example Al Qaryah bid Duwayr (القرية بالدوير ) but Ad Duwayr (الدوير ); and Al Ḩarajah bil Qur’ān (الحرجة بالقرآن ) but Al Qur’ān (القرآن ).
77
+
78
+
79
+ - The Arabic word for God ( لله) should be written Allāh. The alif khanjarīyah (dagger alif) ( ) above the second ل (lām) in the word لله , like the short vowels, is not usually written but should be romanized ā, like a full-size alif. This diacritical mark appears in a few other Arabic words, for instance on the alif maqşūrah as described in note 5.
80
+
81
+ - Names which consist of noun phrases (see also note 4) should be written as separate words. The definite article within such names should be romanized al, not ul, e.g., ‘Abd Allāh, ‘Abd ar Raḩmān, Dhū al Faqār, and as noted in special rule 1, the medial al should be lower case.
82
+
83
+ - The Arabic word ب ن should be romanized Bin rather than Ibn whenever written without alif, that is between two proper nouns, e.g., ‘Umar Bin al Khaţţāb. Where it appears with alif ( )اب ن , it should be romanized Ibn.
84
+
85
+ - The Turkish word Paşa should be romanized from Arabic script as Bāshā. The Turkish word Bey should be romanized as Bey in Egyptian names, no matter how it is written in Arabic-language sources, but in other Arabic areas it should be romanized as Bak where written بك and as Bayk when written بيك .
86
+
87
+ - The modern colloquial word Sīdī (سيدي ) should be give precedence over the classical form Sayyidī. This does not preclude the spelling Sayyidī if the latter is indicated by the Arabic script or other evidence – for instance, if the yā’ is written with a shaddah ( ).
88
+
89
+ - The colloquial word Bū should not be changed to the standard form Abū.
90
+
91
+ - The colloquial word for water, written مي ة on Arabic maps, should be romanized Mayyat.
92
+
93
+ - Place names of Aramaic origin in Syria often contain initial consonant clusters consisting of b plus another consonant such as l or h. In romanization, the clusters bl, bh, etc., should be so represented.
94
+
95
+ - In names containing the Arabic word for back, ridge, or hill, appearing as either ظهر (Z̧ahr) or ضه ر (Ḑahr) in Arabic sources, the word should be romanized to reflect the particular Arabic spelling shown. Where sources differ, preference should be given to the form found on the most authoritative source.
96
+
97
+ - In formal Arabic, the spelling of some words ending in a long vowel character may change according to that word’s grammatical function in a sentence. For example, the personal name Abū Bakr (ابو بكر ) would become Abī Bakr (ابي بكر ) when preceded by a generic in an iḑāfah construction (used in Moroccan Arabic Script) e.g. Shāri‘ Abī Bakr (شارع ابي بكر – Abu Bakr Street). The spelling of such words as found on the most authoritative source should be used in the romanized form of the name. Other common words affected by this rule are Banū/Banī (sons of…) and Dhū/Dhī (owner of ...). Examples of names in this category include Jabal Abā aş Şabbān (جبل ابا الصبان ) and Muḩāfaz̧at Dhī Qār ( محافظة ذي قار ).
98
+
99
+ - Occasionally the character sequences ك ه , ده , س ه , and ت occur. They may be romanized k·h, d·h, s·h, and t·h in order to differentiate these romanizations from the digraphs kh, dh, sh, and th, which are used to represent the characters خ, ذ, ش, and ث respectively. See also note 4.
100
+
101
+
102
+ tests:
103
+
104
+ - source: قُرآن
105
+ expected: Qur’ān
106
+
107
+ - source: أَبُو ظَبْي
108
+ expected: Abū Z̧aby
109
+
110
+ - source: بِئْر زَيْت
111
+ expected: Bi’r Zayt
112
+
113
+ - source: أُمّ العَمَد
114
+ expected: Umm al ‘Amad
115
+
116
+ - source: البَحرَيْن
117
+ expected: Al Baḩrayn
118
+
119
+ - source: الكُوت
120
+ expected: Al Kūt
121
+
122
+ - source: الثُّلَيْثُوَات
123
+ expected: Ath Thulaythuwāt
124
+
125
+ - source: الجَزِيرَة
126
+ expected: Al Jazīrah
127
+
128
+ - source: المَحْمُودِيَّة
129
+ expected: Al Maḩmūdīyah
130
+
131
+ - source: خَيْبَر
132
+ expected: Khaybar
133
+
134
+ - source: دَمَنْهُور
135
+ expected: Damanhūr
136
+
137
+ - source: ذَهَب
138
+ expected: Dhahab
139
+
140
+ - source: الرَّوْضة
141
+ expected: Ar Rawḑah
142
+
143
+ - source: زُوَارَة
144
+ expected: Zuwārah
145
+
146
+ - source: السُّلَيْمانِيَّة
147
+ expected: As Sulaymānīyah
148
+
149
+ - source: الشَّام
150
+ expected: Ash Shām
151
+
152
+ - source: قَيْصُومَة
153
+ expected: Qayşūmah
154
+
155
+ - source: ضَوْر
156
+ expected: Ḑawr
157
+
158
+ - source: القُنَيْطِرَة
159
+ expected: Al Qunayţirah
160
+
161
+ - source: ظُفَار
162
+ expected: Z̧ufār
163
+
164
+ - source: أَبُو عَرِيش
165
+ expected: Abū ‘Arīsh
166
+
167
+ - source: بَغْداد
168
+ expected: Baghdād
169
+
170
+ - source: الفُرات
171
+ expected: Al Furāt
172
+
173
+ - source: قَطَر
174
+ expected: Qaţar
175
+
176
+ - source: الكُوَيْت
177
+ expected: Al Kuwayt
178
+
179
+ - source: حَلَب
180
+ expected: Ḩalab
181
+
182
+ - source: مَكَّة
183
+ expected: Makkah
184
+
185
+ - source: نَخْل
186
+ expected: Nakhl
187
+
188
+ - source: جَبَل هارُون
189
+ expected: Jabal Hārūn
190
+
191
+ - source: وادِي غَضَا
192
+ expected: Wādī Ghaḑā
193
+
194
+ - source: اليَمَن
195
+ expected: Al Yaman
196
+
197
+ - source: القاهِرَة
198
+ expected: Al Qāhirah
199
+
200
+ - source: المَدِينَة المُنَوَّرَة
201
+ expected: Al Madīnah al Munawwarah
202
+
203
+ - source: مُحَافَظَة دِمَشْق
204
+ expected: Muḩāfaz̧at Dimashq
205
+
206
+ - source: البَصْرَة
207
+ expected: Al Başrah
208
+
209
+ - source: الرِّيَاض
210
+ expected: Ar Riyāḑ
211
+
212
+ - source: القُدْس
213
+ expected: Al Quds
214
+
215
+ - source: بَاب المَنْدَب
216
+ expected: Bāb al Mandab
217
+
218
+ - source: المَدِينة
219
+ expected: Al Madīnah
220
+
221
+ - source: صُور
222
+ expected: Şūr
223
+
224
+ - source: مَرْسَىٰ مَطْرُوح
225
+ expected: Marsá Maţrūḩ
226
+
227
+ - source: صَيْدَا
228
+ expected: Şaydā
229
+
230
+ - source: الدَّوحَة
231
+ expected: Ad Dawḩah
232
+
233
+ - source: مُحَمَّد
234
+ expected: Muḩammad
235
+
236
+ - source: أُوزُونْلَار
237
+ expected: Ūzūnlār
238
+
239
+ - source: أَوْسَط
240
+ expected: Awsaţ
241
+
242
+ - source: سَنَاو
243
+ expected: Sanāw
244
+
245
+ - source: اِيرَان
246
+ expected: Īrān
247
+
248
+ - source: تَلّ السَّرَاي
249
+ expected: Tall as Sarāy
250
+
251
+ - source: آلْبُو مُعَيْط
252
+ expected: Ālbū Mu‘ayţ
253
+
254
+ - source: سَلْمان پَاك
255
+ expected: Salmān Pāk
256
+
257
+ - source: تَلّ كُوچِك الصَّغِير
258
+ expected: Tall Kūchik aş Şaghīr
259
+
260
+ # - source: مَزََّة ڤِيلَّات غَرْبِيَّة
261
+ # expected: Mazzah Vīllāt Gharbīyah
262
+
263
+ - source: ڨَفْصَة
264
+ expected: Gafşah
265
+
266
+ - source: تَلّ گَمْر
267
+ expected: Tall Gamr
268
+
269
+ - source: زَاڴُورَة
270
+ expected: Zāgūrah
271
+
272
+
273
+ map:
274
+ postrules:
275
+ - pattern: '(?<=\b)(?<!\b[‘|’])[\u0061-\uFFFF]'
276
+ result: "upcase"
277
+ # don't capitalize defined article in the middle of a sentence
278
+ - pattern : ' At T' # الت
279
+ result: ' at T'
280
+ - pattern : ' Ath Th' # الث
281
+ result: ' ath th'
282
+ - pattern : ' Ad D' # الد
283
+ result: ' ad D'
284
+ - pattern : ' Adh Dh' # الذ
285
+ result: ' adh Dh'
286
+ - pattern : ' Ar R' # الر
287
+ result: ' ar R'
288
+ - pattern : ' Az Z' # الز
289
+ result: ' az Z'
290
+ - pattern : ' As S' # الس
291
+ result: ' as S'
292
+ - pattern : ' Ash Sh' # الش
293
+ result: ' ash Sh'
294
+ - pattern : ' Aş Ş' # الص
295
+ result: ' aş Ş'
296
+ - pattern : ' Aḑ Ḑ' # الض
297
+ result: ' aḑ Ḑ'
298
+ - pattern : ' Aţ Ţ' # الط
299
+ result: ' aţ Ţ'
300
+ - pattern : ' Az̧ Z̧' # الظ
301
+ result: ' az̧ Z̧'
302
+ - pattern : ' Al L' # الل
303
+ result: ' al L'
304
+ - pattern : ' an n' # الن
305
+ result: ' an N'
306
+ - pattern: " Al " # ال
307
+ result: " al "
308
+
309
+ characters:
310
+
311
+ # Modified/Non-Standard Arabic Script Characters
312
+
313
+ '\u067E': 'p'
314
+ '\u0686': 'ch'
315
+ '\u06A4': 'v'
316
+ # Used in Tunisian Arabic Script.
317
+ '\u06A8': 'g'
318
+ # Used principally in Iraq, but also sometimes used in other Arabic speaking countries to represent the ‘g’ sound.
319
+ '\u06AF': 'g'
320
+ # Used in Moroccan Arabic Script.
321
+ '\u06B4': 'g' # ڭ
322
+ '\u06AD': 'g'
323
+
324
+
325
+
326
+ # pointing
327
+
328
+
329
+ # Note 11
330
+ '\b\u0627\u0648': 'ū' #او
331
+ '\b\u0627\u0648\u0652' : 'aw' # اوْ
332
+ '\u0627\u0648': 'āw' #او in word medial or final position
333
+
334
+ '\b\u0627\u064A': 'ī' # اي in word initial position (see Note 11)
335
+ '\u0627\u064A' : 'āy' # اي in word medial or final position
336
+
337
+
338
+ '\u064e' : 'a' # َ fatha
339
+ '\u064e(?=\u0629)' : '' # َ fatha followed by ta' marboota
340
+ '\u064e(?=a[h|t])' : '' # َ fatha followed by ta' marboota, handling different order of conversion
341
+ '\u0650' : 'i' # ِ kasra
342
+ '\u064f' : 'u' # ُ damma
343
+
344
+ '\u064e\u0627' : 'ā' # ـَا fatha followed by ا
345
+ '\u0650\u064a' : 'ī' # ـِي kasra followed by ي
346
+ '\u064f\u0648' : 'ū' # ـُو damma followed by و
347
+ '\u064f\u0648(?=\u064e|u064f)' : 'uw' # ـِي kasra followed by ي
348
+ '\u064e\u0649' : 'á' # ـَى fatha followed by ى which is ا not ي Note 5
349
+ '\u064e\u0649\u0670' : 'á' # Note 5
350
+ '\u0649\u0670': 'á' # See Note 5
351
+ '\u0652' : '' # ْ sokoon, not romanized, Indicates absence of short vowel
352
+ '\u064e\u064a\u0652' : 'ay' # ـَيْ
353
+ '\u064e\u064a' : 'aī' # ـَي
354
+
355
+ '\u064e\u0648\u0652' : 'aw' # ـَوْ
356
+ '\u064b': '' # See Note 6
357
+ '\u064d': '' # See Note 6
358
+ '\u064c': '' # See Note 6
359
+
360
+
361
+ # special pointed letters
362
+ '\u0639\u064e' : '‘a' # عَ
363
+ '\u0639\u0650' : '‘i' # عِ
364
+ '\u0639\u064f' : '‘ū' # عُ
365
+
366
+ # Note 2
367
+ '\u0623' : ''
368
+ # '\u0623\u064e' : 'a' # أَ
369
+ # '\u0625\u0650' : 'i' # إِ
370
+ # '\u0623\u064f' : 'u' # أُ
371
+ # '\u0623\u064f\u0648' : 'ū' # أُ
372
+
373
+ # handle MacOS regex difference
374
+ '\u0639\u064f\u0648' : '‘ū' # عُو damma followed by و
375
+
376
+ '\u0650\u064a\u0651\u064e' : 'īy' # ـِيَّ
377
+ '\u0650\u064a(?=\u064e|u064f)' : 'iy' # ـِي kasra followed by ي
378
+
379
+ # not romanized in word-initial position (see Note 2)
380
+ '\u0621': '’'
381
+
382
+ '\b\u0622' : 'ā' # آ in word initial position (see Notes 3 and 9)
383
+ '\u0622': '’ā' # آ in word medial position (see Notes 3 and 9)
384
+ '\u0671': '’' # See Note 8
385
+ '\u0626' : "’" # ئ
386
+ '\u0627': 'ā' # See Notes 3 and 10
387
+
388
+ '\b\u0627\u0648' : 'ū' # اُ
389
+ '\b\u0627\u0650\u064a' : 'ī' # اي
390
+ '\b\u0627\u0644' : 'al ' # ال
391
+
392
+ # Sun letters
393
+
394
+ '\b\u0627\u0644\u062a\u0651?' : 'at t' # الت
395
+ '\b\u0627\u0644\u062b\u0651?' : 'ath th' # الث
396
+ '\b\u0627\u0644\u062f\u0651?' : 'ad d' # الد
397
+ '\b\u0627\u0644\u0630\u0651?' : 'adh dh' # الذ
398
+ '\b\u0627\u0644\u0631\u0651?' : 'ar r' # الر
399
+ '\b\u0627\u0644\u0632\u0651?' : 'az z' # الز
400
+ '\b\u0627\u0644\u0633\u0651?' : 'as s' # الس
401
+ '\b\u0627\u0644\u0634\u0651?' : 'ash sh' # الش
402
+ '\b\u0627\u0644\u0635\u0651?' : 'aş ş' # الص
403
+ '\b\u0627\u0644\u0636\u0651?' : 'aḑ ḑ' # الض
404
+ '\b\u0627\u0644\u0637\u0651?' : 'aţ ţ' # الط
405
+ '\b\u0627\u0644\u0638\u0651?' : 'az̧ z̧' # الظ
406
+ '\b\u0627\u0644\u0644\u0651?' : 'al l' # الل
407
+ '\b\u0627\u0644\u0646\u0651?' : 'an n' # الن
408
+
409
+ # shadda Note 7
410
+
411
+ '\u0628\u0651' : 'bb' # ب
412
+ '\u062a\u0651' : 'tt' # ت
413
+ '\u062b\u0651' : 'thth' # ث
414
+ '\u062c\u0651' : 'jj' # ج
415
+ '\u062d\u0651' : 'ḩḩ' # ح
416
+ '\u062e\u0651' : 'khkh' # خ
417
+ '\u062f\u0651' : 'dd' # د
418
+ '\u0630\u0651' : 'dhdh' # ذ
419
+ '\u0631\u0651' : 'rr' # ر
420
+ '\u0632\u0651' : 'zz' # ز
421
+ '\u0633\u0651' : 'ss' # س
422
+ '\u0634\u0651' : 'sh' # ش
423
+ '\u0635\u0651' : 'şş' # ص
424
+ '\u0636\u0651' : 'ḑḑ' # ض
425
+ '\u0637\u0651' : 'ţţ' # ط
426
+ '\u0638\u0651' : 'z̧z̧' # ظ
427
+ '\u063a\u0651' : 'ghgh' # غ
428
+ '\u0641\u0651' : 'ff' # ف
429
+ '\u0642\u0651' : 'qq' # ق
430
+ '\u0643\u0651' : 'kk' # ك
431
+ '\u0644\u0651' : 'll' # ل
432
+ '\u0645\u0651' : 'mm' # م
433
+ '\u0646\u0651' : 'nn' # ن
434
+ '\u0647\u0651' : 'hh' # ه
435
+ '\u0648\u0651' : 'ww' # و
436
+ '\u064a\u0651' : 'yy' # ي
437
+
438
+ # ta' marboota See Note 4
439
+
440
+ '\u0629' : 'at' # ة in the middle of the sentence
441
+ '\u0629$' : 'ah'
442
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{2})\u0629' : 'ah'
443
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{3})\u0629' : 'ah'
444
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{4})\u0629' : 'ah'
445
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{5})\u0629' : 'ah'
446
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{6})\u0629' : 'ah'
447
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{7})\u0629' : 'ah'
448
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{8})\u0629' : 'ah'
449
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{9})\u0629' : 'ah'
450
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{10})\u0629' : 'ah'
451
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{11})\u0629' : 'ah'
452
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{12})\u0629' : 'ah'
453
+ '(?<=\b\u0627\u0644[\u0600-\u06ff]{13})\u0629' : 'ah'
454
+
455
+
456
+ # standard consonant characters
457
+
458
+ '\u0628' : 'b' # ب
459
+ '\uFE91' : 'b' # ﺑ
460
+ '\uFE92' : 'b' # ﺒ
461
+ '\uFE90' : 'b' # ﺐ
462
+
463
+ '\u062a' : 't' # ت
464
+ '\ufe97' : 't' # ﺗ
465
+ '\ufe98' : 't' # ﺘ
466
+ '\ufe96' : 't' # ﺖ
467
+
468
+ '\u062b' : 'th' # ث
469
+ '\ufe9b' : 'th' # ﺛ
470
+ '\ufe9c' : 'th' # ﺜ
471
+ '\ufe9a' : 'th' # ﺚ
472
+
473
+ '\u062c' : 'j' # ج
474
+ '\ufe9f' : 'j' # ﺟ
475
+ '\ufea0' : 'j' # ﺠ
476
+ '\ufe9e' : 'j' # ﺞ
477
+
478
+ '\u062d' : 'ḩ' # ح
479
+ '\ufea3' : 'ḩ' # ﺣ
480
+ '\ufea4' : 'ḩ' # ﺤ
481
+ '\ufea2' : 'ḩ' # ﺢ
482
+
483
+ '\u062e' : 'kh' # خ
484
+ '\ufea7' : 'kh' # ﺧ
485
+ '\ufea8' : 'kh' # ﺨ
486
+ '\ufea6' : 'kh' # ﺦ
487
+
488
+ '\u062f' : 'd' # د
489
+ '\ufeaa' : 'd' # ﺪ
490
+
491
+ '\u0630' : 'dh' # ذ
492
+ '\ufeac' : 'dh' # ﺬ
493
+
494
+ '\u0631' : 'r' # ر
495
+ '\ufeae' : 'r' # ﺮ
496
+
497
+ '\u0632' : 'z' # ز
498
+ '\ufeb0' : 'z' # ﺰ
499
+
500
+ '\u0633' : 's' # س
501
+ '\ufeb3' : 's' # ﺳ
502
+ '\ufeb4' : 's' # ﺴ
503
+ '\ufeb2' : 's' # ﺲ
504
+
505
+ '\u0634' : 'sh' # ش
506
+ '\ufeb7' : 'sh' # ﺷ
507
+ '\ufeb8' : 'sh' # ﺸ
508
+ '\ufeb6' : 'sh' # ﺶ
509
+
510
+ '\u0635' : 'ş' # ص
511
+ '\ufebb' : 'ş' # ﺻ
512
+ '\ufebc' : 'ş' # ﺼ
513
+ '\ufeba' : 'ş' # ﺺ
514
+
515
+ '\u0636' : 'ḑ' # ض
516
+ '\ufebf' : 'ḑ' # ﺿ
517
+ '\ufec0' : 'ḑ' # ﻀ
518
+ '\ufebe' : 'ḑ' # ﺾ
519
+
520
+ '\u0637' : 'ţ' # ط
521
+ '\ufec3' : 'ţ' # ﻃ
522
+ '\ufec4' : 'ţ' # ﻄ
523
+ '\ufec2' : 'ţ' # ﻂ
524
+
525
+ '\u0638' : 'z̧' # ظ
526
+ '\ufec7' : 'z̧' # ﻇ
527
+ '\ufec8' : 'z̧' # ﻈ
528
+ '\ufec6' : 'z̧' # ﻆ
529
+
530
+ '\u0639' : '‘' # ع
531
+ '\ufecb' : '‘' # ﻋ
532
+ '\ufecc' : '‘' # ﻌ
533
+ '\ufeca' : '‘' # ﻊ
534
+
535
+ '\u063a' : 'gh' # غ
536
+ '\ufecf' : 'gh' # ﻏ
537
+ '\ufed0' : 'gh' # ﻐ
538
+ '\ufece' : 'gh' # ﻎ
539
+
540
+ '\u0641' : 'f' # ف
541
+ '\ufed3' : 'f' # ﻓ
542
+ '\ufed4' : 'f' # ﻔ
543
+ '\ufed2' : 'f' # ﻒ
544
+
545
+ '\u0642' : 'q' # ق
546
+ '\ufed7' : 'q' # ﻗ
547
+ '\ufed8' : 'q' # ﻘ
548
+ '\ufed6' : 'q' # ﻖ
549
+
550
+ '\u0643' : 'k' # ك
551
+ '\ufedb' : 'k' # ﻛ
552
+ '\ufedc' : 'k' # ﻜ
553
+ '\ufeda' : 'k' # ﻚ
554
+
555
+ '\u0644' : 'l' # ل
556
+ '\ufedf' : 'l' # ﻟ
557
+ '\ufee0' : 'l' # ﻠ
558
+ '\ufede' : 'l' # ﻞ
559
+
560
+ '\u0645' : 'm' # م
561
+ '\ufee3' : 'm' # ﻣ
562
+ '\ufee4' : 'm' # ﻤ
563
+ '\ufee2' : 'm' # ﻢ
564
+
565
+ '\u0646' : 'n' # ن
566
+ '\ufee7' : 'n' # ﻧ
567
+ '\ufee8' : 'n' # ﻨ
568
+ '\ufee6' : 'n' # ﻦ
569
+
570
+ '\u0647' : 'h' # ه
571
+ '\ufeeb' : 'h' # ﻫ
572
+ '\ufeec' : 'h' # ﻬ
573
+ '\ufeea' : 'h' # ﻪ
574
+
575
+ '\u0648' : 'w' # و
576
+ '\ufeee' : 'w' # ﻮ
577
+
578
+ '\u064a' : 'y' # ي
579
+ '\ufef3' : 'y' # ﻳ
580
+ '\ufef4' : 'y' # ﻴ
581
+ '\ufef1' : 'y' # ﻱ
582
+
583
+
584
+ # NUMERALS
585
+
586
+ # Although Perso-Arabic script is written from right to left, numerical expressions, e.g. ۱۹٦۸ → 1968, are written from left to right.
587
+ '۰': '0'
588
+ '۱': '1'
589
+ '۲': '2'
590
+ '۳': '3'
591
+ '٤': '4'
592
+ '٥': '5'
593
+ '٦': '6'
594
+ '۷': '7'
595
+ '۸': '8'
596
+ '۹': '9'