interscript 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (116) hide show
  1. checksums.yaml +4 -4
  2. data/lib/__pycache__/g2pwrapper.cpython-38.pyc +0 -0
  3. data/lib/interscript.rb +5 -1
  4. data/lib/interscript/fs.rb +3 -1
  5. data/lib/interscript/mapping.rb +2 -2
  6. data/lib/interscript/opal.rb +5 -1
  7. data/lib/interscript/opal/maps.js.erb +7 -4
  8. data/lib/interscript/version.rb +1 -1
  9. data/maps/acadsin-zho-Hani-Latn-2002.yaml +1 -1
  10. data/maps/alalc-amh-Ethi-Latn-1997.yaml +509 -0
  11. data/maps/alalc-amh-Ethi-Latn-2011.yaml +138 -0
  12. data/maps/alalc-ara-Arab-Latn-1997.yaml +1283 -0
  13. data/maps/alalc-asm-Deva-Latn-1997.yaml +159 -0
  14. data/maps/alalc-aze-Cyrl-Latn-1997.yaml +1 -1
  15. data/maps/{alalc-bel-cyrl-latn-1997.yaml → alalc-bel-Cyrl-Latn-1997.yaml} +2 -2
  16. data/maps/alalc-ell-Grek-Latn-1997.yaml +2 -3
  17. data/maps/alalc-ell-Grek-Latn-2010.yaml +2 -3
  18. data/maps/alalc-hin-Deva-Latn-2020.yaml +159 -0
  19. data/maps/alalc-kat-Geok-Latn-1997.yaml +1 -2
  20. data/maps/alalc-kor-Hang-Latn-1997.yaml +1 -1
  21. data/maps/alalc-mar-Deva-Latn-1997.yaml +170 -0
  22. data/maps/{alalc-mkd-cyrl-latn-1997.yaml → alalc-mkd-Cyrl-Latn-1997.yaml} +0 -0
  23. data/maps/alalc-pan-Deva-Latn-1997.yaml +237 -0
  24. data/maps/alalc-rus-Cyrl-Latn-1997.yaml +1 -2
  25. data/maps/alalc-rus-Cyrl-Latn-2012.yaml +2 -2
  26. data/maps/{alalc-srp-cyrl-latn-2013.yaml → alalc-srp-Cyrl-Latn-2013.yaml} +0 -0
  27. data/maps/bas-rus-Cyrl-Latn-2017-bss.yaml +0 -1
  28. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +0 -2
  29. data/maps/bgn-kor-Hang-Latn-1943.yaml +1 -1
  30. data/maps/bgn-kor-Kore-Latn-1943.yaml +1 -1
  31. data/maps/bgnpcgn-amh-Ethi-Latn-1967.yaml +528 -0
  32. data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +592 -0
  33. data/maps/bgnpcgn-bak-Cyrl-Latn-2007.yaml +5 -5
  34. data/maps/{bgnpcgn-bel-cyrl-latn-1979.yaml → bgnpcgn-bel-Cyrl-Latn-1979.yaml} +0 -0
  35. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +3 -4
  36. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +0 -1
  37. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +0 -1
  38. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +17 -17
  39. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +2 -2
  40. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +2 -2
  41. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +1 -1
  42. data/maps/bgnpcgn-nep-Deva-Latn-2011.yaml +200 -0
  43. data/maps/bgnpcgn-per-Arab-Latn-1956.yaml +0 -1
  44. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +0 -1
  45. data/maps/bis-asm-Beng-Latn-13194-1991.yaml +159 -0
  46. data/maps/bis-ben-Beng-Latn-13194-1991.yaml +156 -0
  47. data/maps/bis-dev-Deva-Latn-13194-1991.yaml +184 -0
  48. data/maps/bis-gjr-Gujr-Latn-13194-1991.yaml +166 -0
  49. data/maps/bis-knd-Knda-Latn-13194-1991.yaml +173 -0
  50. data/maps/bis-mlm-Mlym-Latn-13194-1991.yaml +176 -0
  51. data/maps/bis-ori-Orya-Latn-13194-1991.yaml +160 -0
  52. data/maps/bis-pnj-Guru-Latn-13194-1991.yaml +175 -0
  53. data/maps/bis-tel-Telu-Latn-13194-1991.yaml +170 -0
  54. data/maps/bis-tml-Taml-Latn-13194-1991.yaml +155 -0
  55. data/maps/by-bel-Cyrl-Latn-1998.yaml +4 -4
  56. data/maps/by-bel-Cyrl-Latn-2007.yaml +3 -3
  57. data/maps/dos-nep-Deva-Latn-1997.yaml +33 -0
  58. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +4 -5
  59. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +4 -5
  60. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +0 -1
  61. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +0 -1
  62. data/maps/ggg-kat-Geor-Latn-2002.yaml +0 -1
  63. data/maps/{gki-bel-cyrl-latn-1992.yaml → gki-bel-Cyrl-Latn-1992.yaml} +1 -1
  64. data/maps/{gki-bel-cyrl-latn-2000.yaml → gki-bel-Cyrl-Latn-2000.yaml} +1 -1
  65. data/maps/{gost-rus-cyrl-latn-16876-71-1983.yaml → gost-rus-Cyrl-Latn-16876-71-1983.yaml} +1 -1
  66. data/maps/icao-bel-Cyrl-Latn-9303.yaml +0 -5
  67. data/maps/icao-bul-Cyrl-Latn-9303.yaml +0 -4
  68. data/maps/icao-per-Arab-Latn-9303.yaml +0 -1
  69. data/maps/icao-rus-Cyrl-Latn-9303.yaml +0 -1
  70. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +0 -1
  71. data/maps/iso-ara-Arab-Latn-233-1984.yaml +323 -0
  72. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +4 -5
  73. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +1 -2
  74. data/maps/iso-rus-Cyrl-Latn-9-1995.yaml +0 -1
  75. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +1 -1
  76. data/maps/kp-kor-Hang-Latn-2002.yaml +4 -4
  77. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +2 -2
  78. data/maps/mvd-bel-Cyrl-Latn-2008.yaml +4 -4
  79. data/maps/mvd-bel-Cyrl-Latn-2010.yaml +4 -4
  80. data/maps/mvd-rus-Cyrl-Latn-2008.yaml +1 -2
  81. data/maps/mvd-rus-Cyrl-Latn-2010.yaml +4 -4
  82. data/maps/nil-kor-Hang-Hang-jamo.yaml +3 -3
  83. data/maps/odni-aze-Cyrl-Latn-2015.yaml +1 -1
  84. data/maps/odni-bel-Cyrl-Latn-2015.yaml +1 -1
  85. data/maps/odni-bul-Cyrl-Latn-2015.yaml +3 -3
  86. data/maps/odni-hin-Deva-Latn-2015.yaml +258 -0
  87. data/maps/odni-kat-Geor-Latn-2015.yaml +0 -1
  88. data/maps/{odni-mkd-cyrl-latn-2015.yaml → odni-mkd-Cyrl-Latn-2015.yaml} +0 -0
  89. data/maps/odni-rus-Cyrl-Latn-2015.yaml +1 -1
  90. data/maps/odni-srp-Cyrl-Latn-2015.yaml +2 -2
  91. data/maps/odni-urd-Arab-Latn-2015.yaml +221 -0
  92. data/maps/odni-uzb-Cyrl-Latn-2015.yaml +1 -2
  93. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +4 -4
  94. data/maps/royin-tha-Thai-Latn-1968.yaml +4 -4
  95. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +4 -4
  96. data/maps/royin-tha-Thai-Latn-1999.yaml +3 -3
  97. data/maps/{ses-ara-arab-latn-1930.yaml → ses-ara-Arab-Latn-1930.yaml} +7 -3
  98. data/maps/un-ara-Arab-Latn-1971.yaml +16 -4
  99. data/maps/un-ara-Arab-Latn-1972.yaml +14 -7
  100. data/maps/un-ara-Arab-Latn-2017.yaml +56 -19
  101. data/maps/un-bel-Cyrl-Latn-2007.yaml +3 -3
  102. data/maps/un-ell-Grek-Latn-1987-tl.yaml +1 -2
  103. data/maps/un-ell-Grek-Latn-1987-ts.yaml +0 -1
  104. data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +42 -42
  105. data/maps/un-mon-Mong-Latn-2013.yaml +9 -3
  106. data/maps/un-nep-Deva-Latn-1972.yaml +163 -0
  107. data/maps/un-rus-Cyrl-Latn-1987.yaml +1 -1
  108. data/maps/{un-ukr-cyrl-latn-1998.yaml → un-ukr-Cyrl-Latn-1998.yaml} +1 -1
  109. data/maps/ungegn-amh-Ethi-Latn-2016.yaml +575 -0
  110. data/maps/var-kor-Hang-Latn-mr-1939.yaml +2 -2
  111. data/maps/var-kor-Kore-Hang-2013.yaml +1 -1
  112. data/maps/var-kor-Kore-Latn-mr-1939.yaml +1 -2
  113. data/maps/var-tha-Thai-Thai-phonemic.yaml +5 -5
  114. data/maps/var-tha-Thai-Zsym-ipa.yaml +12 -12
  115. data/maps/var-zho-Hani-Latn-1979.yaml +7 -7
  116. metadata +41 -15
@@ -9,10 +9,10 @@ url: https://unstats.un.org/unsd/geoinfo/UNGEGN/docs/20th-gegn-docs/20th_gegn_WP
9
9
  creation_date: 1998
10
10
  description: |
11
11
  The national system of romanization for Belarusian was approved by the State Committee and Land Resources,
12
- Geodesy and Cartography, Republic of Belarus, on 20 March, 1998. This scheme was also supported by
12
+ Geodesy and Cartography, Republic of Belarus, on 20 March, 1998. This scheme was also supported by
13
13
  the Y. Kolas Institute of Linguistics and the Republic Committee on Toponymy at the Belarusian Academy of Sciences.
14
- While the system is still based on GOST 1983, it takes more precisely into account the peculiarities of
15
- the Belarusian orthography. The system is reversible though there may exist some ambiguous consonant combinations.
14
+ While the system is still based on GOST 1983, it takes more precisely into account the peculiarities of
15
+ the Belarusian orthography. The system is reversible though there may exist some ambiguous consonant combinations.
16
16
 
17
17
  tests: # the same as in by-bel-Cyrl-Latn-2007
18
18
  - source: Аршанскi
@@ -107,7 +107,7 @@ tests: # the same as in by-bel-Cyrl-Latn-2007
107
107
  expected: Viazynka
108
108
 
109
109
  map:
110
- inherit: gost-rus-cyrl-latn-16876-71-1983
110
+ inherit: gost-rus-Cyrl-Latn-16876-71-1983
111
111
 
112
112
  rules:
113
113
  - pattern: (?<=[БбВвГгДдЖжЗзЙйКкЛлМмНнПпРрСсТтФфХхЦцЧчШш])\u0415 # Е after consonants
@@ -8,11 +8,11 @@ name: REGULATORY LEGAL ACTS OF THE NATIONAL BANK, NATIONAL ACADEMY OF SCIENCES O
8
8
  url: http://www.pravo.by/pdf/2007-159/2007-159(027-028).pdf
9
9
  creation_date: 2007
10
10
  description: |
11
- RESOLUTION OF THE STATE COMMITTEE
11
+ RESOLUTION OF THE STATE COMMITTEE
12
12
  ON PROPERTY OF THE REPUBLIC OF BELARUS June 11, 2007 No. 38
13
13
 
14
- 8/16668 (06/18/2007) On amendments and additions to the Instructions
15
- for the transliteration of geographical names of the
14
+ 8/16668 (06/18/2007) On amendments and additions to the Instructions
15
+ for the transliteration of geographical names of the
16
16
  Republic of Belarus in letters of the Latin alphabet
17
17
 
18
18
  Based on the Regulation on the State Property Committee of the Republic of Belarus,
@@ -0,0 +1,33 @@
1
+ ---
2
+ authority_id: bgnpcgn
3
+ id: 1997
4
+ language: nep
5
+ source_script: Deva
6
+ destination_script: Latn
7
+ name: Nepali Romanization, 1997
8
+ url: http://nationalgeoportal.gov.np/old/pdf/translation2.pdf
9
+ creation_date: 1997
10
+ description: |
11
+ Survey Department, Ministry of Land Management, Cooperatives and Poverty Alleviation, Government of Nepal.
12
+
13
+ notes:
14
+ - |
15
+ ं (anusvara) is rendered by
16
+ ṅ before क, ख, ग, and घ
17
+ ñ before च, छ, ज, and झ
18
+ ṇ before ट, ठ, ड, and ढ
19
+ n before त, थ, द, and ध
20
+ ṁ before य, र, ल, व, श, ष, स
21
+
22
+ tests:
23
+ - source: "दुःख"
24
+ expected: "duhkh"
25
+
26
+ map:
27
+
28
+ inherit: "bgnpcgn-nep-Deva-Latn-2011"
29
+
30
+ characters:
31
+
32
+ # Bisarga
33
+ 'ः': 'h'
@@ -5,7 +5,7 @@ language: ell
5
5
  source_script: Grek
6
6
  destination_script: Latn
7
7
  name: ELOT 743:1982 (transliteration)
8
- url:
8
+ url:
9
9
  creation_date: 1982
10
10
  description: |
11
11
  ELOT 743:1982 transliteration table for Greek
@@ -22,7 +22,7 @@ tests:
22
22
 
23
23
  expected: |
24
24
  Éna práma mónon me parakíni̱se ki eména na grápso̱ óti toúti̱n ti̱n patrída ti̱n échomen óloi mazí, kai sofoí ki amatheís kai ploúsioi kai fto̱choí kai politikoí kai stratio̱tikoí kai oi pléon mikróteroi ánthro̱poi; ósoi ago̱nistí̱kamen, analógo̱s o katheís, échomen na zí̱somen edó̱. To loipón doulépsamen óloi mazí, na ti̱n fylámen ki óloi mazí kai na mi̱n légei oúte o dynatós «egó̱» oúte o adýnatos. Xérete póte na légei o katheís «egó̱»? Ótan ago̱nisteí mónos tou kai fkiásei í̱ chalásei, na légei «egó̱»; ótan ómo̱s ago̱nízontai polloí kai fkiánoun, tóte na léne «emeís». Eímaste eis to «emeís» ki óchi eis to «egó̱». Kai eis to exí̱s na máthomen gnó̱si̱, an thélomen na fkiásomen cho̱rión, na zí̱somen óloi mazí.
25
-
25
+
26
26
  Giánni̱s Makrygiánni̱s.
27
27
 
28
28
 
@@ -70,7 +70,7 @@ tests:
70
70
  expected: Taÿ́getos
71
71
  - source: σπρέυ
72
72
  expected: spréy
73
-
73
+
74
74
  - source: Αθήνα
75
75
  expected: Athí̱na
76
76
  - source: Άγιον Όρος
@@ -566,7 +566,7 @@ map:
566
566
  - pattern: (?<=[Οο])\u03C5 # υ (after Ο)
567
567
  result: u
568
568
  - pattern: (?<=[Οο])\u03CD # ύ (after Ο)
569
- result: ú
569
+ result: ú
570
570
  - pattern: (?<=[ΆάΈέΉήΌό])\u03A5 # Άυ, Έυ, Ήυ, Όυ
571
571
  result: Υ
572
572
  - pattern: (?<=[ΆάΈέΉήΌό])\u03C5 # Άυ, Έυ, Ήυ, Όυ
@@ -682,4 +682,3 @@ map:
682
682
 
683
683
  "\u0387": ";" # ·
684
684
  "\u00B7": ";" # ·
685
-
@@ -5,7 +5,7 @@ language: ell
5
5
  source_script: Grek
6
6
  destination_script: Latn
7
7
  name: ELOT 743:1982
8
- url:
8
+ url:
9
9
  creation_date: 1982
10
10
  description: |
11
11
  ELOT 743:1982 transcription table for Greek.
@@ -20,7 +20,7 @@ tests:
20
20
 
21
21
  expected: |
22
22
  Éna práma mónon me parakínise ki eména na grápso óti toútin tin patrída tin échomen óloi mazí, kai sofoí ki amatheís kai ploúsioi kai ftochoí kai politikoí kai stratiotikoí kai oi pléon mikróteroi ánthropoi; ósoi agonistíkamen, analógos o katheís, échomen na zísomen edó. To loipón doulépsamen óloi mazí, na tin fylámen ki óloi mazí kai na min légei oúte o dynatós «egó» oúte o adýnatos. Xérete póte na légei o katheís «egó»? Ótan agonisteí mónos tou kai fkiásei í chalásei, na légei «egó»; ótan ómos agonízontai polloí kai fkiánoun, tóte na léne «emeís». Eímaste eis to «emeís» ki óchi eis to «egó». Kai eis to exís na máthomen gnósi, an thélomen na fkiásomen chorión, na zísomen óloi mazí.
23
-
23
+
24
24
  Giánnis Makrygiánnis.
25
25
 
26
26
  - source: ΑΘΗΝΑ
@@ -67,7 +67,7 @@ tests:
67
67
  expected: Taÿ́getos
68
68
  - source: σπρέυ
69
69
  expected: spréy
70
-
70
+
71
71
  - source: Αθήνα
72
72
  expected: Athína
73
73
  - source: Άγιον Όρος
@@ -563,7 +563,7 @@ map:
563
563
  - pattern: (?<=[Οο])\u03C5 # υ (after Ο)
564
564
  result: u
565
565
  - pattern: (?<=[Οο])\u03CD # ύ (after Ο)
566
- result: ú
566
+ result: ú
567
567
  - pattern: (?<=[ΆάΈέΉήΌό])\u03A5 # Άυ, Έυ, Ήυ, Όυ
568
568
  result: Υ
569
569
  - pattern: (?<=[ΆάΈέΉήΌό])\u03C5 # Άυ, Έυ, Ήυ, Όυ
@@ -678,4 +678,3 @@ map:
678
678
 
679
679
  "\u0387": ";" # ·
680
680
  "\u00B7": ";" # ·
681
-
@@ -17,4 +17,3 @@ map:
17
17
  character_separator: ""
18
18
  word_separator: " "
19
19
  inherit: "iso-ell-Grek-Latn-843-1997-t1"
20
-
@@ -29,4 +29,3 @@ map:
29
29
  character_separator: ""
30
30
  word_separator: " "
31
31
  inherit: "iso-ell-Grek-Latn-843-1997-t2"
32
-
@@ -86,4 +86,3 @@ map:
86
86
  '\u10ee' : 'kh' # ხ
87
87
  '\u10ef' : 'j' # ჯ
88
88
  '\u10f0' : 'h' # ჰ
89
-
@@ -20,7 +20,7 @@ tests:
20
20
  # "Kanyukhi" in GNDB `bel_Cyrl2Latn_GBO_1992`, but that is clearly bgnpcgn-ukr-Cyrl-Latn-1965
21
21
 
22
22
  map:
23
- inherit: gost-rus-cyrl-latn-16876-71-1983
23
+ inherit: gost-rus-Cyrl-Latn-16876-71-1983
24
24
 
25
25
  characters:
26
26
  '\u0406' : 'I' # І
@@ -8,7 +8,7 @@ name: On approval of the Instructions for the transliteration of geographical na
8
8
  url: https://registr.by/doc/103003
9
9
  creation_date: 2000
10
10
  description: |
11
- Act name:
11
+ Act name:
12
12
  On approval of the Instructions for the transliteration of geographical names of
13
13
  the Republic of Belarus in letters of the Latin alphabet
14
14
  Type of act, adoption authority, date and number of adoption (publication):
@@ -24,7 +24,7 @@ description: |
24
24
  In 1978, COMECON adopted GOST 16876-71 with minor modifications as its official transliteration standard,
25
25
  under the name of SEV 1362-78 (Russian: СЭВ 1362-78).
26
26
 
27
- In 1982, In accordance with Order No. 169 of April 16, 1982, GOST 16876-71 / ST SEV 1362-78 was put into effect on May 1, 1982.
27
+ In 1982, In accordance with Order No. 169 of April 16, 1982, GOST 16876-71 / ST SEV 1362-78 was put into effect on May 1, 1982.
28
28
 
29
29
  In 1983, In accordance with Order No. 231 of May 16, 1983, Additional guidelines was released (check notes[2])
30
30
 
@@ -134,8 +134,3 @@ map:
134
134
  "\u0457": "i" # ї
135
135
  "\u0453": "g" # ѓ
136
136
  "\u0456": "i" # і
137
-
138
-
139
-
140
-
141
-
@@ -116,7 +116,3 @@ map:
116
116
  "\u0454": "ie" # є
117
117
  "\u0457": "i" # ї
118
118
  "\u0453": "g" # ѓ
119
-
120
-
121
-
122
-
@@ -101,4 +101,3 @@ map:
101
101
  "\u06D0": "Y" # ې
102
102
  "\u06D2": "XYB" # ے
103
103
  "\u06D3": "XBE" # ۓ
104
-
@@ -115,4 +115,3 @@ map:
115
115
  "\u0454": "ie" # є
116
116
  "\u0457": "i" # ї
117
117
  "\u0453": "g" # ѓ
118
-
@@ -117,4 +117,3 @@ map:
117
117
  "\u0454": "ie" # є
118
118
  "\u0457": "i" # ї
119
119
  "\u0453": "g" # ѓ
120
-
@@ -0,0 +1,323 @@
1
+ ---
2
+ authority_id: iso
3
+ id: 233-1984
4
+ language: ara
5
+ source_script: Arab
6
+ destination_script: Latn
7
+ name: ISO 233:1984 Documentation — Transliteration of Arabic characters into Latin characters
8
+ url:
9
+ - https://www.iso.org/standard/4117.html
10
+ - http://transliteration.eki.ee/pdf/Arabic_2.2.pdf
11
+ - http://www.eki.ee/wgrs/rom1_ar.pdf
12
+ creation_date: 1984
13
+ confirmation date: 2018-06
14
+ description: |
15
+ Is one of a series of International Standards dealing with
16
+ the conversion of systems of writing, following the
17
+ principles of stringent conversion in order to permit
18
+ international information exchange. Its aim is to provide a
19
+ means for international communication of written messages
20
+ in a form which permits the automatic transmission and
21
+ reconstitution of these by men or machines. Cancels and
22
+ replaces ISO Recommendation R 233-1961
23
+ notes:
24
+ - |
25
+ The transliteration ISO 233:1984 WRT ara-arab-latn-2017 gives every character and diacritical mark a unique
26
+ equivalent and e.g. long vowels in Arabic ā, ī and ū are consequently written a’, iy and uw
27
+ respectively in the ISO transliteration. Other main correspondences
28
+ ث is ṯ instead of th
29
+ ج is ǧ instead of j
30
+ ح is ḥ instead of ẖ
31
+ خ is ẖ instead of kh
32
+ ذ is ḏ instead of dh
33
+ ش is š instead of sh
34
+ ص is ṣ instead of s̱
35
+ ض is ḍ instead of ḏ
36
+ ط is ṭ instead of ṯ
37
+ ظ is ẓ instead of d͟h
38
+ غ is ġ instead of gh
39
+ ة is ẗ instead of h/t
40
+ ى is ỳ
41
+ ـِي is iy instead of iy
42
+ ـُو is uw instead of ū
43
+ ـَا is a’ instead of ā
44
+ ـَى is aỳ instead of á
45
+
46
+ tests:
47
+
48
+ - source: مِصر
49
+ expected: Miṣr
50
+
51
+ - source: قَطَر
52
+ expected: Qaṭar
53
+
54
+ - source: الجُمهُورِيَّة العِرَاقِيَّة
55
+ expected: Al Ǧumhuwriyaẗ al ‘Ira’qiyaẗ
56
+
57
+ - source: جُمهُورِيَّة مِصر العَرَبِيَّة
58
+ expected: Ǧumhuwriyaẗ Miṣr al ‘Arabiyaẗ
59
+
60
+ - source: الرِيَاض
61
+ expected: Ar Riya’ḍ
62
+
63
+ - source: الشارِقة
64
+ expected: Aš Šâriqaẗ
65
+
66
+ map:
67
+ postrules:
68
+ - pattern: (?<=\b)(?<!\b[‘|’|'])[\u0061-\uFFFF]
69
+ result: "upcase"
70
+ # don't capitalize defined article in the middle of a sentence
71
+ - pattern : ' At T' # الت
72
+ result: ' at T'
73
+ - pattern : ' Aṯ Ṯ' # الث
74
+ result: ' aṯ Ṯ'
75
+ - pattern : ' Ad D' # الد
76
+ result: ' ad D'
77
+ - pattern : ' Aḏ Ḏ' # الذ
78
+ result: ' aḏ Ḏ'
79
+ - pattern : ' Ar R' # الر
80
+ result: ' ar R'
81
+ - pattern : ' Az Z' # الز
82
+ result: ' az Z'
83
+ - pattern : ' As S' # الس
84
+ result: ' as S'
85
+ - pattern : ' Aš Š' # الش
86
+ result: ' aš Š'
87
+ - pattern : ' Aṣ Ṣ' # الص
88
+ result: ' aṣ Ṣ'
89
+ - pattern : ' Aḍ Ḍ' # الض
90
+ result: ' aḍ Ḍ'
91
+ - pattern : ' Aṭ Ṭ' # الط
92
+ result: ' aṭ Ṭ'
93
+ - pattern : ' Aẓ Ẓ' # الظ
94
+ result: ' aẓ Ẓ'
95
+ - pattern : ' Al L' # الل
96
+ result: ' al L'
97
+ - pattern : ' an n' # الن
98
+ result: ' an N'
99
+ - pattern: " Al " # ال
100
+ result: " al "
101
+
102
+ characters:
103
+
104
+ # pointing
105
+ '\u064e' : 'a' # َ fatha
106
+ '\u064e(?=\u0629)' : '' # َ fatha followed by ta' marboota
107
+ '\u0650' : 'i' # ِ kasra
108
+ '\u064f' : 'u' # ُ damma
109
+ '\u0652' : '' # ْ sokoon, see note A below
110
+
111
+ # special pointed letters
112
+ # special pointed letters
113
+ '\u0639\u064e' : '‘a' # عَ
114
+ '\u0639\u0650' : '‘i' # عِ
115
+ '\u0639\u064f' : '‘ū' # عُ
116
+ # handle MacOS regex difference
117
+ '\u0639\u064f\u0648' : '‘ū' # عُو damma followed by و
118
+
119
+ '\u0650\u064a' : 'iy' # ـِي kasra followed by ي
120
+ '\u0650\u064a\u0651\u064e' : 'iy' # ـِيَّ
121
+ '\u0650\u064a(?=\u064e|u064f)' : 'iy' # ـِي kasra followed by ي
122
+ '\u064e\u0627' : 'a’' # ـَا fatha followed by ا
123
+ '\u064e\u0649' : 'aỳ' # ـَى fatha followed by ى which is ا not ي
124
+ '\u064f\u0648' : 'uw' # ـُو damma followed by و
125
+ '\u064e\u0648\u0652' : 'aw' # ـَوْ
126
+ '\u064e\u064a\u0652' : 'ay' # ـَيْ
127
+
128
+ # Sun letters
129
+
130
+ '\b\u0627\u0644\u062a' : 'at t' # الت
131
+ '\b\u0627\u0644\u062b' : 'aṯ ṯ' # الث
132
+ '\b\u0627\u0644\u062f' : 'ad d' # الد
133
+ '\b\u0627\u0644\u0630' : 'aḏ ḏ' # الذ
134
+ '\b\u0627\u0644\u0631' : 'ar r' # الر
135
+ '\b\u0627\u0644\u0632' : 'az z' # الز
136
+ '\b\u0627\u0644\u0633' : 'as s' # الس
137
+ '\b\u0627\u0644\u0634' : 'aš š' # الش
138
+ '\b\u0627\u0644\u0635' : 'aṣ ṣ' # الص
139
+ '\b\u0627\u0644\u0636' : 'aḍ ḍ' # الض
140
+ '\b\u0627\u0644\u0637' : 'aṭ ṭ' # الط
141
+ '\b\u0627\u0644\u0638' : 'aẓ ẓ' # الظ
142
+ '\b\u0627\u0644\u0644' : 'al l' # الل
143
+ '\b\u0627\u0644\u0646' : 'an n' # الن
144
+
145
+ # ta' marboota in iso-233-1984 is all the same `aẗ`
146
+ '\u0629' : 'aẗ' # ة in the middle of the sentence
147
+
148
+ # Shadda
149
+
150
+
151
+ '\u0628\u0651' : 'bb' # ب
152
+ '\u062a\u0651' : 'tt' # ت
153
+ '\u062b\u0651' : 'ṯṯ' # ث
154
+ '\u062c\u0651' : 'ǧǧ' # ج
155
+ '\u062d\u0651' : 'ḥḥ' # ح
156
+ '\u062e\u0651' : 'ẖẖ' # خ
157
+ '\u062f\u0651' : 'dd' # د
158
+ '\u0630\u0651' : 'ḏḏ' # ذ
159
+ '\u0631\u0651' : 'rr' # ر
160
+ '\u0632\u0651' : 'zz' # ز
161
+ '\u0633\u0651' : 'ss' # س
162
+ '\u0634\u0651' : 'šš' # ش
163
+ '\u0635\u0651' : 'ṣṣ' # ص
164
+ '\u0636\u0651' : 'ḍḍ' # ض
165
+ '\u0637\u0651' : 'ṭṭ' # ط
166
+ '\u0638\u0651' : 'ẓẓ' # ظ
167
+ '\u063a\u0651' : 'ġġ' # غ
168
+ '\u0641\u0651' : 'ff' # ف
169
+ '\u0642\u0651' : 'qq' # ق
170
+ '\u0643\u0651' : 'kk' # ك
171
+ '\u0644\u0651' : 'll' # ل
172
+ '\u0645\u0651' : 'mm' # م
173
+ '\u0646\u0651' : 'nn' # ن
174
+ '\u0647\u0651' : 'hh' # ه
175
+ '\u0648\u0651' : 'ww' # و
176
+ '\u064a\u0651' : 'yy' # ي
177
+
178
+
179
+ '\u0622' : '’â' # آ
180
+
181
+ '\u0627' : 'â' # ا
182
+
183
+ '\u0649' : 'ỳ' # ى
184
+
185
+ '\u0626' : "'" # ئ
186
+
187
+
188
+ '\u0621' : # ء
189
+ - '’'
190
+ - '' # see note A
191
+
192
+ '\u0623' : 'a' # أ
193
+
194
+ # See note B
195
+ '\b\u0627\u0644' : 'al ' # ال
196
+ # '\uFE8E' : '' # ﺎ
197
+
198
+ '\u0628' : 'b' # ب
199
+ '\uFE91' : 'b' # ﺑ
200
+ '\uFE92' : 'b' # ﺒ
201
+ '\uFE90' : 'b' # ﺐ
202
+
203
+ # See note C
204
+ '\u062a' : 't' # ت
205
+ '\ufe97' : 't' # ﺗ
206
+ '\ufe98' : 't' # ﺘ
207
+ '\ufe96' : 't' # ﺖ
208
+
209
+ '\u062b' : 'ṯ' # ث
210
+ '\ufe9b' : 'ṯ' # ﺛ
211
+ '\ufe9c' : 'ṯ' # ﺜ
212
+ '\ufe9a' : 'ṯ' # ﺚ
213
+
214
+ '\u062c' : 'ǧ' # ج
215
+ '\ufe9f' : 'ǧ' # ﺟ
216
+ '\ufea0' : 'ǧ' # ﺠ
217
+ '\ufe9e' : 'ǧ' # ﺞ
218
+
219
+ '\u062d' : 'ḥ' # ح
220
+ '\ufea3' : 'ḥ' # ﺣ
221
+ '\ufea4' : 'ḥ' # ﺤ
222
+ '\ufea2' : 'ḥ' # ﺢ
223
+
224
+ '\u062e' : 'ẖ' # خ
225
+ '\ufea7' : 'ẖ' # ﺧ
226
+ '\ufea8' : 'ẖ' # ﺨ
227
+ '\ufea6' : 'ẖ' # ﺦ
228
+
229
+ '\u062f' : 'd' # د
230
+ '\ufeaa' : 'd' # ﺪ
231
+
232
+ '\u0630' : 'ḏ' # ذ
233
+ '\ufeac' : 'ḏ' # ﺬ
234
+
235
+ '\u0631' : 'r' # ر
236
+ '\ufeae' : 'r' # ﺮ
237
+
238
+ '\u0632' : 'z' # ز
239
+ '\ufeb0' : 'z' # ﺰ
240
+
241
+ '\u0633' : 's' # س
242
+ '\ufeb3' : 's' # ﺳ
243
+ '\ufeb4' : 's' # ﺴ
244
+ '\ufeb2' : 's' # ﺲ
245
+
246
+ '\u0634' : 'š' # ش
247
+ '\ufeb7' : 'š' # ﺷ
248
+ '\ufeb8' : 'š' # ﺸ
249
+ '\ufeb6' : 'š' # ﺶ
250
+
251
+ '\u0635' : 'ṣ' # ص
252
+ '\ufebb' : 'ṣ' # ﺻ
253
+ '\ufebc' : 'ṣ' # ﺼ
254
+ '\ufeba' : 'ṣ' # ﺺ
255
+
256
+ '\u0636' : 'ḍ' # ض
257
+ '\ufebf' : 'ḍ' # ﺿ
258
+ '\ufec0' : 'ḍ' # ﻀ
259
+ '\ufebe' : 'ḍ' # ﺾ
260
+
261
+ '\u0637' : 'ṭ' # ط
262
+ '\ufec3' : 'ṭ' # ﻃ
263
+ '\ufec4' : 'ṭ' # ﻄ
264
+ '\ufec2' : 'ṭ' # ﻂ
265
+
266
+ '\u0638' : 'ẓ' # ظ
267
+ '\ufec7' : 'ẓ' # ﻇ
268
+ '\ufec8' : 'ẓ' # ﻈ
269
+ '\ufec6' : 'ẓ' # ﻆ
270
+
271
+ '\u0639' : '‘' # ع
272
+ '\ufecb' : '‘' # ﻋ
273
+ '\ufecc' : '‘' # ﻌ
274
+ '\ufeca' : '‘' # ﻊ
275
+
276
+ '\u063a' : 'ġ' # غ
277
+ '\ufecf' : 'ġ' # ﻏ
278
+ '\ufed0' : 'ġ' # ﻐ
279
+ '\ufece' : 'ġ' # ﻎ
280
+
281
+ '\u0641' : 'f' # ف
282
+ '\ufed3' : 'f' # ﻓ
283
+ '\ufed4' : 'f' # ﻔ
284
+ '\ufed2' : 'f' # ﻒ
285
+
286
+ '\u0642' : 'q' # ق
287
+ '\ufed7' : 'q' # ﻗ
288
+ '\ufed8' : 'q' # ﻘ
289
+ '\ufed6' : 'q' # ﻖ
290
+
291
+ '\u0643' : 'k' # ك
292
+ '\ufedb' : 'k' # ﻛ
293
+ '\ufedc' : 'k' # ﻜ
294
+ '\ufeda' : 'k' # ﻚ
295
+
296
+ '\u0644' : 'l' # ل
297
+ '\ufedf' : 'l' # ﻟ
298
+ '\ufee0' : 'l' # ﻠ
299
+ '\ufede' : 'l' # ﻞ
300
+
301
+ '\u0645' : 'm' # م
302
+ '\ufee3' : 'm' # ﻣ
303
+ '\ufee4' : 'm' # ﻤ
304
+ '\ufee2' : 'm' # ﻢ
305
+
306
+ '\u0646' : 'n' # ن
307
+ '\ufee7' : 'n' # ﻧ
308
+ '\ufee8' : 'n' # ﻨ
309
+ '\ufee6' : 'n' # ﻦ
310
+
311
+ # See note C
312
+ '\u0647' : 'h' # ه
313
+ '\ufeeb' : 'h' # ﻫ
314
+ '\ufeec' : 'h' # ﻬ
315
+ '\ufeea' : 'h' # ﻪ
316
+
317
+ '\u0648' : 'w' # و
318
+ '\ufeee' : 'w' # ﻮ
319
+
320
+ '\u064a' : 'y' # ي
321
+ '\ufef3' : 'y' # ﻳ
322
+ '\ufef4' : 'y' # ﻴ
323
+ '\ufef1' : 'y' # ﻱ