RubyGems - interscript - Versions diffs - 0.1.4 → 0.1.5 - Mend

interscript 0.1.4 → 0.1.5

Files changed (25) hide show

checksums.yaml +4 -4
data/README.adoc +9 -8
data/lib/__pycache__/g2pwrapper.cpython-38.pyc +0 -0
data/lib/interscript-opal.rb +2 -0
data/lib/interscript.rb +46 -56
data/lib/interscript/command.rb +3 -2
data/lib/interscript/fs.rb +69 -0
data/lib/interscript/mapping.rb +35 -18
data/lib/interscript/opal.rb +23 -0
data/lib/interscript/opal/maps.js.erb +7 -0
data/lib/interscript/opal_map_translate.rb +12 -0
data/lib/interscript/version.rb +1 -1
data/maps/{bgnpcgn-chn-Hans-Latn-1979.yaml → bgnpcgn-zho-Hans-Latn-1979.yaml} +1 -1
data/maps/odni-aze-Cyrl-Latn-2015.yaml +144 -0
data/maps/odni-kaz-Cyrl-Latn-2015.yaml +148 -0
data/maps/odni-kir-Cyrl-Latn-2015.yaml +136 -0
data/maps/odni-mkd-cyrl-latn-2015.yaml +122 -0
data/maps/odni-tat-Cyrl-Latn-2015.yaml +142 -0
data/maps/odni-tgk-Cyrl-Latn-2015.yaml +148 -0
data/maps/odni-uig-Cyrl-Latn-2015.yaml +138 -0
data/maps/ses-ara-arab-latn-1930.yaml +275 -0
data/maps/un-ara-Arab-Latn-1971.yaml +127 -0
data/maps/un-ara-Arab-Latn-1972.yaml +152 -0
data/maps/un-ara-Arab-Latn-2017.yaml +383 -0
metadata +89 -2

data/maps/ses-ara-arab-latn-1930.yaml ADDED

@@ -0,0 +1,275 @@
+---
+authority_id: ungegn
+id: 1930
+language: ara
+source_script: Arab
+destination_script: Latn
+name: ROMANIZATION OF ARABIC -- UNGEGN 2017 System
+url: http://www.eki.ee/wgrs/rom1_ar.pdf
+creation_date: 1930
+confirmation date: 2018-06
+description: |
+  The current United Nations recommended romanization
+  system was approved in 2017 (resolution XI/3), based on
+  the system adopted by Arabic experts at the conference
+  held in Beirut in 2007, the Unified Arabic
+  Transliteration System, taking into account the
+  practical amendments and corrections carried out and
+  agreed upon by the representatives of the Arabic-
+  speaking countries at the Fourth Arab Conference on
+  Geographical Names, held in Beirut in 2008, and some
+  clarifications and amendments agreed in Riyadh in 20171.
+  Previously, the United Nations had approved a
+  romanization system in 1972 (resolution II/8), based on the
+  system adopted by Arabic experts at the conference
+  held at Beirut in 1971 with the practical amendments carried out
+  and agreed upon by the representatives of the Arabic-speaking
+  countries at their conference. The table was published in volume
+  II of the conference report.
+  In UN resolution XI/3 it is specifically stated that the
+  system was recommended for the “romanization of the
+  geographical names within those Arabic-speaking countries
+  where this system is officially adopted”. There is
+  evidence of its partial implementation in Jordan, Oman and
+  Saudi Arabia. The UNGEGN Working Group on Romanization
+  Systems intends to continue monitoring the UN system’s
+  implementation across Arabic-speaking countries.
+  In some countries there exist local romanization schemes
+  or practices. The geographical names of Algeria, Djibouti,
+  Mauritania, Morocco and Tunisia are generally rendered in
+  the traditional manner which conforms to the principles of
+  the French orthography.
+  The previous UN-approved system is still found in
+  considerable international usage.
+  Arabic is written from right to left. The Arabic script
+  usually omits vowel points and diacritical marks from
+  writing which makes it difficult to obtain uniform results
+  in the romanization of Arabic. It is essential to identify
+  correctly the words which appear in any particular name
+  and to know the standard Arabic-script spelling including
+  the relevant vowels. One must also take into account
+  dialectal and idiosyncratic deviations. The romanization
+  is generally reversible though there may be some ambiguous
+  letter sequences (dh, kh, sh, th) which may also point to
+  combinations of Arabic characters in addition to the
+  respective single characters.
+notes:
+  - |
+    The Survey of Egypt System (SES) of romanization has the following correspondences with
+    the UN system:
+    á = a #  ـَى fatha followed by ى which is ا not ي
+    ā = â (a) # ـَا fatha followed by alef // آ
+    -ah (ة- = (a # ة ta' marboota at the end of a sentence
+    aw = ô (au) # ـَوْ
+    ay = ei (ai) # ـَيْ
+    ḏ = ḍ # ض
+    dh = dh (z) # ذ
+    d͟h = ẓ (d) # ظ
+    ẖ = ḥ # ح
+    ī = î
+    j = g (j)
+    q = q (k)
+    s = s (c)
+    s̱ = ṣ
+    ṯ = ṭ
+    th = th (t)
+    ū = û
+    ‘ = ‛
+  - |
+    The variants in parentheses are used depending on pronunciation and tradition. Not all the
+    variations have been given above. The article is always written el- (El-Kafr el-Qadîm, Sharm
+    el-Sheikh).
+tests:
+  # Examples taken from:
+  # https://unstats.un.org/unsd/geoinfo/geonames/
+  - source: شَرم الشَيْخ
+    expected: sharm el-sheikh
+  - source: الكَفر القَدِيم
+    expected: el-kafr el-qadîm
+map:
+  inherit: "un-ara-Arab-Latn-2017"
+  characters:
+    # special pointed letters
+    '\u0639\u064e' : '‛a'  # عَ
+    '\u0639\u0650' : '‛i'  # عِ
+    '\u0639\u064f' : '‛û'  # عُ
+    # handle MacOS regex difference
+    '\u0639\u064f\u0648' : '‛û'  # عُو damma followed by و
+    '\u0650\u064a' : 'î' # ـِي kasra followed by ي
+    '\u0650\u064a\u0651\u064e' : 'îy' # ـِيَّ
+    '\u064f\u0648' : 'û'  # ـُو damma followed by و
+    '\u064e\u0627' :   # ـَا fatha followed by ا
+      - 'â'
+      - 'a'
+    '\u064e\u0649' : 'a'  # ـَى fatha followed by ى which is ا not ي
+    '\u064e\u0648\u0652' :  # ـَوْ
+      - 'ô'
+      - 'au'
+    '\u064e\u064a\u0652' :   # ـَيْ
+      - 'ei'
+      - 'ai'
+    '\u0622' :   # آ
+      - 'â'
+      - 'a'
+    # ta' marboota
+    '\u0629$' : 'a'
+    '(?<=\b\u0627\u0644[\u0600-\u06ff]{2})\u0629' : 'a'
+    '(?<=\b\u0627\u0644[\u0600-\u06ff]{3})\u0629' : 'a'
+    '(?<=\b\u0627\u0644[\u0600-\u06ff]{4})\u0629' : 'a'
+    '(?<=\b\u0627\u0644[\u0600-\u06ff]{5})\u0629' : 'a'
+    '(?<=\b\u0627\u0644[\u0600-\u06ff]{6})\u0629' : 'a'
+    '(?<=\b\u0627\u0644[\u0600-\u06ff]{7})\u0629' : 'a'
+    '(?<=\b\u0627\u0644[\u0600-\u06ff]{8})\u0629' : 'a'
+    '(?<=\b\u0627\u0644[\u0600-\u06ff]{9})\u0629' : 'a'
+    '(?<=\b\u0627\u0644[\u0600-\u06ff]{10})\u0629' : 'a'
+    '(?<=\b\u0627\u0644[\u0600-\u06ff]{11})\u0629' : 'a'
+    '(?<=\b\u0627\u0644[\u0600-\u06ff]{12})\u0629' : 'a'
+    '(?<=\b\u0627\u0644[\u0600-\u06ff]{13})\u0629' : 'a'
+    # Sun letters
+    '\b\u0627\u0644\u062a' : 'el-t'  # الت
+    '\b\u0627\u0644\u062b' :  # الث
+      - 'el-th'
+      - 'el-t'
+    '\b\u0627\u0644\u062f' : 'el-d' # الد
+    '\b\u0627\u0644\u0630' :  # الذ
+      - 'el-dh'
+      - 'el-z'
+    '\b\u0627\u0644\u0631' : 'el-r' # الر
+    '\b\u0627\u0644\u0632' : 'el-z'  # الز
+    '\b\u0627\u0644\u0633' : # الس
+      - 'el-s'
+      - 'el-c'
+    '\b\u0627\u0644\u0634' : 'el-sh' # الش
+    '\b\u0627\u0644\u0635' : 'el-ṣ'  # الص
+    '\b\u0627\u0644\u0636' : 'el-ḍ'  # الض
+    '\b\u0627\u0644\u0637' : 'el-ṭ'  # الط
+    '\b\u0627\u0644\u0638' :  # الظ
+      - 'el-ẓ'
+      - 'el-d'
+    '\b\u0627\u0644\u0644' : 'el-l'  # الل
+    '\b\u0627\u0644\u0646' : 'el-n'  # الن
+    # shadda
+    '\u062b\u0651' :  # ث
+      - 'thth'
+      - 'tt'
+    '\u062c\u0651' :  # ج
+      - 'gg'
+      - 'jj'
+    '\u062d\u0651' : 'ḥḥ'  # ح
+    '\u062e\u0651' : 'khkh'  # خ
+    '\u0633\u0651' :  # س
+      - 'ss'
+      - 'cc'
+    '\u0635\u0651' : 'ṣṣ'  # ص
+    '\u0636\u0651' : 'ḍḍ'  # ض
+    '\u0637\u0651' : 'ṭṭ'  # ط
+    '\u0638\u0651' :  # ظ
+      - 'ẓẓ'
+      - 'dd'
+    '\u0642\u0651' :  # ق
+      - 'qq'
+      - 'kk'
+    '\b\u0627\u0644' : 'el-'  # ال
+    # normal letters
+    '\u062c' : # ج
+      - 'g'
+      - 'j'
+    '\ufe9f' :  # ﺟ
+      - 'g'
+      - 'j'
+    '\ufea0' :  # ﺠ
+      - 'g'
+      - 'j'
+    '\ufe9e' :  # ﺞ
+      - 'g'
+      - 'j'
+    '\u062d' : 'ḥ' # ح
+    '\ufea3' : 'ḥ' # ﺣ
+    '\ufea4' : 'ḥ' # ﺤ
+    '\ufea2' : 'ḥ' # ﺢ
+    '\u062e' : 'kh'  # خ
+    '\ufea7' : 'kh'  # ﺧ
+    '\ufea8' : 'kh'  # ﺨ
+    '\ufea6' : 'kh'  # ﺦ
+    '\u0630' : # ذ
+      - 'dh'
+      - 'z'
+    '\ufeac' :  # ﺬ
+      - 'dh'
+      - 'z'
+    '\u0633' :  # س
+      - 's'
+      - 'c'
+    '\ufeb3' :  # ﺳ
+      - 's'
+      - 'c'
+    '\ufeb4' :  # ﺴ
+      - 's'
+      - 'c'
+    '\ufeb2' :  # ﺲ
+      - 's'
+      - 'c'
+    '\u0635' : 'ṣ'  # ص
+    '\ufebb' : 'ṣ'  # ﺻ
+    '\ufebc' : 'ṣ'  # ﺼ
+    '\ufeba' : 'ṣ'  # ﺺ
+    '\u0636' : 'ḍ'  # ض
+    '\ufebf' : 'ḍ'  # ﺿ
+    '\ufec0' : 'ḍ'  # ﻀ
+    '\ufebe' : 'ḍ'  # ﺾ
+    '\u0637' : 'ṭ'  # ط
+    '\ufec3' : 'ṭ'  # ﻃ
+    '\ufec4' : 'ṭ'  # ﻄ
+    '\ufec2' : 'ṭ'  # ﻂ
+    '\u0639' : '‛'  # ع
+    '\ufecb' : '‛'  # ﻋ
+    '\ufecc' : '‛'  # ﻌ
+    '\ufeca' : '‛'  # ﻊ
+    '\u0638' :  # ظ
+      - 'ẓ'
+      - 'd'
+    '\ufec7' :  # ظ
+      - 'ẓ'
+      - 'd'
+    '\ufec8' :  # ظ
+      - 'ẓ'
+      - 'd'
+    '\ufec6' :  # ظ
+      - 'ẓ'
+      - 'd'
+    '\u0642' :  # ق
+      - 'q'
+      - 'k'
+    '\ufed7' :  # ﻗ
+      - 'q'
+      - 'k'
+    '\ufed8' :  # ﻘ
+      - 'q'
+      - 'k'
+    '\ufed6' :  # ﻖ
+      - 'q'
+      - 'k'

data/maps/un-ara-Arab-Latn-1971.yaml ADDED

@@ -0,0 +1,127 @@
+---
+authority_id: ungegn
+id: 1971
+language: ara
+source_script: Arab
+destination_script: Latn
+name: 1971 "Beirut system"
+url: https://unstats.un.org/unsd/geoinfo/UNGEGN/docs/2nd-uncsgn-docs/E_Conf61_4_Add1_e.pdf
+creation_date: 1971
+confirmation date: 2018-06
+description: |
+  The current United Nations recommended romanization
+  system was approved in 2017 (resolution XI/3), based on
+  the system adopted by Arabic experts at the conference
+  held in Beirut in 2007, the Unified Arabic
+  Transliteration System, taking into account the
+  practical amendments and corrections carried out and
+  agreed upon by the representatives of the Arabic-
+  speaking countries at the Fourth Arab Conference on
+  Geographical Names, held in Beirut in 2008, and some
+  clarifications and amendments agreed in Riyadh in 20171.
+  Previously, the United Nations had approved a
+  romanization system in 1972 (resolution II/8), based on the
+  system adopted by Arabic experts at the conference
+  held at Beirut in 1971 with the practical amendments carried out
+  and agreed upon by the representatives of the Arabic-speaking
+  countries at their conference. The table was published in volume
+  II of the conference report.
+  In UN resolution XI/3 it is specifically stated that the
+  system was recommended for the “romanization of the
+  geographical names within those Arabic-speaking countries
+  where this system is officially adopted”. There is
+  evidence of its partial implementation in Jordan, Oman and
+  Saudi Arabia. The UNGEGN Working Group on Romanization
+  Systems intends to continue monitoring the UN system’s
+  implementation across Arabic-speaking countries.
+  In some countries there exist local romanization schemes
+  or practices. The geographical names of Algeria, Djibouti,
+  Mauritania, Morocco and Tunisia are generally rendered in
+  the traditional manner which conforms to the principles of
+  the French orthography.
+  The previous UN-approved system is still found in
+  considerable international usage.
+  Arabic is written from right to left. The Arabic script
+  usually omits vowel points and diacritical marks from
+  writing which makes it difficult to obtain uniform results
+  in the romanization of Arabic. It is essential to identify
+  correctly the words which appear in any particular name
+  and to know the standard Arabic-script spelling including
+  the relevant vowels. One must also take into account
+  dialectal and idiosyncratic deviations. The romanization
+  is generally reversible though there may be some ambiguous
+  letter sequences (dh, kh, sh, th) which may also point to
+  combinations of Arabic characters in addition to the
+  respective single characters.
+notes:
+  - |
+    ث is t͟h (th with sub marcon)
+    خ is k͟h (kh with sub marcon)
+    ذ is d͟h (dh with sub marcon)
+    ش is s͟h (sh with sub marcon)
+    ظ is z͟h (zh with sub marcon)
+    غ is g͟h (gh witg sub marcon)
+    The previous UN 1972 System had the following differences:
+    the character (ظ) was romanized as z̧ instead of d͟h;
+    the cedilla (¸) was used instead of sub-macron (_) in all characters with sub-macrons.  - |
+tests:
+  # Examples taken from:
+  # https://unstats.un.org/unsd/geoinfo/UNGEGN/docs/2nd-uncsgn-docs/E_Conf61_4_Add1_e.pdf
+  # page 31 (38 digital)
+  - source: خَيبَر
+    expected: k͟haybar
+  - source: ظَهران
+    expected: z͟hahrān
+  - source: القُدس
+    expected: al quds
+map:
+  inherit: "un-ara-Arab-Latn-2017"
+  characters:
+    # sun letters
+    '\b\u0627\u0644\u062b' : 'at͟h t͟h'  # الث
+    '\b\u0627\u0644\u0630' : 'ad͟h d͟h'  # الذ
+    '\b\u0627\u0644\u0634' : 'as͟h s͟h' # الش
+    '\b\u0627\u0644\u0638' : 'az͟h z͟h' # الظ
+    # shadda
+    '\u062e\u0651' : 'k͟hk͟h'  # خ
+    '\u0630\u0651' : 'd͟hd͟h'  # ذ
+    '\u0634\u0651' : 's͟h'  # ش
+    '\u0638\u0651' : 'z͟hz͟h'  # ظ
+    '\u063a\u0651' : 'g͟hg͟h'  # غ
+    '\u062b' : 't͟h'  # ث
+    '\ufe9b' : 't͟h'  # ﺛ
+    '\ufe9c' : 't͟h'  # ﺜ
+    '\ufe9a' : 't͟h'  # ﺚ
+    '\u062e' : 'k͟h'  # خ
+    '\ufea7' : 'k͟h'  # ﺧ
+    '\ufea8' : 'k͟h'  # ﺨ
+    '\ufea6' : 'k͟h'  # ﺦ
+    '\u063a' : 'g͟h'  # غ
+    '\ufecf' : 'g͟h'  # ﻏ
+    '\ufed0' : 'g͟h'  # ﻐ
+    '\ufece' : 'g͟h'  # ﻎ
+    '\u0630' : 'd͟h'  # ذ
+    '\ufeac' : 'd͟h'  # ﺬ
+    '\u0634' : 's͟h'  # ش
+    '\ufeb7' : 's͟h'  # ﺷ
+    '\ufeb8' : 's͟h'  # ﺸ
+    '\ufeb6' : 's͟h'  # ﺶ
+    '\u0638' : 'z͟h'  # ظ
+    '\ufec7' : 'z͟h'  # ﻇ
+    '\ufec8' : 'z͟h'  # ﻈ
+    '\ufec6' : 'z͟h'  # ﻆ

data/maps/un-ara-Arab-Latn-1972.yaml ADDED

@@ -0,0 +1,152 @@
+---
+authority_id: ungegn
+id: 1972
+language: ara
+source_script: Arab
+destination_script: Latn
+name: ROMANIZATION OF ARABIC -- UNGEGN 1972 System
+url: http://www.eki.ee/wgrs/obs_rom_vers/rom1_ar_v4_0.pdf
+creation_date: 1972
+confirmation date: 2018-06
+description: |
+  The United Nations recommended romanization
+  system was approved in 1972 (resolution II/8),
+  based on the system adopted by Arabic experts at
+  the conference held at Beirut in 1971 with the
+  practical amendments carried out and agreed upon
+  by the representatives of the Arabic-speaking
+  countries at their conference. The table was
+  published in volume II of the conference report1
+  . In the UN resolution it was specifically
+  pointed out that the system was recommended "for
+  the romanization of the geographical names within
+  those Arabic-speaking countries where this system
+  is officially acknowledged". It cannot be
+  definitely ascertained which of the
+  Arabicspeaking countries have adopted this system
+  officially, especially since 2007 when there are
+  efforts by the Arabic Division to promote a
+  modification of the UN system (ADEGN
+  romanization, see the section on other
+  romanization systems below), with varying
+  success2 . Judging by the use of names in
+  international cartographic products which rely
+  mostly on national sources it appears that the UN
+  system or its modification is more or less
+  current in Iraq, Kuwait, Libya, Saudi Arabia3 ,
+  United Arab Emirates and Yemen, there and in some
+  other countries the system is often used without
+  diacritical marks. For the geographical names of
+  the Syrian Arab Republic international maps
+  favour the UN system while the local usage seems
+  to prefer a French-oriented romanization. Also in
+  Egypt and Sudan there exist local romanization
+  schemes or practices side by side with the UN
+  system. The geographical names of Algeria,
+  Djibouti, Mauritania, Morocco and Tunisia are
+  generally rendered in the traditional manner
+  which conforms to the principles of the French
+  orthography. Resolution 7 of the Seventh UN
+  Conference on the Standardization of Geographical
+  Names (1998) recommended that "the League of Arab
+  States should, through its specialized
+  structures, continue its efforts to organize a
+  conference with a view to considering the
+  difficulties encountered in applying the amended
+  Beirut system of 1972 for the romanization of
+  Arabic script, and submit, as soon as possible, a
+  solution to the United Nations Group of Experts
+  on Geographical Names". At the Eighth UN
+  Conference on the Standardization of Geographical
+  Names (2002), the Arabic Division of the UN Group
+  of Experts announced that it had finalised
+  proposed modifications to the UN recommended
+  romanization system. These proposals would be
+  submitted to the League of Arab States for
+  approval. Arabic is written from right to left.
+  The Arabic script usually omits vowel points and
+  diacritical marks from writing which makes it
+  difficult to obtain uniform results in the
+  romanization of Arabic. It is essential to
+  identify correctly the words which appear in any
+  particular name and to know the standard Arabic-
+  script spelling including proper pointing. One
+  must also take into account dialectal and
+  idiosyncratic deviations. The romanization is
+  generally reversible though there are some
+  ambiguous letter sequences (dh, kh, sh, th) which
+  may also point to combinations of Arabic
+  characters in addition to the respective single
+  characters.
+notes:
+  - |
+    The previous UN 1972 System had the following differences:
+    the character (ظ) was romanized as z̧ instead of d͟h;
+    ح, ص, ض the cedilla (¸) was used instead of sub-macron (_) in all characters with sub-macrons.  - |
+    When the definite article al precedes a word beginning with one of the "sun letters" (t,
+    th, d, dh, r, z, s, sh, ş, ḑ, ţ, z, l, n ̧ ) the l of the definite article is assimilated with the first
+    consonant of the word: ash-Sh الشارقة āriqah.
+tests:
+  # Examples taken from:
+  # https://unstats.un.org/unsd/geoinfo/geonames/
+  - source: مِصر
+    expected: mişr
+  - source: قَطَر
+    expected: qaţar
+  - source: الجُمهُورِيَّة العِراقِيَّة
+    expected: al jumhūrīyah al ‘irāqīyah
+  - source: جُمهُورِيَّة مِصر العَرَبِيَّة
+    expected: jumhūrīyat mişr al ‘arabīyah
+  - source: الرِيَاض
+    expected: ar riyāḑ
+  - source: الشارِقة
+    expected: ash shāriqah
+map:
+  inherit: "un-ara-Arab-Latn-2017"
+  characters:
+    '\b\u0627\u0644\u0635' : 'aş ş'  # الص
+    '\b\u0627\u0644\u0636' : 'aḑ ḑ'  # الض
+    '\b\u0627\u0644\u0637' : 'aţ ţ'  # الط
+    '\u062d\u0651' : 'ḩḩ'  # ح
+    '\u0635\u0651' : 'şş'  # ص
+    '\u0636\u0651' : 'ḑḑ'  # ض
+    '\u0637\u0651' : 'ţţ'  # ط
+    '\u0638\u0651' : 'z̧z̧'  # ظ
+    '\u062d' : 'ḩ' # ح
+    '\ufea3' : 'ḩ' # ﺣ
+    '\ufea4' : 'ḩ' # ﺤ
+    '\ufea2' : 'ḩ' # ﺢ
+    '\u0635' : 'ş'  # ص
+    '\ufebb' : 'ş'  # ﺻ
+    '\ufebc' : 'ş'  # ﺼ
+    '\ufeba' : 'ş'  # ﺺ
+    '\u0636' : 'ḑ'  # ض
+    '\ufebf' : 'ḑ'  # ﺿ
+    '\ufec0' : 'ḑ'  # ﻀ
+    '\ufebe' : 'ḑ'  # ﺾ
+    '\u0637' : 'ţ'  # ط
+    '\ufec3' : 'ţ'  # ﻃ
+    '\ufec4' : 'ţ'  # ﻄ
+    '\ufec2' : 'ţ'  # ﻂ
+    '\u0638' : 'z̧'  # ظ
+    '\ufec7' : 'z̧'  # ﻇ
+    '\ufec8' : 'z̧'  # ﻈ
+    '\ufec6' : 'z̧'  # ﻆ