interscript 0.1.1 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +250 -17
  3. data/bin/interscript +38 -17
  4. data/bin/setup +8 -0
  5. data/lib/__pycache__/g2pwrapper.cpython-38.pyc +0 -0
  6. data/lib/g2pwrapper.py +34 -0
  7. data/lib/interscript-opal.rb +2 -0
  8. data/lib/interscript.rb +138 -20
  9. data/lib/interscript/command.rb +28 -0
  10. data/lib/interscript/fs.rb +71 -0
  11. data/lib/interscript/mapping.rb +142 -0
  12. data/lib/interscript/opal.rb +27 -0
  13. data/lib/interscript/opal/maps.js.erb +10 -0
  14. data/lib/interscript/opal_map_translate.rb +12 -0
  15. data/lib/interscript/version.rb +1 -1
  16. data/lib/model-7 +0 -0
  17. data/lib/tha-pt-b-7 +0 -0
  18. data/maps/acadsin-zho-Hani-Latn-2002.yaml +38912 -0
  19. data/maps/alalc-amh-Ethi-Latn-1997.yaml +509 -0
  20. data/maps/alalc-amh-Ethi-Latn-2011.yaml +138 -0
  21. data/maps/alalc-ara-Arab-Latn-1997.yaml +1283 -0
  22. data/maps/alalc-asm-Deva-Latn-1997.yaml +159 -0
  23. data/maps/alalc-aze-Cyrl-Latn-1997.yaml +141 -0
  24. data/maps/alalc-bel-Cyrl-Latn-1997.yaml +125 -0
  25. data/maps/alalc-ben-Beng-Latn-2017.yaml +130 -0
  26. data/maps/alalc-bul-Cyrl-Latn-1997.yaml +94 -0
  27. data/maps/alalc-ell-Grek-Latn-1997.yaml +624 -0
  28. data/maps/alalc-ell-Grek-Latn-2010.yaml +627 -0
  29. data/maps/alalc-hin-Deva-Latn-2020.yaml +159 -0
  30. data/maps/alalc-kat-Geok-Latn-1997.yaml +111 -0
  31. data/maps/alalc-kat-Geor-Latn-1997.yaml +146 -0
  32. data/maps/alalc-kor-Hang-Latn-1997.yaml +94 -0
  33. data/maps/alalc-mar-Deva-Latn-1997.yaml +170 -0
  34. data/maps/alalc-mkd-Cyrl-Latn-1997.yaml +114 -0
  35. data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +103 -0
  36. data/maps/alalc-pan-Deva-Latn-1997.yaml +237 -0
  37. data/maps/alalc-rus-Cyrl-Latn-1997.yaml +221 -0
  38. data/maps/alalc-rus-Cyrl-Latn-2012.yaml +162 -0
  39. data/maps/alalc-srp-Cyrl-Latn-1997.yaml +114 -0
  40. data/maps/alalc-srp-Cyrl-Latn-2013.yaml +135 -0
  41. data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +141 -0
  42. data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +16 -0
  43. data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +283 -0
  44. data/maps/bas-rus-Cyrl-Latn-2017-bss.yaml +174 -0
  45. data/maps/bas-rus-Cyrl-Latn-2017-oss.yaml +169 -0
  46. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +292 -0
  47. data/maps/bgn-kor-Hang-Latn-1943.yaml +31 -0
  48. data/maps/bgn-kor-Kore-Latn-1943.yaml +31 -0
  49. data/maps/bgna-bul-Cyrl-Latn-2006.yaml +208 -0
  50. data/maps/bgna-bul-Cyrl-Latn-2009.yaml +208 -0
  51. data/maps/bgnpcgn-amh-Ethi-Latn-1967.yaml +528 -0
  52. data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +592 -0
  53. data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +108 -0
  54. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +104 -0
  55. data/maps/bgnpcgn-bak-Cyrl-Latn-2007.yaml +184 -0
  56. data/maps/bgnpcgn-bel-Cyrl-Latn-1979.yaml +285 -0
  57. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +115 -0
  58. data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +38 -0
  59. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +701 -0
  60. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +19 -0
  61. data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +257 -0
  62. data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +127 -0
  63. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +42 -0
  64. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +253 -0
  65. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +48 -0
  66. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +48 -0
  67. data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +159 -0
  68. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +190 -0
  69. data/maps/bgnpcgn-nep-Deva-Latn-2011.yaml +200 -0
  70. data/maps/bgnpcgn-per-Arab-Latn-1956.yaml +92 -0
  71. data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +314 -0
  72. data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +166 -0
  73. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +162 -0
  74. data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +208 -0
  75. data/maps/bgnpcgn-zho-Hans-Latn-1979.yaml +7456 -0
  76. data/maps/bis-asm-Beng-Latn-13194-1991.yaml +159 -0
  77. data/maps/bis-ben-Beng-Latn-13194-1991.yaml +156 -0
  78. data/maps/bis-dev-Deva-Latn-13194-1991.yaml +184 -0
  79. data/maps/bis-gjr-Gujr-Latn-13194-1991.yaml +166 -0
  80. data/maps/bis-knd-Knda-Latn-13194-1991.yaml +173 -0
  81. data/maps/bis-mlm-Mlym-Latn-13194-1991.yaml +176 -0
  82. data/maps/bis-ori-Orya-Latn-13194-1991.yaml +160 -0
  83. data/maps/bis-pnj-Guru-Latn-13194-1991.yaml +175 -0
  84. data/maps/bis-tel-Telu-Latn-13194-1991.yaml +170 -0
  85. data/maps/bis-tml-Taml-Latn-13194-1991.yaml +155 -0
  86. data/maps/by-bel-Cyrl-Latn-1998.yaml +168 -0
  87. data/maps/by-bel-Cyrl-Latn-2007.yaml +115 -0
  88. data/maps/dos-nep-Deva-Latn-1997.yaml +33 -0
  89. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +684 -0
  90. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +680 -0
  91. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +19 -0
  92. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +31 -0
  93. data/maps/ggg-kat-Geor-Latn-2002.yaml +88 -0
  94. data/maps/gki-bel-Cyrl-Latn-1992.yaml +33 -0
  95. data/maps/gki-bel-Cyrl-Latn-2000.yaml +201 -0
  96. data/maps/gost-rus-Cyrl-Latn-16876-71-1983.yaml +186 -0
  97. data/maps/hk-yue-Hani-Latn-1888.yaml +38497 -0
  98. data/maps/icao-bel-Cyrl-Latn-9303.yaml +136 -0
  99. data/maps/icao-bul-Cyrl-Latn-9303.yaml +118 -0
  100. data/maps/icao-heb-Hebr-Latn-9303.yaml +151 -0
  101. data/maps/icao-mkd-Cyrl-Latn-9303.yaml +117 -0
  102. data/maps/icao-per-Arab-Latn-9303.yaml +103 -0
  103. data/maps/icao-rus-Cyrl-Latn-9303.yaml +117 -0
  104. data/maps/icao-srp-Cyrl-Latn-9303.yaml +117 -0
  105. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +119 -0
  106. data/maps/iso-ara-Arab-Latn-233-1984.yaml +323 -0
  107. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +609 -0
  108. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +40 -0
  109. data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +62 -0
  110. data/maps/iso-rus-Cyrl-Latn-9-1995.yaml +271 -0
  111. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +109 -0
  112. data/maps/kp-kor-Hang-Latn-2002.yaml +901 -0
  113. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +44820 -0
  114. data/maps/mext-jpn-Hrkt-Latn-1954.yaml +411 -0
  115. data/maps/moct-kor-Hang-Latn-2000.yaml +803 -0
  116. data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +541 -0
  117. data/maps/mvd-bel-Cyrl-Latn-2008.yaml +225 -0
  118. data/maps/mvd-bel-Cyrl-Latn-2010.yaml +63 -0
  119. data/maps/mvd-rus-Cyrl-Latn-2008.yaml +109 -0
  120. data/maps/mvd-rus-Cyrl-Latn-2010.yaml +37 -0
  121. data/maps/nil-kor-Hang-Hang-jamo.yaml +11193 -0
  122. data/maps/odni-aze-Cyrl-Latn-2015.yaml +144 -0
  123. data/maps/odni-bel-Cyrl-Latn-2015.yaml +148 -0
  124. data/maps/odni-bul-Cyrl-Latn-2015.yaml +96 -0
  125. data/maps/odni-hin-Deva-Latn-2015.yaml +258 -0
  126. data/maps/odni-kat-Geor-Latn-2015.yaml +87 -0
  127. data/maps/odni-kaz-Cyrl-Latn-2015.yaml +148 -0
  128. data/maps/odni-kir-Cyrl-Latn-2015.yaml +136 -0
  129. data/maps/odni-mkd-Cyrl-Latn-2015.yaml +122 -0
  130. data/maps/odni-rus-Cyrl-Latn-2015.yaml +77 -0
  131. data/maps/odni-srp-Cyrl-Latn-2015.yaml +129 -0
  132. data/maps/odni-tat-Cyrl-Latn-2015.yaml +142 -0
  133. data/maps/odni-tgk-Cyrl-Latn-2015.yaml +148 -0
  134. data/maps/odni-uig-Cyrl-Latn-2015.yaml +138 -0
  135. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +157 -0
  136. data/maps/odni-urd-Arab-Latn-2015.yaml +221 -0
  137. data/maps/odni-uzb-Cyrl-Latn-2015.yaml +166 -0
  138. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +90 -0
  139. data/maps/royin-tha-Thai-Latn-1968.yaml +179 -0
  140. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +180 -0
  141. data/maps/royin-tha-Thai-Latn-1999.yaml +76 -0
  142. data/maps/sac-zho-Hans-Latn-1979.yaml +24759 -0
  143. data/maps/ses-ara-Arab-Latn-1930.yaml +279 -0
  144. data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +222 -0
  145. data/maps/ua-ukr-Cyrl-Latn-1996.yaml +193 -0
  146. data/maps/un-ara-Arab-Latn-1971.yaml +139 -0
  147. data/maps/un-ara-Arab-Latn-1972.yaml +159 -0
  148. data/maps/un-ara-Arab-Latn-2017.yaml +420 -0
  149. data/maps/un-bel-Cyrl-Latn-2007.yaml +114 -0
  150. data/maps/un-ben-Beng-Latn-2016.yaml +534 -0
  151. data/maps/un-ell-Grek-Latn-1987-tl.yaml +31 -0
  152. data/maps/un-ell-Grek-Latn-1987-ts.yaml +19 -0
  153. data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +780 -0
  154. data/maps/un-mon-Mong-Latn-2013.yaml +99 -0
  155. data/maps/un-nep-Deva-Latn-1972.yaml +163 -0
  156. data/maps/un-rus-Cyrl-Latn-1987.yaml +166 -0
  157. data/maps/un-ukr-Cyrl-Latn-1998.yaml +30 -0
  158. data/maps/ungegn-amh-Ethi-Latn-2016.yaml +575 -0
  159. data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +406 -0
  160. data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +386 -0
  161. data/maps/var-kor-Hang-Latn-mr-1939.yaml +1054 -0
  162. data/maps/var-kor-Kore-Hang-2013.yaml +59754 -0
  163. data/maps/var-kor-Kore-Latn-mr-1939.yaml +36 -0
  164. data/maps/var-tha-Thai-Thai-phonemic.yaml +59 -0
  165. data/maps/var-tha-Thai-Zsym-ipa.yaml +301 -0
  166. data/maps/var-zho-Hani-Latn-1979.yaml +38908 -0
  167. data/spec/interscript/mapping_spec.rb +42 -0
  168. data/spec/interscript_spec.rb +26 -0
  169. data/spec/spec_helper.rb +3 -0
  170. metadata +298 -18
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: def03b1856a5b9d98e397f378e9958627dfe19077b90293797cc6fc6b81bce3e
4
- data.tar.gz: d105d03d4c918bfb5a478a4928a8d15cd59dd0be4a04c990bae0b38e52df7a23
3
+ metadata.gz: 7557ea1ca381562c61be7dbbeab2ea4adc42ef57ce857ef86acd62a08e5ce588
4
+ data.tar.gz: 5099d4a7bf07817155d620db716af452607b8565f4e08d37aa805f486556e0e1
5
5
  SHA512:
6
- metadata.gz: 8c8e1535bc0d688b54da2ebcf29e8c846e07f315c5fe3cc456f918f15b9d46e3ff87ebfb3dc0b40efcc6769f236002b3784fc9e1febfe133b2e81f7a7c5a09a0
7
- data.tar.gz: 6c11004204682f6b476d4764ac68cb5bbba8a1b53ac6afc268edd2204d3746e953ffbef51ab4bb8c5661ff959bb51c05447622a1e79fa7e012c40a2e77830d0e
6
+ metadata.gz: f8a738a34aba269c0a01b4d123f01cc423a3c296541f302f65a6043bd2b618ef2c702f32d469866ee7c488624985b45ed2eee2ad9bbe2955f230b5b45472c364
7
+ data.tar.gz: a3a66af7fcb9d8c82bcf927b17ea68686298bccec2f6e8dfac25796e7013b36f68fa9f130bd2fb9e7bc214a528896cafdd41a2f464886f09c504f376c19065d8
@@ -1,48 +1,263 @@
1
- = Interscript: Interoperable Script Conversion Systems and a Ruby implementation
1
+ = Interscript: Interoperable Script Conversion Systems, with a Ruby implementation
2
2
 
3
- == Introducation
3
+ image:https://github.com/interscript/interscript/workflows/test/badge.svg["Ruby build status", link="https://github.com/interscript/interscript/actions?workflow=test"]
4
+ image:https://github.com/interscript/interscript/workflows/js/badge.svg["JavaScript build status", link="https://github.com/interscript/interscript/actions?workflow=js"]
4
5
 
5
- This repository contains a number of transliteration schemes from:
6
+ == Introduction
6
7
 
8
+ This repository contains interoperable transliteration schemes from:
9
+
10
+ * ALA-LC
7
11
  * BGN/PCGN
8
12
  * ICAO
9
13
  * ISO
10
14
  * UN (by UNGEGN)
15
+ * Many, many other script conversion system authorities.
11
16
 
12
17
  The goal is to achieve interoperable transliteration schemes allowing quality comparisons.
13
18
 
14
19
 
15
- == STATUS (work in progress!)
16
20
 
17
- These transliteration systems currently work:
21
+ == Demonstration
22
+
23
+ These transliteration systems are used in the demo:
18
24
 
19
25
  `bgnpcgn-rus-Cyrl-Latn-1947`:: BGN/PCGN Romanization of Russian
20
- `iso-rus-Cyrl-Latn-iso9`:: ISO 9 Romanization of Russian
26
+ `iso-rus-Cyrl-Latn-9-1995`:: ISO 9 Romanization of Russian
21
27
  `icao-rus-Cyrl-Latn-9303`:: ICAO MRZ Romanization of Russian
22
- `bas-rus-Cyrl-Latn-bss`:: Bulgaria Academy of Science Streamlined System for Russian
28
+ `bas-rus-Cyrl-Latn-2017-bss`:: Bulgaria Academy of Science Streamlined System for Russian
29
+
30
+ image:demo/20191118-interscript-demo-cast.gif["interscript screencast"]
31
+
32
+
33
+ == Installation
34
+
35
+ === Prerequisites
36
+
37
+ Linux:
38
+
39
+ [source,sh]
40
+ ----
41
+ apt-get install swig python3-setuptools
42
+ ----
43
+
44
+ Windows:
23
45
 
46
+ [source,sh]
47
+ ----
48
+ choco install --no-progress swig
49
+ ----
50
+
51
+ Interscript depends on Python and the https://github.com/sequitur-g2p/sequitur-g2p[`sequitur-g2p`] module
52
+
53
+ [source,sh]
54
+ ----
55
+ pip3 install setuptools numpy
56
+ curl -sSL -o sequitur-g2p.zip https://github.com/sequitur-g2p/sequitur-g2p/archive/806273f.zip
57
+ pip3 install sequitur-g2p.zip
58
+ ----
59
+
60
+ Interscript depends on Ruby. Once you manage to install Ruby, it's easy.
61
+
62
+ [source,sh]
63
+ ----
64
+ gem install interscript
65
+ ----
24
66
 
25
67
  == Usage
26
68
 
69
+ Assume you have a file ready in the source script like this:
70
+
71
+ [source,sh]
72
+ ----
73
+ cat <<EOT > rus-Cyrl.txt
74
+ Эх, тройка! птица тройка, кто тебя выдумал? знать, у бойкого народа ты
75
+ могла только родиться, в той земле, что не любит шутить, а
76
+ ровнем-гладнем разметнулась на полсвета, да и ступай считать версты,
77
+ пока не зарябит тебе в очи. И не хитрый, кажись, дорожный снаряд, не
78
+ железным схвачен винтом, а наскоро живьём с одним топором да долотом
79
+ снарядил и собрал тебя ярославский расторопный мужик. Не в немецких
80
+ ботфортах ямщик: борода да рукавицы, и сидит чёрт знает на чём; а
81
+ привстал, да замахнулся, да затянул песню — кони вихрем, спицы в
82
+ колесах смешались в один гладкий круг, только дрогнула дорога, да
83
+ вскрикнул в испуге остановившийся пешеход — и вон она понеслась,
84
+ понеслась, понеслась!
85
+
86
+ Н.В. Гоголь
87
+ EOT
88
+ ----
89
+
90
+ You can run `interscript` on this text using different transliteration systems.
91
+
92
+ [source,sh]
93
+ ----
94
+ interscript rus-Cyrl.txt \
95
+ --system=bgnpcgn-rus-Cyrl-Latn-1947 \
96
+ --output=bgnpcgn-rus-Latn.txt
97
+
98
+ interscript rus-Cyrl.txt \
99
+ --system=iso-rus-Cyrl-Latn-9-1995 \
100
+ --output=iso-rus-Latn.txt
101
+
102
+ interscript rus-Cyrl.txt \
103
+ --system=icao-rus-Cyrl-Latn-9303 \
104
+ --output=icao-rus-Latn.txt
105
+
106
+ interscript rus-Cyrl.txt \
107
+ --system=bas-rus-Cyrl-Latn-2017-bss \
108
+ --output=bas-rus-Latn.txt
109
+ ----
110
+
111
+ It is then easy to see the exact differences in rendering between the systems.
112
+
113
+ [source,sh]
114
+ ----
115
+ diff bgnpcgn-rus-Latn.txt bas-rus-Latn.txt
116
+ ----
117
+
118
+ == Adding transliteration system
119
+
120
+ Transliteration systems stored in a `maps/` directory as YAML files.
121
+ You can create a new file and add it to the directory.
122
+
123
+ The file should be named as `<system-code>.yaml`, where `system-code`
124
+ is in accordance with
125
+ http://calconnect.gitlab.io/tc-localization/csd-transcription-systems[ISO/CC 24229].
126
+
127
+ === File structure
128
+
129
+ [source,yaml]
130
+ ----
131
+ authority_id: bgnpcgn
132
+ id: 1947
133
+ language: rus
134
+ source_script: Cyrl
135
+ destination_script: Latn
136
+ name: ROMANIZATION OF RUSSIAN, BGN/PCGN 1947 System
137
+ url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/807920/ROMANIZATION_OF_RUSSIAN.pdf
138
+ creation_date: 1947
139
+ confirmation_date: 2019-06
140
+ description: The BGN/PCGN system for Russian was adopted ...
141
+
142
+ notes:
143
+ - The character e should be romanized ye initially, after the vowel ...
144
+
145
+ tests:
146
+ - source: ДЛИННОЕ ПОКРЫВАЛО
147
+ expected: DLINNOYE POKRYVALO
148
+ - source: Еловая шишка
149
+ expected: Yelovaya shishka
150
+
151
+ map:
152
+ rules:
153
+ - pattern: (?<=[АаЕеЁёИиОоУуЫыЭэЮюЯяЙйЪъЬь])\u0415 # Е after a, e, ё, и, о, у, ы, э, ю, я, й, ъ, ь
154
+ result: Ye
155
+ - pattern: \b\u0415 # Е initially
156
+ result: Ye
157
+
158
+ characters:
159
+ "\u0410": "A"
160
+ "\u0411": "B"
161
+ "\u0412": "V"
162
+ ----
163
+
164
+
165
+ === Rules
166
+
167
+ The subsection `rules` is placed under the `map` key. All rules are applied in order they are placed before the subsection `characters` applying. Rules apply to an original text, not to a result of previous rules applying.
168
+
169
+ Each rule has `pattern` and `result` elements.
170
+
171
+ Pattern is a regex expression. It should be representing as a string without `//` or `%r{}` parentheses. For example `\b\u0415`. In case a rule is depend on previous or next content, lookahead or lookbehind could be used. For example a rule with the pattern `(?<=[АаЕеЁёИиОоУуЫыЭэЮюЯяЙйЪъЬь])\u0415` find every Е after upper or lower case symbols a, e, ё, и, о, у, ы, э, ю, я, й, ъ, ь.
172
+
173
+ Result is a replacement a for pattern's match. It can contain a string, an Unicode characters specified by a hexadecimal number, a captured group reference. String with hexadecimal number or captured group reference should be double quoted. For example `"Y\u00eb"` or `"\\1\u00b7\\2"`. Captured group are referred by double backslash and group's number.
174
+
175
+ Because rules are applied in order, multiple rules applicable to the same segment of a string can be addressed by rule ordering, and rules can be used as priority over characters. For example:
176
+
177
+ [source,yaml]
178
+ ----
179
+ map:
180
+ rules:
181
+ - pattern: \u03B3\u03B3 # γ (before Γ, Ξ, Χ)
182
+ result: ng
183
+ - pattern: (?<![Γγ])\u03B3(?=[ΕεέΗηήΙιίΥυύ]) # γ (before front vowels)
184
+ result: y
185
+ ----
186
+
187
+ (γι maps to `yi`; but γγ maps to `ng`. In the case of γγι, the first rule takes priority, and the transliteration is `ngi`: it makes the second rule impossible.)
188
+
189
+ [source,yaml]
190
+ ----
191
+ map:
192
+ rules:
193
+ - pattern: (?<=\b)\u03BC[πΠ] # μπ (initially)
194
+ result: b
195
+ - pattern: \u03BC[πΠ] # μπ (medially)
196
+ result: mb
197
+ ----
198
+
199
+ (The first rule applies at the start of a word; the second rule does not specify a context, as it applies in all other cases not covered by the first rule.)
200
+
201
+ [source,yaml]
202
+ ----
203
+ map:
204
+ rules:
205
+ - pattern: ";"
206
+ result: "?"
207
+
208
+ characters
209
+ "\u00B7": ";
210
+ ----
211
+
212
+ (This guarantees that any `;` are converted to `?` before any new `;` are introduced; because all three are Latin script, they could be mixed up in ordering.)
213
+
214
+ Normally rules "`bleed`" each other: once a rule applies to a segment, that segment cannot trigger other rules, because it is already converted to Roman. Exceptionally, it will be necessary to have a rule add or remove characters in the original script, rather than transliterate them, so that the same context can be invoked by two rules in succession:
215
+
216
+ [source,yaml]
217
+ ----
218
+ map:
219
+ rules:
220
+ - pattern: (?<=[АаЕеЁёИиОоУуЫыЭэЮюЯя])\u042b # Ы after any vowel character
221
+ result: "\u00b7Ы"
222
+ - pattern: \u042b(?=[АаУуЫыЭэ]) # Ы before а, у, ы, or э
223
+ result: "Ы\u00b7"
224
+ ----
225
+
226
+ (If the result were `\u00B7Y`, the second rule could not be applied afterwards; but we want ОЫУ to transliterate as `O·Y·U`. In order to make that happen, we preserve the Ы during the rules phase, resulting in О·Ы·У; we only convert the letters to Roman script in the `characters` phase.)
227
+
228
+ === Testing transliteration systems
229
+
230
+ To test all transliteration systems in the `maps/` directory, run:
27
231
 
28
232
  [source,sh]
29
233
  ----
30
- # Transliterating Russian Cyrillic to Latin using the Streamlined System for Russian
31
- interscript samples/rus-Cyrl.txt --system=bas-rus-Cyrl-Latn-bss --output=rus-Latn.txt
234
+ bundle exec rspec
235
+ ----
236
+
237
+ The command takes `source` texts from the `test` section, transforms
238
+ them using `rules` and `charmaps` from the `map` key, and compares the
239
+ results with `expected:` text from the `source:` section.
240
+
241
+ To test a specific transliteration system, set the environment variable
242
+ `TRANSLIT_SYSTEM` to the system code of the desired system
243
+ (i.e. the "`basename`" of the system's YAML file):
32
244
 
33
- # Transliterating Russian Cyrillic to Latin using the BGN/PCGN Romanization of Russian
34
- interscript samples/rus-Cyrl.txt --system=bgnpcgn-rus-Cyrl-Latn-1947 --output=rus-Latn.txt
245
+ [source,sh]
246
+ ----
247
+ TRANSLIT_SYSTEM=bgnpcgn-rus-Cyrl-Latn-1947 bundle exec rspec
35
248
  ----
36
249
 
37
250
 
38
251
  == ISCS system codes
39
252
 
40
- The system code identifying a script conversion system has a few components:
253
+ In accordance with
254
+ http://calconnect.gitlab.io/tc-localization/csd-transcription-systems[ISO/CC 24229],
255
+ the system code identifying a script conversion system has the following components:
41
256
 
42
- e.g. `bgnpcgn-rus-Cyrl-Latn-1947`
257
+ e.g. `bgnpcgn-rus-Cyrl-Latn-1947`:
43
258
 
44
259
  `bgnpcgn`:: the authority identifier
45
- `rus`:: an ISO 639-2 3-letter language code that this system applies to
260
+ `rus`:: an ISO 639-{1,2,3,5} language code that this system applies to (For 639-2, use (T) code)
46
261
  `Cyrl`:: an ISO 15924 script code, identifying the source script
47
262
  `Latn`:: an ISO 15924 script code, identifying the target script
48
263
  `1947`:: an identifier unit within the authority to identify this system
@@ -53,13 +268,31 @@ e.g. `bgnpcgn-rus-Cyrl-Latn-1947`
53
268
  Currently the schemes cover Cyrillic, Armenian, Greek, Arabic and Hebrew.
54
269
 
55
270
 
56
- == Sources
271
+ == Samples to play with
57
272
 
58
273
  * `rus-Cyrl-1.txt`: Copied from the XLS output from http://www.primorsk.vybory.izbirkom.ru/region/primorsk?action=show&global=true&root=254017025&tvd=4254017212287&vrn=100100067795849&prver=0&pronetvd=0&region=25&sub_region=25&type=242&vibid=4254017212287
59
274
 
60
275
  * `rus-Cyrl-2.txt`: Copied from the XLS output from http://www.yaroslavl.vybory.izbirkom.ru/region/yaroslavl?action=show&root=764013001&tvd=4764013188704&vrn=4764013188693&prver=0&pronetvd=0&region=76&sub_region=76&type=426&vibid=4764013188704
61
276
 
62
277
 
63
- == Credits
278
+ == References
279
+
280
+ Reference documents are located at the
281
+ https://github.com/interscript/interscript-references[interscript-references repository].
282
+ Some specifications that have distribution limitations may not be reproduced there.
283
+
284
+
285
+ == Links to system definitions
286
+
287
+ * https://www.iso.org/committee/48750.html[ISO/TC 46 (see standards published by WG 3)]
288
+ * http://geonames.nga.mil/gns/html/romanization.html[BGN/PCGN and BGN Romanization systems (BGN)]
289
+ * https://www.gov.uk/government/publications/romanization-systems[BGN/PCGN Romanization systems (PCGN)]
290
+ * https://www.loc.gov/catdir/cpso/roman.html[ALA-LC Romanization systems in current use]
291
+ * http://catdir.loc.gov/catdir/cpso/roman.html[ALA-LC Romanization systems from 1997]
292
+ * http://www.eki.ee/wgrs/[UN Romanization systems]
293
+ * http://www.eki.ee/knab/kblatyl2.htm[EKI KNAB systems]
294
+
295
+ == Copyright and license
296
+
297
+ This is a Ribose project. Copyright Ribose.
64
298
 
65
- This is a Ribose project.
@@ -1,20 +1,41 @@
1
1
  #!/usr/bin/env ruby
2
+
2
3
  require 'rubygems'
3
- require_relative '../lib/interscript'
4
-
5
- if ARGV.empty?
6
- puts "write source file, source format, and output file"
7
- else
8
- args = Hash[ARGV.flat_map { |s| s.scan(/--?([^=\s]+)(?:=(\S+))?/) }]
9
- input = ARGV[0]
10
- system_code = args["system"]
11
- output_file = args["output"]
12
-
13
- raise "Please enter the system code with --system={system_code}" unless system_code
14
-
15
- if output_file
16
- Interscript.transliterate_file(system_code, input, output_file)
17
- else
18
- puts Interscript.transliterate(system_code, IO.read(input))
19
- end
4
+
5
+ # resolve bin path, ignoring symlinks
6
+ require 'pathname'
7
+ bin_file = Pathname.new(__FILE__).realpath
8
+
9
+ # add self to libpath
10
+ $LOAD_PATH.unshift File.expand_path("../../lib", bin_file)
11
+
12
+ # Fixes https://github.com/rubygems/rubygems/issues/1420
13
+ require 'rubygems/specification'
14
+
15
+ class Gem::Specification
16
+ def this; self; end
20
17
  end
18
+
19
+ require 'interscript/command'
20
+
21
+ if ARGV.any? && !Interscript::Command.all_tasks.key?(ARGV.first)
22
+ ARGV.unshift :translit
23
+ end
24
+ Interscript::Command.start ARGV
25
+
26
+ # if ARGV.empty?
27
+ # puts "write source file, source format, and output file"
28
+ # else
29
+ # args = Hash[ARGV.flat_map { |s| s.scan(/--?([^=\s]+)(?:=(\S+))?/) }]
30
+ # input = ARGV[0]
31
+ # system_code = args["system"]
32
+ # output_file = args["output"]
33
+
34
+ # raise "Please enter the system code with --system={system_code}" unless system_code
35
+
36
+ # if output_file
37
+ # Interscript.transliterate_file(system_code, input, output_file)
38
+ # else
39
+ # puts Interscript.transliterate(system_code, IO.read(input))
40
+ # end
41
+ # end
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,34 @@
1
+ import g2p, SequiturTool
2
+ import numpy
3
+
4
+ def transliterate(model, word):
5
+
6
+ class Struct:
7
+ def __init__(self, **entries):
8
+ self.__dict__.update(entries)
9
+
10
+ model_path = {
11
+ 'pythainlp_lexicon': './lib/model-7',
12
+ 'wiktionary_phonemic': './lib/tha-pt-b-7'
13
+ }
14
+
15
+ connector_dict = {
16
+ 'pythainlp_lexicon': '',
17
+ 'wiktionary_phonemic': '-'
18
+ }
19
+
20
+
21
+ modelFile = model_path[model]
22
+ connector = connector_dict[model]
23
+
24
+ options = Struct(**{'profile': None, 'resource_usage': None, 'psyco': None, 'tempdir': None, 'trainSample': None, 'develSample': None, 'testSample': None, 'checkpoint': None, 'resume_from_checkpoint': None, 'shouldTranspose': None, 'modelFile': modelFile , 'newModelFile': None, 'shouldTestContinuously': None, 'shouldSelfTest': None, 'lengthConstraints': None, 'shouldSuppressNewMultigrams': None, 'viterbi': None, 'shouldRampUp': None, 'shouldWipeModel': None, 'shouldInitializeWithCounts': None, 'minIterations': 20, 'maxIterations': 100, 'eager_discount_adjustment': None, 'fixed_discount': None, 'encoding': 'UTF-8', 'phoneme_to_phoneme': None, 'test_segmental': None, 'testResult': None, 'applySample': None, 'applyWord': word, 'variants_mass': None, 'variants_number': None, 'fakeTranslator': None, 'stack_limit': None})
25
+
26
+ loadSample = g2p.loadG2PSample
27
+
28
+ model = SequiturTool.procureModel(options, loadSample)
29
+ if not model:
30
+ return 1
31
+ translator = g2p.Translator(model)
32
+ del model
33
+
34
+ return connector.join(translator(tuple(word)))
@@ -0,0 +1,2 @@
1
+ require "opal"
2
+ require "interscript"
@@ -1,39 +1,157 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'yaml'
3
+ require "interscript/opal/maps" if RUBY_ENGINE == "opal"
4
+ require "interscript/mapping"
4
5
 
5
6
  # Transliteration
6
7
  module Interscript
7
- SYSTEM_DEFINITIONS_PATH = File.expand_path('../maps', __dir__)
8
+
9
+ class InvalidSystemError < StandardError; end
10
+ class ExternalProcessNotRecognizedError < StandardError; end
11
+ class ExternalProcessUnavailableError < StandardError; end
12
+
13
+ if RUBY_ENGINE == 'opal'
14
+ require "interscript/opal"
15
+ extend Opal
16
+ else
17
+ require "interscript/fs"
18
+ extend Fs
19
+ end
8
20
 
9
21
  class << self
10
- def transliterate_file(system_code, input_file, output_file)
11
- input = File.read(input_file)
12
- output = transliterate(system_code, input)
13
22
 
14
- File.open(output_file, "w") do |f|
15
- f.puts(output)
23
+ def transliterate(system_code, string, maps={})
24
+ unless maps.has_key? system_code
25
+ maps[system_code] = Interscript::Mapping.for(system_code)
26
+ end
27
+ # mapping = Interscript::Mapping.for(system_code)
28
+ mapping = maps[system_code]
29
+
30
+ # First, apply chained transliteration as specified in the list `chain`
31
+ chain = mapping.chain.dup
32
+ while chain.length > 0
33
+ string = transliterate(chain.shift, string, maps)
34
+ end
35
+
36
+ # Then, apply the rest of the map
37
+ separator = mapping.character_separator || ""
38
+ word_separator = mapping.word_separator || ""
39
+ title_case = mapping.title_case
40
+ downcase = mapping.downcase
41
+
42
+ # charmap = mapping.characters&.sort_by { |k, _v| k.size }&.reverse&.to_h
43
+ # dictmap = mapping.dictionary&.sort_by { |k, _v| k.size }&.reverse&.to_h
44
+ charmap = mapping.characters_hash
45
+ dictmap = mapping.dictionary_hash
46
+ trie = mapping.dictionary_trie
47
+
48
+ string = external_processing(mapping, string)
49
+
50
+ pos = 0
51
+ while pos < string.to_s.size
52
+ m = 0
53
+ wordmatch = ""
54
+
55
+ # Using Trie, find the longest matching substring
56
+ while (pos + m < string.to_s.size) && (trie.partial_word?string[pos..pos+m])
57
+ wordmatch = string[pos..pos+m] if trie.word?string[pos..pos+m]
58
+ m += 1
59
+ end
60
+
61
+ m = wordmatch.length
62
+ if m > 0
63
+ repl = dictmap[string[pos..pos+m-1]]
64
+ string = sub_replace(string, pos, m, repl)
65
+ pos += repl.length
66
+ else
67
+ pos += 1
68
+ end
69
+ end
70
+
71
+ output = string.clone
72
+ offsets = Array.new string.to_s.size, 1
73
+
74
+ # mapping.rules.each do |r|
75
+ # string.to_s.scan(/#{r['pattern']}/) do |matches|
76
+ # match = Regexp.last_match
77
+ # pos = match.offset(0).first
78
+ # result = r['result'].clone
79
+ # matches.each.with_index { |v, i| result.sub!(/\\#{i + 1}/, v) } if matches.is_a? Array
80
+ # result.upcase! if up_case_around?(string, pos)
81
+ # output[offsets[0...pos].sum, match[0].size] = result
82
+ # offsets[pos] += result.size - match[0].size
83
+ # end
84
+ # end
85
+
86
+ mapping.rules.each do |r|
87
+ next unless output
88
+ re = mkregexp(r["pattern"])
89
+ output = output.gsub(re, r["result"])
90
+ end
91
+
92
+ charmap.each do |k, v|
93
+ while (match = output&.match(/#{k}/))
94
+ pos = match.offset(0).first
95
+ result = !downcase && up_case_around?(output, pos) ? v.upcase : v
96
+
97
+ # if more than one, choose the first one
98
+ result = result[0] if result.is_a?(Array)
99
+
100
+ output = sub_replace(
101
+ output,
102
+ pos,
103
+ match[0].size,
104
+ add_separator(separator, pos, result)
105
+ )
106
+ end
107
+ end
108
+
109
+ mapping.postrules.each do |r|
110
+ next unless output
111
+ re = mkregexp(r["pattern"])
112
+ output = if r["result"] == "upcase"
113
+ output.gsub(re, &:upcase)
114
+ else
115
+ output.gsub(re, r["result"])
116
+ end
16
117
  end
17
- puts "Output written to: #{output_file}"
118
+
119
+ return unless output
120
+
121
+ output = output.sub(/^(.)/, &:upcase) if title_case
122
+ if word_separator != ''
123
+ output = output.gsub(/#{word_separator}#{separator}/u, word_separator)
124
+
125
+ if title_case
126
+ output = output.gsub(/#{word_separator}(.)/u, &:upcase)
127
+ end
128
+ end
129
+
130
+ output.unicode_normalize
18
131
  end
19
132
 
20
- def load_system_definition(system_code)
21
- YAML.load_file(File.join(SYSTEM_DEFINITIONS_PATH, "#{system_code}.yaml"))
133
+ private
134
+
135
+ def add_separator(separator, pos, result)
136
+ pos == 0 ? result : separator + result
22
137
  end
23
138
 
24
- def transliterate(system_code, string)
25
- system = load_system_definition(system_code)
139
+ def up_case_around?(string, pos)
140
+ return false if string[pos] == string[pos].downcase
26
141
 
27
- rules = system["map"]["rules"] || []
28
- charmap = system["map"]["characters"] || {}
142
+ i = pos - 1
143
+ i -= 1 while i.positive? && string[i] !~ Regexp.new(ALPHA_REGEXP)
144
+ before = i >= 0 && i < pos ? string[i].to_s.strip : ''
29
145
 
30
- rules.each do |r|
31
- string.gsub! %r{#{r["pattern"]}}, r["result"]
32
- end
146
+ i = pos + 1
147
+ i += 1 while i < string.size - 1 && string[i] !~ Regexp.new(ALPHA_REGEXP)
148
+ after = i > pos ? string[i].to_s.strip : ''
33
149
 
34
- string.split('').map do |char|
35
- charmap[char] || char
36
- end.join('')
150
+ before_uc = !before.empty? && before == before.upcase
151
+ after_uc = !after.empty? && after == after.upcase
152
+ # before_uc && (after.empty? || after_uc) || after_uc && (before.empty? || before_uc)
153
+ before_uc || after_uc
37
154
  end
155
+
38
156
  end
39
157
  end