interscript 0.1.7 → 2.1.0b1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (314) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +29 -0
  5. data/LICENSE.adoc +31 -0
  6. data/README.md +3 -0
  7. data/Rakefile +116 -0
  8. data/bin/console +14 -0
  9. data/bin/interscript +5 -0
  10. data/bin/maps_analyze_staging +168 -0
  11. data/bin/maps_debug_compilers +58 -0
  12. data/bin/maps_debug_ordering +88 -0
  13. data/bin/maps_debug_ruby_compile +24 -0
  14. data/bin/maps_debug_step_by_step +44 -0
  15. data/bin/maps_optimize_order +112 -0
  16. data/bin/maps_v1_analyze_regexps +45 -0
  17. data/bin/maps_v1_to_v2 +426 -0
  18. data/bin/setup +8 -0
  19. data/exe/interscript +6 -0
  20. data/interscript.gemspec +31 -0
  21. data/lib/interscript.rb +83 -133
  22. data/lib/interscript/command.rb +5 -5
  23. data/lib/interscript/compiler.rb +22 -0
  24. data/lib/interscript/compiler/javascript.rb +292 -0
  25. data/lib/interscript/compiler/ruby.rb +262 -0
  26. data/lib/interscript/dsl.rb +68 -0
  27. data/lib/interscript/dsl/aliases.rb +23 -0
  28. data/lib/interscript/dsl/document.rb +46 -0
  29. data/lib/interscript/dsl/group.rb +45 -0
  30. data/lib/interscript/dsl/group/parallel.rb +6 -0
  31. data/lib/interscript/dsl/items.rb +89 -0
  32. data/lib/interscript/dsl/metadata.rb +68 -0
  33. data/lib/interscript/dsl/stage.rb +6 -0
  34. data/lib/interscript/dsl/symbol_mm.rb +11 -0
  35. data/lib/interscript/dsl/tests.rb +12 -0
  36. data/lib/interscript/interpreter.rb +251 -0
  37. data/lib/interscript/node.rb +25 -0
  38. data/lib/interscript/node/alias_def.rb +15 -0
  39. data/lib/interscript/node/dependency.rb +13 -0
  40. data/lib/interscript/node/document.rb +45 -0
  41. data/lib/interscript/node/group.rb +34 -0
  42. data/lib/interscript/node/group/parallel.rb +9 -0
  43. data/lib/interscript/node/group/sequential.rb +2 -0
  44. data/lib/interscript/node/item.rb +52 -0
  45. data/lib/interscript/node/item/alias.rb +42 -0
  46. data/lib/interscript/node/item/any.rb +76 -0
  47. data/lib/interscript/node/item/capture.rb +50 -0
  48. data/lib/interscript/node/item/group.rb +51 -0
  49. data/lib/interscript/node/item/repeat.rb +40 -0
  50. data/lib/interscript/node/item/stage.rb +23 -0
  51. data/lib/interscript/node/item/string.rb +51 -0
  52. data/lib/interscript/node/metadata.rb +18 -0
  53. data/lib/interscript/node/rule.rb +6 -0
  54. data/lib/interscript/node/rule/funcall.rb +18 -0
  55. data/lib/interscript/node/rule/run.rb +15 -0
  56. data/lib/interscript/node/rule/sub.rb +68 -0
  57. data/lib/interscript/node/stage.rb +19 -0
  58. data/lib/interscript/node/tests.rb +15 -0
  59. data/lib/interscript/stdlib.rb +211 -0
  60. data/lib/interscript/utils/regexp_converter.rb +283 -0
  61. data/lib/interscript/version.rb +1 -1
  62. data/lib/interscript/visualize.rb +61 -0
  63. data/lib/interscript/visualize/group.html.erb +59 -0
  64. data/lib/interscript/visualize/json.rb +57 -0
  65. data/lib/interscript/visualize/map.html.erb +46 -0
  66. data/lib/interscript/visualize/nodes.rb +89 -0
  67. data/requirements.txt +1 -0
  68. metadata +78 -416
  69. data/README.adoc +0 -298
  70. data/lib/g2pwrapper.py +0 -34
  71. data/lib/interscript/fs.rb +0 -69
  72. data/lib/interscript/mapping.rb +0 -142
  73. data/lib/interscript/opal.rb +0 -57
  74. data/lib/interscript/opal/entrypoint.rb +0 -12
  75. data/lib/interscript/opal/map_translate.rb +0 -7
  76. data/lib/interscript/opal/maps.js.erb +0 -10
  77. data/lib/model-7 +0 -0
  78. data/lib/tha-pt-b-7 +0 -0
  79. data/maps/acadsin-zho-Hani-Latn-2002.yaml +0 -38916
  80. data/maps/alalc-amh-Ethi-Latn-1997.yaml +0 -513
  81. data/maps/alalc-amh-Ethi-Latn-2011.yaml +0 -138
  82. data/maps/alalc-ara-Arab-Latn-1997.yaml +0 -1287
  83. data/maps/alalc-asm-Deva-Latn-1997.yaml +0 -165
  84. data/maps/alalc-asm-Deva-Latn-2012.yaml +0 -40
  85. data/maps/alalc-aze-Cyrl-Latn-1997.yaml +0 -145
  86. data/maps/alalc-bel-Cyrl-Latn-1997.yaml +0 -129
  87. data/maps/alalc-ben-Beng-Latn-2017.yaml +0 -130
  88. data/maps/alalc-bul-Cyrl-Latn-1997.yaml +0 -98
  89. data/maps/alalc-ell-Grek-Latn-1997.yaml +0 -628
  90. data/maps/alalc-ell-Grek-Latn-2010.yaml +0 -626
  91. data/maps/alalc-guj-Gujr-Latn-1997.yaml +0 -266
  92. data/maps/alalc-guj-Gujr-Latn-2011.yaml +0 -64
  93. data/maps/alalc-hin-Deva-Latn-1997.yaml +0 -211
  94. data/maps/alalc-hin-Deva-Latn-2011.yaml +0 -47
  95. data/maps/alalc-kat-Geok-Latn-1997.yaml +0 -111
  96. data/maps/alalc-kat-Geor-Latn-1997.yaml +0 -150
  97. data/maps/alalc-kor-Hang-Latn-1997.yaml +0 -98
  98. data/maps/alalc-mal-Mlym-Latn-1997.yaml +0 -303
  99. data/maps/alalc-mal-Mlym-Latn-2012.yaml +0 -73
  100. data/maps/alalc-mar-Deva-Latn-1997.yaml +0 -189
  101. data/maps/alalc-mar-Deva-Latn-2011.yaml +0 -45
  102. data/maps/alalc-mkd-Cyrl-Latn-1997.yaml +0 -114
  103. data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +0 -103
  104. data/maps/alalc-mon-Cyrl-Latn-1997.yaml +0 -220
  105. data/maps/alalc-pan-Guru-Latn-1997.yaml +0 -256
  106. data/maps/alalc-pan-Guru-Latn-2011.yaml +0 -78
  107. data/maps/alalc-per-Arab-Latn-1997.yaml +0 -375
  108. data/maps/alalc-pli-Deva-Latn-2012.yaml +0 -144
  109. data/maps/alalc-pra-Deva-Latn-2012.yaml +0 -47
  110. data/maps/alalc-rus-Cyrl-Latn-1997.yaml +0 -225
  111. data/maps/alalc-rus-Cyrl-Latn-2012.yaml +0 -162
  112. data/maps/alalc-san-Deva-Latn-2012.yaml +0 -172
  113. data/maps/alalc-sin-Sinh-Latn-1997.yaml +0 -292
  114. data/maps/alalc-sin-Sinh-Latn-2011.yaml +0 -71
  115. data/maps/alalc-srp-Cyrl-Latn-1997.yaml +0 -118
  116. data/maps/alalc-srp-Cyrl-Latn-2013.yaml +0 -135
  117. data/maps/alalc-tam-Taml-Latn-1997.yaml +0 -62
  118. data/maps/alalc-tam-Taml-Latn-2011.yaml +0 -58
  119. data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +0 -145
  120. data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +0 -16
  121. data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +0 -283
  122. data/maps/bas-rus-Cyrl-Latn-2017-bss.yaml +0 -174
  123. data/maps/bas-rus-Cyrl-Latn-2017-oss.yaml +0 -169
  124. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +0 -292
  125. data/maps/bgn-kor-Hang-Latn-1943.yaml +0 -35
  126. data/maps/bgn-kor-Kore-Latn-1943.yaml +0 -31
  127. data/maps/bgna-bul-Cyrl-Latn-2006.yaml +0 -208
  128. data/maps/bgna-bul-Cyrl-Latn-2009.yaml +0 -208
  129. data/maps/bgnpcgn-amh-Ethi-Latn-1967.yaml +0 -532
  130. data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +0 -596
  131. data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +0 -108
  132. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +0 -104
  133. data/maps/bgnpcgn-bak-Cyrl-Latn-2007.yaml +0 -188
  134. data/maps/bgnpcgn-bel-Cyrl-Latn-1979.yaml +0 -289
  135. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +0 -119
  136. data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +0 -42
  137. data/maps/bgnpcgn-che-Cyrl-Latn-2008.yaml +0 -184
  138. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +0 -705
  139. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +0 -23
  140. data/maps/bgnpcgn-fas-Arab-Latn-1956.yaml +0 -96
  141. data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +0 -257
  142. data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +0 -131
  143. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +0 -42
  144. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +0 -253
  145. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +0 -48
  146. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +0 -48
  147. data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +0 -163
  148. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +0 -190
  149. data/maps/bgnpcgn-mon-Cyrl-Latn-1964.yaml +0 -223
  150. data/maps/bgnpcgn-nep-Deva-Latn-2011.yaml +0 -230
  151. data/maps/bgnpcgn-per-Arab-Latn-1958.yaml +0 -336
  152. data/maps/bgnpcgn-prs-Arab-Latn-2007.yaml +0 -639
  153. data/maps/bgnpcgn-prs-Arab-Latn-yaghoubi.yaml +0 -459
  154. data/maps/bgnpcgn-rue-Cyrl-Latn-2016.yaml +0 -168
  155. data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +0 -318
  156. data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +0 -170
  157. data/maps/bgnpcgn-tat-Cyrl-Latn-2007.yaml +0 -220
  158. data/maps/bgnpcgn-tgk-Cyrl-Latn-1994.yaml +0 -240
  159. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +0 -166
  160. data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +0 -119
  161. data/maps/bgnpcgn-uzb-Cyrl-Latn-1979.yaml +0 -127
  162. data/maps/bgnpcgn-uzb-Cyrl-Latn-2000.yaml +0 -82
  163. data/maps/bgnpcgn-zho-Hans-Latn-1979.yaml +0 -7456
  164. data/maps/bis-asm-Beng-Latn-13194-1991.yaml +0 -159
  165. data/maps/bis-ben-Beng-Latn-13194-1991.yaml +0 -156
  166. data/maps/bis-dev-Deva-Latn-13194-1991.yaml +0 -184
  167. data/maps/bis-guj-Gujr-Latn-13194-1991.yaml +0 -181
  168. data/maps/bis-knd-Knda-Latn-13194-1991.yaml +0 -173
  169. data/maps/bis-mlm-Mlym-Latn-13194-1991.yaml +0 -176
  170. data/maps/bis-ori-Orya-Latn-13194-1991.yaml +0 -160
  171. data/maps/bis-pnj-Guru-Latn-13194-1991.yaml +0 -175
  172. data/maps/bis-tel-Telu-Latn-13194-1991.yaml +0 -170
  173. data/maps/bis-tml-Taml-Latn-13194-1991.yaml +0 -155
  174. data/maps/by-bel-Cyrl-Latn-1998.yaml +0 -172
  175. data/maps/by-bel-Cyrl-Latn-2007.yaml +0 -115
  176. data/maps/din-grc-Grek-Latn-31634-2011-t1.yaml +0 -899
  177. data/maps/din-hin-Deva-Latn-33904-2018.yaml +0 -100
  178. data/maps/din-kat-Geor-Latn-32707-2010.yaml +0 -145
  179. data/maps/din-mar-Deva-Latn-33904-2018.yaml +0 -84
  180. data/maps/din-nep-Deva-Latn-33904-2018.yaml +0 -119
  181. data/maps/din-pli-Deva-Latn-33904-2018.yaml +0 -75
  182. data/maps/din-pra-Deva-Latn-33904-2018.yaml +0 -63
  183. data/maps/din-san-Deva-Latn-33904-2018.yaml +0 -338
  184. data/maps/din-tam-Taml-Latn-33903-2016.yaml +0 -213
  185. data/maps/dos-nep-Deva-Latn-1997.yaml +0 -47
  186. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +0 -684
  187. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +0 -680
  188. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +0 -19
  189. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +0 -31
  190. data/maps/ggg-kat-Geor-Latn-2002.yaml +0 -92
  191. data/maps/gki-bel-Cyrl-Latn-1992.yaml +0 -33
  192. data/maps/gki-bel-Cyrl-Latn-2000.yaml +0 -201
  193. data/maps/gost-rus-Cyrl-Latn-16876-71-1983.yaml +0 -190
  194. data/maps/gost-rus-Cyrl-Latn-7.79-2000-2002.yaml +0 -157
  195. data/maps/hk-yue-Hani-Latn-1888.yaml +0 -38497
  196. data/maps/icao-bel-Cyrl-Latn-9303.yaml +0 -136
  197. data/maps/icao-bul-Cyrl-Latn-9303.yaml +0 -118
  198. data/maps/icao-fas-Arab-Latn-9303.yaml +0 -103
  199. data/maps/icao-heb-Hebr-Latn-9303.yaml +0 -151
  200. data/maps/icao-mkd-Cyrl-Latn-9303.yaml +0 -117
  201. data/maps/icao-rus-Cyrl-Latn-9303.yaml +0 -117
  202. data/maps/icao-srp-Cyrl-Latn-9303.yaml +0 -117
  203. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +0 -119
  204. data/maps/iso-ara-Arab-Latn-233-1984.yaml +0 -323
  205. data/maps/iso-asm-Beng-Latn-15919-2001.yaml +0 -75
  206. data/maps/iso-ben-Beng-Latn-15919-2001.yaml +0 -175
  207. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +0 -613
  208. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +0 -44
  209. data/maps/iso-guj-Gujr-Latn-15919-2001.yaml +0 -220
  210. data/maps/iso-hin-Deva-Latn-15919-2001.yaml +0 -87
  211. data/maps/iso-inc-Deva-Latn-15919-2001.yaml +0 -61
  212. data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +0 -66
  213. data/maps/iso-kan-Knda-Latn-15919-2001.yaml +0 -220
  214. data/maps/iso-kat-Geor-Latn-9984-1996.yaml +0 -145
  215. data/maps/iso-kor-Hang-Latn-1996-method1.yaml +0 -240
  216. data/maps/iso-kor-Hang-Latn-1996-method2.yaml +0 -226
  217. data/maps/iso-mal-Mlym-Latn-15919-2001.yaml +0 -281
  218. data/maps/iso-mar-Deva-Latn-15919-2001.yaml +0 -75
  219. data/maps/iso-nep-Deva-Latn-15919-2001.yaml +0 -87
  220. data/maps/iso-ori-Orya-Latn-15919-2001.yaml +0 -193
  221. data/maps/iso-pan-Guru-Latn-15919-2001.yaml +0 -222
  222. data/maps/iso-pli-Beng-Latn-15919-2001.yaml +0 -73
  223. data/maps/iso-pli-Deva-Latn-15919-2001.yaml +0 -74
  224. data/maps/iso-pli-Sinh-Latn-15919-2001.yaml +0 -219
  225. data/maps/iso-pli-Thai-Latn-15919-2001.yaml +0 -55
  226. data/maps/iso-pra-Deva-Latn-15919-2001.yaml +0 -59
  227. data/maps/iso-prs-Arab-Latn-233-3-1999.yaml +0 -366
  228. data/maps/iso-rus-Cyrl-Latn-9-1995.yaml +0 -271
  229. data/maps/iso-san-Deva-Latn-15919-2001.yaml +0 -220
  230. data/maps/iso-tam-Taml-Latn-15919-2001.yaml +0 -159
  231. data/maps/iso-tel-Telu-Latn-15919-2001.yaml +0 -220
  232. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +0 -109
  233. data/maps/kp-kor-Hang-Latn-2002.yaml +0 -909
  234. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +0 -44820
  235. data/maps/mext-jpn-Hrkt-Latn-1954.yaml +0 -411
  236. data/maps/mns-mon-Cyrl-Latn-5217-2012.yaml +0 -163
  237. data/maps/mns-mon-Latn-Cyrl-5217-2012.yaml +0 -200
  238. data/maps/moct-kor-Hang-Latn-2000.yaml +0 -807
  239. data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +0 -541
  240. data/maps/mvd-bel-Cyrl-Latn-2008.yaml +0 -225
  241. data/maps/mvd-bel-Cyrl-Latn-2010.yaml +0 -63
  242. data/maps/mvd-rus-Cyrl-Latn-2008.yaml +0 -109
  243. data/maps/mvd-rus-Cyrl-Latn-2010.yaml +0 -37
  244. data/maps/odni-ara-Arab-Latn-2015.yaml +0 -425
  245. data/maps/odni-aze-Cyrl-Latn-2015.yaml +0 -144
  246. data/maps/odni-bel-Cyrl-Latn-2015.yaml +0 -148
  247. data/maps/odni-bul-Cyrl-Latn-2015.yaml +0 -96
  248. data/maps/odni-che-Cyrl-Latn-2015.yaml +0 -169
  249. data/maps/odni-fas-Arab-Latn-2015.yaml +0 -406
  250. data/maps/odni-hin-Deva-Latn-2015.yaml +0 -258
  251. data/maps/odni-kat-Geor-Latn-2015.yaml +0 -87
  252. data/maps/odni-kaz-Cyrl-Latn-2015.yaml +0 -148
  253. data/maps/odni-kir-Cyrl-Latn-2015.yaml +0 -136
  254. data/maps/odni-kor-Hang-Latn-2015.yaml +0 -375
  255. data/maps/odni-mkd-Cyrl-Latn-2015.yaml +0 -122
  256. data/maps/odni-per-Arab-Latn-2015.yaml +0 -228
  257. data/maps/odni-rus-Cyrl-Latn-2015.yaml +0 -77
  258. data/maps/odni-srp-Cyrl-Latn-2015.yaml +0 -129
  259. data/maps/odni-tat-Cyrl-Latn-2015.yaml +0 -142
  260. data/maps/odni-tgk-Cyrl-Latn-2015.yaml +0 -148
  261. data/maps/odni-uig-Cyrl-Latn-2015.yaml +0 -138
  262. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +0 -157
  263. data/maps/odni-urd-Arab-Latn-2015.yaml +0 -221
  264. data/maps/odni-uzb-Cyrl-Latn-2015.yaml +0 -166
  265. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +0 -90
  266. data/maps/royin-tha-Thai-Latn-1968.yaml +0 -183
  267. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +0 -180
  268. data/maps/royin-tha-Thai-Latn-1999.yaml +0 -80
  269. data/maps/sac-zho-Hans-Latn-1979.yaml +0 -24763
  270. data/maps/sasm-mon-Mong-Latn-general-1978.yaml +0 -389
  271. data/maps/sasm-mon-Mong-Latn-phonetic-1978.yaml +0 -354
  272. data/maps/ses-ara-Arab-Latn-1930.yaml +0 -283
  273. data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +0 -222
  274. data/maps/ua-ukr-Cyrl-Latn-1996.yaml +0 -197
  275. data/maps/ua-ukr-Cyrl-Latn-2007.yaml +0 -75
  276. data/maps/ua-ukr-Cyrl-Latn-2010.yaml +0 -192
  277. data/maps/un-amh-Ethi-Latn-2016.yaml +0 -602
  278. data/maps/un-ara-Arab-Latn-1971.yaml +0 -139
  279. data/maps/un-ara-Arab-Latn-1972.yaml +0 -159
  280. data/maps/un-ara-Arab-Latn-2017.yaml +0 -420
  281. data/maps/un-bel-Cyrl-Latn-2007.yaml +0 -114
  282. data/maps/un-ben-Beng-Latn-2016.yaml +0 -534
  283. data/maps/un-ell-Grek-Latn-1987-phonetic.yaml +0 -780
  284. data/maps/un-ell-Grek-Latn-1987-tl.yaml +0 -31
  285. data/maps/un-ell-Grek-Latn-1987-ts.yaml +0 -19
  286. data/maps/un-hin-Deva-Latn-2016.yaml +0 -222
  287. data/maps/un-mar-Deva-Latn-2016.yaml +0 -91
  288. data/maps/un-mon-Mong-Latn-general-2013.yaml +0 -264
  289. data/maps/un-mon-Mong-Latn-phonetic-2013.yaml +0 -264
  290. data/maps/un-nep-Deva-Latn-1972.yaml +0 -350
  291. data/maps/un-nep-Deva-Latn-2013.yaml +0 -74
  292. data/maps/un-rus-Cyrl-Latn-1987.yaml +0 -166
  293. data/maps/un-ukr-Cyrl-Latn-1998.yaml +0 -53
  294. data/maps/un-ukr-Cyrl-Latn-2012.yaml +0 -162
  295. data/maps/var-hin-Deva-Latn-hunterian-1872.yaml +0 -221
  296. data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +0 -406
  297. data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +0 -386
  298. data/maps/var-kor-Hang-Hang-jamo.yaml +0 -11193
  299. data/maps/var-kor-Hang-Latn-mr-1939.yaml +0 -1054
  300. data/maps/var-kor-Kore-Hang-2013.yaml +0 -59754
  301. data/maps/var-kor-Kore-Latn-mr-1939.yaml +0 -36
  302. data/maps/var-mar-Deva-Latn-hunterian-1872.yaml +0 -43
  303. data/maps/var-mon-Mong-Latn-1930.yaml +0 -102
  304. data/maps/var-mon-Mong-Latn-lessing.yaml +0 -272
  305. data/maps/var-mon-Mong-Latn-vpmc.yaml +0 -274
  306. data/maps/var-pra-Deva-Latn-iast-1912.yaml +0 -30
  307. data/maps/var-san-Deva-Latn-iast-1912.yaml +0 -149
  308. data/maps/var-tha-Thai-Thai-phonemic.yaml +0 -59
  309. data/maps/var-tha-Thai-Zsym-ipa.yaml +0 -301
  310. data/maps/var-zho-Hani-Latn-wd-1979.yaml +0 -38912
  311. data/spec/interscript/filenames_spec.rb +0 -384
  312. data/spec/interscript/mapping_spec.rb +0 -42
  313. data/spec/interscript_spec.rb +0 -29
  314. data/spec/spec_helper.rb +0 -3
data/README.adoc DELETED
@@ -1,298 +0,0 @@
1
- = Interscript: Interoperable Script Conversion Systems, with a Ruby implementation
2
-
3
- image:https://github.com/interscript/interscript/workflows/test/badge.svg["Ruby build status", link="https://github.com/interscript/interscript/actions?workflow=test"]
4
- image:https://github.com/interscript/interscript/workflows/js/badge.svg["JavaScript build status", link="https://github.com/interscript/interscript/actions?workflow=js"]
5
-
6
- == Introduction
7
-
8
- This repository contains interoperable transliteration schemes from:
9
-
10
- * ALA-LC
11
- * BGN/PCGN
12
- * ICAO
13
- * ISO
14
- * UN (by UNGEGN)
15
- * Many, many other script conversion system authorities.
16
-
17
- The goal is to achieve interoperable transliteration schemes allowing quality comparisons.
18
-
19
-
20
-
21
- == Demonstration
22
-
23
- These transliteration systems are used in the demo:
24
-
25
- `bgnpcgn-rus-Cyrl-Latn-1947`:: BGN/PCGN Romanization of Russian
26
- `iso-rus-Cyrl-Latn-9-1995`:: ISO 9 Romanization of Russian
27
- `icao-rus-Cyrl-Latn-9303`:: ICAO MRZ Romanization of Russian
28
- `bas-rus-Cyrl-Latn-2017-bss`:: Bulgaria Academy of Science Streamlined System for Russian
29
-
30
- image:demo/20191118-interscript-demo-cast.gif["interscript screencast"]
31
-
32
-
33
- == Installation
34
-
35
- === Prerequisites
36
-
37
- Linux:
38
-
39
- [source,sh]
40
- ----
41
- apt-get install swig python3-setuptools
42
- ----
43
-
44
- Windows:
45
-
46
- [source,sh]
47
- ----
48
- choco install --no-progress swig
49
- ----
50
-
51
- Interscript depends on Python and the https://github.com/sequitur-g2p/sequitur-g2p[`sequitur-g2p`] module
52
-
53
- [source,sh]
54
- ----
55
- pip3 install setuptools numpy
56
- curl -sSL -o sequitur-g2p.zip https://github.com/sequitur-g2p/sequitur-g2p/archive/806273f.zip
57
- pip3 install sequitur-g2p.zip
58
- ----
59
-
60
- Interscript depends on Ruby. Once you manage to install Ruby, it's easy.
61
-
62
- [source,sh]
63
- ----
64
- gem install interscript
65
- ----
66
-
67
- == Usage
68
-
69
- Assume you have a file ready in the source script like this:
70
-
71
- [source,sh]
72
- ----
73
- cat <<EOT > rus-Cyrl.txt
74
- Эх, тройка! птица тройка, кто тебя выдумал? знать, у бойкого народа ты
75
- могла только родиться, в той земле, что не любит шутить, а
76
- ровнем-гладнем разметнулась на полсвета, да и ступай считать версты,
77
- пока не зарябит тебе в очи. И не хитрый, кажись, дорожный снаряд, не
78
- железным схвачен винтом, а наскоро живьём с одним топором да долотом
79
- снарядил и собрал тебя ярославский расторопный мужик. Не в немецких
80
- ботфортах ямщик: борода да рукавицы, и сидит чёрт знает на чём; а
81
- привстал, да замахнулся, да затянул песню — кони вихрем, спицы в
82
- колесах смешались в один гладкий круг, только дрогнула дорога, да
83
- вскрикнул в испуге остановившийся пешеход — и вон она понеслась,
84
- понеслась, понеслась!
85
-
86
- Н.В. Гоголь
87
- EOT
88
- ----
89
-
90
- You can run `interscript` on this text using different transliteration systems.
91
-
92
- [source,sh]
93
- ----
94
- interscript rus-Cyrl.txt \
95
- --system=bgnpcgn-rus-Cyrl-Latn-1947 \
96
- --output=bgnpcgn-rus-Latn.txt
97
-
98
- interscript rus-Cyrl.txt \
99
- --system=iso-rus-Cyrl-Latn-9-1995 \
100
- --output=iso-rus-Latn.txt
101
-
102
- interscript rus-Cyrl.txt \
103
- --system=icao-rus-Cyrl-Latn-9303 \
104
- --output=icao-rus-Latn.txt
105
-
106
- interscript rus-Cyrl.txt \
107
- --system=bas-rus-Cyrl-Latn-2017-bss \
108
- --output=bas-rus-Latn.txt
109
- ----
110
-
111
- It is then easy to see the exact differences in rendering between the systems.
112
-
113
- [source,sh]
114
- ----
115
- diff bgnpcgn-rus-Latn.txt bas-rus-Latn.txt
116
- ----
117
-
118
- == Adding transliteration system
119
-
120
- Transliteration systems stored in a `maps/` directory as YAML files.
121
- You can create a new file and add it to the directory.
122
-
123
- The file should be named as `<system-code>.yaml`, where `system-code`
124
- is in accordance with
125
- http://calconnect.gitlab.io/tc-localization/csd-transcription-systems[ISO/CC 24229].
126
-
127
- === File structure
128
-
129
- [source,yaml]
130
- ----
131
- authority_id: bgnpcgn
132
- id: 1947
133
- language: rus
134
- source_script: Cyrl
135
- destination_script: Latn
136
- name: ROMANIZATION OF RUSSIAN, BGN/PCGN 1947 System
137
- url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/807920/ROMANIZATION_OF_RUSSIAN.pdf
138
- creation_date: 1947
139
- confirmation_date: 2019-06
140
- description: The BGN/PCGN system for Russian was adopted ...
141
-
142
- notes:
143
- - The character e should be romanized ye initially, after the vowel ...
144
-
145
- tests:
146
- - source: ДЛИННОЕ ПОКРЫВАЛО
147
- expected: DLINNOYE POKRYVALO
148
- - source: Еловая шишка
149
- expected: Yelovaya shishka
150
-
151
- map:
152
- rules:
153
- - pattern: (?<=[АаЕеЁёИиОоУуЫыЭэЮюЯяЙйЪъЬь])\u0415 # Е after a, e, ё, и, о, у, ы, э, ю, я, й, ъ, ь
154
- result: Ye
155
- - pattern: \b\u0415 # Е initially
156
- result: Ye
157
-
158
- characters:
159
- "\u0410": "A"
160
- "\u0411": "B"
161
- "\u0412": "V"
162
- ----
163
-
164
-
165
- === Rules
166
-
167
- The subsection `rules` is placed under the `map` key. All rules are applied in order they are placed before the subsection `characters` applying. Rules apply to an original text, not to a result of previous rules applying.
168
-
169
- Each rule has `pattern` and `result` elements.
170
-
171
- Pattern is a regex expression. It should be representing as a string without `//` or `%r{}` parentheses. For example `\b\u0415`. In case a rule is depend on previous or next content, lookahead or lookbehind could be used. For example a rule with the pattern `(?<=[АаЕеЁёИиОоУуЫыЭэЮюЯяЙйЪъЬь])\u0415` find every Е after upper or lower case symbols a, e, ё, и, о, у, ы, э, ю, я, й, ъ, ь.
172
-
173
- Result is a replacement a for pattern's match. It can contain a string, an Unicode characters specified by a hexadecimal number, a captured group reference. String with hexadecimal number or captured group reference should be double quoted. For example `"Y\u00eb"` or `"\\1\u00b7\\2"`. Captured group are referred by double backslash and group's number.
174
-
175
- Because rules are applied in order, multiple rules applicable to the same segment of a string can be addressed by rule ordering, and rules can be used as priority over characters. For example:
176
-
177
- [source,yaml]
178
- ----
179
- map:
180
- rules:
181
- - pattern: \u03B3\u03B3 # γ (before Γ, Ξ, Χ)
182
- result: ng
183
- - pattern: (?<![Γγ])\u03B3(?=[ΕεέΗηήΙιίΥυύ]) # γ (before front vowels)
184
- result: y
185
- ----
186
-
187
- (γι maps to `yi`; but γγ maps to `ng`. In the case of γγι, the first rule takes priority, and the transliteration is `ngi`: it makes the second rule impossible.)
188
-
189
- [source,yaml]
190
- ----
191
- map:
192
- rules:
193
- - pattern: (?<=\b)\u03BC[πΠ] # μπ (initially)
194
- result: b
195
- - pattern: \u03BC[πΠ] # μπ (medially)
196
- result: mb
197
- ----
198
-
199
- (The first rule applies at the start of a word; the second rule does not specify a context, as it applies in all other cases not covered by the first rule.)
200
-
201
- [source,yaml]
202
- ----
203
- map:
204
- rules:
205
- - pattern: ";"
206
- result: "?"
207
-
208
- characters
209
- "\u00B7": ";
210
- ----
211
-
212
- (This guarantees that any `;` are converted to `?` before any new `;` are introduced; because all three are Latin script, they could be mixed up in ordering.)
213
-
214
- Normally rules "`bleed`" each other: once a rule applies to a segment, that segment cannot trigger other rules, because it is already converted to Roman. Exceptionally, it will be necessary to have a rule add or remove characters in the original script, rather than transliterate them, so that the same context can be invoked by two rules in succession:
215
-
216
- [source,yaml]
217
- ----
218
- map:
219
- rules:
220
- - pattern: (?<=[АаЕеЁёИиОоУуЫыЭэЮюЯя])\u042b # Ы after any vowel character
221
- result: "\u00b7Ы"
222
- - pattern: \u042b(?=[АаУуЫыЭэ]) # Ы before а, у, ы, or э
223
- result: "Ы\u00b7"
224
- ----
225
-
226
- (If the result were `\u00B7Y`, the second rule could not be applied afterwards; but we want ОЫУ to transliterate as `O·Y·U`. In order to make that happen, we preserve the Ы during the rules phase, resulting in О·Ы·У; we only convert the letters to Roman script in the `characters` phase.)
227
-
228
- === Testing transliteration systems
229
-
230
- To test all transliteration systems in the `maps/` directory, run:
231
-
232
- [source,sh]
233
- ----
234
- bundle exec rspec
235
- ----
236
-
237
- The command takes `source` texts from the `test` section, transforms
238
- them using `rules` and `charmaps` from the `map` key, and compares the
239
- results with `expected:` text from the `source:` section.
240
-
241
- To test a specific transliteration system, set the environment variable
242
- `TRANSLIT_SYSTEM` to the system code of the desired system
243
- (i.e. the "`basename`" of the system's YAML file):
244
-
245
- [source,sh]
246
- ----
247
- TRANSLIT_SYSTEM=bgnpcgn-rus-Cyrl-Latn-1947 bundle exec rspec
248
- ----
249
-
250
-
251
- == ISCS system codes
252
-
253
- In accordance with
254
- http://calconnect.gitlab.io/tc-localization/csd-transcription-systems[ISO/CC 24229],
255
- the system code identifying a script conversion system has the following components:
256
-
257
- e.g. `bgnpcgn-rus-Cyrl-Latn-1947`:
258
-
259
- `bgnpcgn`:: the authority identifier
260
- `rus`:: an ISO 639-{1,2,3,5} language code that this system applies to (For 639-2, use (T) code)
261
- `Cyrl`:: an ISO 15924 script code, identifying the source script
262
- `Latn`:: an ISO 15924 script code, identifying the target script
263
- `1947`:: an identifier unit within the authority to identify this system
264
-
265
-
266
- == Covered languages
267
-
268
- Currently the schemes cover Cyrillic, Armenian, Greek, Arabic and Hebrew.
269
-
270
-
271
- == Samples to play with
272
-
273
- * `rus-Cyrl-1.txt`: Copied from the XLS output from http://www.primorsk.vybory.izbirkom.ru/region/primorsk?action=show&global=true&root=254017025&tvd=4254017212287&vrn=100100067795849&prver=0&pronetvd=0&region=25&sub_region=25&type=242&vibid=4254017212287
274
-
275
- * `rus-Cyrl-2.txt`: Copied from the XLS output from http://www.yaroslavl.vybory.izbirkom.ru/region/yaroslavl?action=show&root=764013001&tvd=4764013188704&vrn=4764013188693&prver=0&pronetvd=0&region=76&sub_region=76&type=426&vibid=4764013188704
276
-
277
-
278
- == References
279
-
280
- Reference documents are located at the
281
- https://github.com/interscript/interscript-references[interscript-references repository].
282
- Some specifications that have distribution limitations may not be reproduced there.
283
-
284
-
285
- == Links to system definitions
286
-
287
- * https://www.iso.org/committee/48750.html[ISO/TC 46 (see standards published by WG 3)]
288
- * http://geonames.nga.mil/gns/html/romanization.html[BGN/PCGN and BGN Romanization systems (BGN)]
289
- * https://www.gov.uk/government/publications/romanization-systems[BGN/PCGN Romanization systems (PCGN)]
290
- * https://www.loc.gov/catdir/cpso/roman.html[ALA-LC Romanization systems in current use]
291
- * http://catdir.loc.gov/catdir/cpso/roman.html[ALA-LC Romanization systems from 1997]
292
- * http://www.eki.ee/wgrs/[UN Romanization systems]
293
- * http://www.eki.ee/knab/kblatyl2.htm[EKI KNAB systems]
294
-
295
- == Copyright and license
296
-
297
- This is a Ribose project. Copyright Ribose.
298
-
data/lib/g2pwrapper.py DELETED
@@ -1,34 +0,0 @@
1
- import g2p, SequiturTool
2
- import numpy
3
-
4
- def transliterate(model, word):
5
-
6
- class Struct:
7
- def __init__(self, **entries):
8
- self.__dict__.update(entries)
9
-
10
- model_path = {
11
- 'pythainlp_lexicon': './lib/model-7',
12
- 'wiktionary_phonemic': './lib/tha-pt-b-7'
13
- }
14
-
15
- connector_dict = {
16
- 'pythainlp_lexicon': '',
17
- 'wiktionary_phonemic': '-'
18
- }
19
-
20
-
21
- modelFile = model_path[model]
22
- connector = connector_dict[model]
23
-
24
- options = Struct(**{'profile': None, 'resource_usage': None, 'psyco': None, 'tempdir': None, 'trainSample': None, 'develSample': None, 'testSample': None, 'checkpoint': None, 'resume_from_checkpoint': None, 'shouldTranspose': None, 'modelFile': modelFile , 'newModelFile': None, 'shouldTestContinuously': None, 'shouldSelfTest': None, 'lengthConstraints': None, 'shouldSuppressNewMultigrams': None, 'viterbi': None, 'shouldRampUp': None, 'shouldWipeModel': None, 'shouldInitializeWithCounts': None, 'minIterations': 20, 'maxIterations': 100, 'eager_discount_adjustment': None, 'fixed_discount': None, 'encoding': 'UTF-8', 'phoneme_to_phoneme': None, 'test_segmental': None, 'testResult': None, 'applySample': None, 'applyWord': word, 'variants_mass': None, 'variants_number': None, 'fakeTranslator': None, 'stack_limit': None})
25
-
26
- loadSample = g2p.loadG2PSample
27
-
28
- model = SequiturTool.procureModel(options, loadSample)
29
- if not model:
30
- return 1
31
- translator = g2p.Translator(model)
32
- del model
33
-
34
- return connector.join(translator(tuple(word)))
@@ -1,69 +0,0 @@
1
- require 'pathname'
2
-
3
- module Interscript
4
- module Fs
5
- def sub_replace(string, pos, size, repl)
6
- string[pos..pos + size - 1] = repl
7
- string
8
- end
9
-
10
- def root_path
11
- @root_path ||= Pathname.new(File.join(File.dirname(__dir__), ".."))
12
- end
13
-
14
- def transliterate_file(system_code, input_file, output_file, maps={})
15
- input = File.read(input_file)
16
- output = transliterate(system_code, input, maps)
17
-
18
- File.open(output_file, 'w') do |f|
19
- f.puts(output)
20
- end
21
-
22
- puts "Output written to: #{output_file}"
23
- output_file
24
- end
25
-
26
- def import_python_modules
27
- begin
28
- pyimport :g2pwrapper
29
- rescue
30
- pyimport :sys
31
- sys.path.append(root_path.to_s + "/lib/")
32
- pyimport :g2pwrapper
33
- end
34
- end
35
-
36
- def external_process(process_name, string)
37
- import_python_modules
38
-
39
- case process_name
40
- when 'sequitur.pythainlp_lexicon'
41
- return g2pwrapper.transliterate('pythainlp_lexicon', string)
42
- when 'sequitur.wiktionary_phonemic'
43
- return g2pwrapper.transliterate('wiktionary_phonemic', string)
44
- else
45
- raise ExternalProcessNotRecognizedError.new
46
- end
47
-
48
- rescue
49
- raise ExternalProcessUnavailableError.new
50
- end
51
-
52
- def external_processing(mapping, string)
53
- # Segmentation
54
- string = external_process(mapping.segmentation, string) if mapping.segmentation
55
-
56
- # Transliteration/Transcription
57
- string = external_process(mapping.transcription, string) if mapping.transcription
58
-
59
- string
60
- end
61
-
62
- private
63
-
64
- def mkregexp(regexpstring)
65
- /#{regexpstring}/u
66
- end
67
-
68
- end
69
- end
@@ -1,142 +0,0 @@
1
- require 'rambling-trie'
2
- require 'yaml' unless RUBY_ENGINE == 'opal'
3
- require 'json'
4
-
5
- module Interscript
6
-
7
- class Mapping
8
- attr_reader(
9
- :id,
10
- :url,
11
- :name,
12
- :notes,
13
- :rules,
14
- :tests,
15
- :language,
16
- :postrules,
17
- :characters,
18
- :description,
19
- :authority_id,
20
- :creation_date,
21
- :source_script,
22
- :destination_script,
23
- :chain,
24
- :character_separator,
25
- :word_separator,
26
- :title_case,
27
- :downcase,
28
- :dictionary,
29
- :characters_hash,
30
- :dictionary_hash,
31
- :segmentation,
32
- :transcription,
33
- :dictionary_trie
34
- )
35
-
36
- def initialize(system_code, options = {})
37
- @system_code = system_code
38
- @depth = options.fetch(:depth, 0).to_i
39
-
40
- unless RUBY_ENGINE == 'opal'
41
- @system_path = options.fetch(:system_code, default_path)
42
- end
43
-
44
- load_and_serialize_system_mappings
45
- end
46
-
47
- def self.for(system_code, options = {})
48
- new(system_code, options)
49
- end
50
-
51
- def load_and_serialize_system_mappings
52
- return if depth >= 5
53
-
54
- mappings = load_system_mappings
55
- serialize_system_mappings(mappings)
56
- end
57
-
58
- private
59
-
60
- attr_reader :depth, :system_code, :system_path
61
-
62
- def system_code_file
63
- [system_code, "yaml"].join(".")
64
- end
65
-
66
- def default_path
67
- @default_path ||= Interscript.root_path.join("maps")
68
- end
69
-
70
- def load_system_mappings
71
- if RUBY_ENGINE == 'opal'
72
- load_opal_mappings
73
- else
74
- load_fs_mappings
75
- end
76
- end
77
-
78
- def load_opal_mappings
79
- JSON.parse(`InterscriptMaps[#{system_code}]`)
80
- end
81
-
82
- def load_fs_mappings
83
- YAML.load_file(system_path.join(system_code_file))
84
- rescue Errno::ENOENT
85
- raise Interscript::InvalidSystemError.new("No system mappings found")
86
- end
87
-
88
- def serialize_system_mappings(mappings)
89
- @id = mappings.fetch("id", nil)
90
- @url = mappings.fetch("url", nil)
91
- @name = mappings.fetch("name", nil)
92
- @notes = mappings.fetch("notes", nil)
93
- @tests = mappings.fetch("tests", [])
94
- @language = mappings.fetch("language", nil)
95
- @description = mappings.fetch("description", nil)
96
- @authority_id = mappings.fetch("authority_id", nil)
97
- @creation_date = mappings.fetch("creation_date", nil)
98
- @source_script = mappings.fetch("source_script", nil)
99
- @destination_script = mappings.fetch("destination_script", nil)
100
- @chain = mappings.fetch("chain", [])
101
- @character_separator = mappings["map"]["character_separator"] || nil
102
- @word_separator = mappings["map"]["word_separator"] || nil
103
- @title_case = mappings["map"]["title_case"] || false
104
- @downcase = mappings["map"]["downcase"] || false
105
- @rules = mappings["map"]["rules"] || []
106
- @postrules = mappings["map"]["postrules"] || []
107
- @characters = mappings["map"]["characters"] || {}
108
- @dictionary = mappings["map"]["dictionary"] || {}
109
- @segmentation = mappings["map"]["segementation"] || nil
110
- @transcription = mappings["map"]["transcription"] || nil
111
-
112
- include_inherited_mappings(mappings)
113
- build_hashes
114
- build_trie
115
- end
116
-
117
- def include_inherited_mappings(mappings)
118
- inherit_systems = [].push(mappings["map"]["inherit"]).flatten
119
-
120
- inherit_systems.each do |inherit_system|
121
- next unless inherit_system
122
-
123
- inherited_mapping = Mapping.for(inherit_system, depth: depth + 1)
124
-
125
- @rules = [rules, inherited_mapping.rules].flatten
126
- @postrules = [inherited_mapping.postrules, postrules].flatten
127
- @characters = (inherited_mapping.characters|| {}).merge(characters)
128
- @dictionary = (inherited_mapping.dictionary|| {}).merge(dictionary)
129
- end
130
- end
131
-
132
- def build_hashes
133
- @characters_hash = characters&.sort_by { |k, _v| k.size }&.reverse&.to_h
134
- @dictionary_hash = dictionary&.sort_by { |k, _v| k.size }&.reverse&.to_h
135
- end
136
-
137
- def build_trie
138
- @dictionary_trie = Rambling::Trie.create
139
- dictionary_trie.concat dictionary.keys
140
- end
141
- end
142
- end