interscript 0.1.6 → 2.1.0a9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +29 -0
  5. data/LICENSE.adoc +31 -0
  6. data/README.md +3 -0
  7. data/Rakefile +53 -0
  8. data/bin/console +14 -0
  9. data/bin/interscript +3 -39
  10. data/bin/maps_analyze_staging +168 -0
  11. data/bin/maps_debug_compilers +58 -0
  12. data/bin/maps_debug_ordering +88 -0
  13. data/bin/maps_debug_ruby_compile +24 -0
  14. data/bin/maps_debug_step_by_step +44 -0
  15. data/bin/maps_optimize_order +112 -0
  16. data/bin/maps_v1_analyze_regexps +45 -0
  17. data/bin/maps_v1_to_v2 +426 -0
  18. data/exe/interscript +6 -0
  19. data/interscript.gemspec +31 -0
  20. data/lib/interscript.rb +81 -127
  21. data/lib/interscript/command.rb +5 -5
  22. data/lib/interscript/compiler.rb +22 -0
  23. data/lib/interscript/compiler/javascript.rb +292 -0
  24. data/lib/interscript/compiler/ruby.rb +262 -0
  25. data/lib/interscript/dsl.rb +67 -0
  26. data/lib/interscript/dsl/aliases.rb +23 -0
  27. data/lib/interscript/dsl/document.rb +46 -0
  28. data/lib/interscript/dsl/group.rb +45 -0
  29. data/lib/interscript/dsl/group/parallel.rb +6 -0
  30. data/lib/interscript/dsl/items.rb +89 -0
  31. data/lib/interscript/dsl/metadata.rb +26 -0
  32. data/lib/interscript/dsl/stage.rb +6 -0
  33. data/lib/interscript/dsl/symbol_mm.rb +11 -0
  34. data/lib/interscript/dsl/tests.rb +12 -0
  35. data/lib/interscript/interpreter.rb +251 -0
  36. data/lib/interscript/node.rb +25 -0
  37. data/lib/interscript/node/alias_def.rb +15 -0
  38. data/lib/interscript/node/dependency.rb +13 -0
  39. data/lib/interscript/node/document.rb +45 -0
  40. data/lib/interscript/node/group.rb +34 -0
  41. data/lib/interscript/node/group/parallel.rb +9 -0
  42. data/lib/interscript/node/group/sequential.rb +2 -0
  43. data/lib/interscript/node/item.rb +52 -0
  44. data/lib/interscript/node/item/alias.rb +42 -0
  45. data/lib/interscript/node/item/any.rb +61 -0
  46. data/lib/interscript/node/item/capture.rb +50 -0
  47. data/lib/interscript/node/item/group.rb +51 -0
  48. data/lib/interscript/node/item/repeat.rb +40 -0
  49. data/lib/interscript/node/item/stage.rb +23 -0
  50. data/lib/interscript/node/item/string.rb +51 -0
  51. data/lib/interscript/node/metadata.rb +18 -0
  52. data/lib/interscript/node/rule.rb +6 -0
  53. data/lib/interscript/node/rule/funcall.rb +18 -0
  54. data/lib/interscript/node/rule/run.rb +15 -0
  55. data/lib/interscript/node/rule/sub.rb +65 -0
  56. data/lib/interscript/node/stage.rb +19 -0
  57. data/lib/interscript/node/tests.rb +15 -0
  58. data/lib/interscript/stdlib.rb +211 -0
  59. data/lib/interscript/utils/regexp_converter.rb +283 -0
  60. data/lib/interscript/version.rb +1 -1
  61. data/requirements.txt +1 -0
  62. metadata +75 -339
  63. data/README.adoc +0 -298
  64. data/bin/rspec +0 -29
  65. data/lib/__pycache__/g2pwrapper.cpython-38.pyc +0 -0
  66. data/lib/g2pwrapper.py +0 -34
  67. data/lib/interscript-opal.rb +0 -2
  68. data/lib/interscript/fs.rb +0 -71
  69. data/lib/interscript/mapping.rb +0 -142
  70. data/lib/interscript/opal.rb +0 -27
  71. data/lib/interscript/opal/maps.js.erb +0 -10
  72. data/lib/interscript/opal_map_translate.rb +0 -12
  73. data/lib/model-7 +0 -0
  74. data/lib/tha-pt-b-7 +0 -0
  75. data/maps/acadsin-zho-Hani-Latn-2002.yaml +0 -38912
  76. data/maps/alalc-amh-Ethi-Latn-1997.yaml +0 -509
  77. data/maps/alalc-amh-Ethi-Latn-2011.yaml +0 -138
  78. data/maps/alalc-ara-Arab-Latn-1997.yaml +0 -1283
  79. data/maps/alalc-asm-Deva-Latn-1997.yaml +0 -159
  80. data/maps/alalc-aze-Cyrl-Latn-1997.yaml +0 -141
  81. data/maps/alalc-bel-Cyrl-Latn-1997.yaml +0 -125
  82. data/maps/alalc-ben-Beng-Latn-2017.yaml +0 -130
  83. data/maps/alalc-bul-Cyrl-Latn-1997.yaml +0 -94
  84. data/maps/alalc-ell-Grek-Latn-1997.yaml +0 -624
  85. data/maps/alalc-ell-Grek-Latn-2010.yaml +0 -627
  86. data/maps/alalc-hin-Deva-Latn-2020.yaml +0 -159
  87. data/maps/alalc-kat-Geok-Latn-1997.yaml +0 -111
  88. data/maps/alalc-kat-Geor-Latn-1997.yaml +0 -146
  89. data/maps/alalc-kor-Hang-Latn-1997.yaml +0 -94
  90. data/maps/alalc-mar-Deva-Latn-1997.yaml +0 -170
  91. data/maps/alalc-mkd-Cyrl-Latn-1997.yaml +0 -114
  92. data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +0 -103
  93. data/maps/alalc-pan-Deva-Latn-1997.yaml +0 -237
  94. data/maps/alalc-rus-Cyrl-Latn-1997.yaml +0 -221
  95. data/maps/alalc-rus-Cyrl-Latn-2012.yaml +0 -162
  96. data/maps/alalc-srp-Cyrl-Latn-1997.yaml +0 -114
  97. data/maps/alalc-srp-Cyrl-Latn-2013.yaml +0 -135
  98. data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +0 -141
  99. data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +0 -16
  100. data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +0 -283
  101. data/maps/bas-rus-Cyrl-Latn-2017-bss.yaml +0 -174
  102. data/maps/bas-rus-Cyrl-Latn-2017-oss.yaml +0 -169
  103. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +0 -292
  104. data/maps/bgn-kor-Hang-Latn-1943.yaml +0 -31
  105. data/maps/bgn-kor-Kore-Latn-1943.yaml +0 -31
  106. data/maps/bgna-bul-Cyrl-Latn-2006.yaml +0 -208
  107. data/maps/bgna-bul-Cyrl-Latn-2009.yaml +0 -208
  108. data/maps/bgnpcgn-amh-Ethi-Latn-1967.yaml +0 -528
  109. data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +0 -592
  110. data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +0 -108
  111. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +0 -104
  112. data/maps/bgnpcgn-bak-Cyrl-Latn-2007.yaml +0 -184
  113. data/maps/bgnpcgn-bel-Cyrl-Latn-1979.yaml +0 -285
  114. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +0 -115
  115. data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +0 -38
  116. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +0 -701
  117. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +0 -19
  118. data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +0 -257
  119. data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +0 -127
  120. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +0 -42
  121. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +0 -253
  122. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +0 -48
  123. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +0 -48
  124. data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +0 -159
  125. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +0 -190
  126. data/maps/bgnpcgn-nep-Deva-Latn-2011.yaml +0 -200
  127. data/maps/bgnpcgn-per-Arab-Latn-1956.yaml +0 -92
  128. data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +0 -314
  129. data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +0 -166
  130. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +0 -162
  131. data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +0 -208
  132. data/maps/bgnpcgn-zho-Hans-Latn-1979.yaml +0 -7456
  133. data/maps/bis-asm-Beng-Latn-13194-1991.yaml +0 -159
  134. data/maps/bis-ben-Beng-Latn-13194-1991.yaml +0 -156
  135. data/maps/bis-dev-Deva-Latn-13194-1991.yaml +0 -184
  136. data/maps/bis-gjr-Gujr-Latn-13194-1991.yaml +0 -166
  137. data/maps/bis-knd-Knda-Latn-13194-1991.yaml +0 -173
  138. data/maps/bis-mlm-Mlym-Latn-13194-1991.yaml +0 -176
  139. data/maps/bis-ori-Orya-Latn-13194-1991.yaml +0 -160
  140. data/maps/bis-pnj-Guru-Latn-13194-1991.yaml +0 -175
  141. data/maps/bis-tel-Telu-Latn-13194-1991.yaml +0 -170
  142. data/maps/bis-tml-Taml-Latn-13194-1991.yaml +0 -155
  143. data/maps/by-bel-Cyrl-Latn-1998.yaml +0 -168
  144. data/maps/by-bel-Cyrl-Latn-2007.yaml +0 -115
  145. data/maps/dos-nep-Deva-Latn-1997.yaml +0 -33
  146. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +0 -684
  147. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +0 -680
  148. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +0 -19
  149. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +0 -31
  150. data/maps/ggg-kat-Geor-Latn-2002.yaml +0 -88
  151. data/maps/gki-bel-Cyrl-Latn-1992.yaml +0 -33
  152. data/maps/gki-bel-Cyrl-Latn-2000.yaml +0 -201
  153. data/maps/gost-rus-Cyrl-Latn-16876-71-1983.yaml +0 -186
  154. data/maps/hk-yue-Hani-Latn-1888.yaml +0 -38497
  155. data/maps/icao-bel-Cyrl-Latn-9303.yaml +0 -136
  156. data/maps/icao-bul-Cyrl-Latn-9303.yaml +0 -118
  157. data/maps/icao-heb-Hebr-Latn-9303.yaml +0 -151
  158. data/maps/icao-mkd-Cyrl-Latn-9303.yaml +0 -117
  159. data/maps/icao-per-Arab-Latn-9303.yaml +0 -103
  160. data/maps/icao-rus-Cyrl-Latn-9303.yaml +0 -117
  161. data/maps/icao-srp-Cyrl-Latn-9303.yaml +0 -117
  162. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +0 -119
  163. data/maps/iso-ara-Arab-Latn-233-1984.yaml +0 -323
  164. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +0 -609
  165. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +0 -40
  166. data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +0 -62
  167. data/maps/iso-rus-Cyrl-Latn-9-1995.yaml +0 -271
  168. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +0 -109
  169. data/maps/kp-kor-Hang-Latn-2002.yaml +0 -901
  170. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +0 -44820
  171. data/maps/mext-jpn-Hrkt-Latn-1954.yaml +0 -411
  172. data/maps/moct-kor-Hang-Latn-2000.yaml +0 -803
  173. data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +0 -541
  174. data/maps/mvd-bel-Cyrl-Latn-2008.yaml +0 -225
  175. data/maps/mvd-bel-Cyrl-Latn-2010.yaml +0 -63
  176. data/maps/mvd-rus-Cyrl-Latn-2008.yaml +0 -109
  177. data/maps/mvd-rus-Cyrl-Latn-2010.yaml +0 -37
  178. data/maps/nil-kor-Hang-Hang-jamo.yaml +0 -11193
  179. data/maps/odni-aze-Cyrl-Latn-2015.yaml +0 -144
  180. data/maps/odni-bel-Cyrl-Latn-2015.yaml +0 -148
  181. data/maps/odni-bul-Cyrl-Latn-2015.yaml +0 -96
  182. data/maps/odni-hin-Deva-Latn-2015.yaml +0 -258
  183. data/maps/odni-kat-Geor-Latn-2015.yaml +0 -87
  184. data/maps/odni-kaz-Cyrl-Latn-2015.yaml +0 -148
  185. data/maps/odni-kir-Cyrl-Latn-2015.yaml +0 -136
  186. data/maps/odni-mkd-Cyrl-Latn-2015.yaml +0 -122
  187. data/maps/odni-rus-Cyrl-Latn-2015.yaml +0 -77
  188. data/maps/odni-srp-Cyrl-Latn-2015.yaml +0 -129
  189. data/maps/odni-tat-Cyrl-Latn-2015.yaml +0 -142
  190. data/maps/odni-tgk-Cyrl-Latn-2015.yaml +0 -148
  191. data/maps/odni-uig-Cyrl-Latn-2015.yaml +0 -138
  192. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +0 -157
  193. data/maps/odni-urd-Arab-Latn-2015.yaml +0 -221
  194. data/maps/odni-uzb-Cyrl-Latn-2015.yaml +0 -166
  195. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +0 -90
  196. data/maps/royin-tha-Thai-Latn-1968.yaml +0 -179
  197. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +0 -180
  198. data/maps/royin-tha-Thai-Latn-1999.yaml +0 -76
  199. data/maps/sac-zho-Hans-Latn-1979.yaml +0 -24759
  200. data/maps/ses-ara-Arab-Latn-1930.yaml +0 -279
  201. data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +0 -222
  202. data/maps/ua-ukr-Cyrl-Latn-1996.yaml +0 -193
  203. data/maps/un-ara-Arab-Latn-1971.yaml +0 -139
  204. data/maps/un-ara-Arab-Latn-1972.yaml +0 -159
  205. data/maps/un-ara-Arab-Latn-2017.yaml +0 -420
  206. data/maps/un-bel-Cyrl-Latn-2007.yaml +0 -114
  207. data/maps/un-ben-Beng-Latn-2016.yaml +0 -534
  208. data/maps/un-ell-Grek-Latn-1987-tl.yaml +0 -31
  209. data/maps/un-ell-Grek-Latn-1987-ts.yaml +0 -19
  210. data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +0 -780
  211. data/maps/un-mon-Mong-Latn-2013.yaml +0 -99
  212. data/maps/un-nep-Deva-Latn-1972.yaml +0 -163
  213. data/maps/un-rus-Cyrl-Latn-1987.yaml +0 -166
  214. data/maps/un-ukr-Cyrl-Latn-1998.yaml +0 -30
  215. data/maps/ungegn-amh-Ethi-Latn-2016.yaml +0 -575
  216. data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +0 -406
  217. data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +0 -386
  218. data/maps/var-kor-Hang-Latn-mr-1939.yaml +0 -1054
  219. data/maps/var-kor-Kore-Hang-2013.yaml +0 -59754
  220. data/maps/var-kor-Kore-Latn-mr-1939.yaml +0 -36
  221. data/maps/var-tha-Thai-Thai-phonemic.yaml +0 -59
  222. data/maps/var-tha-Thai-Zsym-ipa.yaml +0 -301
  223. data/maps/var-zho-Hani-Latn-1979.yaml +0 -38908
  224. data/spec/interscript/mapping_spec.rb +0 -42
  225. data/spec/interscript_spec.rb +0 -26
  226. data/spec/spec_helper.rb +0 -3
data/README.adoc DELETED
@@ -1,298 +0,0 @@
1
- = Interscript: Interoperable Script Conversion Systems, with a Ruby implementation
2
-
3
- image:https://github.com/interscript/interscript/workflows/test/badge.svg["Ruby build status", link="https://github.com/interscript/interscript/actions?workflow=test"]
4
- image:https://github.com/interscript/interscript/workflows/js/badge.svg["JavaScript build status", link="https://github.com/interscript/interscript/actions?workflow=js"]
5
-
6
- == Introduction
7
-
8
- This repository contains interoperable transliteration schemes from:
9
-
10
- * ALA-LC
11
- * BGN/PCGN
12
- * ICAO
13
- * ISO
14
- * UN (by UNGEGN)
15
- * Many, many other script conversion system authorities.
16
-
17
- The goal is to achieve interoperable transliteration schemes allowing quality comparisons.
18
-
19
-
20
-
21
- == Demonstration
22
-
23
- These transliteration systems are used in the demo:
24
-
25
- `bgnpcgn-rus-Cyrl-Latn-1947`:: BGN/PCGN Romanization of Russian
26
- `iso-rus-Cyrl-Latn-9-1995`:: ISO 9 Romanization of Russian
27
- `icao-rus-Cyrl-Latn-9303`:: ICAO MRZ Romanization of Russian
28
- `bas-rus-Cyrl-Latn-2017-bss`:: Bulgaria Academy of Science Streamlined System for Russian
29
-
30
- image:demo/20191118-interscript-demo-cast.gif["interscript screencast"]
31
-
32
-
33
- == Installation
34
-
35
- === Prerequisites
36
-
37
- Linux:
38
-
39
- [source,sh]
40
- ----
41
- apt-get install swig python3-setuptools
42
- ----
43
-
44
- Windows:
45
-
46
- [source,sh]
47
- ----
48
- choco install --no-progress swig
49
- ----
50
-
51
- Interscript depends on Python and the https://github.com/sequitur-g2p/sequitur-g2p[`sequitur-g2p`] module
52
-
53
- [source,sh]
54
- ----
55
- pip3 install setuptools numpy
56
- curl -sSL -o sequitur-g2p.zip https://github.com/sequitur-g2p/sequitur-g2p/archive/806273f.zip
57
- pip3 install sequitur-g2p.zip
58
- ----
59
-
60
- Interscript depends on Ruby. Once you manage to install Ruby, it's easy.
61
-
62
- [source,sh]
63
- ----
64
- gem install interscript
65
- ----
66
-
67
- == Usage
68
-
69
- Assume you have a file ready in the source script like this:
70
-
71
- [source,sh]
72
- ----
73
- cat <<EOT > rus-Cyrl.txt
74
- Эх, тройка! птица тройка, кто тебя выдумал? знать, у бойкого народа ты
75
- могла только родиться, в той земле, что не любит шутить, а
76
- ровнем-гладнем разметнулась на полсвета, да и ступай считать версты,
77
- пока не зарябит тебе в очи. И не хитрый, кажись, дорожный снаряд, не
78
- железным схвачен винтом, а наскоро живьём с одним топором да долотом
79
- снарядил и собрал тебя ярославский расторопный мужик. Не в немецких
80
- ботфортах ямщик: борода да рукавицы, и сидит чёрт знает на чём; а
81
- привстал, да замахнулся, да затянул песню — кони вихрем, спицы в
82
- колесах смешались в один гладкий круг, только дрогнула дорога, да
83
- вскрикнул в испуге остановившийся пешеход — и вон она понеслась,
84
- понеслась, понеслась!
85
-
86
- Н.В. Гоголь
87
- EOT
88
- ----
89
-
90
- You can run `interscript` on this text using different transliteration systems.
91
-
92
- [source,sh]
93
- ----
94
- interscript rus-Cyrl.txt \
95
- --system=bgnpcgn-rus-Cyrl-Latn-1947 \
96
- --output=bgnpcgn-rus-Latn.txt
97
-
98
- interscript rus-Cyrl.txt \
99
- --system=iso-rus-Cyrl-Latn-9-1995 \
100
- --output=iso-rus-Latn.txt
101
-
102
- interscript rus-Cyrl.txt \
103
- --system=icao-rus-Cyrl-Latn-9303 \
104
- --output=icao-rus-Latn.txt
105
-
106
- interscript rus-Cyrl.txt \
107
- --system=bas-rus-Cyrl-Latn-2017-bss \
108
- --output=bas-rus-Latn.txt
109
- ----
110
-
111
- It is then easy to see the exact differences in rendering between the systems.
112
-
113
- [source,sh]
114
- ----
115
- diff bgnpcgn-rus-Latn.txt bas-rus-Latn.txt
116
- ----
117
-
118
- == Adding transliteration system
119
-
120
- Transliteration systems stored in a `maps/` directory as YAML files.
121
- You can create a new file and add it to the directory.
122
-
123
- The file should be named as `<system-code>.yaml`, where `system-code`
124
- is in accordance with
125
- http://calconnect.gitlab.io/tc-localization/csd-transcription-systems[ISO/CC 24229].
126
-
127
- === File structure
128
-
129
- [source,yaml]
130
- ----
131
- authority_id: bgnpcgn
132
- id: 1947
133
- language: rus
134
- source_script: Cyrl
135
- destination_script: Latn
136
- name: ROMANIZATION OF RUSSIAN, BGN/PCGN 1947 System
137
- url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/807920/ROMANIZATION_OF_RUSSIAN.pdf
138
- creation_date: 1947
139
- confirmation_date: 2019-06
140
- description: The BGN/PCGN system for Russian was adopted ...
141
-
142
- notes:
143
- - The character e should be romanized ye initially, after the vowel ...
144
-
145
- tests:
146
- - source: ДЛИННОЕ ПОКРЫВАЛО
147
- expected: DLINNOYE POKRYVALO
148
- - source: Еловая шишка
149
- expected: Yelovaya shishka
150
-
151
- map:
152
- rules:
153
- - pattern: (?<=[АаЕеЁёИиОоУуЫыЭэЮюЯяЙйЪъЬь])\u0415 # Е after a, e, ё, и, о, у, ы, э, ю, я, й, ъ, ь
154
- result: Ye
155
- - pattern: \b\u0415 # Е initially
156
- result: Ye
157
-
158
- characters:
159
- "\u0410": "A"
160
- "\u0411": "B"
161
- "\u0412": "V"
162
- ----
163
-
164
-
165
- === Rules
166
-
167
- The subsection `rules` is placed under the `map` key. All rules are applied in order they are placed before the subsection `characters` applying. Rules apply to an original text, not to a result of previous rules applying.
168
-
169
- Each rule has `pattern` and `result` elements.
170
-
171
- Pattern is a regex expression. It should be representing as a string without `//` or `%r{}` parentheses. For example `\b\u0415`. In case a rule is depend on previous or next content, lookahead or lookbehind could be used. For example a rule with the pattern `(?<=[АаЕеЁёИиОоУуЫыЭэЮюЯяЙйЪъЬь])\u0415` find every Е after upper or lower case symbols a, e, ё, и, о, у, ы, э, ю, я, й, ъ, ь.
172
-
173
- Result is a replacement a for pattern's match. It can contain a string, an Unicode characters specified by a hexadecimal number, a captured group reference. String with hexadecimal number or captured group reference should be double quoted. For example `"Y\u00eb"` or `"\\1\u00b7\\2"`. Captured group are referred by double backslash and group's number.
174
-
175
- Because rules are applied in order, multiple rules applicable to the same segment of a string can be addressed by rule ordering, and rules can be used as priority over characters. For example:
176
-
177
- [source,yaml]
178
- ----
179
- map:
180
- rules:
181
- - pattern: \u03B3\u03B3 # γ (before Γ, Ξ, Χ)
182
- result: ng
183
- - pattern: (?<![Γγ])\u03B3(?=[ΕεέΗηήΙιίΥυύ]) # γ (before front vowels)
184
- result: y
185
- ----
186
-
187
- (γι maps to `yi`; but γγ maps to `ng`. In the case of γγι, the first rule takes priority, and the transliteration is `ngi`: it makes the second rule impossible.)
188
-
189
- [source,yaml]
190
- ----
191
- map:
192
- rules:
193
- - pattern: (?<=\b)\u03BC[πΠ] # μπ (initially)
194
- result: b
195
- - pattern: \u03BC[πΠ] # μπ (medially)
196
- result: mb
197
- ----
198
-
199
- (The first rule applies at the start of a word; the second rule does not specify a context, as it applies in all other cases not covered by the first rule.)
200
-
201
- [source,yaml]
202
- ----
203
- map:
204
- rules:
205
- - pattern: ";"
206
- result: "?"
207
-
208
- characters
209
- "\u00B7": ";
210
- ----
211
-
212
- (This guarantees that any `;` are converted to `?` before any new `;` are introduced; because all three are Latin script, they could be mixed up in ordering.)
213
-
214
- Normally rules "`bleed`" each other: once a rule applies to a segment, that segment cannot trigger other rules, because it is already converted to Roman. Exceptionally, it will be necessary to have a rule add or remove characters in the original script, rather than transliterate them, so that the same context can be invoked by two rules in succession:
215
-
216
- [source,yaml]
217
- ----
218
- map:
219
- rules:
220
- - pattern: (?<=[АаЕеЁёИиОоУуЫыЭэЮюЯя])\u042b # Ы after any vowel character
221
- result: "\u00b7Ы"
222
- - pattern: \u042b(?=[АаУуЫыЭэ]) # Ы before а, у, ы, or э
223
- result: "Ы\u00b7"
224
- ----
225
-
226
- (If the result were `\u00B7Y`, the second rule could not be applied afterwards; but we want ОЫУ to transliterate as `O·Y·U`. In order to make that happen, we preserve the Ы during the rules phase, resulting in О·Ы·У; we only convert the letters to Roman script in the `characters` phase.)
227
-
228
- === Testing transliteration systems
229
-
230
- To test all transliteration systems in the `maps/` directory, run:
231
-
232
- [source,sh]
233
- ----
234
- bundle exec rspec
235
- ----
236
-
237
- The command takes `source` texts from the `test` section, transforms
238
- them using `rules` and `charmaps` from the `map` key, and compares the
239
- results with `expected:` text from the `source:` section.
240
-
241
- To test a specific transliteration system, set the environment variable
242
- `TRANSLIT_SYSTEM` to the system code of the desired system
243
- (i.e. the "`basename`" of the system's YAML file):
244
-
245
- [source,sh]
246
- ----
247
- TRANSLIT_SYSTEM=bgnpcgn-rus-Cyrl-Latn-1947 bundle exec rspec
248
- ----
249
-
250
-
251
- == ISCS system codes
252
-
253
- In accordance with
254
- http://calconnect.gitlab.io/tc-localization/csd-transcription-systems[ISO/CC 24229],
255
- the system code identifying a script conversion system has the following components:
256
-
257
- e.g. `bgnpcgn-rus-Cyrl-Latn-1947`:
258
-
259
- `bgnpcgn`:: the authority identifier
260
- `rus`:: an ISO 639-{1,2,3,5} language code that this system applies to (For 639-2, use (T) code)
261
- `Cyrl`:: an ISO 15924 script code, identifying the source script
262
- `Latn`:: an ISO 15924 script code, identifying the target script
263
- `1947`:: an identifier unit within the authority to identify this system
264
-
265
-
266
- == Covered languages
267
-
268
- Currently the schemes cover Cyrillic, Armenian, Greek, Arabic and Hebrew.
269
-
270
-
271
- == Samples to play with
272
-
273
- * `rus-Cyrl-1.txt`: Copied from the XLS output from http://www.primorsk.vybory.izbirkom.ru/region/primorsk?action=show&global=true&root=254017025&tvd=4254017212287&vrn=100100067795849&prver=0&pronetvd=0&region=25&sub_region=25&type=242&vibid=4254017212287
274
-
275
- * `rus-Cyrl-2.txt`: Copied from the XLS output from http://www.yaroslavl.vybory.izbirkom.ru/region/yaroslavl?action=show&root=764013001&tvd=4764013188704&vrn=4764013188693&prver=0&pronetvd=0&region=76&sub_region=76&type=426&vibid=4764013188704
276
-
277
-
278
- == References
279
-
280
- Reference documents are located at the
281
- https://github.com/interscript/interscript-references[interscript-references repository].
282
- Some specifications that have distribution limitations may not be reproduced there.
283
-
284
-
285
- == Links to system definitions
286
-
287
- * https://www.iso.org/committee/48750.html[ISO/TC 46 (see standards published by WG 3)]
288
- * http://geonames.nga.mil/gns/html/romanization.html[BGN/PCGN and BGN Romanization systems (BGN)]
289
- * https://www.gov.uk/government/publications/romanization-systems[BGN/PCGN Romanization systems (PCGN)]
290
- * https://www.loc.gov/catdir/cpso/roman.html[ALA-LC Romanization systems in current use]
291
- * http://catdir.loc.gov/catdir/cpso/roman.html[ALA-LC Romanization systems from 1997]
292
- * http://www.eki.ee/wgrs/[UN Romanization systems]
293
- * http://www.eki.ee/knab/kblatyl2.htm[EKI KNAB systems]
294
-
295
- == Copyright and license
296
-
297
- This is a Ribose project. Copyright Ribose.
298
-
data/bin/rspec DELETED
@@ -1,29 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- #
5
- # This file was generated by Bundler.
6
- #
7
- # The application 'rspec' is installed as part of a gem, and
8
- # this file is here to facilitate running it.
9
- #
10
-
11
- require "pathname"
12
- ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
13
- Pathname.new(__FILE__).realpath)
14
-
15
- bundle_binstub = File.expand_path("../bundle", __FILE__)
16
-
17
- if File.file?(bundle_binstub)
18
- if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
19
- load(bundle_binstub)
20
- else
21
- abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
22
- Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
23
- end
24
- end
25
-
26
- require "rubygems"
27
- require "bundler/setup"
28
-
29
- load Gem.bin_path("rspec-core", "rspec")
data/lib/g2pwrapper.py DELETED
@@ -1,34 +0,0 @@
1
- import g2p, SequiturTool
2
- import numpy
3
-
4
- def transliterate(model, word):
5
-
6
- class Struct:
7
- def __init__(self, **entries):
8
- self.__dict__.update(entries)
9
-
10
- model_path = {
11
- 'pythainlp_lexicon': './lib/model-7',
12
- 'wiktionary_phonemic': './lib/tha-pt-b-7'
13
- }
14
-
15
- connector_dict = {
16
- 'pythainlp_lexicon': '',
17
- 'wiktionary_phonemic': '-'
18
- }
19
-
20
-
21
- modelFile = model_path[model]
22
- connector = connector_dict[model]
23
-
24
- options = Struct(**{'profile': None, 'resource_usage': None, 'psyco': None, 'tempdir': None, 'trainSample': None, 'develSample': None, 'testSample': None, 'checkpoint': None, 'resume_from_checkpoint': None, 'shouldTranspose': None, 'modelFile': modelFile , 'newModelFile': None, 'shouldTestContinuously': None, 'shouldSelfTest': None, 'lengthConstraints': None, 'shouldSuppressNewMultigrams': None, 'viterbi': None, 'shouldRampUp': None, 'shouldWipeModel': None, 'shouldInitializeWithCounts': None, 'minIterations': 20, 'maxIterations': 100, 'eager_discount_adjustment': None, 'fixed_discount': None, 'encoding': 'UTF-8', 'phoneme_to_phoneme': None, 'test_segmental': None, 'testResult': None, 'applySample': None, 'applyWord': word, 'variants_mass': None, 'variants_number': None, 'fakeTranslator': None, 'stack_limit': None})
25
-
26
- loadSample = g2p.loadG2PSample
27
-
28
- model = SequiturTool.procureModel(options, loadSample)
29
- if not model:
30
- return 1
31
- translator = g2p.Translator(model)
32
- del model
33
-
34
- return connector.join(translator(tuple(word)))
@@ -1,2 +0,0 @@
1
- require "opal"
2
- require "interscript"
@@ -1,71 +0,0 @@
1
- require 'pathname'
2
-
3
- module Interscript
4
- module Fs
5
- ALPHA_REGEXP = '[[:alpha:]]'
6
-
7
- def sub_replace(string, pos, size, repl)
8
- string[pos..pos + size - 1] = repl
9
- string
10
- end
11
-
12
- def root_path
13
- @root_path ||= Pathname.new(File.join(File.dirname(__dir__), ".."))
14
- end
15
-
16
- def transliterate_file(system_code, input_file, output_file, maps={})
17
- input = File.read(input_file)
18
- output = transliterate(system_code, input, maps)
19
-
20
- File.open(output_file, 'w') do |f|
21
- f.puts(output)
22
- end
23
-
24
- puts "Output written to: #{output_file}"
25
- output_file
26
- end
27
-
28
- def import_python_modules
29
- begin
30
- pyimport :g2pwrapper
31
- rescue
32
- pyimport :sys
33
- sys.path.append(root_path.to_s + "/lib/")
34
- pyimport :g2pwrapper
35
- end
36
- end
37
-
38
- def external_process(process_name, string)
39
- import_python_modules
40
-
41
- case process_name
42
- when 'sequitur.pythainlp_lexicon'
43
- return g2pwrapper.transliterate('pythainlp_lexicon', string)
44
- when 'sequitur.wiktionary_phonemic'
45
- return g2pwrapper.transliterate('wiktionary_phonemic', string)
46
- else
47
- raise ExternalProcessNotRecognizedError.new
48
- end
49
-
50
- rescue
51
- raise ExternalProcessUnavailableError.new
52
- end
53
-
54
- def external_processing(mapping, string)
55
- # Segmentation
56
- string = external_process(mapping.segmentation, string) if mapping.segmentation
57
-
58
- # Transliteration/Transcription
59
- string = external_process(mapping.transcription, string) if mapping.transcription
60
-
61
- string
62
- end
63
-
64
- private
65
-
66
- def mkregexp(regexpstring)
67
- /#{regexpstring}/u
68
- end
69
-
70
- end
71
- end
@@ -1,142 +0,0 @@
1
- require 'rambling-trie'
2
- require 'yaml' unless RUBY_ENGINE == 'opal'
3
- require 'json'
4
-
5
- module Interscript
6
-
7
- class Mapping
8
- attr_reader(
9
- :id,
10
- :url,
11
- :name,
12
- :notes,
13
- :rules,
14
- :tests,
15
- :language,
16
- :postrules,
17
- :characters,
18
- :description,
19
- :authority_id,
20
- :creation_date,
21
- :source_script,
22
- :destination_script,
23
- :chain,
24
- :character_separator,
25
- :word_separator,
26
- :title_case,
27
- :downcase,
28
- :dictionary,
29
- :characters_hash,
30
- :dictionary_hash,
31
- :segmentation,
32
- :transcription,
33
- :dictionary_trie
34
- )
35
-
36
- def initialize(system_code, options = {})
37
- @system_code = system_code
38
- @depth = options.fetch(:depth, 0).to_i
39
-
40
- unless RUBY_ENGINE == 'opal'
41
- @system_path = options.fetch(:system_code, default_path)
42
- end
43
-
44
- load_and_serialize_system_mappings
45
- end
46
-
47
- def self.for(system_code, options = {})
48
- new(system_code, options)
49
- end
50
-
51
- def load_and_serialize_system_mappings
52
- return if depth >= 5
53
-
54
- mappings = load_system_mappings
55
- serialize_system_mappings(mappings)
56
- end
57
-
58
- private
59
-
60
- attr_reader :depth, :system_code, :system_path
61
-
62
- def system_code_file
63
- [system_code, "yaml"].join(".")
64
- end
65
-
66
- def default_path
67
- @default_path ||= Interscript.root_path.join("maps")
68
- end
69
-
70
- def load_system_mappings
71
- if RUBY_ENGINE == 'opal'
72
- load_opal_mappings
73
- else
74
- load_fs_mappings
75
- end
76
- end
77
-
78
- def load_opal_mappings
79
- JSON.parse(`InterscriptMaps[#{system_code}]`)
80
- end
81
-
82
- def load_fs_mappings
83
- YAML.load_file(system_path.join(system_code_file))
84
- rescue Errno::ENOENT
85
- raise Interscript::InvalidSystemError.new("No system mappings found")
86
- end
87
-
88
- def serialize_system_mappings(mappings)
89
- @id = mappings.fetch("id", nil)
90
- @url = mappings.fetch("url", nil)
91
- @name = mappings.fetch("name", nil)
92
- @notes = mappings.fetch("notes", nil)
93
- @tests = mappings.fetch("tests", [])
94
- @language = mappings.fetch("language", nil)
95
- @description = mappings.fetch("description", nil)
96
- @authority_id = mappings.fetch("authority_id", nil)
97
- @creation_date = mappings.fetch("creation_date", nil)
98
- @source_script = mappings.fetch("source_script", nil)
99
- @destination_script = mappings.fetch("destination_script", nil)
100
- @chain = mappings.fetch("chain", [])
101
- @character_separator = mappings["map"]["character_separator"] || nil
102
- @word_separator = mappings["map"]["word_separator"] || nil
103
- @title_case = mappings["map"]["title_case"] || false
104
- @downcase = mappings["map"]["downcase"] || false
105
- @rules = mappings["map"]["rules"] || []
106
- @postrules = mappings["map"]["postrules"] || []
107
- @characters = mappings["map"]["characters"] || {}
108
- @dictionary = mappings["map"]["dictionary"] || {}
109
- @segmentation = mappings["map"]["segementation"] || nil
110
- @transcription = mappings["map"]["transcription"] || nil
111
-
112
- include_inherited_mappings(mappings)
113
- build_hashes
114
- build_trie
115
- end
116
-
117
- def include_inherited_mappings(mappings)
118
- inherit_systems = [].push(mappings["map"]["inherit"]).flatten
119
-
120
- inherit_systems.each do |inherit_system|
121
- next unless inherit_system
122
-
123
- inherited_mapping = Mapping.for(inherit_system, depth: depth + 1)
124
-
125
- @rules = [inherited_mapping.rules, rules].flatten
126
- @postrules = [inherited_mapping.postrules, postrules].flatten
127
- @characters = (inherited_mapping.characters|| {}).merge(characters)
128
- @dictionary = (inherited_mapping.dictionary|| {}).merge(dictionary)
129
- end
130
- end
131
-
132
- def build_hashes
133
- @characters_hash = characters&.sort_by { |k, _v| k.size }&.reverse&.to_h
134
- @dictionary_hash = dictionary&.sort_by { |k, _v| k.size }&.reverse&.to_h
135
- end
136
-
137
- def build_trie
138
- @dictionary_trie = Rambling::Trie.create
139
- dictionary_trie.concat dictionary.keys
140
- end
141
- end
142
- end