interscript 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +246 -14
  3. data/bin/interscript +38 -17
  4. data/bin/setup +8 -0
  5. data/lib/g2pwrapper.py +34 -0
  6. data/lib/interscript.rb +140 -16
  7. data/lib/interscript/command.rb +27 -0
  8. data/lib/interscript/mapping.rb +125 -0
  9. data/lib/interscript/version.rb +1 -1
  10. data/lib/model-7 +0 -0
  11. data/lib/tha-pt-b-7 +0 -0
  12. data/maps/acadsin-zho-Hani-Latn-2002.yaml +38912 -0
  13. data/maps/alalc-bel-cyrl-latn-1997.yaml +125 -0
  14. data/maps/alalc-ben-Beng-Latn-2017.yaml +130 -0
  15. data/maps/alalc-bul-Cyrl-Latn-1997.yaml +94 -0
  16. data/maps/alalc-ell-Grek-Latn-1997.yaml +625 -0
  17. data/maps/alalc-ell-Grek-Latn-2010.yaml +628 -0
  18. data/maps/alalc-kat-Geok-Latn-1997.yaml +112 -0
  19. data/maps/alalc-kat-Geor-Latn-1997.yaml +146 -0
  20. data/maps/alalc-kor-Hang-Latn-1997.yaml +94 -0
  21. data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +103 -0
  22. data/maps/alalc-mkd-cyrl-latn-1997.yaml +114 -0
  23. data/maps/alalc-srp-Cyrl-Latn-1997.yaml +114 -0
  24. data/maps/alalc-srp-cyrl-latn-2013.yaml +135 -0
  25. data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +141 -0
  26. data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +16 -0
  27. data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +283 -0
  28. data/maps/{bas-rus-Cyrl-Latn-bss.yaml → bas-rus-Cyrl-Latn-2017-bss.yaml} +57 -31
  29. data/maps/{bas-rus-Cyrl-Latn-oss.yaml → bas-rus-Cyrl-Latn-2017-oss.yaml} +54 -34
  30. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +294 -0
  31. data/maps/bgn-kor-Hang-Latn-1943.yaml +31 -0
  32. data/maps/bgn-kor-Kore-Latn-1943.yaml +31 -0
  33. data/maps/bgna-bul-Cyrl-Latn-2006.yaml +208 -0
  34. data/maps/bgna-bul-Cyrl-Latn-2009.yaml +208 -0
  35. data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +1 -2
  36. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +104 -0
  37. data/maps/bgnpcgn-bel-cyrl-latn-1979.yaml +285 -0
  38. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +115 -0
  39. data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +10 -64
  40. data/maps/bgnpcgn-chn-Hans-Latn-1979.yaml +7456 -0
  41. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +702 -0
  42. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +20 -0
  43. data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +257 -0
  44. data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +127 -0
  45. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +43 -0
  46. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +253 -0
  47. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +48 -0
  48. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +48 -0
  49. data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +159 -0
  50. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +190 -0
  51. data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +145 -64
  52. data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +166 -0
  53. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +75 -2
  54. data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +208 -0
  55. data/maps/by-bel-Cyrl-Latn-1998.yaml +168 -0
  56. data/maps/by-bel-Cyrl-Latn-2007.yaml +115 -0
  57. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +685 -0
  58. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +681 -0
  59. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +20 -0
  60. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +32 -0
  61. data/maps/ggg-kat-Geor-Latn-2002.yaml +89 -0
  62. data/maps/gki-bel-cyrl-latn-1992.yaml +33 -0
  63. data/maps/gki-bel-cyrl-latn-2000.yaml +201 -0
  64. data/maps/gost-rus-cyrl-latn-16876-71-1983.yaml +186 -0
  65. data/maps/hk-yue-Hani-Latn-1888.yaml +38497 -0
  66. data/maps/icao-bel-Cyrl-Latn-9303.yaml +108 -92
  67. data/maps/icao-bul-Cyrl-Latn-9303.yaml +1 -2
  68. data/maps/icao-heb-Hebr-Latn-9303.yaml +118 -124
  69. data/maps/icao-mkd-Cyrl-Latn-9303.yaml +1 -2
  70. data/maps/icao-per-Arab-Latn-9303.yaml +5 -6
  71. data/maps/icao-rus-Cyrl-Latn-9303.yaml +1 -2
  72. data/maps/icao-srp-Cyrl-Latn-9303.yaml +1 -2
  73. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +1 -2
  74. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +610 -0
  75. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +41 -0
  76. data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +62 -0
  77. data/maps/{iso-rus-Cyrl-Latn-iso9.yaml → iso-rus-Cyrl-Latn-9-1995.yaml} +2 -3
  78. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +109 -0
  79. data/maps/kp-kor-Hang-Latn-2002.yaml +901 -0
  80. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +44820 -0
  81. data/maps/mext-jpn-Hrkt-Latn-1954.yaml +411 -0
  82. data/maps/moct-kor-Hang-Latn-2000.yaml +803 -0
  83. data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +541 -0
  84. data/maps/nil-kor-Hang-Hang-jamo.yaml +11193 -0
  85. data/maps/odni-kat-Geor-Latn-2015.yaml +88 -0
  86. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +157 -0
  87. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +90 -0
  88. data/maps/royin-tha-Thai-Latn-1968.yaml +179 -0
  89. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +180 -0
  90. data/maps/royin-tha-Thai-Latn-1999.yaml +76 -0
  91. data/maps/{cn-chn-Hans-Latn-pinyin.yaml → sac-zho-Hans-Latn-1979.yaml} +6 -7
  92. data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +222 -0
  93. data/maps/ua-ukr-Cyrl-Latn-1996.yaml +193 -0
  94. data/maps/un-bel-Cyrl-Latn-2007.yaml +114 -0
  95. data/maps/un-ben-Beng-Latn-2016.yaml +534 -0
  96. data/maps/un-ell-Grek-Latn-1987-tl.yaml +32 -0
  97. data/maps/un-ell-Grek-Latn-1987-ts.yaml +20 -0
  98. data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +780 -0
  99. data/maps/un-mon-Mong-Latn-2013.yaml +19 -6
  100. data/maps/un-rus-Cyrl-Latn-1987.yaml +166 -0
  101. data/maps/un-ukr-cyrl-latn-1998.yaml +30 -0
  102. data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +406 -0
  103. data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +386 -0
  104. data/maps/var-kor-Hang-Latn-mr-1939.yaml +1054 -0
  105. data/maps/var-kor-Kore-Hang-2013.yaml +59754 -0
  106. data/maps/var-kor-Kore-Latn-mr-1939.yaml +37 -0
  107. data/maps/var-tha-Thai-Thai-phonemic.yaml +59 -0
  108. data/maps/var-tha-Thai-Zsym-ipa.yaml +301 -0
  109. data/maps/var-zho-Hani-Latn-1979.yaml +38908 -0
  110. data/spec/interscript/mapping_spec.rb +42 -0
  111. data/spec/interscript_spec.rb +20 -5
  112. data/spec/spec_helper.rb +3 -1
  113. metadata +149 -24
  114. data/maps/bgnpcgn-chn-Hans-Latn-pinyin.yaml +0 -7503
  115. data/maps/historic-jpn-Hrkt-Latn-hepburn.yaml +0 -336
  116. data/maps/icao-gre-Grek-Latn-9303.yaml +0 -101
  117. data/maps/mext-jpn-Hrkt-Latn-hepburn.yaml +0 -330
  118. data/maps/mext-jpn-Hrkt-Latn-kunrei.yaml +0 -308
  119. data/maps/un-jpn-Hrkt-Latn-hepburn.yaml +0 -313
  120. data/maps/un-jpn-Hrkt-Latn-kunrei.yaml +0 -354
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 736d2d149984ce550327443c83f4f8b65ad3a46c106bb2c5d30392292b9e2ed6
4
- data.tar.gz: 0dd4633aeccdfb1acfc618fe8708aafd4b7dac8b051fc6e6ac0420fdacc46066
3
+ metadata.gz: 643981da933194b2464ea279e9d31b9fcd9d32519c5cd236ed805855c93755ad
4
+ data.tar.gz: f54c4303bb02f0a873cfdf96287d78321648cee19c685bf338cb9f8e2f642c56
5
5
  SHA512:
6
- metadata.gz: 156e919c03e8e7a7a0ce804b0d5402df833783bc82037d5fd06b75b4464cee12d4c426a88911394963888205ff4e7bc71e15eb19a6096cc1e71cd7d406efc3a1
7
- data.tar.gz: c821069c94ba9e06d2a70b7e0b20916d1222fabf0a4daf7f5014999e24aac8d218beeb59b0316c885bfb28d84a11fb50d4b4216b157a6089f282811ef7ffa590
6
+ metadata.gz: 2d8cfd0d60e2d41d8b1e31b4e61353b0bc7fd5ac4fc426d4304ccc86bc0bb6d84b4b4a2a6e44bb342afa6c20202a4bca4180a1f5037c73072e246038c6f36f1f
7
+ data.tar.gz: 2a5fffac1de98702494f69d55b2de5200684195b0f7948619bfa2ae9f3f97810c731868f2550578f5ad97a9db9fa72d9c2abad24451437b7e08673dfc1cd97d8
@@ -1,45 +1,259 @@
1
- = Interscript: Interoperable Script Conversion Systems and a Ruby implementation
1
+ = Interscript: Interoperable Script Conversion Systems, with a Ruby implementation
2
2
 
3
- == Introducation
3
+ image:https://github.com/interscript/interscript/workflows/test/badge.svg["Build Status", link="https://github.com/interscript/interscript/actions?workflow=test"]
4
4
 
5
- This repository contains a number of transliteration schemes from:
5
+ == Introduction
6
6
 
7
+ This repository contains interoperable transliteration schemes from:
8
+
9
+ * ALA-LC
7
10
  * BGN/PCGN
8
11
  * ICAO
9
12
  * ISO
10
13
  * UN (by UNGEGN)
14
+ * Many, many other script conversion system authorities.
11
15
 
12
16
  The goal is to achieve interoperable transliteration schemes allowing quality comparisons.
13
17
 
14
18
 
15
- == STATUS (work in progress!)
16
19
 
17
- These transliteration systems currently work:
20
+ == Demonstration
21
+
22
+ These transliteration systems are used in the demo:
18
23
 
19
24
  `bgnpcgn-rus-Cyrl-Latn-1947`:: BGN/PCGN Romanization of Russian
20
25
  `iso-rus-Cyrl-Latn-iso9`:: ISO 9 Romanization of Russian
21
26
  `icao-rus-Cyrl-Latn-9303`:: ICAO MRZ Romanization of Russian
22
27
  `bas-rus-Cyrl-Latn-bss`:: Bulgaria Academy of Science Streamlined System for Russian
23
28
 
29
+ image:demo/20191118-interscript-demo-cast.gif["interscript screencast"]
30
+
31
+
32
+ == Installation
33
+
34
+ === Prerequisites
35
+
36
+ Linux:
37
+
38
+ [source,sh]
39
+ ----
40
+ apt-get install swig python3-setuptools
41
+ ----
42
+
43
+ Windows:
44
+
45
+ [source,sh]
46
+ ----
47
+ choco install --no-progress swig
48
+ ----
49
+
50
+ Interscript depends on Python and the https://github.com/sequitur-g2p/sequitur-g2p[`sequitur-g2p`] module
51
+
52
+ [source,sh]
53
+ ----
54
+ pip3 install setuptools numpy
55
+ curl -sSL -o sequitur-g2p.zip https://github.com/sequitur-g2p/sequitur-g2p/archive/806273f.zip
56
+ pip3 install sequitur-g2p.zip
57
+ ----
58
+
59
+ Interscript depends on Ruby. Once you manage to install Ruby, it's easy.
60
+
61
+ [source,sh]
62
+ ----
63
+ gem install interscript
64
+ ----
24
65
 
25
66
  == Usage
26
67
 
68
+ Assume you have a file ready in the source script like this:
69
+
70
+ [source,sh]
71
+ ----
72
+ cat <<EOT > rus-Cyrl.txt
73
+ Эх, тройка! птица тройка, кто тебя выдумал? знать, у бойкого народа ты
74
+ могла только родиться, в той земле, что не любит шутить, а
75
+ ровнем-гладнем разметнулась на полсвета, да и ступай считать версты,
76
+ пока не зарябит тебе в очи. И не хитрый, кажись, дорожный снаряд, не
77
+ железным схвачен винтом, а наскоро живьём с одним топором да долотом
78
+ снарядил и собрал тебя ярославский расторопный мужик. Не в немецких
79
+ ботфортах ямщик: борода да рукавицы, и сидит чёрт знает на чём; а
80
+ привстал, да замахнулся, да затянул песню — кони вихрем, спицы в
81
+ колесах смешались в один гладкий круг, только дрогнула дорога, да
82
+ вскрикнул в испуге остановившийся пешеход — и вон она понеслась,
83
+ понеслась, понеслась!
84
+
85
+ Н.В. Гоголь
86
+ EOT
87
+ ----
88
+
89
+ You can run `interscript` on this text using different transliteration systems.
90
+
91
+ [source,sh]
92
+ ----
93
+ interscript rus-Cyrl.txt \
94
+ --system=bgnpcgn-rus-Cyrl-Latn-1947 \
95
+ --output=bgnpcgn-rus-Latn.txt
96
+
97
+ interscript rus-Cyrl.txt \
98
+ --system=iso-rus-Cyrl-Latn-iso9 \
99
+ --output=iso-rus-Latn.txt
100
+
101
+ interscript rus-Cyrl.txt \
102
+ --system=icao-rus-Cyrl-Latn-9303 \
103
+ --output=icao-rus-Latn.txt
104
+
105
+ interscript rus-Cyrl.txt \
106
+ --system=bas-rus-Cyrl-Latn-bss \
107
+ --output=bas-rus-Latn.txt
108
+ ----
109
+
110
+ It is then easy to see the exact differences in rendering between the systems.
111
+
112
+ [source,sh]
113
+ ----
114
+ diff bgnpcgn-rus-Latn.txt bas-rus-Latn.txt
115
+ ----
116
+
117
+ == Adding transliteration system
118
+
119
+ Transliteration systems stored in a `maps/` directory as YAML files.
120
+ You can create a new file and add it to the directory.
121
+
122
+ The file should be named as `<system-code>.yaml`, where `system-code`
123
+ is in accordance with
124
+ http://calconnect.gitlab.io/tc-localization/csd-transcription-systems[ISO/CC 24229].
125
+
126
+ === File structure
127
+
128
+ [source,yaml]
129
+ ----
130
+ authority_id: bgnpcgn
131
+ id: 1947
132
+ language: rus
133
+ source_script: Cyrl
134
+ destination_script: Latn
135
+ name: ROMANIZATION OF RUSSIAN, BGN/PCGN 1947 System
136
+ url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/807920/ROMANIZATION_OF_RUSSIAN.pdf
137
+ creation_date: 1947
138
+ confirmation_date: 2019-06
139
+ description: The BGN/PCGN system for Russian was adopted ...
140
+
141
+ notes:
142
+ - The character e should be romanized ye initially, after the vowel ...
143
+
144
+ tests:
145
+ - source: ДЛИННОЕ ПОКРЫВАЛО
146
+ expected: DLINNOYE POKRYVALO
147
+ - source: Еловая шишка
148
+ expected: Yelovaya shishka
149
+
150
+ map:
151
+ rules:
152
+ - pattern: (?<=[АаЕеЁёИиОоУуЫыЭэЮюЯяЙйЪъЬь])\u0415 # Е after a, e, ё, и, о, у, ы, э, ю, я, й, ъ, ь
153
+ result: Ye
154
+ - pattern: \b\u0415 # Е initially
155
+ result: Ye
156
+
157
+ characters:
158
+ "\u0410": "A"
159
+ "\u0411": "B"
160
+ "\u0412": "V"
161
+ ----
162
+
163
+
164
+ === Rules
165
+
166
+ The subsection `rules` is placed under the `map` key. All rules are applied in order they are placed before the subsection `characters` applying. Rules apply to an original text, not to a result of previous rules applying.
167
+
168
+ Each rule has `pattern` and `result` elements.
169
+
170
+ Pattern is a regex expression. It should be representing as a string without `//` or `%r{}` parentheses. For example `\b\u0415`. In case a rule is depend on previous or next content, lookahead or lookbehind could be used. For example a rule with the pattern `(?<=[АаЕеЁёИиОоУуЫыЭэЮюЯяЙйЪъЬь])\u0415` find every Е after upper or lower case symbols a, e, ё, и, о, у, ы, э, ю, я, й, ъ, ь.
171
+
172
+ Result is a replacement a for pattern's match. It can contain a string, an Unicode characters specified by a hexadecimal number, a captured group reference. String with hexadecimal number or captured group reference should be double quoted. For example `"Y\u00eb"` or `"\\1\u00b7\\2"`. Captured group are referred by double backslash and group's number.
173
+
174
+ Because rules are applied in order, multiple rules applicable to the same segment of a string can be addressed by rule ordering, and rules can be used as priority over characters. For example:
175
+
176
+ [source,yaml]
177
+ ----
178
+ map:
179
+ rules:
180
+ - pattern: \u03B3\u03B3 # γ (before Γ, Ξ, Χ)
181
+ result: ng
182
+ - pattern: (?<![Γγ])\u03B3(?=[ΕεέΗηήΙιίΥυύ]) # γ (before front vowels)
183
+ result: y
184
+ ----
185
+
186
+ (γι maps to `yi`; but γγ maps to `ng`. In the case of γγι, the first rule takes priority, and the transliteration is `ngi`: it makes the second rule impossible.)
187
+
188
+ [source,yaml]
189
+ ----
190
+ map:
191
+ rules:
192
+ - pattern: (?<=\b)\u03BC[πΠ] # μπ (initially)
193
+ result: b
194
+ - pattern: \u03BC[πΠ] # μπ (medially)
195
+ result: mb
196
+ ----
197
+
198
+ (The first rule applies at the start of a word; the second rule does not specify a context, as it applies in all other cases not covered by the first rule.)
199
+
200
+ [source,yaml]
201
+ ----
202
+ map:
203
+ rules:
204
+ - pattern: ";"
205
+ result: "?"
206
+
207
+ characters
208
+ "\u00B7": ";
209
+ ----
210
+
211
+ (This guarantees that any `;` are converted to `?` before any new `;` are introduced; because all three are Latin script, they could be mixed up in ordering.)
212
+
213
+ Normally rules "`bleed`" each other: once a rule applies to a segment, that segment cannot trigger other rules, because it is already converted to Roman. Exceptionally, it will be necessary to have a rule add or remove characters in the original script, rather than transliterate them, so that the same context can be invoked by two rules in succession:
214
+
215
+ [source,yaml]
216
+ ----
217
+ map:
218
+ rules:
219
+ - pattern: (?<=[АаЕеЁёИиОоУуЫыЭэЮюЯя])\u042b # Ы after any vowel character
220
+ result: "\u00b7Ы"
221
+ - pattern: \u042b(?=[АаУуЫыЭэ]) # Ы before а, у, ы, or э
222
+ result: "Ы\u00b7"
223
+ ----
224
+
225
+ (If the result were `\u00B7Y`, the second rule could not be applied afterwards; but we want ОЫУ to transliterate as `O·Y·U`. In order to make that happen, we preserve the Ы during the rules phase, resulting in О·Ы·У; we only convert the letters to Roman script in the `characters` phase.)
226
+
227
+ === Testing transliteration systems
228
+
229
+ To test all transliteration systems in the `maps/` directory, run:
27
230
 
28
231
  [source,sh]
29
232
  ----
30
- # Transliterating Russian Cyrillic to Latin using the Streamlined System for Russian
31
- interscript samples/rus-Cyrl.txt --system=bas-rus-Cyrl-Latn-bss --output=rus-Latn.txt
233
+ bundle exec rspec
234
+ ----
235
+
236
+ The command takes `source` texts from the `test` section, transforms
237
+ them using `rules` and `charmaps` from the `map` key, and compares the
238
+ results with `expected:` text from the `source:` section.
239
+
240
+ To test a specific transliteration system, set the environment variable
241
+ `TRANSLIT_SYSTEM` to the system code of the desired system
242
+ (i.e. the "`basename`" of the system's YAML file):
32
243
 
33
- # Transliterating Russian Cyrillic to Latin using the BGN/PCGN Romanization of Russian
34
- interscript samples/rus-Cyrl.txt --system=bgnpcgn-rus-Cyrl-Latn-1947 --output=rus-Latn.txt
244
+ [source,sh]
245
+ ----
246
+ TRANSLIT_SYSTEM=bgnpcgn-rus-Cyrl-Latn-1947 bundle exec rspec
35
247
  ----
36
248
 
37
249
 
38
250
  == ISCS system codes
39
251
 
40
- The system code identifying a script conversion system has a few components:
252
+ In accordance with
253
+ http://calconnect.gitlab.io/tc-localization/csd-transcription-systems[ISO/CC 24229],
254
+ the system code identifying a script conversion system has the following components:
41
255
 
42
- e.g. `bgnpcgn-rus-Cyrl-Latn-1947`
256
+ e.g. `bgnpcgn-rus-Cyrl-Latn-1947`:
43
257
 
44
258
  `bgnpcgn`:: the authority identifier
45
259
  `rus`:: an ISO 639-2 3-letter language code that this system applies to
@@ -53,13 +267,31 @@ e.g. `bgnpcgn-rus-Cyrl-Latn-1947`
53
267
  Currently the schemes cover Cyrillic, Armenian, Greek, Arabic and Hebrew.
54
268
 
55
269
 
56
- == Sources
270
+ == Samples to play with
57
271
 
58
272
  * `rus-Cyrl-1.txt`: Copied from the XLS output from http://www.primorsk.vybory.izbirkom.ru/region/primorsk?action=show&global=true&root=254017025&tvd=4254017212287&vrn=100100067795849&prver=0&pronetvd=0&region=25&sub_region=25&type=242&vibid=4254017212287
59
273
 
60
274
  * `rus-Cyrl-2.txt`: Copied from the XLS output from http://www.yaroslavl.vybory.izbirkom.ru/region/yaroslavl?action=show&root=764013001&tvd=4764013188704&vrn=4764013188693&prver=0&pronetvd=0&region=76&sub_region=76&type=426&vibid=4764013188704
61
275
 
62
276
 
63
- == Credits
277
+ == References
278
+
279
+ Reference documents are located at the
280
+ https://github.com/interscript/interscript-references[interscript-references repository].
281
+ Some specifications that have distribution limitations may not be reproduced there.
282
+
283
+
284
+ == Links to system definitions
285
+
286
+ * https://www.iso.org/committee/48750.html[ISO/TC 46 (see standards published by WG 3)]
287
+ * http://geonames.nga.mil/gns/html/romanization.html[BGN/PCGN and BGN Romanization systems (BGN)]
288
+ * https://www.gov.uk/government/publications/romanization-systems[BGN/PCGN Romanization systems (PCGN)]
289
+ * https://www.loc.gov/catdir/cpso/roman.html[ALA-LC Romanization systems in current use]
290
+ * http://catdir.loc.gov/catdir/cpso/roman.html[ALA-LC Romanization systems from 1997]
291
+ * http://www.eki.ee/wgrs/[UN Romanization systems]
292
+ * http://www.eki.ee/knab/kblatyl2.htm[EKI KNAB systems]
293
+
294
+ == Copyright and license
295
+
296
+ This is a Ribose project. Copyright Ribose.
64
297
 
65
- This is a Ribose project.
@@ -1,20 +1,41 @@
1
1
  #!/usr/bin/env ruby
2
+
2
3
  require 'rubygems'
3
- require_relative '../lib/interscript'
4
-
5
- if ARGV.empty?
6
- puts "write source file, source format, and output file"
7
- else
8
- args = Hash[ARGV.flat_map { |s| s.scan(/--?([^=\s]+)(?:=(\S+))?/) }]
9
- input = ARGV[0]
10
- system_code = args["system"]
11
- output_file = args["output"]
12
-
13
- raise "Please enter the system code with --system={system_code}" unless system_code
14
-
15
- if output_file
16
- Interscript.transliterate_file(system_code, input, output_file)
17
- else
18
- puts Interscript.transliterate(system_code, IO.read(input))
19
- end
4
+
5
+ # resolve bin path, ignoring symlinks
6
+ require 'pathname'
7
+ bin_file = Pathname.new(__FILE__).realpath
8
+
9
+ # add self to libpath
10
+ $LOAD_PATH.unshift File.expand_path("../../lib", bin_file)
11
+
12
+ # Fixes https://github.com/rubygems/rubygems/issues/1420
13
+ require 'rubygems/specification'
14
+
15
+ class Gem::Specification
16
+ def this; self; end
20
17
  end
18
+
19
+ require 'interscript/command'
20
+
21
+ if ARGV.any? && !Interscript::Command.all_tasks.key?(ARGV.first)
22
+ ARGV.unshift :translit
23
+ end
24
+ Interscript::Command.start ARGV
25
+
26
+ # if ARGV.empty?
27
+ # puts "write source file, source format, and output file"
28
+ # else
29
+ # args = Hash[ARGV.flat_map { |s| s.scan(/--?([^=\s]+)(?:=(\S+))?/) }]
30
+ # input = ARGV[0]
31
+ # system_code = args["system"]
32
+ # output_file = args["output"]
33
+
34
+ # raise "Please enter the system code with --system={system_code}" unless system_code
35
+
36
+ # if output_file
37
+ # Interscript.transliterate_file(system_code, input, output_file)
38
+ # else
39
+ # puts Interscript.transliterate(system_code, IO.read(input))
40
+ # end
41
+ # end
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,34 @@
1
+ import g2p, SequiturTool
2
+ import numpy
3
+
4
+ def transliterate(model, word):
5
+
6
+ class Struct:
7
+ def __init__(self, **entries):
8
+ self.__dict__.update(entries)
9
+
10
+ model_path = {
11
+ 'pythainlp_lexicon': './lib/model-7',
12
+ 'wiktionary_phonemic': './lib/tha-pt-b-7'
13
+ }
14
+
15
+ connector_dict = {
16
+ 'pythainlp_lexicon': '',
17
+ 'wiktionary_phonemic': '-'
18
+ }
19
+
20
+
21
+ modelFile = model_path[model]
22
+ connector = connector_dict[model]
23
+
24
+ options = Struct(**{'profile': None, 'resource_usage': None, 'psyco': None, 'tempdir': None, 'trainSample': None, 'develSample': None, 'testSample': None, 'checkpoint': None, 'resume_from_checkpoint': None, 'shouldTranspose': None, 'modelFile': modelFile , 'newModelFile': None, 'shouldTestContinuously': None, 'shouldSelfTest': None, 'lengthConstraints': None, 'shouldSuppressNewMultigrams': None, 'viterbi': None, 'shouldRampUp': None, 'shouldWipeModel': None, 'shouldInitializeWithCounts': None, 'minIterations': 20, 'maxIterations': 100, 'eager_discount_adjustment': None, 'fixed_discount': None, 'encoding': 'UTF-8', 'phoneme_to_phoneme': None, 'test_segmental': None, 'testResult': None, 'applySample': None, 'applyWord': word, 'variants_mass': None, 'variants_number': None, 'fakeTranslator': None, 'stack_limit': None})
25
+
26
+ loadSample = g2p.loadG2PSample
27
+
28
+ model = SequiturTool.procureModel(options, loadSample)
29
+ if not model:
30
+ return 1
31
+ translator = g2p.Translator(model)
32
+ del model
33
+
34
+ return connector.join(translator(tuple(word)))
@@ -1,39 +1,163 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'yaml'
3
+ require "yaml"
4
+ require "interscript/mapping"
4
5
 
5
6
  # Transliteration
6
7
  module Interscript
7
- SYSTEM_DEFINITIONS_PATH = File.expand_path('../maps', __dir__)
8
8
 
9
9
  class << self
10
- def transliterate_file(system_code, input_file, output_file)
10
+ def root_path
11
+ @root_path ||= Pathname.new(File.dirname(__dir__))
12
+ end
13
+
14
+ def transliterate_file(system_code, input_file, output_file, maps)
11
15
  input = File.read(input_file)
12
- output = transliterate(system_code, input)
16
+ output = transliterate(system_code, input, maps)
13
17
 
14
- File.open(output_file, "w") do |f|
18
+ File.open(output_file, 'w') do |f|
15
19
  f.puts(output)
16
20
  end
17
21
  puts "Output written to: #{output_file}"
18
22
  end
19
23
 
20
- def load_system_definition(system_code)
21
- YAML.load_file(File.join(SYSTEM_DEFINITIONS_PATH, "#{system_code}.yaml"))
24
+ def import_python_modules
25
+ begin
26
+ pyimport :g2pwrapper
27
+ rescue
28
+ pyimport :sys
29
+ sys.path.append(root_path.to_s+"/lib/")
30
+ pyimport :g2pwrapper
31
+ end
32
+ end
33
+
34
+ def external_process(process_name, string)
35
+ import_python_modules
36
+ case process_name
37
+ when 'sequitur.pythainlp_lexicon'
38
+ return g2pwrapper.transliterate('pythainlp_lexicon', string)
39
+ when 'sequitur.wiktionary_phonemic'
40
+ return g2pwrapper.transliterate('wiktionary_phonemic', string)
41
+ else
42
+ puts "Invalid Process"
43
+ end
22
44
  end
23
45
 
24
- def transliterate(system_code, string)
25
- system = load_system_definition(system_code)
46
+ def transliterate(system_code, string, maps={})
47
+ if (!maps.has_key?system_code)
48
+ maps[system_code] = Interscript::Mapping.for(system_code)
49
+ end
50
+ # mapping = Interscript::Mapping.for(system_code)
51
+ mapping = maps[system_code]
52
+
26
53
 
27
- rules = system["map"]["rules"] || []
28
- charmap = system["map"]["characters"] || {}
54
+ # First, apply chained transliteration as specified in the list `chain`
55
+ chain = mapping.chain.dup
56
+ while chain.length > 0
57
+ string = transliterate(chain.shift, string, maps)
58
+ end
59
+
60
+ # Then, apply the rest of the map
61
+ separator = mapping.character_separator || ""
62
+ word_separator = mapping.word_separator || ""
63
+ title_case = mapping.title_case
64
+ downcase = mapping.downcase
65
+
66
+ # charmap = mapping.characters&.sort_by { |k, _v| k.size }&.reverse&.to_h
67
+ # dictmap = mapping.dictionary&.sort_by { |k, _v| k.size }&.reverse&.to_h
68
+ charmap = mapping.characters_hash
69
+ dictmap = mapping.dictionary_hash
70
+ trie = mapping.dictionary_trie
71
+
72
+ # Segmentation
73
+ string = external_process(mapping.segmentation, string) if mapping.segmentation
29
74
 
30
- rules.each do |r|
31
- string.gsub! %r{#{r["pattern"]}}, r["result"]
75
+ # Transliteration/Transcription
76
+ string = external_process(mapping.transcription, string) if mapping.transcription
77
+
78
+ pos = 0
79
+ while pos < string.to_s.size
80
+ m = 0
81
+ wordmatch = ""
82
+
83
+ # Using Trie, find the longest matching substring
84
+ while (pos + m < string.to_s.size) && (trie.partial_word?string[pos..pos+m])
85
+ wordmatch = string[pos..pos+m] if trie.word?string[pos..pos+m]
86
+ m += 1
87
+ end
88
+ m = wordmatch.length
89
+ if m > 0
90
+ repl = dictmap[string[pos..pos+m-1]]
91
+ string[pos..pos+m-1] = repl
92
+ pos += repl.length
93
+ else
94
+ pos += 1
95
+ end
32
96
  end
33
97
 
34
- string.split('').map do |char|
35
- charmap[char] || char
36
- end.join('')
98
+ output = string.clone
99
+ offsets = Array.new string.to_s.size, 1
100
+
101
+ # mapping.rules.each do |r|
102
+ # string.to_s.scan(/#{r['pattern']}/) do |matches|
103
+ # match = Regexp.last_match
104
+ # pos = match.offset(0).first
105
+ # result = r['result'].clone
106
+ # matches.each.with_index { |v, i| result.sub!(/\\#{i + 1}/, v) } if matches.is_a? Array
107
+ # result.upcase! if up_case_around?(string, pos)
108
+ # output[offsets[0...pos].sum, match[0].size] = result
109
+ # offsets[pos] += result.size - match[0].size
110
+ # end
111
+ # end
112
+ mapping.rules.each do |r|
113
+ output.gsub!(/#{r['pattern']}/, r['result'])
114
+ end
115
+
116
+ charmap.each do |k, v|
117
+ while (match = output&.match(/#{k}/))
118
+ pos = match.offset(0).first
119
+ result = !downcase && up_case_around?(output, pos) ? v.upcase : v
120
+ result = result[0] if result.is_a?(Array) # if more than one, choose the first one
121
+ output[pos, match[0].size] = add_separator(separator, pos, result)
122
+ end
123
+ end
124
+
125
+ mapping.postrules.each do |r|
126
+ output.gsub!(/#{r['pattern']}/, r['result'])
127
+ end
128
+
129
+ if output
130
+ output.sub!(/^(.)/, &:upcase) if title_case
131
+ if word_separator != ''
132
+ output.gsub!(/#{word_separator}#{separator}/,word_separator)
133
+ output.gsub!(/#{word_separator}(.)/, &:upcase) if title_case
134
+ end
135
+ end
136
+
137
+ output ? output.unicode_normalize : output
138
+ end
139
+
140
+ private
141
+
142
+ def add_separator(separator, pos, result)
143
+ pos == 0 ? result : separator + result
144
+ end
145
+
146
+ def up_case_around?(string, pos)
147
+ return false if string[pos] == string[pos].downcase
148
+
149
+ i = pos - 1
150
+ i -= 1 while i.positive? && string[i] !~ /[[:alpha:]]/
151
+ before = i >= 0 && i < pos ? string[i].to_s.strip : ''
152
+
153
+ i = pos + 1
154
+ i += 1 while i < string.size - 1 && string[i] !~ /[[:alpha:]]/
155
+ after = i > pos ? string[i].to_s.strip : ''
156
+
157
+ before_uc = !before.empty? && before == before.upcase
158
+ after_uc = !after.empty? && after == after.upcase
159
+ # before_uc && (after.empty? || after_uc) || after_uc && (before.empty? || before_uc)
160
+ before_uc || after_uc
37
161
  end
38
162
  end
39
163
  end