interscript 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +246 -14
  3. data/bin/interscript +38 -17
  4. data/bin/setup +8 -0
  5. data/lib/g2pwrapper.py +34 -0
  6. data/lib/interscript.rb +140 -16
  7. data/lib/interscript/command.rb +27 -0
  8. data/lib/interscript/mapping.rb +125 -0
  9. data/lib/interscript/version.rb +1 -1
  10. data/lib/model-7 +0 -0
  11. data/lib/tha-pt-b-7 +0 -0
  12. data/maps/acadsin-zho-Hani-Latn-2002.yaml +38912 -0
  13. data/maps/alalc-bel-cyrl-latn-1997.yaml +125 -0
  14. data/maps/alalc-ben-Beng-Latn-2017.yaml +130 -0
  15. data/maps/alalc-bul-Cyrl-Latn-1997.yaml +94 -0
  16. data/maps/alalc-ell-Grek-Latn-1997.yaml +625 -0
  17. data/maps/alalc-ell-Grek-Latn-2010.yaml +628 -0
  18. data/maps/alalc-kat-Geok-Latn-1997.yaml +112 -0
  19. data/maps/alalc-kat-Geor-Latn-1997.yaml +146 -0
  20. data/maps/alalc-kor-Hang-Latn-1997.yaml +94 -0
  21. data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +103 -0
  22. data/maps/alalc-mkd-cyrl-latn-1997.yaml +114 -0
  23. data/maps/alalc-srp-Cyrl-Latn-1997.yaml +114 -0
  24. data/maps/alalc-srp-cyrl-latn-2013.yaml +135 -0
  25. data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +141 -0
  26. data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +16 -0
  27. data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +283 -0
  28. data/maps/{bas-rus-Cyrl-Latn-bss.yaml → bas-rus-Cyrl-Latn-2017-bss.yaml} +57 -31
  29. data/maps/{bas-rus-Cyrl-Latn-oss.yaml → bas-rus-Cyrl-Latn-2017-oss.yaml} +54 -34
  30. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +294 -0
  31. data/maps/bgn-kor-Hang-Latn-1943.yaml +31 -0
  32. data/maps/bgn-kor-Kore-Latn-1943.yaml +31 -0
  33. data/maps/bgna-bul-Cyrl-Latn-2006.yaml +208 -0
  34. data/maps/bgna-bul-Cyrl-Latn-2009.yaml +208 -0
  35. data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +1 -2
  36. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +104 -0
  37. data/maps/bgnpcgn-bel-cyrl-latn-1979.yaml +285 -0
  38. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +115 -0
  39. data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +10 -64
  40. data/maps/bgnpcgn-chn-Hans-Latn-1979.yaml +7456 -0
  41. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +702 -0
  42. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +20 -0
  43. data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +257 -0
  44. data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +127 -0
  45. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +43 -0
  46. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +253 -0
  47. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +48 -0
  48. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +48 -0
  49. data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +159 -0
  50. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +190 -0
  51. data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +145 -64
  52. data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +166 -0
  53. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +75 -2
  54. data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +208 -0
  55. data/maps/by-bel-Cyrl-Latn-1998.yaml +168 -0
  56. data/maps/by-bel-Cyrl-Latn-2007.yaml +115 -0
  57. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +685 -0
  58. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +681 -0
  59. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +20 -0
  60. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +32 -0
  61. data/maps/ggg-kat-Geor-Latn-2002.yaml +89 -0
  62. data/maps/gki-bel-cyrl-latn-1992.yaml +33 -0
  63. data/maps/gki-bel-cyrl-latn-2000.yaml +201 -0
  64. data/maps/gost-rus-cyrl-latn-16876-71-1983.yaml +186 -0
  65. data/maps/hk-yue-Hani-Latn-1888.yaml +38497 -0
  66. data/maps/icao-bel-Cyrl-Latn-9303.yaml +108 -92
  67. data/maps/icao-bul-Cyrl-Latn-9303.yaml +1 -2
  68. data/maps/icao-heb-Hebr-Latn-9303.yaml +118 -124
  69. data/maps/icao-mkd-Cyrl-Latn-9303.yaml +1 -2
  70. data/maps/icao-per-Arab-Latn-9303.yaml +5 -6
  71. data/maps/icao-rus-Cyrl-Latn-9303.yaml +1 -2
  72. data/maps/icao-srp-Cyrl-Latn-9303.yaml +1 -2
  73. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +1 -2
  74. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +610 -0
  75. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +41 -0
  76. data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +62 -0
  77. data/maps/{iso-rus-Cyrl-Latn-iso9.yaml → iso-rus-Cyrl-Latn-9-1995.yaml} +2 -3
  78. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +109 -0
  79. data/maps/kp-kor-Hang-Latn-2002.yaml +901 -0
  80. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +44820 -0
  81. data/maps/mext-jpn-Hrkt-Latn-1954.yaml +411 -0
  82. data/maps/moct-kor-Hang-Latn-2000.yaml +803 -0
  83. data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +541 -0
  84. data/maps/nil-kor-Hang-Hang-jamo.yaml +11193 -0
  85. data/maps/odni-kat-Geor-Latn-2015.yaml +88 -0
  86. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +157 -0
  87. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +90 -0
  88. data/maps/royin-tha-Thai-Latn-1968.yaml +179 -0
  89. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +180 -0
  90. data/maps/royin-tha-Thai-Latn-1999.yaml +76 -0
  91. data/maps/{cn-chn-Hans-Latn-pinyin.yaml → sac-zho-Hans-Latn-1979.yaml} +6 -7
  92. data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +222 -0
  93. data/maps/ua-ukr-Cyrl-Latn-1996.yaml +193 -0
  94. data/maps/un-bel-Cyrl-Latn-2007.yaml +114 -0
  95. data/maps/un-ben-Beng-Latn-2016.yaml +534 -0
  96. data/maps/un-ell-Grek-Latn-1987-tl.yaml +32 -0
  97. data/maps/un-ell-Grek-Latn-1987-ts.yaml +20 -0
  98. data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +780 -0
  99. data/maps/un-mon-Mong-Latn-2013.yaml +19 -6
  100. data/maps/un-rus-Cyrl-Latn-1987.yaml +166 -0
  101. data/maps/un-ukr-cyrl-latn-1998.yaml +30 -0
  102. data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +406 -0
  103. data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +386 -0
  104. data/maps/var-kor-Hang-Latn-mr-1939.yaml +1054 -0
  105. data/maps/var-kor-Kore-Hang-2013.yaml +59754 -0
  106. data/maps/var-kor-Kore-Latn-mr-1939.yaml +37 -0
  107. data/maps/var-tha-Thai-Thai-phonemic.yaml +59 -0
  108. data/maps/var-tha-Thai-Zsym-ipa.yaml +301 -0
  109. data/maps/var-zho-Hani-Latn-1979.yaml +38908 -0
  110. data/spec/interscript/mapping_spec.rb +42 -0
  111. data/spec/interscript_spec.rb +20 -5
  112. data/spec/spec_helper.rb +3 -1
  113. metadata +149 -24
  114. data/maps/bgnpcgn-chn-Hans-Latn-pinyin.yaml +0 -7503
  115. data/maps/historic-jpn-Hrkt-Latn-hepburn.yaml +0 -336
  116. data/maps/icao-gre-Grek-Latn-9303.yaml +0 -101
  117. data/maps/mext-jpn-Hrkt-Latn-hepburn.yaml +0 -330
  118. data/maps/mext-jpn-Hrkt-Latn-kunrei.yaml +0 -308
  119. data/maps/un-jpn-Hrkt-Latn-hepburn.yaml +0 -313
  120. data/maps/un-jpn-Hrkt-Latn-kunrei.yaml +0 -354
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 736d2d149984ce550327443c83f4f8b65ad3a46c106bb2c5d30392292b9e2ed6
4
- data.tar.gz: 0dd4633aeccdfb1acfc618fe8708aafd4b7dac8b051fc6e6ac0420fdacc46066
3
+ metadata.gz: 643981da933194b2464ea279e9d31b9fcd9d32519c5cd236ed805855c93755ad
4
+ data.tar.gz: f54c4303bb02f0a873cfdf96287d78321648cee19c685bf338cb9f8e2f642c56
5
5
  SHA512:
6
- metadata.gz: 156e919c03e8e7a7a0ce804b0d5402df833783bc82037d5fd06b75b4464cee12d4c426a88911394963888205ff4e7bc71e15eb19a6096cc1e71cd7d406efc3a1
7
- data.tar.gz: c821069c94ba9e06d2a70b7e0b20916d1222fabf0a4daf7f5014999e24aac8d218beeb59b0316c885bfb28d84a11fb50d4b4216b157a6089f282811ef7ffa590
6
+ metadata.gz: 2d8cfd0d60e2d41d8b1e31b4e61353b0bc7fd5ac4fc426d4304ccc86bc0bb6d84b4b4a2a6e44bb342afa6c20202a4bca4180a1f5037c73072e246038c6f36f1f
7
+ data.tar.gz: 2a5fffac1de98702494f69d55b2de5200684195b0f7948619bfa2ae9f3f97810c731868f2550578f5ad97a9db9fa72d9c2abad24451437b7e08673dfc1cd97d8
@@ -1,45 +1,259 @@
1
- = Interscript: Interoperable Script Conversion Systems and a Ruby implementation
1
+ = Interscript: Interoperable Script Conversion Systems, with a Ruby implementation
2
2
 
3
- == Introducation
3
+ image:https://github.com/interscript/interscript/workflows/test/badge.svg["Build Status", link="https://github.com/interscript/interscript/actions?workflow=test"]
4
4
 
5
- This repository contains a number of transliteration schemes from:
5
+ == Introduction
6
6
 
7
+ This repository contains interoperable transliteration schemes from:
8
+
9
+ * ALA-LC
7
10
  * BGN/PCGN
8
11
  * ICAO
9
12
  * ISO
10
13
  * UN (by UNGEGN)
14
+ * Many, many other script conversion system authorities.
11
15
 
12
16
  The goal is to achieve interoperable transliteration schemes allowing quality comparisons.
13
17
 
14
18
 
15
- == STATUS (work in progress!)
16
19
 
17
- These transliteration systems currently work:
20
+ == Demonstration
21
+
22
+ These transliteration systems are used in the demo:
18
23
 
19
24
  `bgnpcgn-rus-Cyrl-Latn-1947`:: BGN/PCGN Romanization of Russian
20
25
  `iso-rus-Cyrl-Latn-iso9`:: ISO 9 Romanization of Russian
21
26
  `icao-rus-Cyrl-Latn-9303`:: ICAO MRZ Romanization of Russian
22
27
  `bas-rus-Cyrl-Latn-bss`:: Bulgaria Academy of Science Streamlined System for Russian
23
28
 
29
+ image:demo/20191118-interscript-demo-cast.gif["interscript screencast"]
30
+
31
+
32
+ == Installation
33
+
34
+ === Prerequisites
35
+
36
+ Linux:
37
+
38
+ [source,sh]
39
+ ----
40
+ apt-get install swig python3-setuptools
41
+ ----
42
+
43
+ Windows:
44
+
45
+ [source,sh]
46
+ ----
47
+ choco install --no-progress swig
48
+ ----
49
+
50
+ Interscript depends on Python and the https://github.com/sequitur-g2p/sequitur-g2p[`sequitur-g2p`] module
51
+
52
+ [source,sh]
53
+ ----
54
+ pip3 install setuptools numpy
55
+ curl -sSL -o sequitur-g2p.zip https://github.com/sequitur-g2p/sequitur-g2p/archive/806273f.zip
56
+ pip3 install sequitur-g2p.zip
57
+ ----
58
+
59
+ Interscript depends on Ruby. Once you manage to install Ruby, it's easy.
60
+
61
+ [source,sh]
62
+ ----
63
+ gem install interscript
64
+ ----
24
65
 
25
66
  == Usage
26
67
 
68
+ Assume you have a file ready in the source script like this:
69
+
70
+ [source,sh]
71
+ ----
72
+ cat <<EOT > rus-Cyrl.txt
73
+ Эх, тройка! птица тройка, кто тебя выдумал? знать, у бойкого народа ты
74
+ могла только родиться, в той земле, что не любит шутить, а
75
+ ровнем-гладнем разметнулась на полсвета, да и ступай считать версты,
76
+ пока не зарябит тебе в очи. И не хитрый, кажись, дорожный снаряд, не
77
+ железным схвачен винтом, а наскоро живьём с одним топором да долотом
78
+ снарядил и собрал тебя ярославский расторопный мужик. Не в немецких
79
+ ботфортах ямщик: борода да рукавицы, и сидит чёрт знает на чём; а
80
+ привстал, да замахнулся, да затянул песню — кони вихрем, спицы в
81
+ колесах смешались в один гладкий круг, только дрогнула дорога, да
82
+ вскрикнул в испуге остановившийся пешеход — и вон она понеслась,
83
+ понеслась, понеслась!
84
+
85
+ Н.В. Гоголь
86
+ EOT
87
+ ----
88
+
89
+ You can run `interscript` on this text using different transliteration systems.
90
+
91
+ [source,sh]
92
+ ----
93
+ interscript rus-Cyrl.txt \
94
+ --system=bgnpcgn-rus-Cyrl-Latn-1947 \
95
+ --output=bgnpcgn-rus-Latn.txt
96
+
97
+ interscript rus-Cyrl.txt \
98
+ --system=iso-rus-Cyrl-Latn-iso9 \
99
+ --output=iso-rus-Latn.txt
100
+
101
+ interscript rus-Cyrl.txt \
102
+ --system=icao-rus-Cyrl-Latn-9303 \
103
+ --output=icao-rus-Latn.txt
104
+
105
+ interscript rus-Cyrl.txt \
106
+ --system=bas-rus-Cyrl-Latn-bss \
107
+ --output=bas-rus-Latn.txt
108
+ ----
109
+
110
+ It is then easy to see the exact differences in rendering between the systems.
111
+
112
+ [source,sh]
113
+ ----
114
+ diff bgnpcgn-rus-Latn.txt bas-rus-Latn.txt
115
+ ----
116
+
117
+ == Adding transliteration system
118
+
119
+ Transliteration systems stored in a `maps/` directory as YAML files.
120
+ You can create a new file and add it to the directory.
121
+
122
+ The file should be named as `<system-code>.yaml`, where `system-code`
123
+ is in accordance with
124
+ http://calconnect.gitlab.io/tc-localization/csd-transcription-systems[ISO/CC 24229].
125
+
126
+ === File structure
127
+
128
+ [source,yaml]
129
+ ----
130
+ authority_id: bgnpcgn
131
+ id: 1947
132
+ language: rus
133
+ source_script: Cyrl
134
+ destination_script: Latn
135
+ name: ROMANIZATION OF RUSSIAN, BGN/PCGN 1947 System
136
+ url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/807920/ROMANIZATION_OF_RUSSIAN.pdf
137
+ creation_date: 1947
138
+ confirmation_date: 2019-06
139
+ description: The BGN/PCGN system for Russian was adopted ...
140
+
141
+ notes:
142
+ - The character e should be romanized ye initially, after the vowel ...
143
+
144
+ tests:
145
+ - source: ДЛИННОЕ ПОКРЫВАЛО
146
+ expected: DLINNOYE POKRYVALO
147
+ - source: Еловая шишка
148
+ expected: Yelovaya shishka
149
+
150
+ map:
151
+ rules:
152
+ - pattern: (?<=[АаЕеЁёИиОоУуЫыЭэЮюЯяЙйЪъЬь])\u0415 # Е after a, e, ё, и, о, у, ы, э, ю, я, й, ъ, ь
153
+ result: Ye
154
+ - pattern: \b\u0415 # Е initially
155
+ result: Ye
156
+
157
+ characters:
158
+ "\u0410": "A"
159
+ "\u0411": "B"
160
+ "\u0412": "V"
161
+ ----
162
+
163
+
164
+ === Rules
165
+
166
+ The subsection `rules` is placed under the `map` key. All rules are applied in order they are placed before the subsection `characters` applying. Rules apply to an original text, not to a result of previous rules applying.
167
+
168
+ Each rule has `pattern` and `result` elements.
169
+
170
+ Pattern is a regex expression. It should be representing as a string without `//` or `%r{}` parentheses. For example `\b\u0415`. In case a rule is depend on previous or next content, lookahead or lookbehind could be used. For example a rule with the pattern `(?<=[АаЕеЁёИиОоУуЫыЭэЮюЯяЙйЪъЬь])\u0415` find every Е after upper or lower case symbols a, e, ё, и, о, у, ы, э, ю, я, й, ъ, ь.
171
+
172
+ Result is a replacement a for pattern's match. It can contain a string, an Unicode characters specified by a hexadecimal number, a captured group reference. String with hexadecimal number or captured group reference should be double quoted. For example `"Y\u00eb"` or `"\\1\u00b7\\2"`. Captured group are referred by double backslash and group's number.
173
+
174
+ Because rules are applied in order, multiple rules applicable to the same segment of a string can be addressed by rule ordering, and rules can be used as priority over characters. For example:
175
+
176
+ [source,yaml]
177
+ ----
178
+ map:
179
+ rules:
180
+ - pattern: \u03B3\u03B3 # γ (before Γ, Ξ, Χ)
181
+ result: ng
182
+ - pattern: (?<![Γγ])\u03B3(?=[ΕεέΗηήΙιίΥυύ]) # γ (before front vowels)
183
+ result: y
184
+ ----
185
+
186
+ (γι maps to `yi`; but γγ maps to `ng`. In the case of γγι, the first rule takes priority, and the transliteration is `ngi`: it makes the second rule impossible.)
187
+
188
+ [source,yaml]
189
+ ----
190
+ map:
191
+ rules:
192
+ - pattern: (?<=\b)\u03BC[πΠ] # μπ (initially)
193
+ result: b
194
+ - pattern: \u03BC[πΠ] # μπ (medially)
195
+ result: mb
196
+ ----
197
+
198
+ (The first rule applies at the start of a word; the second rule does not specify a context, as it applies in all other cases not covered by the first rule.)
199
+
200
+ [source,yaml]
201
+ ----
202
+ map:
203
+ rules:
204
+ - pattern: ";"
205
+ result: "?"
206
+
207
+ characters
208
+ "\u00B7": ";
209
+ ----
210
+
211
+ (This guarantees that any `;` are converted to `?` before any new `;` are introduced; because all three are Latin script, they could be mixed up in ordering.)
212
+
213
+ Normally rules "`bleed`" each other: once a rule applies to a segment, that segment cannot trigger other rules, because it is already converted to Roman. Exceptionally, it will be necessary to have a rule add or remove characters in the original script, rather than transliterate them, so that the same context can be invoked by two rules in succession:
214
+
215
+ [source,yaml]
216
+ ----
217
+ map:
218
+ rules:
219
+ - pattern: (?<=[АаЕеЁёИиОоУуЫыЭэЮюЯя])\u042b # Ы after any vowel character
220
+ result: "\u00b7Ы"
221
+ - pattern: \u042b(?=[АаУуЫыЭэ]) # Ы before а, у, ы, or э
222
+ result: "Ы\u00b7"
223
+ ----
224
+
225
+ (If the result were `\u00B7Y`, the second rule could not be applied afterwards; but we want ОЫУ to transliterate as `O·Y·U`. In order to make that happen, we preserve the Ы during the rules phase, resulting in О·Ы·У; we only convert the letters to Roman script in the `characters` phase.)
226
+
227
+ === Testing transliteration systems
228
+
229
+ To test all transliteration systems in the `maps/` directory, run:
27
230
 
28
231
  [source,sh]
29
232
  ----
30
- # Transliterating Russian Cyrillic to Latin using the Streamlined System for Russian
31
- interscript samples/rus-Cyrl.txt --system=bas-rus-Cyrl-Latn-bss --output=rus-Latn.txt
233
+ bundle exec rspec
234
+ ----
235
+
236
+ The command takes `source` texts from the `test` section, transforms
237
+ them using `rules` and `charmaps` from the `map` key, and compares the
238
+ results with `expected:` text from the `source:` section.
239
+
240
+ To test a specific transliteration system, set the environment variable
241
+ `TRANSLIT_SYSTEM` to the system code of the desired system
242
+ (i.e. the "`basename`" of the system's YAML file):
32
243
 
33
- # Transliterating Russian Cyrillic to Latin using the BGN/PCGN Romanization of Russian
34
- interscript samples/rus-Cyrl.txt --system=bgnpcgn-rus-Cyrl-Latn-1947 --output=rus-Latn.txt
244
+ [source,sh]
245
+ ----
246
+ TRANSLIT_SYSTEM=bgnpcgn-rus-Cyrl-Latn-1947 bundle exec rspec
35
247
  ----
36
248
 
37
249
 
38
250
  == ISCS system codes
39
251
 
40
- The system code identifying a script conversion system has a few components:
252
+ In accordance with
253
+ http://calconnect.gitlab.io/tc-localization/csd-transcription-systems[ISO/CC 24229],
254
+ the system code identifying a script conversion system has the following components:
41
255
 
42
- e.g. `bgnpcgn-rus-Cyrl-Latn-1947`
256
+ e.g. `bgnpcgn-rus-Cyrl-Latn-1947`:
43
257
 
44
258
  `bgnpcgn`:: the authority identifier
45
259
  `rus`:: an ISO 639-2 3-letter language code that this system applies to
@@ -53,13 +267,31 @@ e.g. `bgnpcgn-rus-Cyrl-Latn-1947`
53
267
  Currently the schemes cover Cyrillic, Armenian, Greek, Arabic and Hebrew.
54
268
 
55
269
 
56
- == Sources
270
+ == Samples to play with
57
271
 
58
272
  * `rus-Cyrl-1.txt`: Copied from the XLS output from http://www.primorsk.vybory.izbirkom.ru/region/primorsk?action=show&global=true&root=254017025&tvd=4254017212287&vrn=100100067795849&prver=0&pronetvd=0&region=25&sub_region=25&type=242&vibid=4254017212287
59
273
 
60
274
  * `rus-Cyrl-2.txt`: Copied from the XLS output from http://www.yaroslavl.vybory.izbirkom.ru/region/yaroslavl?action=show&root=764013001&tvd=4764013188704&vrn=4764013188693&prver=0&pronetvd=0&region=76&sub_region=76&type=426&vibid=4764013188704
61
275
 
62
276
 
63
- == Credits
277
+ == References
278
+
279
+ Reference documents are located at the
280
+ https://github.com/interscript/interscript-references[interscript-references repository].
281
+ Some specifications that have distribution limitations may not be reproduced there.
282
+
283
+
284
+ == Links to system definitions
285
+
286
+ * https://www.iso.org/committee/48750.html[ISO/TC 46 (see standards published by WG 3)]
287
+ * http://geonames.nga.mil/gns/html/romanization.html[BGN/PCGN and BGN Romanization systems (BGN)]
288
+ * https://www.gov.uk/government/publications/romanization-systems[BGN/PCGN Romanization systems (PCGN)]
289
+ * https://www.loc.gov/catdir/cpso/roman.html[ALA-LC Romanization systems in current use]
290
+ * http://catdir.loc.gov/catdir/cpso/roman.html[ALA-LC Romanization systems from 1997]
291
+ * http://www.eki.ee/wgrs/[UN Romanization systems]
292
+ * http://www.eki.ee/knab/kblatyl2.htm[EKI KNAB systems]
293
+
294
+ == Copyright and license
295
+
296
+ This is a Ribose project. Copyright Ribose.
64
297
 
65
- This is a Ribose project.
@@ -1,20 +1,41 @@
1
1
  #!/usr/bin/env ruby
2
+
2
3
  require 'rubygems'
3
- require_relative '../lib/interscript'
4
-
5
- if ARGV.empty?
6
- puts "write source file, source format, and output file"
7
- else
8
- args = Hash[ARGV.flat_map { |s| s.scan(/--?([^=\s]+)(?:=(\S+))?/) }]
9
- input = ARGV[0]
10
- system_code = args["system"]
11
- output_file = args["output"]
12
-
13
- raise "Please enter the system code with --system={system_code}" unless system_code
14
-
15
- if output_file
16
- Interscript.transliterate_file(system_code, input, output_file)
17
- else
18
- puts Interscript.transliterate(system_code, IO.read(input))
19
- end
4
+
5
+ # resolve bin path, ignoring symlinks
6
+ require 'pathname'
7
+ bin_file = Pathname.new(__FILE__).realpath
8
+
9
+ # add self to libpath
10
+ $LOAD_PATH.unshift File.expand_path("../../lib", bin_file)
11
+
12
+ # Fixes https://github.com/rubygems/rubygems/issues/1420
13
+ require 'rubygems/specification'
14
+
15
+ class Gem::Specification
16
+ def this; self; end
20
17
  end
18
+
19
+ require 'interscript/command'
20
+
21
+ if ARGV.any? && !Interscript::Command.all_tasks.key?(ARGV.first)
22
+ ARGV.unshift :translit
23
+ end
24
+ Interscript::Command.start ARGV
25
+
26
+ # if ARGV.empty?
27
+ # puts "write source file, source format, and output file"
28
+ # else
29
+ # args = Hash[ARGV.flat_map { |s| s.scan(/--?([^=\s]+)(?:=(\S+))?/) }]
30
+ # input = ARGV[0]
31
+ # system_code = args["system"]
32
+ # output_file = args["output"]
33
+
34
+ # raise "Please enter the system code with --system={system_code}" unless system_code
35
+
36
+ # if output_file
37
+ # Interscript.transliterate_file(system_code, input, output_file)
38
+ # else
39
+ # puts Interscript.transliterate(system_code, IO.read(input))
40
+ # end
41
+ # end
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,34 @@
1
+ import g2p, SequiturTool
2
+ import numpy
3
+
4
+ def transliterate(model, word):
5
+
6
+ class Struct:
7
+ def __init__(self, **entries):
8
+ self.__dict__.update(entries)
9
+
10
+ model_path = {
11
+ 'pythainlp_lexicon': './lib/model-7',
12
+ 'wiktionary_phonemic': './lib/tha-pt-b-7'
13
+ }
14
+
15
+ connector_dict = {
16
+ 'pythainlp_lexicon': '',
17
+ 'wiktionary_phonemic': '-'
18
+ }
19
+
20
+
21
+ modelFile = model_path[model]
22
+ connector = connector_dict[model]
23
+
24
+ options = Struct(**{'profile': None, 'resource_usage': None, 'psyco': None, 'tempdir': None, 'trainSample': None, 'develSample': None, 'testSample': None, 'checkpoint': None, 'resume_from_checkpoint': None, 'shouldTranspose': None, 'modelFile': modelFile , 'newModelFile': None, 'shouldTestContinuously': None, 'shouldSelfTest': None, 'lengthConstraints': None, 'shouldSuppressNewMultigrams': None, 'viterbi': None, 'shouldRampUp': None, 'shouldWipeModel': None, 'shouldInitializeWithCounts': None, 'minIterations': 20, 'maxIterations': 100, 'eager_discount_adjustment': None, 'fixed_discount': None, 'encoding': 'UTF-8', 'phoneme_to_phoneme': None, 'test_segmental': None, 'testResult': None, 'applySample': None, 'applyWord': word, 'variants_mass': None, 'variants_number': None, 'fakeTranslator': None, 'stack_limit': None})
25
+
26
+ loadSample = g2p.loadG2PSample
27
+
28
+ model = SequiturTool.procureModel(options, loadSample)
29
+ if not model:
30
+ return 1
31
+ translator = g2p.Translator(model)
32
+ del model
33
+
34
+ return connector.join(translator(tuple(word)))
@@ -1,39 +1,163 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'yaml'
3
+ require "yaml"
4
+ require "interscript/mapping"
4
5
 
5
6
  # Transliteration
6
7
  module Interscript
7
- SYSTEM_DEFINITIONS_PATH = File.expand_path('../maps', __dir__)
8
8
 
9
9
  class << self
10
- def transliterate_file(system_code, input_file, output_file)
10
+ def root_path
11
+ @root_path ||= Pathname.new(File.dirname(__dir__))
12
+ end
13
+
14
+ def transliterate_file(system_code, input_file, output_file, maps)
11
15
  input = File.read(input_file)
12
- output = transliterate(system_code, input)
16
+ output = transliterate(system_code, input, maps)
13
17
 
14
- File.open(output_file, "w") do |f|
18
+ File.open(output_file, 'w') do |f|
15
19
  f.puts(output)
16
20
  end
17
21
  puts "Output written to: #{output_file}"
18
22
  end
19
23
 
20
- def load_system_definition(system_code)
21
- YAML.load_file(File.join(SYSTEM_DEFINITIONS_PATH, "#{system_code}.yaml"))
24
+ def import_python_modules
25
+ begin
26
+ pyimport :g2pwrapper
27
+ rescue
28
+ pyimport :sys
29
+ sys.path.append(root_path.to_s+"/lib/")
30
+ pyimport :g2pwrapper
31
+ end
32
+ end
33
+
34
+ def external_process(process_name, string)
35
+ import_python_modules
36
+ case process_name
37
+ when 'sequitur.pythainlp_lexicon'
38
+ return g2pwrapper.transliterate('pythainlp_lexicon', string)
39
+ when 'sequitur.wiktionary_phonemic'
40
+ return g2pwrapper.transliterate('wiktionary_phonemic', string)
41
+ else
42
+ puts "Invalid Process"
43
+ end
22
44
  end
23
45
 
24
- def transliterate(system_code, string)
25
- system = load_system_definition(system_code)
46
+ def transliterate(system_code, string, maps={})
47
+ if (!maps.has_key?system_code)
48
+ maps[system_code] = Interscript::Mapping.for(system_code)
49
+ end
50
+ # mapping = Interscript::Mapping.for(system_code)
51
+ mapping = maps[system_code]
52
+
26
53
 
27
- rules = system["map"]["rules"] || []
28
- charmap = system["map"]["characters"] || {}
54
+ # First, apply chained transliteration as specified in the list `chain`
55
+ chain = mapping.chain.dup
56
+ while chain.length > 0
57
+ string = transliterate(chain.shift, string, maps)
58
+ end
59
+
60
+ # Then, apply the rest of the map
61
+ separator = mapping.character_separator || ""
62
+ word_separator = mapping.word_separator || ""
63
+ title_case = mapping.title_case
64
+ downcase = mapping.downcase
65
+
66
+ # charmap = mapping.characters&.sort_by { |k, _v| k.size }&.reverse&.to_h
67
+ # dictmap = mapping.dictionary&.sort_by { |k, _v| k.size }&.reverse&.to_h
68
+ charmap = mapping.characters_hash
69
+ dictmap = mapping.dictionary_hash
70
+ trie = mapping.dictionary_trie
71
+
72
+ # Segmentation
73
+ string = external_process(mapping.segmentation, string) if mapping.segmentation
29
74
 
30
- rules.each do |r|
31
- string.gsub! %r{#{r["pattern"]}}, r["result"]
75
+ # Transliteration/Transcription
76
+ string = external_process(mapping.transcription, string) if mapping.transcription
77
+
78
+ pos = 0
79
+ while pos < string.to_s.size
80
+ m = 0
81
+ wordmatch = ""
82
+
83
+ # Using Trie, find the longest matching substring
84
+ while (pos + m < string.to_s.size) && (trie.partial_word?string[pos..pos+m])
85
+ wordmatch = string[pos..pos+m] if trie.word?string[pos..pos+m]
86
+ m += 1
87
+ end
88
+ m = wordmatch.length
89
+ if m > 0
90
+ repl = dictmap[string[pos..pos+m-1]]
91
+ string[pos..pos+m-1] = repl
92
+ pos += repl.length
93
+ else
94
+ pos += 1
95
+ end
32
96
  end
33
97
 
34
- string.split('').map do |char|
35
- charmap[char] || char
36
- end.join('')
98
+ output = string.clone
99
+ offsets = Array.new string.to_s.size, 1
100
+
101
+ # mapping.rules.each do |r|
102
+ # string.to_s.scan(/#{r['pattern']}/) do |matches|
103
+ # match = Regexp.last_match
104
+ # pos = match.offset(0).first
105
+ # result = r['result'].clone
106
+ # matches.each.with_index { |v, i| result.sub!(/\\#{i + 1}/, v) } if matches.is_a? Array
107
+ # result.upcase! if up_case_around?(string, pos)
108
+ # output[offsets[0...pos].sum, match[0].size] = result
109
+ # offsets[pos] += result.size - match[0].size
110
+ # end
111
+ # end
112
+ mapping.rules.each do |r|
113
+ output.gsub!(/#{r['pattern']}/, r['result'])
114
+ end
115
+
116
+ charmap.each do |k, v|
117
+ while (match = output&.match(/#{k}/))
118
+ pos = match.offset(0).first
119
+ result = !downcase && up_case_around?(output, pos) ? v.upcase : v
120
+ result = result[0] if result.is_a?(Array) # if more than one, choose the first one
121
+ output[pos, match[0].size] = add_separator(separator, pos, result)
122
+ end
123
+ end
124
+
125
+ mapping.postrules.each do |r|
126
+ output.gsub!(/#{r['pattern']}/, r['result'])
127
+ end
128
+
129
+ if output
130
+ output.sub!(/^(.)/, &:upcase) if title_case
131
+ if word_separator != ''
132
+ output.gsub!(/#{word_separator}#{separator}/,word_separator)
133
+ output.gsub!(/#{word_separator}(.)/, &:upcase) if title_case
134
+ end
135
+ end
136
+
137
+ output ? output.unicode_normalize : output
138
+ end
139
+
140
+ private
141
+
142
+ def add_separator(separator, pos, result)
143
+ pos == 0 ? result : separator + result
144
+ end
145
+
146
+ def up_case_around?(string, pos)
147
+ return false if string[pos] == string[pos].downcase
148
+
149
+ i = pos - 1
150
+ i -= 1 while i.positive? && string[i] !~ /[[:alpha:]]/
151
+ before = i >= 0 && i < pos ? string[i].to_s.strip : ''
152
+
153
+ i = pos + 1
154
+ i += 1 while i < string.size - 1 && string[i] !~ /[[:alpha:]]/
155
+ after = i > pos ? string[i].to_s.strip : ''
156
+
157
+ before_uc = !before.empty? && before == before.upcase
158
+ after_uc = !after.empty? && after == after.upcase
159
+ # before_uc && (after.empty? || after_uc) || after_uc && (before.empty? || before_uc)
160
+ before_uc || after_uc
37
161
  end
38
162
  end
39
163
  end