interscript 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +246 -14
- data/bin/interscript +38 -17
- data/bin/setup +8 -0
- data/lib/g2pwrapper.py +34 -0
- data/lib/interscript.rb +140 -16
- data/lib/interscript/command.rb +27 -0
- data/lib/interscript/mapping.rb +125 -0
- data/lib/interscript/version.rb +1 -1
- data/lib/model-7 +0 -0
- data/lib/tha-pt-b-7 +0 -0
- data/maps/acadsin-zho-Hani-Latn-2002.yaml +38912 -0
- data/maps/alalc-bel-cyrl-latn-1997.yaml +125 -0
- data/maps/alalc-ben-Beng-Latn-2017.yaml +130 -0
- data/maps/alalc-bul-Cyrl-Latn-1997.yaml +94 -0
- data/maps/alalc-ell-Grek-Latn-1997.yaml +625 -0
- data/maps/alalc-ell-Grek-Latn-2010.yaml +628 -0
- data/maps/alalc-kat-Geok-Latn-1997.yaml +112 -0
- data/maps/alalc-kat-Geor-Latn-1997.yaml +146 -0
- data/maps/alalc-kor-Hang-Latn-1997.yaml +94 -0
- data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +103 -0
- data/maps/alalc-mkd-cyrl-latn-1997.yaml +114 -0
- data/maps/alalc-srp-Cyrl-Latn-1997.yaml +114 -0
- data/maps/alalc-srp-cyrl-latn-2013.yaml +135 -0
- data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +141 -0
- data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +16 -0
- data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +283 -0
- data/maps/{bas-rus-Cyrl-Latn-bss.yaml → bas-rus-Cyrl-Latn-2017-bss.yaml} +57 -31
- data/maps/{bas-rus-Cyrl-Latn-oss.yaml → bas-rus-Cyrl-Latn-2017-oss.yaml} +54 -34
- data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +294 -0
- data/maps/bgn-kor-Hang-Latn-1943.yaml +31 -0
- data/maps/bgn-kor-Kore-Latn-1943.yaml +31 -0
- data/maps/bgna-bul-Cyrl-Latn-2006.yaml +208 -0
- data/maps/bgna-bul-Cyrl-Latn-2009.yaml +208 -0
- data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +1 -2
- data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +104 -0
- data/maps/bgnpcgn-bel-cyrl-latn-1979.yaml +285 -0
- data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +115 -0
- data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +10 -64
- data/maps/bgnpcgn-chn-Hans-Latn-1979.yaml +7456 -0
- data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +702 -0
- data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +20 -0
- data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +257 -0
- data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +127 -0
- data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +43 -0
- data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +253 -0
- data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +48 -0
- data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +48 -0
- data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +159 -0
- data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +190 -0
- data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +145 -64
- data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +166 -0
- data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +75 -2
- data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +208 -0
- data/maps/by-bel-Cyrl-Latn-1998.yaml +168 -0
- data/maps/by-bel-Cyrl-Latn-2007.yaml +115 -0
- data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +685 -0
- data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +681 -0
- data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +20 -0
- data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +32 -0
- data/maps/ggg-kat-Geor-Latn-2002.yaml +89 -0
- data/maps/gki-bel-cyrl-latn-1992.yaml +33 -0
- data/maps/gki-bel-cyrl-latn-2000.yaml +201 -0
- data/maps/gost-rus-cyrl-latn-16876-71-1983.yaml +186 -0
- data/maps/hk-yue-Hani-Latn-1888.yaml +38497 -0
- data/maps/icao-bel-Cyrl-Latn-9303.yaml +108 -92
- data/maps/icao-bul-Cyrl-Latn-9303.yaml +1 -2
- data/maps/icao-heb-Hebr-Latn-9303.yaml +118 -124
- data/maps/icao-mkd-Cyrl-Latn-9303.yaml +1 -2
- data/maps/icao-per-Arab-Latn-9303.yaml +5 -6
- data/maps/icao-rus-Cyrl-Latn-9303.yaml +1 -2
- data/maps/icao-srp-Cyrl-Latn-9303.yaml +1 -2
- data/maps/icao-ukr-Cyrl-Latn-9303.yaml +1 -2
- data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +610 -0
- data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +41 -0
- data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +62 -0
- data/maps/{iso-rus-Cyrl-Latn-iso9.yaml → iso-rus-Cyrl-Latn-9-1995.yaml} +2 -3
- data/maps/iso-tha-Thai-Latn-11940-1998.yaml +109 -0
- data/maps/kp-kor-Hang-Latn-2002.yaml +901 -0
- data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +44820 -0
- data/maps/mext-jpn-Hrkt-Latn-1954.yaml +411 -0
- data/maps/moct-kor-Hang-Latn-2000.yaml +803 -0
- data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +541 -0
- data/maps/nil-kor-Hang-Hang-jamo.yaml +11193 -0
- data/maps/odni-kat-Geor-Latn-2015.yaml +88 -0
- data/maps/odni-ukr-Cyrl-Latn-2015.yaml +157 -0
- data/maps/royin-tha-Thai-Latn-1939-generic.yaml +90 -0
- data/maps/royin-tha-Thai-Latn-1968.yaml +179 -0
- data/maps/royin-tha-Thai-Latn-1999-chained.yaml +180 -0
- data/maps/royin-tha-Thai-Latn-1999.yaml +76 -0
- data/maps/{cn-chn-Hans-Latn-pinyin.yaml → sac-zho-Hans-Latn-1979.yaml} +6 -7
- data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +222 -0
- data/maps/ua-ukr-Cyrl-Latn-1996.yaml +193 -0
- data/maps/un-bel-Cyrl-Latn-2007.yaml +114 -0
- data/maps/un-ben-Beng-Latn-2016.yaml +534 -0
- data/maps/un-ell-Grek-Latn-1987-tl.yaml +32 -0
- data/maps/un-ell-Grek-Latn-1987-ts.yaml +20 -0
- data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +780 -0
- data/maps/un-mon-Mong-Latn-2013.yaml +19 -6
- data/maps/un-rus-Cyrl-Latn-1987.yaml +166 -0
- data/maps/un-ukr-cyrl-latn-1998.yaml +30 -0
- data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +406 -0
- data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +386 -0
- data/maps/var-kor-Hang-Latn-mr-1939.yaml +1054 -0
- data/maps/var-kor-Kore-Hang-2013.yaml +59754 -0
- data/maps/var-kor-Kore-Latn-mr-1939.yaml +37 -0
- data/maps/var-tha-Thai-Thai-phonemic.yaml +59 -0
- data/maps/var-tha-Thai-Zsym-ipa.yaml +301 -0
- data/maps/var-zho-Hani-Latn-1979.yaml +38908 -0
- data/spec/interscript/mapping_spec.rb +42 -0
- data/spec/interscript_spec.rb +20 -5
- data/spec/spec_helper.rb +3 -1
- metadata +149 -24
- data/maps/bgnpcgn-chn-Hans-Latn-pinyin.yaml +0 -7503
- data/maps/historic-jpn-Hrkt-Latn-hepburn.yaml +0 -336
- data/maps/icao-gre-Grek-Latn-9303.yaml +0 -101
- data/maps/mext-jpn-Hrkt-Latn-hepburn.yaml +0 -330
- data/maps/mext-jpn-Hrkt-Latn-kunrei.yaml +0 -308
- data/maps/un-jpn-Hrkt-Latn-hepburn.yaml +0 -313
- data/maps/un-jpn-Hrkt-Latn-kunrei.yaml +0 -354
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 643981da933194b2464ea279e9d31b9fcd9d32519c5cd236ed805855c93755ad
|
|
4
|
+
data.tar.gz: f54c4303bb02f0a873cfdf96287d78321648cee19c685bf338cb9f8e2f642c56
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2d8cfd0d60e2d41d8b1e31b4e61353b0bc7fd5ac4fc426d4304ccc86bc0bb6d84b4b4a2a6e44bb342afa6c20202a4bca4180a1f5037c73072e246038c6f36f1f
|
|
7
|
+
data.tar.gz: 2a5fffac1de98702494f69d55b2de5200684195b0f7948619bfa2ae9f3f97810c731868f2550578f5ad97a9db9fa72d9c2abad24451437b7e08673dfc1cd97d8
|
data/README.adoc
CHANGED
|
@@ -1,45 +1,259 @@
|
|
|
1
|
-
= Interscript: Interoperable Script Conversion Systems
|
|
1
|
+
= Interscript: Interoperable Script Conversion Systems, with a Ruby implementation
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
image:https://github.com/interscript/interscript/workflows/test/badge.svg["Build Status", link="https://github.com/interscript/interscript/actions?workflow=test"]
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
== Introduction
|
|
6
6
|
|
|
7
|
+
This repository contains interoperable transliteration schemes from:
|
|
8
|
+
|
|
9
|
+
* ALA-LC
|
|
7
10
|
* BGN/PCGN
|
|
8
11
|
* ICAO
|
|
9
12
|
* ISO
|
|
10
13
|
* UN (by UNGEGN)
|
|
14
|
+
* Many, many other script conversion system authorities.
|
|
11
15
|
|
|
12
16
|
The goal is to achieve interoperable transliteration schemes allowing quality comparisons.
|
|
13
17
|
|
|
14
18
|
|
|
15
|
-
== STATUS (work in progress!)
|
|
16
19
|
|
|
17
|
-
|
|
20
|
+
== Demonstration
|
|
21
|
+
|
|
22
|
+
These transliteration systems are used in the demo:
|
|
18
23
|
|
|
19
24
|
`bgnpcgn-rus-Cyrl-Latn-1947`:: BGN/PCGN Romanization of Russian
|
|
20
25
|
`iso-rus-Cyrl-Latn-iso9`:: ISO 9 Romanization of Russian
|
|
21
26
|
`icao-rus-Cyrl-Latn-9303`:: ICAO MRZ Romanization of Russian
|
|
22
27
|
`bas-rus-Cyrl-Latn-bss`:: Bulgaria Academy of Science Streamlined System for Russian
|
|
23
28
|
|
|
29
|
+
image:demo/20191118-interscript-demo-cast.gif["interscript screencast"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
== Installation
|
|
33
|
+
|
|
34
|
+
=== Prerequisites
|
|
35
|
+
|
|
36
|
+
Linux:
|
|
37
|
+
|
|
38
|
+
[source,sh]
|
|
39
|
+
----
|
|
40
|
+
apt-get install swig python3-setuptools
|
|
41
|
+
----
|
|
42
|
+
|
|
43
|
+
Windows:
|
|
44
|
+
|
|
45
|
+
[source,sh]
|
|
46
|
+
----
|
|
47
|
+
choco install --no-progress swig
|
|
48
|
+
----
|
|
49
|
+
|
|
50
|
+
Interscript depends on Python and the https://github.com/sequitur-g2p/sequitur-g2p[`sequitur-g2p`] module
|
|
51
|
+
|
|
52
|
+
[source,sh]
|
|
53
|
+
----
|
|
54
|
+
pip3 install setuptools numpy
|
|
55
|
+
curl -sSL -o sequitur-g2p.zip https://github.com/sequitur-g2p/sequitur-g2p/archive/806273f.zip
|
|
56
|
+
pip3 install sequitur-g2p.zip
|
|
57
|
+
----
|
|
58
|
+
|
|
59
|
+
Interscript depends on Ruby. Once you manage to install Ruby, it's easy.
|
|
60
|
+
|
|
61
|
+
[source,sh]
|
|
62
|
+
----
|
|
63
|
+
gem install interscript
|
|
64
|
+
----
|
|
24
65
|
|
|
25
66
|
== Usage
|
|
26
67
|
|
|
68
|
+
Assume you have a file ready in the source script like this:
|
|
69
|
+
|
|
70
|
+
[source,sh]
|
|
71
|
+
----
|
|
72
|
+
cat <<EOT > rus-Cyrl.txt
|
|
73
|
+
Эх, тройка! птица тройка, кто тебя выдумал? знать, у бойкого народа ты
|
|
74
|
+
могла только родиться, в той земле, что не любит шутить, а
|
|
75
|
+
ровнем-гладнем разметнулась на полсвета, да и ступай считать версты,
|
|
76
|
+
пока не зарябит тебе в очи. И не хитрый, кажись, дорожный снаряд, не
|
|
77
|
+
железным схвачен винтом, а наскоро живьём с одним топором да долотом
|
|
78
|
+
снарядил и собрал тебя ярославский расторопный мужик. Не в немецких
|
|
79
|
+
ботфортах ямщик: борода да рукавицы, и сидит чёрт знает на чём; а
|
|
80
|
+
привстал, да замахнулся, да затянул песню — кони вихрем, спицы в
|
|
81
|
+
колесах смешались в один гладкий круг, только дрогнула дорога, да
|
|
82
|
+
вскрикнул в испуге остановившийся пешеход — и вон она понеслась,
|
|
83
|
+
понеслась, понеслась!
|
|
84
|
+
|
|
85
|
+
Н.В. Гоголь
|
|
86
|
+
EOT
|
|
87
|
+
----
|
|
88
|
+
|
|
89
|
+
You can run `interscript` on this text using different transliteration systems.
|
|
90
|
+
|
|
91
|
+
[source,sh]
|
|
92
|
+
----
|
|
93
|
+
interscript rus-Cyrl.txt \
|
|
94
|
+
--system=bgnpcgn-rus-Cyrl-Latn-1947 \
|
|
95
|
+
--output=bgnpcgn-rus-Latn.txt
|
|
96
|
+
|
|
97
|
+
interscript rus-Cyrl.txt \
|
|
98
|
+
--system=iso-rus-Cyrl-Latn-iso9 \
|
|
99
|
+
--output=iso-rus-Latn.txt
|
|
100
|
+
|
|
101
|
+
interscript rus-Cyrl.txt \
|
|
102
|
+
--system=icao-rus-Cyrl-Latn-9303 \
|
|
103
|
+
--output=icao-rus-Latn.txt
|
|
104
|
+
|
|
105
|
+
interscript rus-Cyrl.txt \
|
|
106
|
+
--system=bas-rus-Cyrl-Latn-bss \
|
|
107
|
+
--output=bas-rus-Latn.txt
|
|
108
|
+
----
|
|
109
|
+
|
|
110
|
+
It is then easy to see the exact differences in rendering between the systems.
|
|
111
|
+
|
|
112
|
+
[source,sh]
|
|
113
|
+
----
|
|
114
|
+
diff bgnpcgn-rus-Latn.txt bas-rus-Latn.txt
|
|
115
|
+
----
|
|
116
|
+
|
|
117
|
+
== Adding transliteration system
|
|
118
|
+
|
|
119
|
+
Transliteration systems stored in a `maps/` directory as YAML files.
|
|
120
|
+
You can create a new file and add it to the directory.
|
|
121
|
+
|
|
122
|
+
The file should be named as `<system-code>.yaml`, where `system-code`
|
|
123
|
+
is in accordance with
|
|
124
|
+
http://calconnect.gitlab.io/tc-localization/csd-transcription-systems[ISO/CC 24229].
|
|
125
|
+
|
|
126
|
+
=== File structure
|
|
127
|
+
|
|
128
|
+
[source,yaml]
|
|
129
|
+
----
|
|
130
|
+
authority_id: bgnpcgn
|
|
131
|
+
id: 1947
|
|
132
|
+
language: rus
|
|
133
|
+
source_script: Cyrl
|
|
134
|
+
destination_script: Latn
|
|
135
|
+
name: ROMANIZATION OF RUSSIAN, BGN/PCGN 1947 System
|
|
136
|
+
url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/807920/ROMANIZATION_OF_RUSSIAN.pdf
|
|
137
|
+
creation_date: 1947
|
|
138
|
+
confirmation_date: 2019-06
|
|
139
|
+
description: The BGN/PCGN system for Russian was adopted ...
|
|
140
|
+
|
|
141
|
+
notes:
|
|
142
|
+
- The character e should be romanized ye initially, after the vowel ...
|
|
143
|
+
|
|
144
|
+
tests:
|
|
145
|
+
- source: ДЛИННОЕ ПОКРЫВАЛО
|
|
146
|
+
expected: DLINNOYE POKRYVALO
|
|
147
|
+
- source: Еловая шишка
|
|
148
|
+
expected: Yelovaya shishka
|
|
149
|
+
|
|
150
|
+
map:
|
|
151
|
+
rules:
|
|
152
|
+
- pattern: (?<=[АаЕеЁёИиОоУуЫыЭэЮюЯяЙйЪъЬь])\u0415 # Е after a, e, ё, и, о, у, ы, э, ю, я, й, ъ, ь
|
|
153
|
+
result: Ye
|
|
154
|
+
- pattern: \b\u0415 # Е initially
|
|
155
|
+
result: Ye
|
|
156
|
+
|
|
157
|
+
characters:
|
|
158
|
+
"\u0410": "A"
|
|
159
|
+
"\u0411": "B"
|
|
160
|
+
"\u0412": "V"
|
|
161
|
+
----
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
=== Rules
|
|
165
|
+
|
|
166
|
+
The subsection `rules` is placed under the `map` key. All rules are applied in order they are placed before the subsection `characters` applying. Rules apply to an original text, not to a result of previous rules applying.
|
|
167
|
+
|
|
168
|
+
Each rule has `pattern` and `result` elements.
|
|
169
|
+
|
|
170
|
+
Pattern is a regex expression. It should be representing as a string without `//` or `%r{}` parentheses. For example `\b\u0415`. In case a rule is depend on previous or next content, lookahead or lookbehind could be used. For example a rule with the pattern `(?<=[АаЕеЁёИиОоУуЫыЭэЮюЯяЙйЪъЬь])\u0415` find every Е after upper or lower case symbols a, e, ё, и, о, у, ы, э, ю, я, й, ъ, ь.
|
|
171
|
+
|
|
172
|
+
Result is a replacement a for pattern's match. It can contain a string, an Unicode characters specified by a hexadecimal number, a captured group reference. String with hexadecimal number or captured group reference should be double quoted. For example `"Y\u00eb"` or `"\\1\u00b7\\2"`. Captured group are referred by double backslash and group's number.
|
|
173
|
+
|
|
174
|
+
Because rules are applied in order, multiple rules applicable to the same segment of a string can be addressed by rule ordering, and rules can be used as priority over characters. For example:
|
|
175
|
+
|
|
176
|
+
[source,yaml]
|
|
177
|
+
----
|
|
178
|
+
map:
|
|
179
|
+
rules:
|
|
180
|
+
- pattern: \u03B3\u03B3 # γ (before Γ, Ξ, Χ)
|
|
181
|
+
result: ng
|
|
182
|
+
- pattern: (?<![Γγ])\u03B3(?=[ΕεέΗηήΙιίΥυύ]) # γ (before front vowels)
|
|
183
|
+
result: y
|
|
184
|
+
----
|
|
185
|
+
|
|
186
|
+
(γι maps to `yi`; but γγ maps to `ng`. In the case of γγι, the first rule takes priority, and the transliteration is `ngi`: it makes the second rule impossible.)
|
|
187
|
+
|
|
188
|
+
[source,yaml]
|
|
189
|
+
----
|
|
190
|
+
map:
|
|
191
|
+
rules:
|
|
192
|
+
- pattern: (?<=\b)\u03BC[πΠ] # μπ (initially)
|
|
193
|
+
result: b
|
|
194
|
+
- pattern: \u03BC[πΠ] # μπ (medially)
|
|
195
|
+
result: mb
|
|
196
|
+
----
|
|
197
|
+
|
|
198
|
+
(The first rule applies at the start of a word; the second rule does not specify a context, as it applies in all other cases not covered by the first rule.)
|
|
199
|
+
|
|
200
|
+
[source,yaml]
|
|
201
|
+
----
|
|
202
|
+
map:
|
|
203
|
+
rules:
|
|
204
|
+
- pattern: ";"
|
|
205
|
+
result: "?"
|
|
206
|
+
|
|
207
|
+
characters
|
|
208
|
+
"\u00B7": ";
|
|
209
|
+
----
|
|
210
|
+
|
|
211
|
+
(This guarantees that any `;` are converted to `?` before any new `;` are introduced; because all three are Latin script, they could be mixed up in ordering.)
|
|
212
|
+
|
|
213
|
+
Normally rules "`bleed`" each other: once a rule applies to a segment, that segment cannot trigger other rules, because it is already converted to Roman. Exceptionally, it will be necessary to have a rule add or remove characters in the original script, rather than transliterate them, so that the same context can be invoked by two rules in succession:
|
|
214
|
+
|
|
215
|
+
[source,yaml]
|
|
216
|
+
----
|
|
217
|
+
map:
|
|
218
|
+
rules:
|
|
219
|
+
- pattern: (?<=[АаЕеЁёИиОоУуЫыЭэЮюЯя])\u042b # Ы after any vowel character
|
|
220
|
+
result: "\u00b7Ы"
|
|
221
|
+
- pattern: \u042b(?=[АаУуЫыЭэ]) # Ы before а, у, ы, or э
|
|
222
|
+
result: "Ы\u00b7"
|
|
223
|
+
----
|
|
224
|
+
|
|
225
|
+
(If the result were `\u00B7Y`, the second rule could not be applied afterwards; but we want ОЫУ to transliterate as `O·Y·U`. In order to make that happen, we preserve the Ы during the rules phase, resulting in О·Ы·У; we only convert the letters to Roman script in the `characters` phase.)
|
|
226
|
+
|
|
227
|
+
=== Testing transliteration systems
|
|
228
|
+
|
|
229
|
+
To test all transliteration systems in the `maps/` directory, run:
|
|
27
230
|
|
|
28
231
|
[source,sh]
|
|
29
232
|
----
|
|
30
|
-
|
|
31
|
-
|
|
233
|
+
bundle exec rspec
|
|
234
|
+
----
|
|
235
|
+
|
|
236
|
+
The command takes `source` texts from the `test` section, transforms
|
|
237
|
+
them using `rules` and `charmaps` from the `map` key, and compares the
|
|
238
|
+
results with `expected:` text from the `source:` section.
|
|
239
|
+
|
|
240
|
+
To test a specific transliteration system, set the environment variable
|
|
241
|
+
`TRANSLIT_SYSTEM` to the system code of the desired system
|
|
242
|
+
(i.e. the "`basename`" of the system's YAML file):
|
|
32
243
|
|
|
33
|
-
|
|
34
|
-
|
|
244
|
+
[source,sh]
|
|
245
|
+
----
|
|
246
|
+
TRANSLIT_SYSTEM=bgnpcgn-rus-Cyrl-Latn-1947 bundle exec rspec
|
|
35
247
|
----
|
|
36
248
|
|
|
37
249
|
|
|
38
250
|
== ISCS system codes
|
|
39
251
|
|
|
40
|
-
|
|
252
|
+
In accordance with
|
|
253
|
+
http://calconnect.gitlab.io/tc-localization/csd-transcription-systems[ISO/CC 24229],
|
|
254
|
+
the system code identifying a script conversion system has the following components:
|
|
41
255
|
|
|
42
|
-
e.g. `bgnpcgn-rus-Cyrl-Latn-1947
|
|
256
|
+
e.g. `bgnpcgn-rus-Cyrl-Latn-1947`:
|
|
43
257
|
|
|
44
258
|
`bgnpcgn`:: the authority identifier
|
|
45
259
|
`rus`:: an ISO 639-2 3-letter language code that this system applies to
|
|
@@ -53,13 +267,31 @@ e.g. `bgnpcgn-rus-Cyrl-Latn-1947`
|
|
|
53
267
|
Currently the schemes cover Cyrillic, Armenian, Greek, Arabic and Hebrew.
|
|
54
268
|
|
|
55
269
|
|
|
56
|
-
==
|
|
270
|
+
== Samples to play with
|
|
57
271
|
|
|
58
272
|
* `rus-Cyrl-1.txt`: Copied from the XLS output from http://www.primorsk.vybory.izbirkom.ru/region/primorsk?action=show&global=true&root=254017025&tvd=4254017212287&vrn=100100067795849&prver=0&pronetvd=0®ion=25&sub_region=25&type=242&vibid=4254017212287
|
|
59
273
|
|
|
60
274
|
* `rus-Cyrl-2.txt`: Copied from the XLS output from http://www.yaroslavl.vybory.izbirkom.ru/region/yaroslavl?action=show&root=764013001&tvd=4764013188704&vrn=4764013188693&prver=0&pronetvd=0®ion=76&sub_region=76&type=426&vibid=4764013188704
|
|
61
275
|
|
|
62
276
|
|
|
63
|
-
==
|
|
277
|
+
== References
|
|
278
|
+
|
|
279
|
+
Reference documents are located at the
|
|
280
|
+
https://github.com/interscript/interscript-references[interscript-references repository].
|
|
281
|
+
Some specifications that have distribution limitations may not be reproduced there.
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
== Links to system definitions
|
|
285
|
+
|
|
286
|
+
* https://www.iso.org/committee/48750.html[ISO/TC 46 (see standards published by WG 3)]
|
|
287
|
+
* http://geonames.nga.mil/gns/html/romanization.html[BGN/PCGN and BGN Romanization systems (BGN)]
|
|
288
|
+
* https://www.gov.uk/government/publications/romanization-systems[BGN/PCGN Romanization systems (PCGN)]
|
|
289
|
+
* https://www.loc.gov/catdir/cpso/roman.html[ALA-LC Romanization systems in current use]
|
|
290
|
+
* http://catdir.loc.gov/catdir/cpso/roman.html[ALA-LC Romanization systems from 1997]
|
|
291
|
+
* http://www.eki.ee/wgrs/[UN Romanization systems]
|
|
292
|
+
* http://www.eki.ee/knab/kblatyl2.htm[EKI KNAB systems]
|
|
293
|
+
|
|
294
|
+
== Copyright and license
|
|
295
|
+
|
|
296
|
+
This is a Ribose project. Copyright Ribose.
|
|
64
297
|
|
|
65
|
-
This is a Ribose project.
|
data/bin/interscript
CHANGED
|
@@ -1,20 +1,41 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
|
+
|
|
2
3
|
require 'rubygems'
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
Interscript.transliterate_file(system_code, input, output_file)
|
|
17
|
-
else
|
|
18
|
-
puts Interscript.transliterate(system_code, IO.read(input))
|
|
19
|
-
end
|
|
4
|
+
|
|
5
|
+
# resolve bin path, ignoring symlinks
|
|
6
|
+
require 'pathname'
|
|
7
|
+
bin_file = Pathname.new(__FILE__).realpath
|
|
8
|
+
|
|
9
|
+
# add self to libpath
|
|
10
|
+
$LOAD_PATH.unshift File.expand_path("../../lib", bin_file)
|
|
11
|
+
|
|
12
|
+
# Fixes https://github.com/rubygems/rubygems/issues/1420
|
|
13
|
+
require 'rubygems/specification'
|
|
14
|
+
|
|
15
|
+
class Gem::Specification
|
|
16
|
+
def this; self; end
|
|
20
17
|
end
|
|
18
|
+
|
|
19
|
+
require 'interscript/command'
|
|
20
|
+
|
|
21
|
+
if ARGV.any? && !Interscript::Command.all_tasks.key?(ARGV.first)
|
|
22
|
+
ARGV.unshift :translit
|
|
23
|
+
end
|
|
24
|
+
Interscript::Command.start ARGV
|
|
25
|
+
|
|
26
|
+
# if ARGV.empty?
|
|
27
|
+
# puts "write source file, source format, and output file"
|
|
28
|
+
# else
|
|
29
|
+
# args = Hash[ARGV.flat_map { |s| s.scan(/--?([^=\s]+)(?:=(\S+))?/) }]
|
|
30
|
+
# input = ARGV[0]
|
|
31
|
+
# system_code = args["system"]
|
|
32
|
+
# output_file = args["output"]
|
|
33
|
+
|
|
34
|
+
# raise "Please enter the system code with --system={system_code}" unless system_code
|
|
35
|
+
|
|
36
|
+
# if output_file
|
|
37
|
+
# Interscript.transliterate_file(system_code, input, output_file)
|
|
38
|
+
# else
|
|
39
|
+
# puts Interscript.transliterate(system_code, IO.read(input))
|
|
40
|
+
# end
|
|
41
|
+
# end
|
data/bin/setup
ADDED
data/lib/g2pwrapper.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import g2p, SequiturTool
|
|
2
|
+
import numpy
|
|
3
|
+
|
|
4
|
+
def transliterate(model, word):
|
|
5
|
+
|
|
6
|
+
class Struct:
|
|
7
|
+
def __init__(self, **entries):
|
|
8
|
+
self.__dict__.update(entries)
|
|
9
|
+
|
|
10
|
+
model_path = {
|
|
11
|
+
'pythainlp_lexicon': './lib/model-7',
|
|
12
|
+
'wiktionary_phonemic': './lib/tha-pt-b-7'
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
connector_dict = {
|
|
16
|
+
'pythainlp_lexicon': '',
|
|
17
|
+
'wiktionary_phonemic': '-'
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
modelFile = model_path[model]
|
|
22
|
+
connector = connector_dict[model]
|
|
23
|
+
|
|
24
|
+
options = Struct(**{'profile': None, 'resource_usage': None, 'psyco': None, 'tempdir': None, 'trainSample': None, 'develSample': None, 'testSample': None, 'checkpoint': None, 'resume_from_checkpoint': None, 'shouldTranspose': None, 'modelFile': modelFile , 'newModelFile': None, 'shouldTestContinuously': None, 'shouldSelfTest': None, 'lengthConstraints': None, 'shouldSuppressNewMultigrams': None, 'viterbi': None, 'shouldRampUp': None, 'shouldWipeModel': None, 'shouldInitializeWithCounts': None, 'minIterations': 20, 'maxIterations': 100, 'eager_discount_adjustment': None, 'fixed_discount': None, 'encoding': 'UTF-8', 'phoneme_to_phoneme': None, 'test_segmental': None, 'testResult': None, 'applySample': None, 'applyWord': word, 'variants_mass': None, 'variants_number': None, 'fakeTranslator': None, 'stack_limit': None})
|
|
25
|
+
|
|
26
|
+
loadSample = g2p.loadG2PSample
|
|
27
|
+
|
|
28
|
+
model = SequiturTool.procureModel(options, loadSample)
|
|
29
|
+
if not model:
|
|
30
|
+
return 1
|
|
31
|
+
translator = g2p.Translator(model)
|
|
32
|
+
del model
|
|
33
|
+
|
|
34
|
+
return connector.join(translator(tuple(word)))
|
data/lib/interscript.rb
CHANGED
|
@@ -1,39 +1,163 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require
|
|
3
|
+
require "yaml"
|
|
4
|
+
require "interscript/mapping"
|
|
4
5
|
|
|
5
6
|
# Transliteration
|
|
6
7
|
module Interscript
|
|
7
|
-
SYSTEM_DEFINITIONS_PATH = File.expand_path('../maps', __dir__)
|
|
8
8
|
|
|
9
9
|
class << self
|
|
10
|
-
def
|
|
10
|
+
def root_path
|
|
11
|
+
@root_path ||= Pathname.new(File.dirname(__dir__))
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def transliterate_file(system_code, input_file, output_file, maps)
|
|
11
15
|
input = File.read(input_file)
|
|
12
|
-
output = transliterate(system_code, input)
|
|
16
|
+
output = transliterate(system_code, input, maps)
|
|
13
17
|
|
|
14
|
-
File.open(output_file,
|
|
18
|
+
File.open(output_file, 'w') do |f|
|
|
15
19
|
f.puts(output)
|
|
16
20
|
end
|
|
17
21
|
puts "Output written to: #{output_file}"
|
|
18
22
|
end
|
|
19
23
|
|
|
20
|
-
def
|
|
21
|
-
|
|
24
|
+
def import_python_modules
|
|
25
|
+
begin
|
|
26
|
+
pyimport :g2pwrapper
|
|
27
|
+
rescue
|
|
28
|
+
pyimport :sys
|
|
29
|
+
sys.path.append(root_path.to_s+"/lib/")
|
|
30
|
+
pyimport :g2pwrapper
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def external_process(process_name, string)
|
|
35
|
+
import_python_modules
|
|
36
|
+
case process_name
|
|
37
|
+
when 'sequitur.pythainlp_lexicon'
|
|
38
|
+
return g2pwrapper.transliterate('pythainlp_lexicon', string)
|
|
39
|
+
when 'sequitur.wiktionary_phonemic'
|
|
40
|
+
return g2pwrapper.transliterate('wiktionary_phonemic', string)
|
|
41
|
+
else
|
|
42
|
+
puts "Invalid Process"
|
|
43
|
+
end
|
|
22
44
|
end
|
|
23
45
|
|
|
24
|
-
def transliterate(system_code, string)
|
|
25
|
-
|
|
46
|
+
def transliterate(system_code, string, maps={})
|
|
47
|
+
if (!maps.has_key?system_code)
|
|
48
|
+
maps[system_code] = Interscript::Mapping.for(system_code)
|
|
49
|
+
end
|
|
50
|
+
# mapping = Interscript::Mapping.for(system_code)
|
|
51
|
+
mapping = maps[system_code]
|
|
52
|
+
|
|
26
53
|
|
|
27
|
-
|
|
28
|
-
|
|
54
|
+
# First, apply chained transliteration as specified in the list `chain`
|
|
55
|
+
chain = mapping.chain.dup
|
|
56
|
+
while chain.length > 0
|
|
57
|
+
string = transliterate(chain.shift, string, maps)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Then, apply the rest of the map
|
|
61
|
+
separator = mapping.character_separator || ""
|
|
62
|
+
word_separator = mapping.word_separator || ""
|
|
63
|
+
title_case = mapping.title_case
|
|
64
|
+
downcase = mapping.downcase
|
|
65
|
+
|
|
66
|
+
# charmap = mapping.characters&.sort_by { |k, _v| k.size }&.reverse&.to_h
|
|
67
|
+
# dictmap = mapping.dictionary&.sort_by { |k, _v| k.size }&.reverse&.to_h
|
|
68
|
+
charmap = mapping.characters_hash
|
|
69
|
+
dictmap = mapping.dictionary_hash
|
|
70
|
+
trie = mapping.dictionary_trie
|
|
71
|
+
|
|
72
|
+
# Segmentation
|
|
73
|
+
string = external_process(mapping.segmentation, string) if mapping.segmentation
|
|
29
74
|
|
|
30
|
-
|
|
31
|
-
|
|
75
|
+
# Transliteration/Transcription
|
|
76
|
+
string = external_process(mapping.transcription, string) if mapping.transcription
|
|
77
|
+
|
|
78
|
+
pos = 0
|
|
79
|
+
while pos < string.to_s.size
|
|
80
|
+
m = 0
|
|
81
|
+
wordmatch = ""
|
|
82
|
+
|
|
83
|
+
# Using Trie, find the longest matching substring
|
|
84
|
+
while (pos + m < string.to_s.size) && (trie.partial_word?string[pos..pos+m])
|
|
85
|
+
wordmatch = string[pos..pos+m] if trie.word?string[pos..pos+m]
|
|
86
|
+
m += 1
|
|
87
|
+
end
|
|
88
|
+
m = wordmatch.length
|
|
89
|
+
if m > 0
|
|
90
|
+
repl = dictmap[string[pos..pos+m-1]]
|
|
91
|
+
string[pos..pos+m-1] = repl
|
|
92
|
+
pos += repl.length
|
|
93
|
+
else
|
|
94
|
+
pos += 1
|
|
95
|
+
end
|
|
32
96
|
end
|
|
33
97
|
|
|
34
|
-
string.
|
|
35
|
-
|
|
36
|
-
|
|
98
|
+
output = string.clone
|
|
99
|
+
offsets = Array.new string.to_s.size, 1
|
|
100
|
+
|
|
101
|
+
# mapping.rules.each do |r|
|
|
102
|
+
# string.to_s.scan(/#{r['pattern']}/) do |matches|
|
|
103
|
+
# match = Regexp.last_match
|
|
104
|
+
# pos = match.offset(0).first
|
|
105
|
+
# result = r['result'].clone
|
|
106
|
+
# matches.each.with_index { |v, i| result.sub!(/\\#{i + 1}/, v) } if matches.is_a? Array
|
|
107
|
+
# result.upcase! if up_case_around?(string, pos)
|
|
108
|
+
# output[offsets[0...pos].sum, match[0].size] = result
|
|
109
|
+
# offsets[pos] += result.size - match[0].size
|
|
110
|
+
# end
|
|
111
|
+
# end
|
|
112
|
+
mapping.rules.each do |r|
|
|
113
|
+
output.gsub!(/#{r['pattern']}/, r['result'])
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
charmap.each do |k, v|
|
|
117
|
+
while (match = output&.match(/#{k}/))
|
|
118
|
+
pos = match.offset(0).first
|
|
119
|
+
result = !downcase && up_case_around?(output, pos) ? v.upcase : v
|
|
120
|
+
result = result[0] if result.is_a?(Array) # if more than one, choose the first one
|
|
121
|
+
output[pos, match[0].size] = add_separator(separator, pos, result)
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
mapping.postrules.each do |r|
|
|
126
|
+
output.gsub!(/#{r['pattern']}/, r['result'])
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
if output
|
|
130
|
+
output.sub!(/^(.)/, &:upcase) if title_case
|
|
131
|
+
if word_separator != ''
|
|
132
|
+
output.gsub!(/#{word_separator}#{separator}/,word_separator)
|
|
133
|
+
output.gsub!(/#{word_separator}(.)/, &:upcase) if title_case
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
output ? output.unicode_normalize : output
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
private
|
|
141
|
+
|
|
142
|
+
def add_separator(separator, pos, result)
|
|
143
|
+
pos == 0 ? result : separator + result
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def up_case_around?(string, pos)
|
|
147
|
+
return false if string[pos] == string[pos].downcase
|
|
148
|
+
|
|
149
|
+
i = pos - 1
|
|
150
|
+
i -= 1 while i.positive? && string[i] !~ /[[:alpha:]]/
|
|
151
|
+
before = i >= 0 && i < pos ? string[i].to_s.strip : ''
|
|
152
|
+
|
|
153
|
+
i = pos + 1
|
|
154
|
+
i += 1 while i < string.size - 1 && string[i] !~ /[[:alpha:]]/
|
|
155
|
+
after = i > pos ? string[i].to_s.strip : ''
|
|
156
|
+
|
|
157
|
+
before_uc = !before.empty? && before == before.upcase
|
|
158
|
+
after_uc = !after.empty? && after == after.upcase
|
|
159
|
+
# before_uc && (after.empty? || after_uc) || after_uc && (before.empty? || before_uc)
|
|
160
|
+
before_uc || after_uc
|
|
37
161
|
end
|
|
38
162
|
end
|
|
39
163
|
end
|