interscript 0.1.6 → 2.1.0a9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/Gemfile +29 -0
- data/LICENSE.adoc +31 -0
- data/README.md +3 -0
- data/Rakefile +53 -0
- data/bin/console +14 -0
- data/bin/interscript +3 -39
- data/bin/maps_analyze_staging +168 -0
- data/bin/maps_debug_compilers +58 -0
- data/bin/maps_debug_ordering +88 -0
- data/bin/maps_debug_ruby_compile +24 -0
- data/bin/maps_debug_step_by_step +44 -0
- data/bin/maps_optimize_order +112 -0
- data/bin/maps_v1_analyze_regexps +45 -0
- data/bin/maps_v1_to_v2 +426 -0
- data/exe/interscript +6 -0
- data/interscript.gemspec +31 -0
- data/lib/interscript.rb +81 -127
- data/lib/interscript/command.rb +5 -5
- data/lib/interscript/compiler.rb +22 -0
- data/lib/interscript/compiler/javascript.rb +292 -0
- data/lib/interscript/compiler/ruby.rb +262 -0
- data/lib/interscript/dsl.rb +67 -0
- data/lib/interscript/dsl/aliases.rb +23 -0
- data/lib/interscript/dsl/document.rb +46 -0
- data/lib/interscript/dsl/group.rb +45 -0
- data/lib/interscript/dsl/group/parallel.rb +6 -0
- data/lib/interscript/dsl/items.rb +89 -0
- data/lib/interscript/dsl/metadata.rb +26 -0
- data/lib/interscript/dsl/stage.rb +6 -0
- data/lib/interscript/dsl/symbol_mm.rb +11 -0
- data/lib/interscript/dsl/tests.rb +12 -0
- data/lib/interscript/interpreter.rb +251 -0
- data/lib/interscript/node.rb +25 -0
- data/lib/interscript/node/alias_def.rb +15 -0
- data/lib/interscript/node/dependency.rb +13 -0
- data/lib/interscript/node/document.rb +45 -0
- data/lib/interscript/node/group.rb +34 -0
- data/lib/interscript/node/group/parallel.rb +9 -0
- data/lib/interscript/node/group/sequential.rb +2 -0
- data/lib/interscript/node/item.rb +52 -0
- data/lib/interscript/node/item/alias.rb +42 -0
- data/lib/interscript/node/item/any.rb +61 -0
- data/lib/interscript/node/item/capture.rb +50 -0
- data/lib/interscript/node/item/group.rb +51 -0
- data/lib/interscript/node/item/repeat.rb +40 -0
- data/lib/interscript/node/item/stage.rb +23 -0
- data/lib/interscript/node/item/string.rb +51 -0
- data/lib/interscript/node/metadata.rb +18 -0
- data/lib/interscript/node/rule.rb +6 -0
- data/lib/interscript/node/rule/funcall.rb +18 -0
- data/lib/interscript/node/rule/run.rb +15 -0
- data/lib/interscript/node/rule/sub.rb +65 -0
- data/lib/interscript/node/stage.rb +19 -0
- data/lib/interscript/node/tests.rb +15 -0
- data/lib/interscript/stdlib.rb +211 -0
- data/lib/interscript/utils/regexp_converter.rb +283 -0
- data/lib/interscript/version.rb +1 -1
- data/requirements.txt +1 -0
- metadata +75 -339
- data/README.adoc +0 -298
- data/bin/rspec +0 -29
- data/lib/__pycache__/g2pwrapper.cpython-38.pyc +0 -0
- data/lib/g2pwrapper.py +0 -34
- data/lib/interscript-opal.rb +0 -2
- data/lib/interscript/fs.rb +0 -71
- data/lib/interscript/mapping.rb +0 -142
- data/lib/interscript/opal.rb +0 -27
- data/lib/interscript/opal/maps.js.erb +0 -10
- data/lib/interscript/opal_map_translate.rb +0 -12
- data/lib/model-7 +0 -0
- data/lib/tha-pt-b-7 +0 -0
- data/maps/acadsin-zho-Hani-Latn-2002.yaml +0 -38912
- data/maps/alalc-amh-Ethi-Latn-1997.yaml +0 -509
- data/maps/alalc-amh-Ethi-Latn-2011.yaml +0 -138
- data/maps/alalc-ara-Arab-Latn-1997.yaml +0 -1283
- data/maps/alalc-asm-Deva-Latn-1997.yaml +0 -159
- data/maps/alalc-aze-Cyrl-Latn-1997.yaml +0 -141
- data/maps/alalc-bel-Cyrl-Latn-1997.yaml +0 -125
- data/maps/alalc-ben-Beng-Latn-2017.yaml +0 -130
- data/maps/alalc-bul-Cyrl-Latn-1997.yaml +0 -94
- data/maps/alalc-ell-Grek-Latn-1997.yaml +0 -624
- data/maps/alalc-ell-Grek-Latn-2010.yaml +0 -627
- data/maps/alalc-hin-Deva-Latn-2020.yaml +0 -159
- data/maps/alalc-kat-Geok-Latn-1997.yaml +0 -111
- data/maps/alalc-kat-Geor-Latn-1997.yaml +0 -146
- data/maps/alalc-kor-Hang-Latn-1997.yaml +0 -94
- data/maps/alalc-mar-Deva-Latn-1997.yaml +0 -170
- data/maps/alalc-mkd-Cyrl-Latn-1997.yaml +0 -114
- data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +0 -103
- data/maps/alalc-pan-Deva-Latn-1997.yaml +0 -237
- data/maps/alalc-rus-Cyrl-Latn-1997.yaml +0 -221
- data/maps/alalc-rus-Cyrl-Latn-2012.yaml +0 -162
- data/maps/alalc-srp-Cyrl-Latn-1997.yaml +0 -114
- data/maps/alalc-srp-Cyrl-Latn-2013.yaml +0 -135
- data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +0 -141
- data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +0 -16
- data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +0 -283
- data/maps/bas-rus-Cyrl-Latn-2017-bss.yaml +0 -174
- data/maps/bas-rus-Cyrl-Latn-2017-oss.yaml +0 -169
- data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +0 -292
- data/maps/bgn-kor-Hang-Latn-1943.yaml +0 -31
- data/maps/bgn-kor-Kore-Latn-1943.yaml +0 -31
- data/maps/bgna-bul-Cyrl-Latn-2006.yaml +0 -208
- data/maps/bgna-bul-Cyrl-Latn-2009.yaml +0 -208
- data/maps/bgnpcgn-amh-Ethi-Latn-1967.yaml +0 -528
- data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +0 -592
- data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +0 -108
- data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +0 -104
- data/maps/bgnpcgn-bak-Cyrl-Latn-2007.yaml +0 -184
- data/maps/bgnpcgn-bel-Cyrl-Latn-1979.yaml +0 -285
- data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +0 -115
- data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +0 -38
- data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +0 -701
- data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +0 -19
- data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +0 -257
- data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +0 -127
- data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +0 -42
- data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +0 -253
- data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +0 -48
- data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +0 -48
- data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +0 -159
- data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +0 -190
- data/maps/bgnpcgn-nep-Deva-Latn-2011.yaml +0 -200
- data/maps/bgnpcgn-per-Arab-Latn-1956.yaml +0 -92
- data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +0 -314
- data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +0 -166
- data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +0 -162
- data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +0 -208
- data/maps/bgnpcgn-zho-Hans-Latn-1979.yaml +0 -7456
- data/maps/bis-asm-Beng-Latn-13194-1991.yaml +0 -159
- data/maps/bis-ben-Beng-Latn-13194-1991.yaml +0 -156
- data/maps/bis-dev-Deva-Latn-13194-1991.yaml +0 -184
- data/maps/bis-gjr-Gujr-Latn-13194-1991.yaml +0 -166
- data/maps/bis-knd-Knda-Latn-13194-1991.yaml +0 -173
- data/maps/bis-mlm-Mlym-Latn-13194-1991.yaml +0 -176
- data/maps/bis-ori-Orya-Latn-13194-1991.yaml +0 -160
- data/maps/bis-pnj-Guru-Latn-13194-1991.yaml +0 -175
- data/maps/bis-tel-Telu-Latn-13194-1991.yaml +0 -170
- data/maps/bis-tml-Taml-Latn-13194-1991.yaml +0 -155
- data/maps/by-bel-Cyrl-Latn-1998.yaml +0 -168
- data/maps/by-bel-Cyrl-Latn-2007.yaml +0 -115
- data/maps/dos-nep-Deva-Latn-1997.yaml +0 -33
- data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +0 -684
- data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +0 -680
- data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +0 -19
- data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +0 -31
- data/maps/ggg-kat-Geor-Latn-2002.yaml +0 -88
- data/maps/gki-bel-Cyrl-Latn-1992.yaml +0 -33
- data/maps/gki-bel-Cyrl-Latn-2000.yaml +0 -201
- data/maps/gost-rus-Cyrl-Latn-16876-71-1983.yaml +0 -186
- data/maps/hk-yue-Hani-Latn-1888.yaml +0 -38497
- data/maps/icao-bel-Cyrl-Latn-9303.yaml +0 -136
- data/maps/icao-bul-Cyrl-Latn-9303.yaml +0 -118
- data/maps/icao-heb-Hebr-Latn-9303.yaml +0 -151
- data/maps/icao-mkd-Cyrl-Latn-9303.yaml +0 -117
- data/maps/icao-per-Arab-Latn-9303.yaml +0 -103
- data/maps/icao-rus-Cyrl-Latn-9303.yaml +0 -117
- data/maps/icao-srp-Cyrl-Latn-9303.yaml +0 -117
- data/maps/icao-ukr-Cyrl-Latn-9303.yaml +0 -119
- data/maps/iso-ara-Arab-Latn-233-1984.yaml +0 -323
- data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +0 -609
- data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +0 -40
- data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +0 -62
- data/maps/iso-rus-Cyrl-Latn-9-1995.yaml +0 -271
- data/maps/iso-tha-Thai-Latn-11940-1998.yaml +0 -109
- data/maps/kp-kor-Hang-Latn-2002.yaml +0 -901
- data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +0 -44820
- data/maps/mext-jpn-Hrkt-Latn-1954.yaml +0 -411
- data/maps/moct-kor-Hang-Latn-2000.yaml +0 -803
- data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +0 -541
- data/maps/mvd-bel-Cyrl-Latn-2008.yaml +0 -225
- data/maps/mvd-bel-Cyrl-Latn-2010.yaml +0 -63
- data/maps/mvd-rus-Cyrl-Latn-2008.yaml +0 -109
- data/maps/mvd-rus-Cyrl-Latn-2010.yaml +0 -37
- data/maps/nil-kor-Hang-Hang-jamo.yaml +0 -11193
- data/maps/odni-aze-Cyrl-Latn-2015.yaml +0 -144
- data/maps/odni-bel-Cyrl-Latn-2015.yaml +0 -148
- data/maps/odni-bul-Cyrl-Latn-2015.yaml +0 -96
- data/maps/odni-hin-Deva-Latn-2015.yaml +0 -258
- data/maps/odni-kat-Geor-Latn-2015.yaml +0 -87
- data/maps/odni-kaz-Cyrl-Latn-2015.yaml +0 -148
- data/maps/odni-kir-Cyrl-Latn-2015.yaml +0 -136
- data/maps/odni-mkd-Cyrl-Latn-2015.yaml +0 -122
- data/maps/odni-rus-Cyrl-Latn-2015.yaml +0 -77
- data/maps/odni-srp-Cyrl-Latn-2015.yaml +0 -129
- data/maps/odni-tat-Cyrl-Latn-2015.yaml +0 -142
- data/maps/odni-tgk-Cyrl-Latn-2015.yaml +0 -148
- data/maps/odni-uig-Cyrl-Latn-2015.yaml +0 -138
- data/maps/odni-ukr-Cyrl-Latn-2015.yaml +0 -157
- data/maps/odni-urd-Arab-Latn-2015.yaml +0 -221
- data/maps/odni-uzb-Cyrl-Latn-2015.yaml +0 -166
- data/maps/royin-tha-Thai-Latn-1939-generic.yaml +0 -90
- data/maps/royin-tha-Thai-Latn-1968.yaml +0 -179
- data/maps/royin-tha-Thai-Latn-1999-chained.yaml +0 -180
- data/maps/royin-tha-Thai-Latn-1999.yaml +0 -76
- data/maps/sac-zho-Hans-Latn-1979.yaml +0 -24759
- data/maps/ses-ara-Arab-Latn-1930.yaml +0 -279
- data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +0 -222
- data/maps/ua-ukr-Cyrl-Latn-1996.yaml +0 -193
- data/maps/un-ara-Arab-Latn-1971.yaml +0 -139
- data/maps/un-ara-Arab-Latn-1972.yaml +0 -159
- data/maps/un-ara-Arab-Latn-2017.yaml +0 -420
- data/maps/un-bel-Cyrl-Latn-2007.yaml +0 -114
- data/maps/un-ben-Beng-Latn-2016.yaml +0 -534
- data/maps/un-ell-Grek-Latn-1987-tl.yaml +0 -31
- data/maps/un-ell-Grek-Latn-1987-ts.yaml +0 -19
- data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +0 -780
- data/maps/un-mon-Mong-Latn-2013.yaml +0 -99
- data/maps/un-nep-Deva-Latn-1972.yaml +0 -163
- data/maps/un-rus-Cyrl-Latn-1987.yaml +0 -166
- data/maps/un-ukr-Cyrl-Latn-1998.yaml +0 -30
- data/maps/ungegn-amh-Ethi-Latn-2016.yaml +0 -575
- data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +0 -406
- data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +0 -386
- data/maps/var-kor-Hang-Latn-mr-1939.yaml +0 -1054
- data/maps/var-kor-Kore-Hang-2013.yaml +0 -59754
- data/maps/var-kor-Kore-Latn-mr-1939.yaml +0 -36
- data/maps/var-tha-Thai-Thai-phonemic.yaml +0 -59
- data/maps/var-tha-Thai-Zsym-ipa.yaml +0 -301
- data/maps/var-zho-Hani-Latn-1979.yaml +0 -38908
- data/spec/interscript/mapping_spec.rb +0 -42
- data/spec/interscript_spec.rb +0 -26
- data/spec/spec_helper.rb +0 -3
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
ENV["INTERSCRIPT_STAGING"] = "1"
|
3
|
+
require "bundler/setup"
|
4
|
+
require "interscript"
|
5
|
+
require "interscript/compiler/ruby"
|
6
|
+
|
7
|
+
# Compile a given map with the Ruby compiler for debugging purposes
|
8
|
+
|
9
|
+
if ARGV[0] == '-b'
|
10
|
+
require 'base64'
|
11
|
+
$b64 = true
|
12
|
+
ARGV.shift
|
13
|
+
end
|
14
|
+
|
15
|
+
map = ARGV[0]
|
16
|
+
m = Interscript.parse(map)
|
17
|
+
cr = Interscript::Compiler::Ruby
|
18
|
+
mr = cr.(map)
|
19
|
+
|
20
|
+
if $b64
|
21
|
+
puts Base64.encode64(mr.code)
|
22
|
+
else
|
23
|
+
puts mr.code
|
24
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
ENV["INTERSCRIPT_STAGING"] = "1"
|
3
|
+
require "bundler/setup"
|
4
|
+
require "interscript"
|
5
|
+
require "interscript/compiler/ruby"
|
6
|
+
|
7
|
+
# This script has been written because there are some differences between platforms
|
8
|
+
# (ie. windows vs linux) that we wish to find out more about
|
9
|
+
|
10
|
+
if ARGV[0] == '-b'
|
11
|
+
require 'base64'
|
12
|
+
$b64 = []
|
13
|
+
ARGV.shift
|
14
|
+
end
|
15
|
+
|
16
|
+
map = ARGV[0]
|
17
|
+
m = Interscript.parse(map)
|
18
|
+
cr = Interscript::Compiler::Ruby
|
19
|
+
mr = cr.(map, debug: true)
|
20
|
+
|
21
|
+
m.tests.data.each_with_index do |(from, expected), idx|
|
22
|
+
r = mr.(from)
|
23
|
+
|
24
|
+
unless ARGV[1] && ARGV[1].split(",").any? { |i| i.to_i == idx }
|
25
|
+
if r == expected
|
26
|
+
cr.reset_debug_data
|
27
|
+
next
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
dr = cr.read_debug_data
|
32
|
+
|
33
|
+
if $b64
|
34
|
+
$b64 << [idx, dr]
|
35
|
+
else
|
36
|
+
pp [idx, dr]
|
37
|
+
end
|
38
|
+
|
39
|
+
cr.reset_debug_data
|
40
|
+
end
|
41
|
+
|
42
|
+
if $b64
|
43
|
+
puts Base64.encode64($b64.inspect)
|
44
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
ENV["INTERSCRIPT_STAGING"] = "1"
|
3
|
+
require "bundler/setup"
|
4
|
+
require "interscript"
|
5
|
+
require "interscript/compiler/ruby"
|
6
|
+
|
7
|
+
$map_name = ARGV[0]
|
8
|
+
|
9
|
+
if $map_name
|
10
|
+
filelist = [ __dir__+"/../../maps/maps-staging/#{$map_name}.imp" ]
|
11
|
+
else
|
12
|
+
filelist = Dir[__dir__+"/../../maps/maps-staging/*.imp"].sort
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
# levenshtein distance algorithm for comparing string similarity
|
17
|
+
def ld(s, t)
|
18
|
+
v0 = (0..t.length).to_a
|
19
|
+
v1 = []
|
20
|
+
#p v0
|
21
|
+
|
22
|
+
s.chars.each_with_index do |s_ch, i|
|
23
|
+
v1[0] = i + 1
|
24
|
+
|
25
|
+
t.chars.each_with_index do |t_ch, j|
|
26
|
+
cost = s_ch == t_ch ? 0 : 1
|
27
|
+
v1[j + 1] = [v1[j] + 1, v0[j + 1] + 1, v0[j] + cost].min
|
28
|
+
end
|
29
|
+
v0 = v1.dup
|
30
|
+
#p v1
|
31
|
+
end
|
32
|
+
|
33
|
+
v0[t.length]
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
def score_order( system, order )
|
38
|
+
interpreter = Marshal.load( Marshal.dump( $interpreter ))
|
39
|
+
parallel = interpreter.map.stages[:main].children.select{|x| Interscript::Node::Group::Parallel === x}[0]
|
40
|
+
parallel.apply_order(order)
|
41
|
+
interpreter.map.stages[:main].children[$parallel_idx].children = parallel.children.compact #.reorder_children(source,target)
|
42
|
+
delta_sum = 0
|
43
|
+
errors = []
|
44
|
+
system.tests.data.each do |from, expected|
|
45
|
+
result = interpreter.(from)
|
46
|
+
delta = ld(expected, result)
|
47
|
+
errors << [expected, result] if delta != 0
|
48
|
+
delta_sum += delta
|
49
|
+
end;
|
50
|
+
[delta_sum, errors]
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
def mutate_order(order)
|
55
|
+
order2 = order.dup
|
56
|
+
a = rand(order2.size)
|
57
|
+
b = rand(order2.size)
|
58
|
+
order2[a], order2[b] = order2[b], order2[a]
|
59
|
+
order2
|
60
|
+
end
|
61
|
+
|
62
|
+
for i in filelist
|
63
|
+
|
64
|
+
begin
|
65
|
+
system_name = File.basename(i, ".imp")
|
66
|
+
puts "\ndebugging #{system_name}"
|
67
|
+
|
68
|
+
system = Interscript.parse(system_name);
|
69
|
+
if system.tests && system.tests.data && system.tests.data.length > 0
|
70
|
+
|
71
|
+
$interpreter = Interscript::Interpreter.new.compile(system);
|
72
|
+
$orig_parallel = $interpreter.map.stages[:main].children.select{|x| Interscript::Node::Group::Parallel === x}[0].dup;
|
73
|
+
$parallel_idx = $interpreter.map.stages[:main].children.each_with_index.select{|x,i| Interscript::Node::Group::Parallel === x}.map{|x,i| i}[0]
|
74
|
+
next if !$parallel_idx
|
75
|
+
starting_score, starting_errors = score_order(system, $orig_parallel.children.size.times.to_a)
|
76
|
+
parallel_size = $interpreter.map.stages[:main].children.select{|x| Interscript::Node::Group::Parallel === x}[0].children.size
|
77
|
+
|
78
|
+
puts "starting_score = #{starting_score}"
|
79
|
+
best_score = starting_score
|
80
|
+
best_errors = starting_errors
|
81
|
+
curr_order = $orig_parallel.children.size.times.to_a.shuffle
|
82
|
+
#curr_order = [28, 308, 61, 87, 29, 147, 124, 22, 373, 186, 336, 19, 405, 387, 215, 209, 131, 30, 60, 343, 34, 380, 189, 53, 300, 286, 162, 31, 33, 218, 90, 51, 293, 226, 56, 305, 142, 102, 346, 222, 126, 338, 50, 52, 363, 144, 136, 16, 388, 221, 267, 63, 352, 365, 251, 78, 68, 328, 69, 12, 67, 317, 334, 94, 366, 412, 302, 243, 311, 318, 281, 274, 143, 236, 386, 135, 280, 167, 173, 291, 271, 309, 73, 20, 157, 331, 43, 242, 65, 351, 134, 151, 0, 285, 211, 417, 220, 179, 91, 353, 255, 141, 23, 104, 413, 409, 256, 326, 180, 140, 24, 348, 261, 5, 99, 47, 35, 358, 177, 123, 277, 396, 114, 213, 116, 188, 217, 249, 419, 120, 289, 330, 110, 118, 176, 113, 278, 127, 313, 55, 370, 48, 364, 171, 244, 407, 57, 371, 128, 196, 103, 202, 294, 239, 283, 299, 237, 394, 81, 230, 97, 46, 109, 337, 355, 240, 195, 100, 204, 389, 146, 153, 121, 183, 137, 159, 254, 231, 3, 101, 290, 323, 148, 359, 250, 25, 40, 219, 119, 169, 378, 282, 377, 238, 130, 279, 385, 58, 41, 115, 197, 382, 193, 225, 199, 6, 59, 208, 93, 138, 11, 15, 37, 38, 27, 354, 175, 411, 83, 89, 368, 216, 301, 168, 401, 84, 235, 333, 246, 284, 372, 155, 105, 339, 228, 342, 122, 161, 316, 145, 272, 321, 80, 315, 163, 107, 288, 227, 191, 306, 310, 76, 85, 132, 2, 320, 36, 13, 74, 233, 72, 381, 269, 70, 402, 86, 95, 111, 8, 383, 314, 10, 200, 203, 292, 241, 212, 374, 234, 369, 422, 42, 357, 18, 49, 214, 9, 156, 129, 258, 259, 190, 79, 367, 414, 201, 166, 270, 319, 332, 4, 184, 187, 164, 395, 325, 88, 245, 185, 71, 400, 275, 312, 324, 1, 224, 45, 205, 404, 260, 392, 253, 273, 416, 96, 408, 112, 349, 393, 345, 152, 329, 420, 410, 14, 361, 7, 257, 207, 194, 298, 17, 98, 340, 391, 399, 397, 82, 263, 376, 158, 327, 406, 265, 418, 322, 77, 92, 266, 262, 44, 360, 172, 403, 350, 66, 384, 247, 139, 181, 198, 248, 232, 32, 295, 106, 160, 287, 379, 341, 344, 421, 182, 375, 307, 415, 64, 75, 297, 125, 276, 223, 149, 26, 398, 303, 154, 133, 210, 150, 206, 174, 62, 170, 390, 54, 347, 39, 229, 178, 296, 108, 21, 165, 268, 264, 356, 304, 192, 252, 117, 335, 362]
|
83
|
+
|
84
|
+
best_order = curr_order.dup
|
85
|
+
while true
|
86
|
+
|
87
|
+
curr_score, curr_errors = score_order(system, curr_order)
|
88
|
+
#print "#{source} <-> #{target} = #{curr_score}; "
|
89
|
+
puts Time.now.inspect
|
90
|
+
puts best_order.inspect
|
91
|
+
puts curr_score
|
92
|
+
puts best_errors.inspect
|
93
|
+
puts best_score
|
94
|
+
puts ''
|
95
|
+
|
96
|
+
if curr_score < best_score
|
97
|
+
puts ''
|
98
|
+
best_score = curr_score.dup
|
99
|
+
best_order = curr_order.dup
|
100
|
+
best_errors = curr_errors.dup
|
101
|
+
end
|
102
|
+
curr_order = mutate_order(best_order)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
rescue Exception => e
|
106
|
+
puts e
|
107
|
+
end
|
108
|
+
|
109
|
+
end
|
110
|
+
# for this code to work sorting by max_length in interpreter.rb line 46 needs to be disabled
|
111
|
+
# #r.children.each do |i|
|
112
|
+
# r.children.sort_by{ |rule| -rule.max_length }.each do |i|
|
@@ -0,0 +1,45 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'regexp_parser'
|
6
|
+
|
7
|
+
old_maps = Dir["../../interscript/maps/*.yaml"]
|
8
|
+
|
9
|
+
$expr_classes = []
|
10
|
+
$quantifiers = []
|
11
|
+
old_maps.each do |old_map|
|
12
|
+
old_map_name = File.basename(old_map, ".yaml")
|
13
|
+
puts old_map
|
14
|
+
f = File.read(old_map)
|
15
|
+
yaml = YAML.load(f)
|
16
|
+
map_keys = yaml['map'].keys
|
17
|
+
puts map_keys
|
18
|
+
rs = yaml['map']['postrules']&.map{|h| h['pattern']} || []
|
19
|
+
rs += yaml['map']['characters']&.keys || []
|
20
|
+
rs.each do |regexp|
|
21
|
+
tree = Regexp::Parser.parse( regexp )
|
22
|
+
|
23
|
+
arr = tree.expressions
|
24
|
+
while true
|
25
|
+
new_arr = arr.map do |elem|
|
26
|
+
if elem.respond_to? :quantifier and elem.quantifier
|
27
|
+
$expr_classes << elem.quantifier
|
28
|
+
end
|
29
|
+
el = el.class == Class ? el : el.class
|
30
|
+
if elem.respond_to?(:expressions)
|
31
|
+
[el, elem.expressions]
|
32
|
+
else
|
33
|
+
el
|
34
|
+
end
|
35
|
+
end.flatten
|
36
|
+
break if new_arr == arr
|
37
|
+
arr = new_arr
|
38
|
+
end
|
39
|
+
$expr_classes += arr
|
40
|
+
end
|
41
|
+
end;
|
42
|
+
# $expressions.map{|elem| elem.class.to_s=="Class" ? elem : elem.class}.tally
|
43
|
+
|
44
|
+
pp $expr_classes.tally.sort_by{|k,v| -v}
|
45
|
+
pp $quanitifiers.map{|q| q.text}.tally.sort_by{|k,v| -v}
|
data/bin/maps_v1_to_v2
ADDED
@@ -0,0 +1,426 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# This is a helper script for porting Interscript v1 maps to v2 format. It won't
|
3
|
+
# ever be able to port them completely, but it should help bootstrap the process.
|
4
|
+
|
5
|
+
require 'bundler/setup'
|
6
|
+
|
7
|
+
require 'yaml'
|
8
|
+
require 'fileutils'
|
9
|
+
|
10
|
+
Dir.chdir(__dir__ + "/../")
|
11
|
+
FileUtils.rm_rf(Dir.glob("../maps/maps-staging/*"))
|
12
|
+
#FileUtils.mkdir_p("../maps/maps-staging/")
|
13
|
+
|
14
|
+
#old_maps = []
|
15
|
+
old_maps = Dir["../../interscript/maps/*.yaml"]
|
16
|
+
#old_maps = Dir["../../interscript/maps/alalc-aze-Arab-Latn-1997.yaml"]
|
17
|
+
#old_maps = Dir["../../interscript/maps/mofa-jpn-Hrkt-Latn-1989.yaml"]
|
18
|
+
|
19
|
+
|
20
|
+
ex_maps = Dir["../maps/maps/*.imp"]
|
21
|
+
ex_map_names = ex_maps.map { |i| File.basename(i, ".imp") }
|
22
|
+
|
23
|
+
|
24
|
+
require 'regexp_parser'
|
25
|
+
|
26
|
+
require 'interscript/utils/regexp_converter.rb'
|
27
|
+
|
28
|
+
|
29
|
+
def process_line( kkk, vvv, indent: 0)
|
30
|
+
|
31
|
+
parse_kkk = Regexp::Parser.parse(kkk, 'ruby/2.1')
|
32
|
+
tokens_kkk = process(parse_kkk)
|
33
|
+
# pp conv
|
34
|
+
root_hash = process_root(tokens_kkk)
|
35
|
+
# pp root_hash
|
36
|
+
# puts "vvv = #{vvv.inspect}"
|
37
|
+
if vvv.class == String
|
38
|
+
if vvv == '' or vvv =='""' or vvv == nil or vvv.include? '~'
|
39
|
+
root_hash[:to] = '""'
|
40
|
+
elsif vvv == '"'
|
41
|
+
root_hash[:to] = '"\""'
|
42
|
+
else
|
43
|
+
if vvv == "?" #alalc-ell-Grek-Latn-1997.imp un-ell-Grek-Latn-1987-phonetic have to "?"
|
44
|
+
root_hash[:to] = "?".inspect
|
45
|
+
#if root_hash[:from].to_s.include?('capture') or root_hash[:to] =~ /\\\\([0-9]+)/
|
46
|
+
else
|
47
|
+
parse_vvv = Regexp::Parser.parse(vvv)
|
48
|
+
tokens_vvv = process(parse_vvv)
|
49
|
+
string_vvv = stringify(tokens_vvv)
|
50
|
+
# puts string_vvv
|
51
|
+
root_hash[:to] = string_vvv
|
52
|
+
root_hash[:to] = 'upcase' if string_vvv.include? 'upcase'
|
53
|
+
end
|
54
|
+
end
|
55
|
+
elsif vvv.class == Array
|
56
|
+
root_hash[:to] = "any(#{vvv.inspect})"
|
57
|
+
else
|
58
|
+
root_hash[:to] = "\"\""
|
59
|
+
# puts "unknown class #{vvv.inspect}"
|
60
|
+
end
|
61
|
+
|
62
|
+
str = stringify_root(root_hash, indent: indent)
|
63
|
+
# puts str
|
64
|
+
# puts ""
|
65
|
+
str
|
66
|
+
end
|
67
|
+
|
68
|
+
|
69
|
+
old_maps.sort.each do |old_map|
|
70
|
+
old_map_name = File.basename(old_map, ".yaml")
|
71
|
+
|
72
|
+
if ex_map_names.include? old_map_name
|
73
|
+
puts "* Skipping #{old_map_name} as it's already ported"
|
74
|
+
next
|
75
|
+
end
|
76
|
+
|
77
|
+
print "* Converting #{old_map_name}."
|
78
|
+
|
79
|
+
f = File.read(old_map)
|
80
|
+
fl = f.split("\n")
|
81
|
+
|
82
|
+
md = []
|
83
|
+
tests = []
|
84
|
+
map = []
|
85
|
+
chain = nil
|
86
|
+
|
87
|
+
cur = md
|
88
|
+
|
89
|
+
bugnotes = false
|
90
|
+
|
91
|
+
fl.each do |i|
|
92
|
+
if i == '---'
|
93
|
+
# skip the first line
|
94
|
+
elsif i =~ /\A\s+|\A\z/
|
95
|
+
# continuation
|
96
|
+
if bugnotes
|
97
|
+
i = "#{i}"
|
98
|
+
md << i
|
99
|
+
else
|
100
|
+
cur << i
|
101
|
+
end
|
102
|
+
else
|
103
|
+
cmt = nil
|
104
|
+
i = i.sub(/(#.*?)\z/) do |j|
|
105
|
+
cmt = j
|
106
|
+
""
|
107
|
+
end
|
108
|
+
|
109
|
+
# block begin or md
|
110
|
+
case i.strip
|
111
|
+
when "tests:"
|
112
|
+
cur = tests
|
113
|
+
bugnotes = false
|
114
|
+
when "map:"
|
115
|
+
cur = map
|
116
|
+
bugnotes = false
|
117
|
+
when "notes:"
|
118
|
+
md << "notes:"
|
119
|
+
bugnotes = true
|
120
|
+
when /\Achain:/
|
121
|
+
chain = i
|
122
|
+
else
|
123
|
+
cur << i
|
124
|
+
end
|
125
|
+
|
126
|
+
cur << cmt if cmt
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
print "."
|
131
|
+
|
132
|
+
newmd = []
|
133
|
+
aliasff = false
|
134
|
+
md.each do |i|
|
135
|
+
if i.strip == "alias:"
|
136
|
+
aliasff = true
|
137
|
+
elsif i !~ /\A\s+/
|
138
|
+
aliasff = false
|
139
|
+
end
|
140
|
+
newmd << i unless aliasff
|
141
|
+
end
|
142
|
+
md = newmd
|
143
|
+
|
144
|
+
print "."
|
145
|
+
|
146
|
+
newmd = md.map(&" ".method(:+))
|
147
|
+
.join("\n")
|
148
|
+
.gsub(" note:", " notes:")
|
149
|
+
.gsub("confirmation date:", "confirmation_date:")
|
150
|
+
#.gsub("special_rules:", ' - "special rules:"')
|
151
|
+
#.gsub("original_description:", " # original description:")
|
152
|
+
#.gsub("original_notes:", ' - "original notes:"')
|
153
|
+
#.gsub("implementation_notes:", ' - "implementation notes:"')
|
154
|
+
.rstrip
|
155
|
+
|
156
|
+
new = "metadata {\n"
|
157
|
+
new << newmd
|
158
|
+
new << "\n}\n\n"
|
159
|
+
|
160
|
+
class MultilineError < StandardError; end
|
161
|
+
|
162
|
+
if tests.length > 0
|
163
|
+
new << "tests {\n"
|
164
|
+
cmt = ""
|
165
|
+
|
166
|
+
iter = 0
|
167
|
+
while iter < tests.length; begin
|
168
|
+
test = tests[iter]
|
169
|
+
|
170
|
+
if test =~ /\A\s*#/
|
171
|
+
new << " " << test.strip << "\n"
|
172
|
+
iter += 1
|
173
|
+
next
|
174
|
+
end
|
175
|
+
|
176
|
+
re_source = /\A(?: ){0,2}- source: (.*?)(\s*#.*?)?\z/m
|
177
|
+
re_expect = /\A(?: ){0,3}expected:[ \t](.*?)(\s*#.*?)?\z/m
|
178
|
+
|
179
|
+
if test.rstrip.end_with?("|") ||
|
180
|
+
(test =~ /"/ && !test.rstrip.end_with?('"'))
|
181
|
+
while iter < tests.length
|
182
|
+
xtest = tests[iter+1]
|
183
|
+
break if xtest =~ re_source || xtest =~ re_expect
|
184
|
+
test << "\n" << (xtest||"")
|
185
|
+
iter += 1
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
case test
|
190
|
+
when re_source
|
191
|
+
new << " test #{YAML.load($1).inspect}, "
|
192
|
+
cmt = $2 if $2
|
193
|
+
when re_expect
|
194
|
+
new << "#{YAML.load($1).inspect}".unicode_normalize
|
195
|
+
new << cmt
|
196
|
+
new << $2 if $2
|
197
|
+
new << "\n"
|
198
|
+
cmt = ""
|
199
|
+
when /\A\s*\z/
|
200
|
+
# empty line, ignore
|
201
|
+
else
|
202
|
+
new << "\n# BUG: #{test}\n"
|
203
|
+
end
|
204
|
+
|
205
|
+
iter += 1
|
206
|
+
rescue Psych::SyntaxError
|
207
|
+
p test
|
208
|
+
end; end
|
209
|
+
|
210
|
+
new << "}\n\n"
|
211
|
+
end
|
212
|
+
|
213
|
+
print "."
|
214
|
+
|
215
|
+
new << "# This map has been partially converted by the bin/maps_v1_to_v2 script\n"
|
216
|
+
new << "# The section below requires human attention. Remember to remove this\n"
|
217
|
+
new << "# comment and move the converted map to 'maps/' directory. Please also\n"
|
218
|
+
new << "# take note that the maps-staging directory will be cleaned up whenever\n"
|
219
|
+
new << "# you run the bin/maps_v1_to_v2 script. You should particularly be\n"
|
220
|
+
new << "# concerned about any regular expressions found in this file and about\n"
|
221
|
+
new << "# advanced expressions in parallel {} parts, and also about the order\n"
|
222
|
+
new << "# of particular parts of the stage.\n\n"
|
223
|
+
|
224
|
+
transcription = nil
|
225
|
+
title_case = nil
|
226
|
+
downcase = nil
|
227
|
+
inherit = nil
|
228
|
+
|
229
|
+
characters, rules, dictionary, postrules = [], [], [], []
|
230
|
+
|
231
|
+
cur = nil
|
232
|
+
indent = 0
|
233
|
+
stagedone = false
|
234
|
+
efini = proc do
|
235
|
+
if inherit
|
236
|
+
new << "\n"
|
237
|
+
inherit.each do |i|
|
238
|
+
new << " run map.#{i}.stage.main\n"
|
239
|
+
end
|
240
|
+
end
|
241
|
+
new << "\n"
|
242
|
+
efini=proc{}
|
243
|
+
end
|
244
|
+
fini = proc{}
|
245
|
+
begn = proc { new << "stage {\n"; stagedone = true; begn=proc{} }
|
246
|
+
|
247
|
+
iter = -1
|
248
|
+
while iter+1 < map.length
|
249
|
+
iter += 1
|
250
|
+
i = map[iter]
|
251
|
+
|
252
|
+
cmt = nil
|
253
|
+
if i =~ /\A [^\s#]/ || i =~ /\A inherit:/
|
254
|
+
i.sub(/(#.*?)\z/) { cmt = $1 }
|
255
|
+
if cmt
|
256
|
+
new << " "*indent << cmt << "\n"
|
257
|
+
end
|
258
|
+
|
259
|
+
maybe_val = YAML.load(i.split(":").last)
|
260
|
+
case i.split(":").first.strip
|
261
|
+
when "inherit"
|
262
|
+
inherit = Array(maybe_val)
|
263
|
+
inherit = inherit.map do |inh|
|
264
|
+
short = inh.split("-")[2..3].join.downcase
|
265
|
+
new << "dependency #{inh.inspect}, as: #{short}\n"
|
266
|
+
short
|
267
|
+
end
|
268
|
+
new << "\n"
|
269
|
+
raise "Duplicate items" unless inherit.length == inherit.uniq.length
|
270
|
+
when "dictionary"
|
271
|
+
begn.()
|
272
|
+
fini.()
|
273
|
+
new << " # DICTIONARY\n"
|
274
|
+
new << " parallel {\n"
|
275
|
+
indent = 4
|
276
|
+
cur = dictionary
|
277
|
+
fini = proc{new << " }\n\n";indent = 2}
|
278
|
+
when "rules"
|
279
|
+
begn.()
|
280
|
+
fini.()
|
281
|
+
efini.()
|
282
|
+
new << " # RULES\n"
|
283
|
+
indent = 2
|
284
|
+
cur = rules
|
285
|
+
fini = proc{new << "\n"}
|
286
|
+
when "characters"
|
287
|
+
begn.()
|
288
|
+
fini.()
|
289
|
+
efini.()
|
290
|
+
new << " # CHARACTERS\n"
|
291
|
+
new << " parallel {\n"
|
292
|
+
indent = 4
|
293
|
+
cur = characters
|
294
|
+
fini = proc{new << " }\n\n";indent = 2}
|
295
|
+
when "postrules"
|
296
|
+
begn.()
|
297
|
+
fini.()
|
298
|
+
efini.()
|
299
|
+
new << " # POSTRULES\n"
|
300
|
+
indent = 2
|
301
|
+
cur = postrules
|
302
|
+
fini = proc{new << "\n"}
|
303
|
+
when "downcase"
|
304
|
+
downcase = maybe_val
|
305
|
+
when "title_case"
|
306
|
+
title_case = maybe_val
|
307
|
+
when "transcription"
|
308
|
+
transcription = maybe_val
|
309
|
+
# Those we will ignore for now
|
310
|
+
when "word_separator", "segmentation", "character_separator", "map"
|
311
|
+
# Those are bugs
|
312
|
+
when "title-case"
|
313
|
+
else
|
314
|
+
p i
|
315
|
+
end
|
316
|
+
else
|
317
|
+
cmt = ""
|
318
|
+
i = i.sub(/(#.*?)\z/) { cmt << $1; "" }
|
319
|
+
#new << " "*indent << i.strip << "\n"
|
320
|
+
if i.strip == ""
|
321
|
+
new << " "*indent << cmt << "\n"
|
322
|
+
next
|
323
|
+
end
|
324
|
+
|
325
|
+
case cur.object_id
|
326
|
+
when nil.object_id
|
327
|
+
raise "Unexpected line #{i}"
|
328
|
+
when characters.object_id, dictionary.object_id
|
329
|
+
k,v = i.split(":", 2).map(&:strip)
|
330
|
+
if !v || v == ""
|
331
|
+
v = ""
|
332
|
+
# Load array
|
333
|
+
iter2 = iter + 1
|
334
|
+
while iter2 < map.length
|
335
|
+
i2 = map[iter2]
|
336
|
+
break unless i2.strip =~ /\A-/
|
337
|
+
i2 = i2.sub(/(#.*?)\z/) { cmt << $1; "" }
|
338
|
+
v << "\n" << i2
|
339
|
+
iter2 += 1
|
340
|
+
end
|
341
|
+
iter = iter2 - 1
|
342
|
+
end
|
343
|
+
|
344
|
+
kk,vv = YAML.load(k), YAML.load(v)
|
345
|
+
|
346
|
+
kkk,vvv = kk.inspect, vv.inspect
|
347
|
+
if vv.class == Array
|
348
|
+
if vv.all? { |z| z.length == 1 }
|
349
|
+
vvv = "any(" + vv.join.inspect + ")"
|
350
|
+
else
|
351
|
+
vvv = "any(" + vv.inspect + ")"
|
352
|
+
end
|
353
|
+
elsif vv.class == NilClass
|
354
|
+
vvv = "none"
|
355
|
+
end
|
356
|
+
kkk,vvv = kk.dup,vv.dup
|
357
|
+
# This worked due to use of regexps... it should remove 1 slash.
|
358
|
+
kkk = kkk.gsub("\\\\u", "\\\\u")
|
359
|
+
kkk = kkk.gsub("\\\\U", "\\\\u")
|
360
|
+
|
361
|
+
new << process_line( kkk,vvv, indent: indent )
|
362
|
+
when rules.object_id, postrules.object_id
|
363
|
+
if i.strip =~ /\A- pattern\s*:/
|
364
|
+
_, k = i.split(":", 2)
|
365
|
+
ii = map[iter+1]
|
366
|
+
ii = ii.sub(/(#.*?)\z/) { cmt << $1; "" }
|
367
|
+
if ii.strip == ""
|
368
|
+
iter += 1
|
369
|
+
ii = map[iter+1]
|
370
|
+
ii = ii.sub(/(#.*?)\z/) { cmt << $1; "" }
|
371
|
+
end
|
372
|
+
if ii.strip =~ /\Aresult\s*:/
|
373
|
+
_, v = ii.split(":", 2)
|
374
|
+
else
|
375
|
+
raise "Unexpected(2): #{ii.strip.inspect}"
|
376
|
+
end
|
377
|
+
iter += 1
|
378
|
+
else
|
379
|
+
raise "Unexpected(1): #{i.strip.inspect}"
|
380
|
+
end
|
381
|
+
|
382
|
+
kk,vv = YAML.load(k), YAML.load(v)
|
383
|
+
# kkk,vvv = kk.inspect, vv.inspect
|
384
|
+
kkk,vvv = kk.dup,vv.dup
|
385
|
+
|
386
|
+
new << process_line( kkk,vvv, indent: indent )
|
387
|
+
end
|
388
|
+
|
389
|
+
if cmt != ""
|
390
|
+
new << " " << cmt << "\n"
|
391
|
+
else
|
392
|
+
new << "\n"
|
393
|
+
end
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
fini.()
|
398
|
+
|
399
|
+
unless stagedone
|
400
|
+
new << "stage {\n"
|
401
|
+
efini.()
|
402
|
+
end
|
403
|
+
|
404
|
+
new << " title_case\n" if title_case
|
405
|
+
new << " downcase\n" if downcase
|
406
|
+
|
407
|
+
new << "}\n\n"
|
408
|
+
|
409
|
+
if chain
|
410
|
+
new << "# This map is chained and probably depends on seq2seq:\n"
|
411
|
+
new << "# #{chain}\n\n"
|
412
|
+
end
|
413
|
+
|
414
|
+
if transcription
|
415
|
+
new << "# This map contains transcription and probably depends on seq2seq:\n"
|
416
|
+
new << "# transcription: #{transcription}\n\n"
|
417
|
+
end
|
418
|
+
|
419
|
+
new = new.gsub(/ +$/, '') # Cleanup trailing whitespaces
|
420
|
+
|
421
|
+
#new << map.join("\n")
|
422
|
+
|
423
|
+
File.write("../maps/maps-staging/#{old_map_name}.imp", new)
|
424
|
+
|
425
|
+
puts " done!"
|
426
|
+
end
|