interscript 0.1.4 → 2.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (183) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +29 -0
  5. data/LICENSE.adoc +31 -0
  6. data/README.md +3 -0
  7. data/Rakefile +53 -0
  8. data/bin/console +14 -0
  9. data/bin/interscript +3 -39
  10. data/bin/maps_analyze_staging +168 -0
  11. data/bin/maps_debug_compilers +58 -0
  12. data/bin/maps_debug_ordering +88 -0
  13. data/bin/maps_debug_ruby_compile +24 -0
  14. data/bin/maps_debug_step_by_step +44 -0
  15. data/bin/maps_optimize_order +112 -0
  16. data/bin/maps_v1_analyze_regexps +45 -0
  17. data/bin/maps_v1_to_v2 +426 -0
  18. data/exe/interscript +6 -0
  19. data/interscript.gemspec +31 -0
  20. data/lib/interscript.rb +76 -128
  21. data/lib/interscript/command.rb +6 -5
  22. data/lib/interscript/compiler.rb +22 -0
  23. data/lib/interscript/compiler/javascript.rb +292 -0
  24. data/lib/interscript/compiler/ruby.rb +262 -0
  25. data/lib/interscript/dsl.rb +67 -0
  26. data/lib/interscript/dsl/aliases.rb +23 -0
  27. data/lib/interscript/dsl/document.rb +46 -0
  28. data/lib/interscript/dsl/group.rb +45 -0
  29. data/lib/interscript/dsl/group/parallel.rb +6 -0
  30. data/lib/interscript/dsl/items.rb +89 -0
  31. data/lib/interscript/dsl/metadata.rb +26 -0
  32. data/lib/interscript/dsl/stage.rb +6 -0
  33. data/lib/interscript/dsl/symbol_mm.rb +11 -0
  34. data/lib/interscript/dsl/tests.rb +12 -0
  35. data/lib/interscript/interpreter.rb +251 -0
  36. data/lib/interscript/node.rb +25 -0
  37. data/lib/interscript/node/alias_def.rb +15 -0
  38. data/lib/interscript/node/dependency.rb +13 -0
  39. data/lib/interscript/node/document.rb +45 -0
  40. data/lib/interscript/node/group.rb +34 -0
  41. data/lib/interscript/node/group/parallel.rb +9 -0
  42. data/lib/interscript/node/group/sequential.rb +2 -0
  43. data/lib/interscript/node/item.rb +52 -0
  44. data/lib/interscript/node/item/alias.rb +42 -0
  45. data/lib/interscript/node/item/any.rb +61 -0
  46. data/lib/interscript/node/item/capture.rb +50 -0
  47. data/lib/interscript/node/item/group.rb +51 -0
  48. data/lib/interscript/node/item/repeat.rb +40 -0
  49. data/lib/interscript/node/item/stage.rb +23 -0
  50. data/lib/interscript/node/item/string.rb +51 -0
  51. data/lib/interscript/node/metadata.rb +18 -0
  52. data/lib/interscript/node/rule.rb +6 -0
  53. data/lib/interscript/node/rule/funcall.rb +18 -0
  54. data/lib/interscript/node/rule/run.rb +15 -0
  55. data/lib/interscript/node/rule/sub.rb +65 -0
  56. data/lib/interscript/node/stage.rb +19 -0
  57. data/lib/interscript/node/tests.rb +15 -0
  58. data/lib/interscript/stdlib.rb +211 -0
  59. data/lib/interscript/utils/regexp_converter.rb +283 -0
  60. data/lib/interscript/version.rb +1 -1
  61. data/requirements.txt +1 -0
  62. metadata +73 -223
  63. data/README.adoc +0 -297
  64. data/bin/rspec +0 -29
  65. data/lib/g2pwrapper.py +0 -34
  66. data/lib/interscript/mapping.rb +0 -125
  67. data/lib/model-7 +0 -0
  68. data/lib/tha-pt-b-7 +0 -0
  69. data/maps/acadsin-zho-Hani-Latn-2002.yaml +0 -38912
  70. data/maps/alalc-aze-Cyrl-Latn-1997.yaml +0 -141
  71. data/maps/alalc-bel-cyrl-latn-1997.yaml +0 -125
  72. data/maps/alalc-ben-Beng-Latn-2017.yaml +0 -130
  73. data/maps/alalc-bul-Cyrl-Latn-1997.yaml +0 -94
  74. data/maps/alalc-ell-Grek-Latn-1997.yaml +0 -625
  75. data/maps/alalc-ell-Grek-Latn-2010.yaml +0 -628
  76. data/maps/alalc-kat-Geok-Latn-1997.yaml +0 -112
  77. data/maps/alalc-kat-Geor-Latn-1997.yaml +0 -146
  78. data/maps/alalc-kor-Hang-Latn-1997.yaml +0 -94
  79. data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +0 -103
  80. data/maps/alalc-mkd-cyrl-latn-1997.yaml +0 -114
  81. data/maps/alalc-rus-Cyrl-Latn-1997.yaml +0 -222
  82. data/maps/alalc-rus-Cyrl-Latn-2012.yaml +0 -162
  83. data/maps/alalc-srp-Cyrl-Latn-1997.yaml +0 -114
  84. data/maps/alalc-srp-cyrl-latn-2013.yaml +0 -135
  85. data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +0 -141
  86. data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +0 -16
  87. data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +0 -283
  88. data/maps/bas-rus-Cyrl-Latn-2017-bss.yaml +0 -175
  89. data/maps/bas-rus-Cyrl-Latn-2017-oss.yaml +0 -169
  90. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +0 -294
  91. data/maps/bgn-kor-Hang-Latn-1943.yaml +0 -31
  92. data/maps/bgn-kor-Kore-Latn-1943.yaml +0 -31
  93. data/maps/bgna-bul-Cyrl-Latn-2006.yaml +0 -208
  94. data/maps/bgna-bul-Cyrl-Latn-2009.yaml +0 -208
  95. data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +0 -108
  96. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +0 -104
  97. data/maps/bgnpcgn-bak-Cyrl-Latn-2007.yaml +0 -184
  98. data/maps/bgnpcgn-bel-cyrl-latn-1979.yaml +0 -285
  99. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +0 -115
  100. data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +0 -38
  101. data/maps/bgnpcgn-chn-Hans-Latn-1979.yaml +0 -7456
  102. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +0 -702
  103. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +0 -20
  104. data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +0 -257
  105. data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +0 -127
  106. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +0 -43
  107. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +0 -253
  108. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +0 -48
  109. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +0 -48
  110. data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +0 -159
  111. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +0 -190
  112. data/maps/bgnpcgn-per-Arab-Latn-1956.yaml +0 -93
  113. data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +0 -314
  114. data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +0 -166
  115. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +0 -163
  116. data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +0 -208
  117. data/maps/by-bel-Cyrl-Latn-1998.yaml +0 -168
  118. data/maps/by-bel-Cyrl-Latn-2007.yaml +0 -115
  119. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +0 -685
  120. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +0 -681
  121. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +0 -20
  122. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +0 -32
  123. data/maps/ggg-kat-Geor-Latn-2002.yaml +0 -89
  124. data/maps/gki-bel-cyrl-latn-1992.yaml +0 -33
  125. data/maps/gki-bel-cyrl-latn-2000.yaml +0 -201
  126. data/maps/gost-rus-cyrl-latn-16876-71-1983.yaml +0 -186
  127. data/maps/hk-yue-Hani-Latn-1888.yaml +0 -38497
  128. data/maps/icao-bel-Cyrl-Latn-9303.yaml +0 -141
  129. data/maps/icao-bul-Cyrl-Latn-9303.yaml +0 -122
  130. data/maps/icao-heb-Hebr-Latn-9303.yaml +0 -151
  131. data/maps/icao-mkd-Cyrl-Latn-9303.yaml +0 -117
  132. data/maps/icao-per-Arab-Latn-9303.yaml +0 -104
  133. data/maps/icao-rus-Cyrl-Latn-9303.yaml +0 -118
  134. data/maps/icao-srp-Cyrl-Latn-9303.yaml +0 -117
  135. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +0 -120
  136. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +0 -610
  137. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +0 -41
  138. data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +0 -62
  139. data/maps/iso-rus-Cyrl-Latn-9-1995.yaml +0 -272
  140. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +0 -109
  141. data/maps/kp-kor-Hang-Latn-2002.yaml +0 -901
  142. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +0 -44820
  143. data/maps/mext-jpn-Hrkt-Latn-1954.yaml +0 -411
  144. data/maps/moct-kor-Hang-Latn-2000.yaml +0 -803
  145. data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +0 -541
  146. data/maps/mvd-bel-Cyrl-Latn-2008.yaml +0 -225
  147. data/maps/mvd-bel-Cyrl-Latn-2010.yaml +0 -63
  148. data/maps/mvd-rus-Cyrl-Latn-2008.yaml +0 -110
  149. data/maps/mvd-rus-Cyrl-Latn-2010.yaml +0 -37
  150. data/maps/nil-kor-Hang-Hang-jamo.yaml +0 -11193
  151. data/maps/odni-bel-Cyrl-Latn-2015.yaml +0 -148
  152. data/maps/odni-bul-Cyrl-Latn-2015.yaml +0 -96
  153. data/maps/odni-kat-Geor-Latn-2015.yaml +0 -88
  154. data/maps/odni-rus-Cyrl-Latn-2015.yaml +0 -77
  155. data/maps/odni-srp-Cyrl-Latn-2015.yaml +0 -129
  156. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +0 -157
  157. data/maps/odni-uzb-Cyrl-Latn-2015.yaml +0 -167
  158. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +0 -90
  159. data/maps/royin-tha-Thai-Latn-1968.yaml +0 -179
  160. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +0 -180
  161. data/maps/royin-tha-Thai-Latn-1999.yaml +0 -76
  162. data/maps/sac-zho-Hans-Latn-1979.yaml +0 -24759
  163. data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +0 -222
  164. data/maps/ua-ukr-Cyrl-Latn-1996.yaml +0 -193
  165. data/maps/un-bel-Cyrl-Latn-2007.yaml +0 -114
  166. data/maps/un-ben-Beng-Latn-2016.yaml +0 -534
  167. data/maps/un-ell-Grek-Latn-1987-tl.yaml +0 -32
  168. data/maps/un-ell-Grek-Latn-1987-ts.yaml +0 -20
  169. data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +0 -780
  170. data/maps/un-mon-Mong-Latn-2013.yaml +0 -93
  171. data/maps/un-rus-Cyrl-Latn-1987.yaml +0 -166
  172. data/maps/un-ukr-cyrl-latn-1998.yaml +0 -30
  173. data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +0 -406
  174. data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +0 -386
  175. data/maps/var-kor-Hang-Latn-mr-1939.yaml +0 -1054
  176. data/maps/var-kor-Kore-Hang-2013.yaml +0 -59754
  177. data/maps/var-kor-Kore-Latn-mr-1939.yaml +0 -37
  178. data/maps/var-tha-Thai-Thai-phonemic.yaml +0 -59
  179. data/maps/var-tha-Thai-Zsym-ipa.yaml +0 -301
  180. data/maps/var-zho-Hani-Latn-1979.yaml +0 -38908
  181. data/spec/interscript/mapping_spec.rb +0 -42
  182. data/spec/interscript_spec.rb +0 -26
  183. data/spec/spec_helper.rb +0 -3
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+ ENV["INTERSCRIPT_STAGING"] = "1"
3
+ require "bundler/setup"
4
+ require "interscript"
5
+ require "interscript/compiler/ruby"
6
+
7
+ # Compile a given map with the Ruby compiler for debugging purposes
8
+
9
+ if ARGV[0] == '-b'
10
+ require 'base64'
11
+ $b64 = true
12
+ ARGV.shift
13
+ end
14
+
15
+ map = ARGV[0]
16
+ m = Interscript.parse(map)
17
+ cr = Interscript::Compiler::Ruby
18
+ mr = cr.(map)
19
+
20
+ if $b64
21
+ puts Base64.encode64(mr.code)
22
+ else
23
+ puts mr.code
24
+ end
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env ruby
2
+ ENV["INTERSCRIPT_STAGING"] = "1"
3
+ require "bundler/setup"
4
+ require "interscript"
5
+ require "interscript/compiler/ruby"
6
+
7
+ # This script has been written because there are some differences between platforms
8
+ # (ie. windows vs linux) that we wish to find out more about
9
+
10
+ if ARGV[0] == '-b'
11
+ require 'base64'
12
+ $b64 = []
13
+ ARGV.shift
14
+ end
15
+
16
+ map = ARGV[0]
17
+ m = Interscript.parse(map)
18
+ cr = Interscript::Compiler::Ruby
19
+ mr = cr.(map, debug: true)
20
+
21
+ m.tests.data.each_with_index do |(from, expected), idx|
22
+ r = mr.(from)
23
+
24
+ unless ARGV[1] && ARGV[1].split(",").any? { |i| i.to_i == idx }
25
+ if r == expected
26
+ cr.reset_debug_data
27
+ next
28
+ end
29
+ end
30
+
31
+ dr = cr.read_debug_data
32
+
33
+ if $b64
34
+ $b64 << [idx, dr]
35
+ else
36
+ pp [idx, dr]
37
+ end
38
+
39
+ cr.reset_debug_data
40
+ end
41
+
42
+ if $b64
43
+ puts Base64.encode64($b64.inspect)
44
+ end
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env ruby
2
+ ENV["INTERSCRIPT_STAGING"] = "1"
3
+ require "bundler/setup"
4
+ require "interscript"
5
+ require "interscript/compiler/ruby"
6
+
7
+ $map_name = ARGV[0]
8
+
9
+ if $map_name
10
+ filelist = [ __dir__+"/../../maps/maps-staging/#{$map_name}.imp" ]
11
+ else
12
+ filelist = Dir[__dir__+"/../../maps/maps-staging/*.imp"].sort
13
+ end
14
+
15
+
16
+ # levenshtein distance algorithm for comparing string similarity
17
+ def ld(s, t)
18
+ v0 = (0..t.length).to_a
19
+ v1 = []
20
+ #p v0
21
+
22
+ s.chars.each_with_index do |s_ch, i|
23
+ v1[0] = i + 1
24
+
25
+ t.chars.each_with_index do |t_ch, j|
26
+ cost = s_ch == t_ch ? 0 : 1
27
+ v1[j + 1] = [v1[j] + 1, v0[j + 1] + 1, v0[j] + cost].min
28
+ end
29
+ v0 = v1.dup
30
+ #p v1
31
+ end
32
+
33
+ v0[t.length]
34
+ end
35
+
36
+
37
+ def score_order( system, order )
38
+ interpreter = Marshal.load( Marshal.dump( $interpreter ))
39
+ parallel = interpreter.map.stages[:main].children.select{|x| Interscript::Node::Group::Parallel === x}[0]
40
+ parallel.apply_order(order)
41
+ interpreter.map.stages[:main].children[$parallel_idx].children = parallel.children.compact #.reorder_children(source,target)
42
+ delta_sum = 0
43
+ errors = []
44
+ system.tests.data.each do |from, expected|
45
+ result = interpreter.(from)
46
+ delta = ld(expected, result)
47
+ errors << [expected, result] if delta != 0
48
+ delta_sum += delta
49
+ end;
50
+ [delta_sum, errors]
51
+ end
52
+
53
+
54
+ def mutate_order(order)
55
+ order2 = order.dup
56
+ a = rand(order2.size)
57
+ b = rand(order2.size)
58
+ order2[a], order2[b] = order2[b], order2[a]
59
+ order2
60
+ end
61
+
62
+ for i in filelist
63
+
64
+ begin
65
+ system_name = File.basename(i, ".imp")
66
+ puts "\ndebugging #{system_name}"
67
+
68
+ system = Interscript.parse(system_name);
69
+ if system.tests && system.tests.data && system.tests.data.length > 0
70
+
71
+ $interpreter = Interscript::Interpreter.new.compile(system);
72
+ $orig_parallel = $interpreter.map.stages[:main].children.select{|x| Interscript::Node::Group::Parallel === x}[0].dup;
73
+ $parallel_idx = $interpreter.map.stages[:main].children.each_with_index.select{|x,i| Interscript::Node::Group::Parallel === x}.map{|x,i| i}[0]
74
+ next if !$parallel_idx
75
+ starting_score, starting_errors = score_order(system, $orig_parallel.children.size.times.to_a)
76
+ parallel_size = $interpreter.map.stages[:main].children.select{|x| Interscript::Node::Group::Parallel === x}[0].children.size
77
+
78
+ puts "starting_score = #{starting_score}"
79
+ best_score = starting_score
80
+ best_errors = starting_errors
81
+ curr_order = $orig_parallel.children.size.times.to_a.shuffle
82
+ #curr_order = [28, 308, 61, 87, 29, 147, 124, 22, 373, 186, 336, 19, 405, 387, 215, 209, 131, 30, 60, 343, 34, 380, 189, 53, 300, 286, 162, 31, 33, 218, 90, 51, 293, 226, 56, 305, 142, 102, 346, 222, 126, 338, 50, 52, 363, 144, 136, 16, 388, 221, 267, 63, 352, 365, 251, 78, 68, 328, 69, 12, 67, 317, 334, 94, 366, 412, 302, 243, 311, 318, 281, 274, 143, 236, 386, 135, 280, 167, 173, 291, 271, 309, 73, 20, 157, 331, 43, 242, 65, 351, 134, 151, 0, 285, 211, 417, 220, 179, 91, 353, 255, 141, 23, 104, 413, 409, 256, 326, 180, 140, 24, 348, 261, 5, 99, 47, 35, 358, 177, 123, 277, 396, 114, 213, 116, 188, 217, 249, 419, 120, 289, 330, 110, 118, 176, 113, 278, 127, 313, 55, 370, 48, 364, 171, 244, 407, 57, 371, 128, 196, 103, 202, 294, 239, 283, 299, 237, 394, 81, 230, 97, 46, 109, 337, 355, 240, 195, 100, 204, 389, 146, 153, 121, 183, 137, 159, 254, 231, 3, 101, 290, 323, 148, 359, 250, 25, 40, 219, 119, 169, 378, 282, 377, 238, 130, 279, 385, 58, 41, 115, 197, 382, 193, 225, 199, 6, 59, 208, 93, 138, 11, 15, 37, 38, 27, 354, 175, 411, 83, 89, 368, 216, 301, 168, 401, 84, 235, 333, 246, 284, 372, 155, 105, 339, 228, 342, 122, 161, 316, 145, 272, 321, 80, 315, 163, 107, 288, 227, 191, 306, 310, 76, 85, 132, 2, 320, 36, 13, 74, 233, 72, 381, 269, 70, 402, 86, 95, 111, 8, 383, 314, 10, 200, 203, 292, 241, 212, 374, 234, 369, 422, 42, 357, 18, 49, 214, 9, 156, 129, 258, 259, 190, 79, 367, 414, 201, 166, 270, 319, 332, 4, 184, 187, 164, 395, 325, 88, 245, 185, 71, 400, 275, 312, 324, 1, 224, 45, 205, 404, 260, 392, 253, 273, 416, 96, 408, 112, 349, 393, 345, 152, 329, 420, 410, 14, 361, 7, 257, 207, 194, 298, 17, 98, 340, 391, 399, 397, 82, 263, 376, 158, 327, 406, 265, 418, 322, 77, 92, 266, 262, 44, 360, 172, 403, 350, 66, 384, 247, 139, 181, 198, 248, 232, 32, 295, 106, 160, 287, 379, 341, 344, 421, 182, 375, 307, 415, 64, 75, 297, 125, 276, 223, 149, 26, 398, 303, 154, 133, 210, 150, 206, 174, 62, 170, 390, 54, 347, 39, 229, 178, 296, 108, 21, 165, 268, 264, 356, 304, 192, 252, 117, 335, 362]
83
+
84
+ best_order = curr_order.dup
85
+ while true
86
+
87
+ curr_score, curr_errors = score_order(system, curr_order)
88
+ #print "#{source} <-> #{target} = #{curr_score}; "
89
+ puts Time.now.inspect
90
+ puts best_order.inspect
91
+ puts curr_score
92
+ puts best_errors.inspect
93
+ puts best_score
94
+ puts ''
95
+
96
+ if curr_score < best_score
97
+ puts ''
98
+ best_score = curr_score.dup
99
+ best_order = curr_order.dup
100
+ best_errors = curr_errors.dup
101
+ end
102
+ curr_order = mutate_order(best_order)
103
+ end
104
+ end
105
+ rescue Exception => e
106
+ puts e
107
+ end
108
+
109
+ end
110
+ # for this code to work sorting by max_length in interpreter.rb line 46 needs to be disabled
111
+ # #r.children.each do |i|
112
+ # r.children.sort_by{ |rule| -rule.max_length }.each do |i|
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'yaml'
4
+ require 'fileutils'
5
+ require 'regexp_parser'
6
+
7
+ old_maps = Dir["../../interscript/maps/*.yaml"]
8
+
9
+ $expr_classes = []
10
+ $quantifiers = []
11
+ old_maps.each do |old_map|
12
+ old_map_name = File.basename(old_map, ".yaml")
13
+ puts old_map
14
+ f = File.read(old_map)
15
+ yaml = YAML.load(f)
16
+ map_keys = yaml['map'].keys
17
+ puts map_keys
18
+ rs = yaml['map']['postrules']&.map{|h| h['pattern']} || []
19
+ rs += yaml['map']['characters']&.keys || []
20
+ rs.each do |regexp|
21
+ tree = Regexp::Parser.parse( regexp )
22
+
23
+ arr = tree.expressions
24
+ while true
25
+ new_arr = arr.map do |elem|
26
+ if elem.respond_to? :quantifier and elem.quantifier
27
+ $expr_classes << elem.quantifier
28
+ end
29
+ el = el.class == Class ? el : el.class
30
+ if elem.respond_to?(:expressions)
31
+ [el, elem.expressions]
32
+ else
33
+ el
34
+ end
35
+ end.flatten
36
+ break if new_arr == arr
37
+ arr = new_arr
38
+ end
39
+ $expr_classes += arr
40
+ end
41
+ end;
42
+ # $expressions.map{|elem| elem.class.to_s=="Class" ? elem : elem.class}.tally
43
+
44
+ pp $expr_classes.tally.sort_by{|k,v| -v}
45
+ pp $quanitifiers.map{|q| q.text}.tally.sort_by{|k,v| -v}
data/bin/maps_v1_to_v2 ADDED
@@ -0,0 +1,426 @@
1
+ #!/usr/bin/env ruby
2
+ # This is a helper script for porting Interscript v1 maps to v2 format. It won't
3
+ # ever be able to port them completely, but it should help bootstrap the process.
4
+
5
+ require 'bundler/setup'
6
+
7
+ require 'yaml'
8
+ require 'fileutils'
9
+
10
+ Dir.chdir(__dir__ + "/../")
11
+ FileUtils.rm_rf(Dir.glob("../maps/maps-staging/*"))
12
+ #FileUtils.mkdir_p("../maps/maps-staging/")
13
+
14
+ #old_maps = []
15
+ old_maps = Dir["../../interscript/maps/*.yaml"]
16
+ #old_maps = Dir["../../interscript/maps/alalc-aze-Arab-Latn-1997.yaml"]
17
+ #old_maps = Dir["../../interscript/maps/mofa-jpn-Hrkt-Latn-1989.yaml"]
18
+
19
+
20
+ ex_maps = Dir["../maps/maps/*.imp"]
21
+ ex_map_names = ex_maps.map { |i| File.basename(i, ".imp") }
22
+
23
+
24
+ require 'regexp_parser'
25
+
26
+ require 'interscript/utils/regexp_converter.rb'
27
+
28
+
29
+ def process_line( kkk, vvv, indent: 0)
30
+
31
+ parse_kkk = Regexp::Parser.parse(kkk, 'ruby/2.1')
32
+ tokens_kkk = process(parse_kkk)
33
+ # pp conv
34
+ root_hash = process_root(tokens_kkk)
35
+ # pp root_hash
36
+ # puts "vvv = #{vvv.inspect}"
37
+ if vvv.class == String
38
+ if vvv == '' or vvv =='""' or vvv == nil or vvv.include? '~'
39
+ root_hash[:to] = '""'
40
+ elsif vvv == '"'
41
+ root_hash[:to] = '"\""'
42
+ else
43
+ if vvv == "?" #alalc-ell-Grek-Latn-1997.imp un-ell-Grek-Latn-1987-phonetic have to "?"
44
+ root_hash[:to] = "?".inspect
45
+ #if root_hash[:from].to_s.include?('capture') or root_hash[:to] =~ /\\\\([0-9]+)/
46
+ else
47
+ parse_vvv = Regexp::Parser.parse(vvv)
48
+ tokens_vvv = process(parse_vvv)
49
+ string_vvv = stringify(tokens_vvv)
50
+ # puts string_vvv
51
+ root_hash[:to] = string_vvv
52
+ root_hash[:to] = 'upcase' if string_vvv.include? 'upcase'
53
+ end
54
+ end
55
+ elsif vvv.class == Array
56
+ root_hash[:to] = "any(#{vvv.inspect})"
57
+ else
58
+ root_hash[:to] = "\"\""
59
+ # puts "unknown class #{vvv.inspect}"
60
+ end
61
+
62
+ str = stringify_root(root_hash, indent: indent)
63
+ # puts str
64
+ # puts ""
65
+ str
66
+ end
67
+
68
+
69
+ old_maps.sort.each do |old_map|
70
+ old_map_name = File.basename(old_map, ".yaml")
71
+
72
+ if ex_map_names.include? old_map_name
73
+ puts "* Skipping #{old_map_name} as it's already ported"
74
+ next
75
+ end
76
+
77
+ print "* Converting #{old_map_name}."
78
+
79
+ f = File.read(old_map)
80
+ fl = f.split("\n")
81
+
82
+ md = []
83
+ tests = []
84
+ map = []
85
+ chain = nil
86
+
87
+ cur = md
88
+
89
+ bugnotes = false
90
+
91
+ fl.each do |i|
92
+ if i == '---'
93
+ # skip the first line
94
+ elsif i =~ /\A\s+|\A\z/
95
+ # continuation
96
+ if bugnotes
97
+ i = "#{i}"
98
+ md << i
99
+ else
100
+ cur << i
101
+ end
102
+ else
103
+ cmt = nil
104
+ i = i.sub(/(#.*?)\z/) do |j|
105
+ cmt = j
106
+ ""
107
+ end
108
+
109
+ # block begin or md
110
+ case i.strip
111
+ when "tests:"
112
+ cur = tests
113
+ bugnotes = false
114
+ when "map:"
115
+ cur = map
116
+ bugnotes = false
117
+ when "notes:"
118
+ md << "notes:"
119
+ bugnotes = true
120
+ when /\Achain:/
121
+ chain = i
122
+ else
123
+ cur << i
124
+ end
125
+
126
+ cur << cmt if cmt
127
+ end
128
+ end
129
+
130
+ print "."
131
+
132
+ newmd = []
133
+ aliasff = false
134
+ md.each do |i|
135
+ if i.strip == "alias:"
136
+ aliasff = true
137
+ elsif i !~ /\A\s+/
138
+ aliasff = false
139
+ end
140
+ newmd << i unless aliasff
141
+ end
142
+ md = newmd
143
+
144
+ print "."
145
+
146
+ newmd = md.map(&" ".method(:+))
147
+ .join("\n")
148
+ .gsub(" note:", " notes:")
149
+ .gsub("confirmation date:", "confirmation_date:")
150
+ #.gsub("special_rules:", ' - "special rules:"')
151
+ #.gsub("original_description:", " # original description:")
152
+ #.gsub("original_notes:", ' - "original notes:"')
153
+ #.gsub("implementation_notes:", ' - "implementation notes:"')
154
+ .rstrip
155
+
156
+ new = "metadata {\n"
157
+ new << newmd
158
+ new << "\n}\n\n"
159
+
160
+ class MultilineError < StandardError; end
161
+
162
+ if tests.length > 0
163
+ new << "tests {\n"
164
+ cmt = ""
165
+
166
+ iter = 0
167
+ while iter < tests.length; begin
168
+ test = tests[iter]
169
+
170
+ if test =~ /\A\s*#/
171
+ new << " " << test.strip << "\n"
172
+ iter += 1
173
+ next
174
+ end
175
+
176
+ re_source = /\A(?: ){0,2}- source: (.*?)(\s*#.*?)?\z/m
177
+ re_expect = /\A(?: ){0,3}expected:[ \t](.*?)(\s*#.*?)?\z/m
178
+
179
+ if test.rstrip.end_with?("|") ||
180
+ (test =~ /"/ && !test.rstrip.end_with?('"'))
181
+ while iter < tests.length
182
+ xtest = tests[iter+1]
183
+ break if xtest =~ re_source || xtest =~ re_expect
184
+ test << "\n" << (xtest||"")
185
+ iter += 1
186
+ end
187
+ end
188
+
189
+ case test
190
+ when re_source
191
+ new << " test #{YAML.load($1).inspect}, "
192
+ cmt = $2 if $2
193
+ when re_expect
194
+ new << "#{YAML.load($1).inspect}".unicode_normalize
195
+ new << cmt
196
+ new << $2 if $2
197
+ new << "\n"
198
+ cmt = ""
199
+ when /\A\s*\z/
200
+ # empty line, ignore
201
+ else
202
+ new << "\n# BUG: #{test}\n"
203
+ end
204
+
205
+ iter += 1
206
+ rescue Psych::SyntaxError
207
+ p test
208
+ end; end
209
+
210
+ new << "}\n\n"
211
+ end
212
+
213
+ print "."
214
+
215
+ new << "# This map has been partially converted by the bin/maps_v1_to_v2 script\n"
216
+ new << "# The section below requires human attention. Remember to remove this\n"
217
+ new << "# comment and move the converted map to 'maps/' directory. Please also\n"
218
+ new << "# take note that the maps-staging directory will be cleaned up whenever\n"
219
+ new << "# you run the bin/maps_v1_to_v2 script. You should particularly be\n"
220
+ new << "# concerned about any regular expressions found in this file and about\n"
221
+ new << "# advanced expressions in parallel {} parts, and also about the order\n"
222
+ new << "# of particular parts of the stage.\n\n"
223
+
224
+ transcription = nil
225
+ title_case = nil
226
+ downcase = nil
227
+ inherit = nil
228
+
229
+ characters, rules, dictionary, postrules = [], [], [], []
230
+
231
+ cur = nil
232
+ indent = 0
233
+ stagedone = false
234
+ efini = proc do
235
+ if inherit
236
+ new << "\n"
237
+ inherit.each do |i|
238
+ new << " run map.#{i}.stage.main\n"
239
+ end
240
+ end
241
+ new << "\n"
242
+ efini=proc{}
243
+ end
244
+ fini = proc{}
245
+ begn = proc { new << "stage {\n"; stagedone = true; begn=proc{} }
246
+
247
+ iter = -1
248
+ while iter+1 < map.length
249
+ iter += 1
250
+ i = map[iter]
251
+
252
+ cmt = nil
253
+ if i =~ /\A [^\s#]/ || i =~ /\A inherit:/
254
+ i.sub(/(#.*?)\z/) { cmt = $1 }
255
+ if cmt
256
+ new << " "*indent << cmt << "\n"
257
+ end
258
+
259
+ maybe_val = YAML.load(i.split(":").last)
260
+ case i.split(":").first.strip
261
+ when "inherit"
262
+ inherit = Array(maybe_val)
263
+ inherit = inherit.map do |inh|
264
+ short = inh.split("-")[2..3].join.downcase
265
+ new << "dependency #{inh.inspect}, as: #{short}\n"
266
+ short
267
+ end
268
+ new << "\n"
269
+ raise "Duplicate items" unless inherit.length == inherit.uniq.length
270
+ when "dictionary"
271
+ begn.()
272
+ fini.()
273
+ new << " # DICTIONARY\n"
274
+ new << " parallel {\n"
275
+ indent = 4
276
+ cur = dictionary
277
+ fini = proc{new << " }\n\n";indent = 2}
278
+ when "rules"
279
+ begn.()
280
+ fini.()
281
+ efini.()
282
+ new << " # RULES\n"
283
+ indent = 2
284
+ cur = rules
285
+ fini = proc{new << "\n"}
286
+ when "characters"
287
+ begn.()
288
+ fini.()
289
+ efini.()
290
+ new << " # CHARACTERS\n"
291
+ new << " parallel {\n"
292
+ indent = 4
293
+ cur = characters
294
+ fini = proc{new << " }\n\n";indent = 2}
295
+ when "postrules"
296
+ begn.()
297
+ fini.()
298
+ efini.()
299
+ new << " # POSTRULES\n"
300
+ indent = 2
301
+ cur = postrules
302
+ fini = proc{new << "\n"}
303
+ when "downcase"
304
+ downcase = maybe_val
305
+ when "title_case"
306
+ title_case = maybe_val
307
+ when "transcription"
308
+ transcription = maybe_val
309
+ # Those we will ignore for now
310
+ when "word_separator", "segmentation", "character_separator", "map"
311
+ # Those are bugs
312
+ when "title-case"
313
+ else
314
+ p i
315
+ end
316
+ else
317
+ cmt = ""
318
+ i = i.sub(/(#.*?)\z/) { cmt << $1; "" }
319
+ #new << " "*indent << i.strip << "\n"
320
+ if i.strip == ""
321
+ new << " "*indent << cmt << "\n"
322
+ next
323
+ end
324
+
325
+ case cur.object_id
326
+ when nil.object_id
327
+ raise "Unexpected line #{i}"
328
+ when characters.object_id, dictionary.object_id
329
+ k,v = i.split(":", 2).map(&:strip)
330
+ if !v || v == ""
331
+ v = ""
332
+ # Load array
333
+ iter2 = iter + 1
334
+ while iter2 < map.length
335
+ i2 = map[iter2]
336
+ break unless i2.strip =~ /\A-/
337
+ i2 = i2.sub(/(#.*?)\z/) { cmt << $1; "" }
338
+ v << "\n" << i2
339
+ iter2 += 1
340
+ end
341
+ iter = iter2 - 1
342
+ end
343
+
344
+ kk,vv = YAML.load(k), YAML.load(v)
345
+
346
+ kkk,vvv = kk.inspect, vv.inspect
347
+ if vv.class == Array
348
+ if vv.all? { |z| z.length == 1 }
349
+ vvv = "any(" + vv.join.inspect + ")"
350
+ else
351
+ vvv = "any(" + vv.inspect + ")"
352
+ end
353
+ elsif vv.class == NilClass
354
+ vvv = "none"
355
+ end
356
+ kkk,vvv = kk.dup,vv.dup
357
+ # This worked due to use of regexps... it should remove 1 slash.
358
+ kkk = kkk.gsub("\\\\u", "\\\\u")
359
+ kkk = kkk.gsub("\\\\U", "\\\\u")
360
+
361
+ new << process_line( kkk,vvv, indent: indent )
362
+ when rules.object_id, postrules.object_id
363
+ if i.strip =~ /\A- pattern\s*:/
364
+ _, k = i.split(":", 2)
365
+ ii = map[iter+1]
366
+ ii = ii.sub(/(#.*?)\z/) { cmt << $1; "" }
367
+ if ii.strip == ""
368
+ iter += 1
369
+ ii = map[iter+1]
370
+ ii = ii.sub(/(#.*?)\z/) { cmt << $1; "" }
371
+ end
372
+ if ii.strip =~ /\Aresult\s*:/
373
+ _, v = ii.split(":", 2)
374
+ else
375
+ raise "Unexpected(2): #{ii.strip.inspect}"
376
+ end
377
+ iter += 1
378
+ else
379
+ raise "Unexpected(1): #{i.strip.inspect}"
380
+ end
381
+
382
+ kk,vv = YAML.load(k), YAML.load(v)
383
+ # kkk,vvv = kk.inspect, vv.inspect
384
+ kkk,vvv = kk.dup,vv.dup
385
+
386
+ new << process_line( kkk,vvv, indent: indent )
387
+ end
388
+
389
+ if cmt != ""
390
+ new << " " << cmt << "\n"
391
+ else
392
+ new << "\n"
393
+ end
394
+ end
395
+ end
396
+
397
+ fini.()
398
+
399
+ unless stagedone
400
+ new << "stage {\n"
401
+ efini.()
402
+ end
403
+
404
+ new << " title_case\n" if title_case
405
+ new << " downcase\n" if downcase
406
+
407
+ new << "}\n\n"
408
+
409
+ if chain
410
+ new << "# This map is chained and probably depends on seq2seq:\n"
411
+ new << "# #{chain}\n\n"
412
+ end
413
+
414
+ if transcription
415
+ new << "# This map contains transcription and probably depends on seq2seq:\n"
416
+ new << "# transcription: #{transcription}\n\n"
417
+ end
418
+
419
+ new = new.gsub(/ +$/, '') # Cleanup trailing whitespaces
420
+
421
+ #new << map.join("\n")
422
+
423
+ File.write("../maps/maps-staging/#{old_map_name}.imp", new)
424
+
425
+ puts " done!"
426
+ end