interscript 0.1.4 → 2.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +29 -0
  5. data/LICENSE.adoc +31 -0
  6. data/README.md +3 -0
  7. data/Rakefile +53 -0
  8. data/bin/console +14 -0
  9. data/bin/interscript +3 -39
  10. data/bin/maps_analyze_staging +168 -0
  11. data/bin/maps_debug_compilers +58 -0
  12. data/bin/maps_debug_ordering +88 -0
  13. data/bin/maps_debug_ruby_compile +24 -0
  14. data/bin/maps_debug_step_by_step +44 -0
  15. data/bin/maps_optimize_order +112 -0
  16. data/bin/maps_v1_analyze_regexps +45 -0
  17. data/bin/maps_v1_to_v2 +426 -0
  18. data/exe/interscript +6 -0
  19. data/interscript.gemspec +31 -0
  20. data/lib/interscript.rb +76 -128
  21. data/lib/interscript/command.rb +6 -5
  22. data/lib/interscript/compiler.rb +22 -0
  23. data/lib/interscript/compiler/javascript.rb +292 -0
  24. data/lib/interscript/compiler/ruby.rb +262 -0
  25. data/lib/interscript/dsl.rb +67 -0
  26. data/lib/interscript/dsl/aliases.rb +23 -0
  27. data/lib/interscript/dsl/document.rb +46 -0
  28. data/lib/interscript/dsl/group.rb +45 -0
  29. data/lib/interscript/dsl/group/parallel.rb +6 -0
  30. data/lib/interscript/dsl/items.rb +89 -0
  31. data/lib/interscript/dsl/metadata.rb +26 -0
  32. data/lib/interscript/dsl/stage.rb +6 -0
  33. data/lib/interscript/dsl/symbol_mm.rb +11 -0
  34. data/lib/interscript/dsl/tests.rb +12 -0
  35. data/lib/interscript/interpreter.rb +251 -0
  36. data/lib/interscript/node.rb +25 -0
  37. data/lib/interscript/node/alias_def.rb +15 -0
  38. data/lib/interscript/node/dependency.rb +13 -0
  39. data/lib/interscript/node/document.rb +45 -0
  40. data/lib/interscript/node/group.rb +34 -0
  41. data/lib/interscript/node/group/parallel.rb +9 -0
  42. data/lib/interscript/node/group/sequential.rb +2 -0
  43. data/lib/interscript/node/item.rb +52 -0
  44. data/lib/interscript/node/item/alias.rb +42 -0
  45. data/lib/interscript/node/item/any.rb +61 -0
  46. data/lib/interscript/node/item/capture.rb +50 -0
  47. data/lib/interscript/node/item/group.rb +51 -0
  48. data/lib/interscript/node/item/repeat.rb +40 -0
  49. data/lib/interscript/node/item/stage.rb +23 -0
  50. data/lib/interscript/node/item/string.rb +51 -0
  51. data/lib/interscript/node/metadata.rb +18 -0
  52. data/lib/interscript/node/rule.rb +6 -0
  53. data/lib/interscript/node/rule/funcall.rb +18 -0
  54. data/lib/interscript/node/rule/run.rb +15 -0
  55. data/lib/interscript/node/rule/sub.rb +65 -0
  56. data/lib/interscript/node/stage.rb +19 -0
  57. data/lib/interscript/node/tests.rb +15 -0
  58. data/lib/interscript/stdlib.rb +211 -0
  59. data/lib/interscript/utils/regexp_converter.rb +283 -0
  60. data/lib/interscript/version.rb +1 -1
  61. data/requirements.txt +1 -0
  62. metadata +73 -223
  63. data/README.adoc +0 -297
  64. data/bin/rspec +0 -29
  65. data/lib/g2pwrapper.py +0 -34
  66. data/lib/interscript/mapping.rb +0 -125
  67. data/lib/model-7 +0 -0
  68. data/lib/tha-pt-b-7 +0 -0
  69. data/maps/acadsin-zho-Hani-Latn-2002.yaml +0 -38912
  70. data/maps/alalc-aze-Cyrl-Latn-1997.yaml +0 -141
  71. data/maps/alalc-bel-cyrl-latn-1997.yaml +0 -125
  72. data/maps/alalc-ben-Beng-Latn-2017.yaml +0 -130
  73. data/maps/alalc-bul-Cyrl-Latn-1997.yaml +0 -94
  74. data/maps/alalc-ell-Grek-Latn-1997.yaml +0 -625
  75. data/maps/alalc-ell-Grek-Latn-2010.yaml +0 -628
  76. data/maps/alalc-kat-Geok-Latn-1997.yaml +0 -112
  77. data/maps/alalc-kat-Geor-Latn-1997.yaml +0 -146
  78. data/maps/alalc-kor-Hang-Latn-1997.yaml +0 -94
  79. data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +0 -103
  80. data/maps/alalc-mkd-cyrl-latn-1997.yaml +0 -114
  81. data/maps/alalc-rus-Cyrl-Latn-1997.yaml +0 -222
  82. data/maps/alalc-rus-Cyrl-Latn-2012.yaml +0 -162
  83. data/maps/alalc-srp-Cyrl-Latn-1997.yaml +0 -114
  84. data/maps/alalc-srp-cyrl-latn-2013.yaml +0 -135
  85. data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +0 -141
  86. data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +0 -16
  87. data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +0 -283
  88. data/maps/bas-rus-Cyrl-Latn-2017-bss.yaml +0 -175
  89. data/maps/bas-rus-Cyrl-Latn-2017-oss.yaml +0 -169
  90. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +0 -294
  91. data/maps/bgn-kor-Hang-Latn-1943.yaml +0 -31
  92. data/maps/bgn-kor-Kore-Latn-1943.yaml +0 -31
  93. data/maps/bgna-bul-Cyrl-Latn-2006.yaml +0 -208
  94. data/maps/bgna-bul-Cyrl-Latn-2009.yaml +0 -208
  95. data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +0 -108
  96. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +0 -104
  97. data/maps/bgnpcgn-bak-Cyrl-Latn-2007.yaml +0 -184
  98. data/maps/bgnpcgn-bel-cyrl-latn-1979.yaml +0 -285
  99. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +0 -115
  100. data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +0 -38
  101. data/maps/bgnpcgn-chn-Hans-Latn-1979.yaml +0 -7456
  102. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +0 -702
  103. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +0 -20
  104. data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +0 -257
  105. data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +0 -127
  106. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +0 -43
  107. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +0 -253
  108. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +0 -48
  109. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +0 -48
  110. data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +0 -159
  111. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +0 -190
  112. data/maps/bgnpcgn-per-Arab-Latn-1956.yaml +0 -93
  113. data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +0 -314
  114. data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +0 -166
  115. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +0 -163
  116. data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +0 -208
  117. data/maps/by-bel-Cyrl-Latn-1998.yaml +0 -168
  118. data/maps/by-bel-Cyrl-Latn-2007.yaml +0 -115
  119. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +0 -685
  120. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +0 -681
  121. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +0 -20
  122. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +0 -32
  123. data/maps/ggg-kat-Geor-Latn-2002.yaml +0 -89
  124. data/maps/gki-bel-cyrl-latn-1992.yaml +0 -33
  125. data/maps/gki-bel-cyrl-latn-2000.yaml +0 -201
  126. data/maps/gost-rus-cyrl-latn-16876-71-1983.yaml +0 -186
  127. data/maps/hk-yue-Hani-Latn-1888.yaml +0 -38497
  128. data/maps/icao-bel-Cyrl-Latn-9303.yaml +0 -141
  129. data/maps/icao-bul-Cyrl-Latn-9303.yaml +0 -122
  130. data/maps/icao-heb-Hebr-Latn-9303.yaml +0 -151
  131. data/maps/icao-mkd-Cyrl-Latn-9303.yaml +0 -117
  132. data/maps/icao-per-Arab-Latn-9303.yaml +0 -104
  133. data/maps/icao-rus-Cyrl-Latn-9303.yaml +0 -118
  134. data/maps/icao-srp-Cyrl-Latn-9303.yaml +0 -117
  135. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +0 -120
  136. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +0 -610
  137. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +0 -41
  138. data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +0 -62
  139. data/maps/iso-rus-Cyrl-Latn-9-1995.yaml +0 -272
  140. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +0 -109
  141. data/maps/kp-kor-Hang-Latn-2002.yaml +0 -901
  142. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +0 -44820
  143. data/maps/mext-jpn-Hrkt-Latn-1954.yaml +0 -411
  144. data/maps/moct-kor-Hang-Latn-2000.yaml +0 -803
  145. data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +0 -541
  146. data/maps/mvd-bel-Cyrl-Latn-2008.yaml +0 -225
  147. data/maps/mvd-bel-Cyrl-Latn-2010.yaml +0 -63
  148. data/maps/mvd-rus-Cyrl-Latn-2008.yaml +0 -110
  149. data/maps/mvd-rus-Cyrl-Latn-2010.yaml +0 -37
  150. data/maps/nil-kor-Hang-Hang-jamo.yaml +0 -11193
  151. data/maps/odni-bel-Cyrl-Latn-2015.yaml +0 -148
  152. data/maps/odni-bul-Cyrl-Latn-2015.yaml +0 -96
  153. data/maps/odni-kat-Geor-Latn-2015.yaml +0 -88
  154. data/maps/odni-rus-Cyrl-Latn-2015.yaml +0 -77
  155. data/maps/odni-srp-Cyrl-Latn-2015.yaml +0 -129
  156. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +0 -157
  157. data/maps/odni-uzb-Cyrl-Latn-2015.yaml +0 -167
  158. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +0 -90
  159. data/maps/royin-tha-Thai-Latn-1968.yaml +0 -179
  160. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +0 -180
  161. data/maps/royin-tha-Thai-Latn-1999.yaml +0 -76
  162. data/maps/sac-zho-Hans-Latn-1979.yaml +0 -24759
  163. data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +0 -222
  164. data/maps/ua-ukr-Cyrl-Latn-1996.yaml +0 -193
  165. data/maps/un-bel-Cyrl-Latn-2007.yaml +0 -114
  166. data/maps/un-ben-Beng-Latn-2016.yaml +0 -534
  167. data/maps/un-ell-Grek-Latn-1987-tl.yaml +0 -32
  168. data/maps/un-ell-Grek-Latn-1987-ts.yaml +0 -20
  169. data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +0 -780
  170. data/maps/un-mon-Mong-Latn-2013.yaml +0 -93
  171. data/maps/un-rus-Cyrl-Latn-1987.yaml +0 -166
  172. data/maps/un-ukr-cyrl-latn-1998.yaml +0 -30
  173. data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +0 -406
  174. data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +0 -386
  175. data/maps/var-kor-Hang-Latn-mr-1939.yaml +0 -1054
  176. data/maps/var-kor-Kore-Hang-2013.yaml +0 -59754
  177. data/maps/var-kor-Kore-Latn-mr-1939.yaml +0 -37
  178. data/maps/var-tha-Thai-Thai-phonemic.yaml +0 -59
  179. data/maps/var-tha-Thai-Zsym-ipa.yaml +0 -301
  180. data/maps/var-zho-Hani-Latn-1979.yaml +0 -38908
  181. data/spec/interscript/mapping_spec.rb +0 -42
  182. data/spec/interscript_spec.rb +0 -26
  183. data/spec/spec_helper.rb +0 -3
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+ ENV["INTERSCRIPT_STAGING"] = "1"
3
+ require "bundler/setup"
4
+ require "interscript"
5
+ require "interscript/compiler/ruby"
6
+
7
+ # Compile a given map with the Ruby compiler for debugging purposes
8
+
9
+ if ARGV[0] == '-b'
10
+ require 'base64'
11
+ $b64 = true
12
+ ARGV.shift
13
+ end
14
+
15
+ map = ARGV[0]
16
+ m = Interscript.parse(map)
17
+ cr = Interscript::Compiler::Ruby
18
+ mr = cr.(map)
19
+
20
+ if $b64
21
+ puts Base64.encode64(mr.code)
22
+ else
23
+ puts mr.code
24
+ end
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env ruby
2
+ ENV["INTERSCRIPT_STAGING"] = "1"
3
+ require "bundler/setup"
4
+ require "interscript"
5
+ require "interscript/compiler/ruby"
6
+
7
+ # This script has been written because there are some differences between platforms
8
+ # (ie. windows vs linux) that we wish to find out more about
9
+
10
+ if ARGV[0] == '-b'
11
+ require 'base64'
12
+ $b64 = []
13
+ ARGV.shift
14
+ end
15
+
16
+ map = ARGV[0]
17
+ m = Interscript.parse(map)
18
+ cr = Interscript::Compiler::Ruby
19
+ mr = cr.(map, debug: true)
20
+
21
+ m.tests.data.each_with_index do |(from, expected), idx|
22
+ r = mr.(from)
23
+
24
+ unless ARGV[1] && ARGV[1].split(",").any? { |i| i.to_i == idx }
25
+ if r == expected
26
+ cr.reset_debug_data
27
+ next
28
+ end
29
+ end
30
+
31
+ dr = cr.read_debug_data
32
+
33
+ if $b64
34
+ $b64 << [idx, dr]
35
+ else
36
+ pp [idx, dr]
37
+ end
38
+
39
+ cr.reset_debug_data
40
+ end
41
+
42
+ if $b64
43
+ puts Base64.encode64($b64.inspect)
44
+ end
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env ruby
2
+ ENV["INTERSCRIPT_STAGING"] = "1"
3
+ require "bundler/setup"
4
+ require "interscript"
5
+ require "interscript/compiler/ruby"
6
+
7
+ $map_name = ARGV[0]
8
+
9
+ if $map_name
10
+ filelist = [ __dir__+"/../../maps/maps-staging/#{$map_name}.imp" ]
11
+ else
12
+ filelist = Dir[__dir__+"/../../maps/maps-staging/*.imp"].sort
13
+ end
14
+
15
+
16
+ # levenshtein distance algorithm for comparing string similarity
17
+ def ld(s, t)
18
+ v0 = (0..t.length).to_a
19
+ v1 = []
20
+ #p v0
21
+
22
+ s.chars.each_with_index do |s_ch, i|
23
+ v1[0] = i + 1
24
+
25
+ t.chars.each_with_index do |t_ch, j|
26
+ cost = s_ch == t_ch ? 0 : 1
27
+ v1[j + 1] = [v1[j] + 1, v0[j + 1] + 1, v0[j] + cost].min
28
+ end
29
+ v0 = v1.dup
30
+ #p v1
31
+ end
32
+
33
+ v0[t.length]
34
+ end
35
+
36
+
37
+ def score_order( system, order )
38
+ interpreter = Marshal.load( Marshal.dump( $interpreter ))
39
+ parallel = interpreter.map.stages[:main].children.select{|x| Interscript::Node::Group::Parallel === x}[0]
40
+ parallel.apply_order(order)
41
+ interpreter.map.stages[:main].children[$parallel_idx].children = parallel.children.compact #.reorder_children(source,target)
42
+ delta_sum = 0
43
+ errors = []
44
+ system.tests.data.each do |from, expected|
45
+ result = interpreter.(from)
46
+ delta = ld(expected, result)
47
+ errors << [expected, result] if delta != 0
48
+ delta_sum += delta
49
+ end;
50
+ [delta_sum, errors]
51
+ end
52
+
53
+
54
+ def mutate_order(order)
55
+ order2 = order.dup
56
+ a = rand(order2.size)
57
+ b = rand(order2.size)
58
+ order2[a], order2[b] = order2[b], order2[a]
59
+ order2
60
+ end
61
+
62
+ for i in filelist
63
+
64
+ begin
65
+ system_name = File.basename(i, ".imp")
66
+ puts "\ndebugging #{system_name}"
67
+
68
+ system = Interscript.parse(system_name);
69
+ if system.tests && system.tests.data && system.tests.data.length > 0
70
+
71
+ $interpreter = Interscript::Interpreter.new.compile(system);
72
+ $orig_parallel = $interpreter.map.stages[:main].children.select{|x| Interscript::Node::Group::Parallel === x}[0].dup;
73
+ $parallel_idx = $interpreter.map.stages[:main].children.each_with_index.select{|x,i| Interscript::Node::Group::Parallel === x}.map{|x,i| i}[0]
74
+ next if !$parallel_idx
75
+ starting_score, starting_errors = score_order(system, $orig_parallel.children.size.times.to_a)
76
+ parallel_size = $interpreter.map.stages[:main].children.select{|x| Interscript::Node::Group::Parallel === x}[0].children.size
77
+
78
+ puts "starting_score = #{starting_score}"
79
+ best_score = starting_score
80
+ best_errors = starting_errors
81
+ curr_order = $orig_parallel.children.size.times.to_a.shuffle
82
+ #curr_order = [28, 308, 61, 87, 29, 147, 124, 22, 373, 186, 336, 19, 405, 387, 215, 209, 131, 30, 60, 343, 34, 380, 189, 53, 300, 286, 162, 31, 33, 218, 90, 51, 293, 226, 56, 305, 142, 102, 346, 222, 126, 338, 50, 52, 363, 144, 136, 16, 388, 221, 267, 63, 352, 365, 251, 78, 68, 328, 69, 12, 67, 317, 334, 94, 366, 412, 302, 243, 311, 318, 281, 274, 143, 236, 386, 135, 280, 167, 173, 291, 271, 309, 73, 20, 157, 331, 43, 242, 65, 351, 134, 151, 0, 285, 211, 417, 220, 179, 91, 353, 255, 141, 23, 104, 413, 409, 256, 326, 180, 140, 24, 348, 261, 5, 99, 47, 35, 358, 177, 123, 277, 396, 114, 213, 116, 188, 217, 249, 419, 120, 289, 330, 110, 118, 176, 113, 278, 127, 313, 55, 370, 48, 364, 171, 244, 407, 57, 371, 128, 196, 103, 202, 294, 239, 283, 299, 237, 394, 81, 230, 97, 46, 109, 337, 355, 240, 195, 100, 204, 389, 146, 153, 121, 183, 137, 159, 254, 231, 3, 101, 290, 323, 148, 359, 250, 25, 40, 219, 119, 169, 378, 282, 377, 238, 130, 279, 385, 58, 41, 115, 197, 382, 193, 225, 199, 6, 59, 208, 93, 138, 11, 15, 37, 38, 27, 354, 175, 411, 83, 89, 368, 216, 301, 168, 401, 84, 235, 333, 246, 284, 372, 155, 105, 339, 228, 342, 122, 161, 316, 145, 272, 321, 80, 315, 163, 107, 288, 227, 191, 306, 310, 76, 85, 132, 2, 320, 36, 13, 74, 233, 72, 381, 269, 70, 402, 86, 95, 111, 8, 383, 314, 10, 200, 203, 292, 241, 212, 374, 234, 369, 422, 42, 357, 18, 49, 214, 9, 156, 129, 258, 259, 190, 79, 367, 414, 201, 166, 270, 319, 332, 4, 184, 187, 164, 395, 325, 88, 245, 185, 71, 400, 275, 312, 324, 1, 224, 45, 205, 404, 260, 392, 253, 273, 416, 96, 408, 112, 349, 393, 345, 152, 329, 420, 410, 14, 361, 7, 257, 207, 194, 298, 17, 98, 340, 391, 399, 397, 82, 263, 376, 158, 327, 406, 265, 418, 322, 77, 92, 266, 262, 44, 360, 172, 403, 350, 66, 384, 247, 139, 181, 198, 248, 232, 32, 295, 106, 160, 287, 379, 341, 344, 421, 182, 375, 307, 415, 64, 75, 297, 125, 276, 223, 149, 26, 398, 303, 154, 133, 210, 150, 206, 174, 62, 170, 390, 54, 347, 39, 229, 178, 296, 108, 21, 165, 268, 264, 356, 304, 192, 252, 117, 335, 362]
83
+
84
+ best_order = curr_order.dup
85
+ while true
86
+
87
+ curr_score, curr_errors = score_order(system, curr_order)
88
+ #print "#{source} <-> #{target} = #{curr_score}; "
89
+ puts Time.now.inspect
90
+ puts best_order.inspect
91
+ puts curr_score
92
+ puts best_errors.inspect
93
+ puts best_score
94
+ puts ''
95
+
96
+ if curr_score < best_score
97
+ puts ''
98
+ best_score = curr_score.dup
99
+ best_order = curr_order.dup
100
+ best_errors = curr_errors.dup
101
+ end
102
+ curr_order = mutate_order(best_order)
103
+ end
104
+ end
105
+ rescue Exception => e
106
+ puts e
107
+ end
108
+
109
+ end
110
+ # for this code to work sorting by max_length in interpreter.rb line 46 needs to be disabled
111
+ # #r.children.each do |i|
112
+ # r.children.sort_by{ |rule| -rule.max_length }.each do |i|
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'yaml'
4
+ require 'fileutils'
5
+ require 'regexp_parser'
6
+
7
+ old_maps = Dir["../../interscript/maps/*.yaml"]
8
+
9
+ $expr_classes = []
10
+ $quantifiers = []
11
+ old_maps.each do |old_map|
12
+ old_map_name = File.basename(old_map, ".yaml")
13
+ puts old_map
14
+ f = File.read(old_map)
15
+ yaml = YAML.load(f)
16
+ map_keys = yaml['map'].keys
17
+ puts map_keys
18
+ rs = yaml['map']['postrules']&.map{|h| h['pattern']} || []
19
+ rs += yaml['map']['characters']&.keys || []
20
+ rs.each do |regexp|
21
+ tree = Regexp::Parser.parse( regexp )
22
+
23
+ arr = tree.expressions
24
+ while true
25
+ new_arr = arr.map do |elem|
26
+ if elem.respond_to? :quantifier and elem.quantifier
27
+ $expr_classes << elem.quantifier
28
+ end
29
+ el = el.class == Class ? el : el.class
30
+ if elem.respond_to?(:expressions)
31
+ [el, elem.expressions]
32
+ else
33
+ el
34
+ end
35
+ end.flatten
36
+ break if new_arr == arr
37
+ arr = new_arr
38
+ end
39
+ $expr_classes += arr
40
+ end
41
+ end;
42
+ # $expressions.map{|elem| elem.class.to_s=="Class" ? elem : elem.class}.tally
43
+
44
+ pp $expr_classes.tally.sort_by{|k,v| -v}
45
+ pp $quanitifiers.map{|q| q.text}.tally.sort_by{|k,v| -v}
data/bin/maps_v1_to_v2 ADDED
@@ -0,0 +1,426 @@
1
+ #!/usr/bin/env ruby
2
+ # This is a helper script for porting Interscript v1 maps to v2 format. It won't
3
+ # ever be able to port them completely, but it should help bootstrap the process.
4
+
5
+ require 'bundler/setup'
6
+
7
+ require 'yaml'
8
+ require 'fileutils'
9
+
10
+ Dir.chdir(__dir__ + "/../")
11
+ FileUtils.rm_rf(Dir.glob("../maps/maps-staging/*"))
12
+ #FileUtils.mkdir_p("../maps/maps-staging/")
13
+
14
+ #old_maps = []
15
+ old_maps = Dir["../../interscript/maps/*.yaml"]
16
+ #old_maps = Dir["../../interscript/maps/alalc-aze-Arab-Latn-1997.yaml"]
17
+ #old_maps = Dir["../../interscript/maps/mofa-jpn-Hrkt-Latn-1989.yaml"]
18
+
19
+
20
+ ex_maps = Dir["../maps/maps/*.imp"]
21
+ ex_map_names = ex_maps.map { |i| File.basename(i, ".imp") }
22
+
23
+
24
+ require 'regexp_parser'
25
+
26
+ require 'interscript/utils/regexp_converter.rb'
27
+
28
+
29
+ def process_line( kkk, vvv, indent: 0)
30
+
31
+ parse_kkk = Regexp::Parser.parse(kkk, 'ruby/2.1')
32
+ tokens_kkk = process(parse_kkk)
33
+ # pp conv
34
+ root_hash = process_root(tokens_kkk)
35
+ # pp root_hash
36
+ # puts "vvv = #{vvv.inspect}"
37
+ if vvv.class == String
38
+ if vvv == '' or vvv =='""' or vvv == nil or vvv.include? '~'
39
+ root_hash[:to] = '""'
40
+ elsif vvv == '"'
41
+ root_hash[:to] = '"\""'
42
+ else
43
+ if vvv == "?" #alalc-ell-Grek-Latn-1997.imp un-ell-Grek-Latn-1987-phonetic have to "?"
44
+ root_hash[:to] = "?".inspect
45
+ #if root_hash[:from].to_s.include?('capture') or root_hash[:to] =~ /\\\\([0-9]+)/
46
+ else
47
+ parse_vvv = Regexp::Parser.parse(vvv)
48
+ tokens_vvv = process(parse_vvv)
49
+ string_vvv = stringify(tokens_vvv)
50
+ # puts string_vvv
51
+ root_hash[:to] = string_vvv
52
+ root_hash[:to] = 'upcase' if string_vvv.include? 'upcase'
53
+ end
54
+ end
55
+ elsif vvv.class == Array
56
+ root_hash[:to] = "any(#{vvv.inspect})"
57
+ else
58
+ root_hash[:to] = "\"\""
59
+ # puts "unknown class #{vvv.inspect}"
60
+ end
61
+
62
+ str = stringify_root(root_hash, indent: indent)
63
+ # puts str
64
+ # puts ""
65
+ str
66
+ end
67
+
68
+
69
+ old_maps.sort.each do |old_map|
70
+ old_map_name = File.basename(old_map, ".yaml")
71
+
72
+ if ex_map_names.include? old_map_name
73
+ puts "* Skipping #{old_map_name} as it's already ported"
74
+ next
75
+ end
76
+
77
+ print "* Converting #{old_map_name}."
78
+
79
+ f = File.read(old_map)
80
+ fl = f.split("\n")
81
+
82
+ md = []
83
+ tests = []
84
+ map = []
85
+ chain = nil
86
+
87
+ cur = md
88
+
89
+ bugnotes = false
90
+
91
+ fl.each do |i|
92
+ if i == '---'
93
+ # skip the first line
94
+ elsif i =~ /\A\s+|\A\z/
95
+ # continuation
96
+ if bugnotes
97
+ i = "#{i}"
98
+ md << i
99
+ else
100
+ cur << i
101
+ end
102
+ else
103
+ cmt = nil
104
+ i = i.sub(/(#.*?)\z/) do |j|
105
+ cmt = j
106
+ ""
107
+ end
108
+
109
+ # block begin or md
110
+ case i.strip
111
+ when "tests:"
112
+ cur = tests
113
+ bugnotes = false
114
+ when "map:"
115
+ cur = map
116
+ bugnotes = false
117
+ when "notes:"
118
+ md << "notes:"
119
+ bugnotes = true
120
+ when /\Achain:/
121
+ chain = i
122
+ else
123
+ cur << i
124
+ end
125
+
126
+ cur << cmt if cmt
127
+ end
128
+ end
129
+
130
+ print "."
131
+
132
+ newmd = []
133
+ aliasff = false
134
+ md.each do |i|
135
+ if i.strip == "alias:"
136
+ aliasff = true
137
+ elsif i !~ /\A\s+/
138
+ aliasff = false
139
+ end
140
+ newmd << i unless aliasff
141
+ end
142
+ md = newmd
143
+
144
+ print "."
145
+
146
+ newmd = md.map(&" ".method(:+))
147
+ .join("\n")
148
+ .gsub(" note:", " notes:")
149
+ .gsub("confirmation date:", "confirmation_date:")
150
+ #.gsub("special_rules:", ' - "special rules:"')
151
+ #.gsub("original_description:", " # original description:")
152
+ #.gsub("original_notes:", ' - "original notes:"')
153
+ #.gsub("implementation_notes:", ' - "implementation notes:"')
154
+ .rstrip
155
+
156
+ new = "metadata {\n"
157
+ new << newmd
158
+ new << "\n}\n\n"
159
+
160
+ class MultilineError < StandardError; end
161
+
162
+ if tests.length > 0
163
+ new << "tests {\n"
164
+ cmt = ""
165
+
166
+ iter = 0
167
+ while iter < tests.length; begin
168
+ test = tests[iter]
169
+
170
+ if test =~ /\A\s*#/
171
+ new << " " << test.strip << "\n"
172
+ iter += 1
173
+ next
174
+ end
175
+
176
+ re_source = /\A(?: ){0,2}- source: (.*?)(\s*#.*?)?\z/m
177
+ re_expect = /\A(?: ){0,3}expected:[ \t](.*?)(\s*#.*?)?\z/m
178
+
179
+ if test.rstrip.end_with?("|") ||
180
+ (test =~ /"/ && !test.rstrip.end_with?('"'))
181
+ while iter < tests.length
182
+ xtest = tests[iter+1]
183
+ break if xtest =~ re_source || xtest =~ re_expect
184
+ test << "\n" << (xtest||"")
185
+ iter += 1
186
+ end
187
+ end
188
+
189
+ case test
190
+ when re_source
191
+ new << " test #{YAML.load($1).inspect}, "
192
+ cmt = $2 if $2
193
+ when re_expect
194
+ new << "#{YAML.load($1).inspect}".unicode_normalize
195
+ new << cmt
196
+ new << $2 if $2
197
+ new << "\n"
198
+ cmt = ""
199
+ when /\A\s*\z/
200
+ # empty line, ignore
201
+ else
202
+ new << "\n# BUG: #{test}\n"
203
+ end
204
+
205
+ iter += 1
206
+ rescue Psych::SyntaxError
207
+ p test
208
+ end; end
209
+
210
+ new << "}\n\n"
211
+ end
212
+
213
+ print "."
214
+
215
+ new << "# This map has been partially converted by the bin/maps_v1_to_v2 script\n"
216
+ new << "# The section below requires human attention. Remember to remove this\n"
217
+ new << "# comment and move the converted map to 'maps/' directory. Please also\n"
218
+ new << "# take note that the maps-staging directory will be cleaned up whenever\n"
219
+ new << "# you run the bin/maps_v1_to_v2 script. You should particularly be\n"
220
+ new << "# concerned about any regular expressions found in this file and about\n"
221
+ new << "# advanced expressions in parallel {} parts, and also about the order\n"
222
+ new << "# of particular parts of the stage.\n\n"
223
+
224
+ transcription = nil
225
+ title_case = nil
226
+ downcase = nil
227
+ inherit = nil
228
+
229
+ characters, rules, dictionary, postrules = [], [], [], []
230
+
231
+ cur = nil
232
+ indent = 0
233
+ stagedone = false
234
+ efini = proc do
235
+ if inherit
236
+ new << "\n"
237
+ inherit.each do |i|
238
+ new << " run map.#{i}.stage.main\n"
239
+ end
240
+ end
241
+ new << "\n"
242
+ efini=proc{}
243
+ end
244
+ fini = proc{}
245
+ begn = proc { new << "stage {\n"; stagedone = true; begn=proc{} }
246
+
247
+ iter = -1
248
+ while iter+1 < map.length
249
+ iter += 1
250
+ i = map[iter]
251
+
252
+ cmt = nil
253
+ if i =~ /\A [^\s#]/ || i =~ /\A inherit:/
254
+ i.sub(/(#.*?)\z/) { cmt = $1 }
255
+ if cmt
256
+ new << " "*indent << cmt << "\n"
257
+ end
258
+
259
+ maybe_val = YAML.load(i.split(":").last)
260
+ case i.split(":").first.strip
261
+ when "inherit"
262
+ inherit = Array(maybe_val)
263
+ inherit = inherit.map do |inh|
264
+ short = inh.split("-")[2..3].join.downcase
265
+ new << "dependency #{inh.inspect}, as: #{short}\n"
266
+ short
267
+ end
268
+ new << "\n"
269
+ raise "Duplicate items" unless inherit.length == inherit.uniq.length
270
+ when "dictionary"
271
+ begn.()
272
+ fini.()
273
+ new << " # DICTIONARY\n"
274
+ new << " parallel {\n"
275
+ indent = 4
276
+ cur = dictionary
277
+ fini = proc{new << " }\n\n";indent = 2}
278
+ when "rules"
279
+ begn.()
280
+ fini.()
281
+ efini.()
282
+ new << " # RULES\n"
283
+ indent = 2
284
+ cur = rules
285
+ fini = proc{new << "\n"}
286
+ when "characters"
287
+ begn.()
288
+ fini.()
289
+ efini.()
290
+ new << " # CHARACTERS\n"
291
+ new << " parallel {\n"
292
+ indent = 4
293
+ cur = characters
294
+ fini = proc{new << " }\n\n";indent = 2}
295
+ when "postrules"
296
+ begn.()
297
+ fini.()
298
+ efini.()
299
+ new << " # POSTRULES\n"
300
+ indent = 2
301
+ cur = postrules
302
+ fini = proc{new << "\n"}
303
+ when "downcase"
304
+ downcase = maybe_val
305
+ when "title_case"
306
+ title_case = maybe_val
307
+ when "transcription"
308
+ transcription = maybe_val
309
+ # Those we will ignore for now
310
+ when "word_separator", "segmentation", "character_separator", "map"
311
+ # Those are bugs
312
+ when "title-case"
313
+ else
314
+ p i
315
+ end
316
+ else
317
+ cmt = ""
318
+ i = i.sub(/(#.*?)\z/) { cmt << $1; "" }
319
+ #new << " "*indent << i.strip << "\n"
320
+ if i.strip == ""
321
+ new << " "*indent << cmt << "\n"
322
+ next
323
+ end
324
+
325
+ case cur.object_id
326
+ when nil.object_id
327
+ raise "Unexpected line #{i}"
328
+ when characters.object_id, dictionary.object_id
329
+ k,v = i.split(":", 2).map(&:strip)
330
+ if !v || v == ""
331
+ v = ""
332
+ # Load array
333
+ iter2 = iter + 1
334
+ while iter2 < map.length
335
+ i2 = map[iter2]
336
+ break unless i2.strip =~ /\A-/
337
+ i2 = i2.sub(/(#.*?)\z/) { cmt << $1; "" }
338
+ v << "\n" << i2
339
+ iter2 += 1
340
+ end
341
+ iter = iter2 - 1
342
+ end
343
+
344
+ kk,vv = YAML.load(k), YAML.load(v)
345
+
346
+ kkk,vvv = kk.inspect, vv.inspect
347
+ if vv.class == Array
348
+ if vv.all? { |z| z.length == 1 }
349
+ vvv = "any(" + vv.join.inspect + ")"
350
+ else
351
+ vvv = "any(" + vv.inspect + ")"
352
+ end
353
+ elsif vv.class == NilClass
354
+ vvv = "none"
355
+ end
356
+ kkk,vvv = kk.dup,vv.dup
357
+ # This worked due to use of regexps... it should remove 1 slash.
358
+ kkk = kkk.gsub("\\\\u", "\\\\u")
359
+ kkk = kkk.gsub("\\\\U", "\\\\u")
360
+
361
+ new << process_line( kkk,vvv, indent: indent )
362
+ when rules.object_id, postrules.object_id
363
+ if i.strip =~ /\A- pattern\s*:/
364
+ _, k = i.split(":", 2)
365
+ ii = map[iter+1]
366
+ ii = ii.sub(/(#.*?)\z/) { cmt << $1; "" }
367
+ if ii.strip == ""
368
+ iter += 1
369
+ ii = map[iter+1]
370
+ ii = ii.sub(/(#.*?)\z/) { cmt << $1; "" }
371
+ end
372
+ if ii.strip =~ /\Aresult\s*:/
373
+ _, v = ii.split(":", 2)
374
+ else
375
+ raise "Unexpected(2): #{ii.strip.inspect}"
376
+ end
377
+ iter += 1
378
+ else
379
+ raise "Unexpected(1): #{i.strip.inspect}"
380
+ end
381
+
382
+ kk,vv = YAML.load(k), YAML.load(v)
383
+ # kkk,vvv = kk.inspect, vv.inspect
384
+ kkk,vvv = kk.dup,vv.dup
385
+
386
+ new << process_line( kkk,vvv, indent: indent )
387
+ end
388
+
389
+ if cmt != ""
390
+ new << " " << cmt << "\n"
391
+ else
392
+ new << "\n"
393
+ end
394
+ end
395
+ end
396
+
397
+ fini.()
398
+
399
+ unless stagedone
400
+ new << "stage {\n"
401
+ efini.()
402
+ end
403
+
404
+ new << " title_case\n" if title_case
405
+ new << " downcase\n" if downcase
406
+
407
+ new << "}\n\n"
408
+
409
+ if chain
410
+ new << "# This map is chained and probably depends on seq2seq:\n"
411
+ new << "# #{chain}\n\n"
412
+ end
413
+
414
+ if transcription
415
+ new << "# This map contains transcription and probably depends on seq2seq:\n"
416
+ new << "# transcription: #{transcription}\n\n"
417
+ end
418
+
419
+ new = new.gsub(/ +$/, '') # Cleanup trailing whitespaces
420
+
421
+ #new << map.join("\n")
422
+
423
+ File.write("../maps/maps-staging/#{old_map_name}.imp", new)
424
+
425
+ puts " done!"
426
+ end