interscript 0.1.4 → 2.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +29 -0
  5. data/LICENSE.adoc +31 -0
  6. data/README.md +3 -0
  7. data/Rakefile +53 -0
  8. data/bin/console +14 -0
  9. data/bin/interscript +3 -39
  10. data/bin/maps_analyze_staging +168 -0
  11. data/bin/maps_debug_compilers +58 -0
  12. data/bin/maps_debug_ordering +88 -0
  13. data/bin/maps_debug_ruby_compile +24 -0
  14. data/bin/maps_debug_step_by_step +44 -0
  15. data/bin/maps_optimize_order +112 -0
  16. data/bin/maps_v1_analyze_regexps +45 -0
  17. data/bin/maps_v1_to_v2 +426 -0
  18. data/exe/interscript +6 -0
  19. data/interscript.gemspec +31 -0
  20. data/lib/interscript.rb +76 -128
  21. data/lib/interscript/command.rb +6 -5
  22. data/lib/interscript/compiler.rb +22 -0
  23. data/lib/interscript/compiler/javascript.rb +292 -0
  24. data/lib/interscript/compiler/ruby.rb +262 -0
  25. data/lib/interscript/dsl.rb +67 -0
  26. data/lib/interscript/dsl/aliases.rb +23 -0
  27. data/lib/interscript/dsl/document.rb +46 -0
  28. data/lib/interscript/dsl/group.rb +45 -0
  29. data/lib/interscript/dsl/group/parallel.rb +6 -0
  30. data/lib/interscript/dsl/items.rb +89 -0
  31. data/lib/interscript/dsl/metadata.rb +26 -0
  32. data/lib/interscript/dsl/stage.rb +6 -0
  33. data/lib/interscript/dsl/symbol_mm.rb +11 -0
  34. data/lib/interscript/dsl/tests.rb +12 -0
  35. data/lib/interscript/interpreter.rb +251 -0
  36. data/lib/interscript/node.rb +25 -0
  37. data/lib/interscript/node/alias_def.rb +15 -0
  38. data/lib/interscript/node/dependency.rb +13 -0
  39. data/lib/interscript/node/document.rb +45 -0
  40. data/lib/interscript/node/group.rb +34 -0
  41. data/lib/interscript/node/group/parallel.rb +9 -0
  42. data/lib/interscript/node/group/sequential.rb +2 -0
  43. data/lib/interscript/node/item.rb +52 -0
  44. data/lib/interscript/node/item/alias.rb +42 -0
  45. data/lib/interscript/node/item/any.rb +61 -0
  46. data/lib/interscript/node/item/capture.rb +50 -0
  47. data/lib/interscript/node/item/group.rb +51 -0
  48. data/lib/interscript/node/item/repeat.rb +40 -0
  49. data/lib/interscript/node/item/stage.rb +23 -0
  50. data/lib/interscript/node/item/string.rb +51 -0
  51. data/lib/interscript/node/metadata.rb +18 -0
  52. data/lib/interscript/node/rule.rb +6 -0
  53. data/lib/interscript/node/rule/funcall.rb +18 -0
  54. data/lib/interscript/node/rule/run.rb +15 -0
  55. data/lib/interscript/node/rule/sub.rb +65 -0
  56. data/lib/interscript/node/stage.rb +19 -0
  57. data/lib/interscript/node/tests.rb +15 -0
  58. data/lib/interscript/stdlib.rb +211 -0
  59. data/lib/interscript/utils/regexp_converter.rb +283 -0
  60. data/lib/interscript/version.rb +1 -1
  61. data/requirements.txt +1 -0
  62. metadata +73 -223
  63. data/README.adoc +0 -297
  64. data/bin/rspec +0 -29
  65. data/lib/g2pwrapper.py +0 -34
  66. data/lib/interscript/mapping.rb +0 -125
  67. data/lib/model-7 +0 -0
  68. data/lib/tha-pt-b-7 +0 -0
  69. data/maps/acadsin-zho-Hani-Latn-2002.yaml +0 -38912
  70. data/maps/alalc-aze-Cyrl-Latn-1997.yaml +0 -141
  71. data/maps/alalc-bel-cyrl-latn-1997.yaml +0 -125
  72. data/maps/alalc-ben-Beng-Latn-2017.yaml +0 -130
  73. data/maps/alalc-bul-Cyrl-Latn-1997.yaml +0 -94
  74. data/maps/alalc-ell-Grek-Latn-1997.yaml +0 -625
  75. data/maps/alalc-ell-Grek-Latn-2010.yaml +0 -628
  76. data/maps/alalc-kat-Geok-Latn-1997.yaml +0 -112
  77. data/maps/alalc-kat-Geor-Latn-1997.yaml +0 -146
  78. data/maps/alalc-kor-Hang-Latn-1997.yaml +0 -94
  79. data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +0 -103
  80. data/maps/alalc-mkd-cyrl-latn-1997.yaml +0 -114
  81. data/maps/alalc-rus-Cyrl-Latn-1997.yaml +0 -222
  82. data/maps/alalc-rus-Cyrl-Latn-2012.yaml +0 -162
  83. data/maps/alalc-srp-Cyrl-Latn-1997.yaml +0 -114
  84. data/maps/alalc-srp-cyrl-latn-2013.yaml +0 -135
  85. data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +0 -141
  86. data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +0 -16
  87. data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +0 -283
  88. data/maps/bas-rus-Cyrl-Latn-2017-bss.yaml +0 -175
  89. data/maps/bas-rus-Cyrl-Latn-2017-oss.yaml +0 -169
  90. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +0 -294
  91. data/maps/bgn-kor-Hang-Latn-1943.yaml +0 -31
  92. data/maps/bgn-kor-Kore-Latn-1943.yaml +0 -31
  93. data/maps/bgna-bul-Cyrl-Latn-2006.yaml +0 -208
  94. data/maps/bgna-bul-Cyrl-Latn-2009.yaml +0 -208
  95. data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +0 -108
  96. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +0 -104
  97. data/maps/bgnpcgn-bak-Cyrl-Latn-2007.yaml +0 -184
  98. data/maps/bgnpcgn-bel-cyrl-latn-1979.yaml +0 -285
  99. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +0 -115
  100. data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +0 -38
  101. data/maps/bgnpcgn-chn-Hans-Latn-1979.yaml +0 -7456
  102. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +0 -702
  103. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +0 -20
  104. data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +0 -257
  105. data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +0 -127
  106. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +0 -43
  107. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +0 -253
  108. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +0 -48
  109. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +0 -48
  110. data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +0 -159
  111. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +0 -190
  112. data/maps/bgnpcgn-per-Arab-Latn-1956.yaml +0 -93
  113. data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +0 -314
  114. data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +0 -166
  115. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +0 -163
  116. data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +0 -208
  117. data/maps/by-bel-Cyrl-Latn-1998.yaml +0 -168
  118. data/maps/by-bel-Cyrl-Latn-2007.yaml +0 -115
  119. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +0 -685
  120. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +0 -681
  121. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +0 -20
  122. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +0 -32
  123. data/maps/ggg-kat-Geor-Latn-2002.yaml +0 -89
  124. data/maps/gki-bel-cyrl-latn-1992.yaml +0 -33
  125. data/maps/gki-bel-cyrl-latn-2000.yaml +0 -201
  126. data/maps/gost-rus-cyrl-latn-16876-71-1983.yaml +0 -186
  127. data/maps/hk-yue-Hani-Latn-1888.yaml +0 -38497
  128. data/maps/icao-bel-Cyrl-Latn-9303.yaml +0 -141
  129. data/maps/icao-bul-Cyrl-Latn-9303.yaml +0 -122
  130. data/maps/icao-heb-Hebr-Latn-9303.yaml +0 -151
  131. data/maps/icao-mkd-Cyrl-Latn-9303.yaml +0 -117
  132. data/maps/icao-per-Arab-Latn-9303.yaml +0 -104
  133. data/maps/icao-rus-Cyrl-Latn-9303.yaml +0 -118
  134. data/maps/icao-srp-Cyrl-Latn-9303.yaml +0 -117
  135. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +0 -120
  136. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +0 -610
  137. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +0 -41
  138. data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +0 -62
  139. data/maps/iso-rus-Cyrl-Latn-9-1995.yaml +0 -272
  140. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +0 -109
  141. data/maps/kp-kor-Hang-Latn-2002.yaml +0 -901
  142. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +0 -44820
  143. data/maps/mext-jpn-Hrkt-Latn-1954.yaml +0 -411
  144. data/maps/moct-kor-Hang-Latn-2000.yaml +0 -803
  145. data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +0 -541
  146. data/maps/mvd-bel-Cyrl-Latn-2008.yaml +0 -225
  147. data/maps/mvd-bel-Cyrl-Latn-2010.yaml +0 -63
  148. data/maps/mvd-rus-Cyrl-Latn-2008.yaml +0 -110
  149. data/maps/mvd-rus-Cyrl-Latn-2010.yaml +0 -37
  150. data/maps/nil-kor-Hang-Hang-jamo.yaml +0 -11193
  151. data/maps/odni-bel-Cyrl-Latn-2015.yaml +0 -148
  152. data/maps/odni-bul-Cyrl-Latn-2015.yaml +0 -96
  153. data/maps/odni-kat-Geor-Latn-2015.yaml +0 -88
  154. data/maps/odni-rus-Cyrl-Latn-2015.yaml +0 -77
  155. data/maps/odni-srp-Cyrl-Latn-2015.yaml +0 -129
  156. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +0 -157
  157. data/maps/odni-uzb-Cyrl-Latn-2015.yaml +0 -167
  158. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +0 -90
  159. data/maps/royin-tha-Thai-Latn-1968.yaml +0 -179
  160. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +0 -180
  161. data/maps/royin-tha-Thai-Latn-1999.yaml +0 -76
  162. data/maps/sac-zho-Hans-Latn-1979.yaml +0 -24759
  163. data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +0 -222
  164. data/maps/ua-ukr-Cyrl-Latn-1996.yaml +0 -193
  165. data/maps/un-bel-Cyrl-Latn-2007.yaml +0 -114
  166. data/maps/un-ben-Beng-Latn-2016.yaml +0 -534
  167. data/maps/un-ell-Grek-Latn-1987-tl.yaml +0 -32
  168. data/maps/un-ell-Grek-Latn-1987-ts.yaml +0 -20
  169. data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +0 -780
  170. data/maps/un-mon-Mong-Latn-2013.yaml +0 -93
  171. data/maps/un-rus-Cyrl-Latn-1987.yaml +0 -166
  172. data/maps/un-ukr-cyrl-latn-1998.yaml +0 -30
  173. data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +0 -406
  174. data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +0 -386
  175. data/maps/var-kor-Hang-Latn-mr-1939.yaml +0 -1054
  176. data/maps/var-kor-Kore-Hang-2013.yaml +0 -59754
  177. data/maps/var-kor-Kore-Latn-mr-1939.yaml +0 -37
  178. data/maps/var-tha-Thai-Thai-phonemic.yaml +0 -59
  179. data/maps/var-tha-Thai-Zsym-ipa.yaml +0 -301
  180. data/maps/var-zho-Hani-Latn-1979.yaml +0 -38908
  181. data/spec/interscript/mapping_spec.rb +0 -42
  182. data/spec/interscript_spec.rb +0 -26
  183. data/spec/spec_helper.rb +0 -3
@@ -0,0 +1,18 @@
1
+ class Interscript::Node::MetaData < Interscript::Node
2
+ attr_accessor :data
3
+ def initialize data={}
4
+ @data = data
5
+ end
6
+
7
+ def []=(k,v)
8
+ @data[k] = v
9
+ end
10
+ def [](k)
11
+ @data[k]
12
+ end
13
+
14
+ def to_hash
15
+ {:class => self.class.to_s,
16
+ :data => @data}
17
+ end
18
+ end
@@ -0,0 +1,6 @@
1
+ class Interscript::Node::Rule < Interscript::Node
2
+ end
3
+
4
+ require "interscript/node/rule/sub"
5
+ require "interscript/node/rule/run"
6
+ require "interscript/node/rule/funcall"
@@ -0,0 +1,18 @@
1
+ class Interscript::Node::Rule::Funcall < Interscript::Node::Rule
2
+ attr_accessor :name, :kwargs
3
+ def initialize name, **kwargs
4
+ @name = name
5
+ @kwargs = kwargs
6
+ end
7
+
8
+ def to_hash
9
+ { :class => self.class.to_s,
10
+ :name => self.name,
11
+ :kwargs => self.kwargs
12
+ }
13
+ end
14
+
15
+ def inspect
16
+ "#{@name} #{kwargs.inspect[1..-2]}"
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ class Interscript::Node::Rule::Run < Interscript::Node::Rule
2
+ attr_accessor :stage
3
+ def initialize stage
4
+ @stage = stage
5
+ end
6
+
7
+ def to_hash
8
+ { :class => self.class.to_s,
9
+ :stage => self.stage.to_hash }
10
+ end
11
+
12
+ def inspect
13
+ "run #{@stage.inspect}"
14
+ end
15
+ end
@@ -0,0 +1,65 @@
1
+ class Interscript::Node::Rule::Sub < Interscript::Node::Rule
2
+ attr_accessor :from, :to
3
+ attr_accessor :before, :not_before, :after, :not_after
4
+ attr_accessor :priority
5
+
6
+ def initialize from, to, before: nil, not_before: nil, after: nil, not_after: nil, priority: nil
7
+ self.from = Interscript::Node::Item.try_convert from
8
+ if to == :upcase
9
+ self.to = :upcase
10
+ else
11
+ self.to = Interscript::Node::Item.try_convert to
12
+ end
13
+
14
+ self.priority = priority
15
+
16
+ #raise TypeError, "Can't supply both before and not_before" if before && not_before
17
+ #raise TypeError, "Can't supply both after and not_after" if after && not_after
18
+
19
+ self.before = Interscript::Node::Item.try_convert(before) if before
20
+ self.after = Interscript::Node::Item.try_convert(after) if after
21
+ self.not_before = Interscript::Node::Item.try_convert(not_before) if not_before
22
+ self.not_after = Interscript::Node::Item.try_convert(not_after) if not_after
23
+ end
24
+
25
+ def max_length
26
+ len = self.from.max_length
27
+ len += self.before.max_length if self.before
28
+ len += self.after.max_length if self.after
29
+ len += self.not_before.max_length if self.not_before
30
+ len += self.not_after.max_length if self.not_after
31
+ len += self.priority if self.priority
32
+ len
33
+ end
34
+
35
+ def to_hash
36
+ puts self.from.inspect if $DEBUG
37
+ puts params.inspect if $DEBUG
38
+ { :class => self.class.to_s,
39
+ :from => self.from.to_hash,
40
+ :to => self.to == :upcase ? :upcase : self.to.to_hash,
41
+ :before => self.before&.to_hash,
42
+ :not_before => self.not_before&.to_hash,
43
+ :after => self.after&.to_hash,
44
+ :not_after => self.not_after&.to_hash,
45
+ :priority => self.priority
46
+ }
47
+ end
48
+
49
+ def inspect
50
+ out = "sub "
51
+ params = []
52
+ params << @from.inspect
53
+ if @to == :upcase
54
+ params << "upcase"
55
+ else
56
+ params << @to.inspect
57
+ end
58
+ params << "before: #{@before.inspect}" if @before
59
+ params << "after: #{@after.inspect}" if @after
60
+ params << "not_before: #{@not_before.inspect}" if @not_before
61
+ params << "not_after: #{@not_after.inspect}" if @not_after
62
+ params << "priority: #{@priority.inspect}" if @priority
63
+ out << params.join(", ")
64
+ end
65
+ end
@@ -0,0 +1,19 @@
1
+ class Interscript::Node::Stage < Interscript::Node::Group::Sequential
2
+ attr_accessor :name, :doc_name
3
+
4
+ def initialize name = :main
5
+ @name = name
6
+ super()
7
+ end
8
+
9
+ def to_hash
10
+ { :class => self.class.to_s,
11
+ :name => name,
12
+ :children => @children.map{|x| x.to_hash} }
13
+ end
14
+
15
+ def inspect
16
+ name = "(#{@name})" if @name != :main
17
+ "stage#{name} {\n#{super}\n}"
18
+ end
19
+ end
@@ -0,0 +1,15 @@
1
+ class Interscript::Node::Tests < Interscript::Node
2
+ attr_accessor :data
3
+ def initialize data=[]
4
+ @data = data
5
+ end
6
+
7
+ def <<(pair)
8
+ @data << pair
9
+ end
10
+
11
+ def to_hash
12
+ { :class => self.class.to_s,
13
+ :data => @data }
14
+ end
15
+ end
@@ -0,0 +1,211 @@
1
+ class Interscript::Stdlib
2
+ ALIASES = {
3
+ any_character: '.',
4
+ none: "",
5
+ space: " ",
6
+ whitespace: "[\\b \\t\\0\\r\\n]",
7
+ boundary: "\\b",
8
+ non_word_boundary: "\\B",
9
+ word: "\\w",
10
+ not_word: "\\W",
11
+ alpha: "[a-zA-Z]",
12
+ not_alpha: "[^a-zA-Z]",
13
+ digit: "\\d",
14
+ not_digit: "\\D",
15
+ line_start: "^",
16
+ line_end: "$",
17
+ string_start: "\\A",
18
+ string_end: "\\z"
19
+ }
20
+
21
+ def self.re_only_alias?(a)
22
+ ! %i[none space].include?(a)
23
+ end
24
+
25
+ @treecache = {}
26
+
27
+ def self.parallel_regexp_compile(subs_hash)
28
+ # puts subs_hash.inspect
29
+ regexp = subs_hash.each_with_index.map do |p,i|
30
+ "(?<_%d>%s)" % [i,p[0]]
31
+ end.join("|")
32
+ subs_regexp = Regexp.compile(regexp)
33
+ # puts subs_regexp.inspect
34
+ end
35
+
36
+ def self.parallel_regexp_gsub(string, subs_regexp, subs_hash)
37
+ string.gsub(subs_regexp) do |match|
38
+ lm = Regexp.last_match
39
+ # Extract the match name
40
+ idx = lm.named_captures.compact.keys.first[1..-1].to_i
41
+ subs_hash[idx]
42
+ end
43
+ end
44
+
45
+ def self.parallel_regexp_gsub_debug(string, subs_regexp, subs_array)
46
+ # only gathering debug info, test data is available in maps_analyze_staging
47
+ $subs_matches = []
48
+ $subs_regexp = subs_regexp
49
+ #$subs_array = subs_array
50
+ string.gsub(subs_regexp) do |match|
51
+ lm = Regexp.last_match
52
+ # puts lm.inspect
53
+ # Extract the match name
54
+ matched = lm.named_captures.compact.keys.first
55
+ # puts matched.inspect
56
+ # puts [lm.begin(matched), lm.end(matched)].inspect
57
+ idx = matched[1..-1].to_i
58
+ debug_info = {begin: lm.begin(matched), end: lm.end(matched), idx: idx, result: subs_array[idx]}
59
+ $subs_matches << debug_info
60
+ subs_array[idx]
61
+ end
62
+ end
63
+
64
+
65
+ def self.parallel_replace_compile_hash(a)
66
+ h = {}
67
+ a.each do |from,to|
68
+ h[from] = to
69
+ end
70
+ h
71
+ end
72
+
73
+ def self.parallel_replace_hash(str,h)
74
+ newstr = ""
75
+ len = str.length
76
+ max_key_len = h.keys.map(&:length).max
77
+ i = 0
78
+ while i < len
79
+ max_key_len.downto(1).each do |checked_len|
80
+ substr = str[i,checked_len]
81
+ if h[substr]
82
+ newstr << h[substr]
83
+ i += substr.length
84
+ elsif checked_len==1
85
+ newstr << str[i,1]
86
+ i += 1
87
+ end
88
+ end
89
+ end
90
+ newstr
91
+ end
92
+
93
+ # hash can be either a hash or a hash-like array
94
+ def self.parallel_replace_compile_tree(hash)
95
+ hh = hash.hash
96
+ if @treecache[hh]
97
+ tree = @treecache[hh]
98
+ else
99
+ tree = {}
100
+ hash.each do |from, to|
101
+ from = Array(from)
102
+ from.each do |f|
103
+ branch = tree
104
+ chars = f.split("")
105
+ chars[0..-2].each do |c|
106
+ branch[c.ord] ||= {}
107
+ branch = branch[c.ord]
108
+ end
109
+ branch[chars.last.ord] ||= {}
110
+ branch[chars.last.ord][nil] = to
111
+ end
112
+ end
113
+ @treecache[hh] = tree
114
+ end
115
+ end
116
+
117
+ def self.parallel_replace_tree(str, tree)
118
+ newstr = ""
119
+ len = str.length
120
+ i = 0
121
+ while i < len
122
+ c = str[i]
123
+
124
+ sub = ""
125
+ branch = tree
126
+ match, repl = nil, nil
127
+
128
+ j = 0
129
+ while j < len-i
130
+ cc = str[i+j]
131
+ if branch.include? cc.ord
132
+ branch = branch[cc.ord]
133
+ sub << cc
134
+ if branch.include? nil
135
+ match = sub.dup
136
+ repl = branch[nil]
137
+ end
138
+ j += 1
139
+ else
140
+ break
141
+ end
142
+ end
143
+
144
+ if match
145
+ i += match.length
146
+ newstr << repl
147
+ else
148
+ newstr << c
149
+ i += 1
150
+ end
151
+ end
152
+ newstr
153
+ end
154
+
155
+ def self.parallel_replace(str, hash)
156
+ tree = parallel_replace_compile_tree(hash)
157
+ parallel_replace_tree(str, tree)
158
+ end
159
+
160
+ # On Windows at least, sort_by is non-deterministic. Let's add some determinism
161
+ # to our efforts.
162
+ def self.deterministic_sort_by_max_length(ary)
163
+ # Deterministic on Linux:
164
+ # ary.sort_by{ |rule| -rule.max_length }
165
+
166
+ ary.each_with_index.sort_by{ |rule,idx| -rule.max_length*100000 + idx }.map(&:first)
167
+ end
168
+
169
+ def self.available_functions
170
+ %i[title_case downcase compose decompose separate secryst]
171
+ end
172
+
173
+ module Functions
174
+ def self.title_case(output, word_separator: " ")
175
+ output = output.gsub(/^(.)/, &:upcase)
176
+ output = output.gsub(/#{word_separator}(.)/, &:upcase) unless word_separator == ''
177
+ output
178
+ end
179
+
180
+ def self.downcase(output, _:nil)
181
+ output.downcase
182
+ end
183
+
184
+ def self.compose(output, _:nil)
185
+ output.unicode_normalize(:nfc)
186
+ end
187
+
188
+ def self.decompose(output, _:nil)
189
+ output.unicode_normalize(:nfd)
190
+ end
191
+
192
+ def self.separate(output, separator: " ")
193
+ output.split("").join(separator)
194
+ end
195
+
196
+ @secryst_models = {}
197
+ def self.secryst(output, model:)
198
+ require "secryst" rescue nil # Try to load secryst, but don't fail hard if not possible.
199
+ unless defined? Secryst
200
+ raise StandardError, "Secryst is not loaded. Please read docs/Usage_with_Secryst.adoc"
201
+ end
202
+ Interscript.secryst_index_locations.each do |remote|
203
+ Secryst::Provisioning.add_remote(remote)
204
+ end
205
+ @secryst_models[model] ||= Secryst::Translator.new(model_file: model)
206
+ output.split("\n").map(&:chomp).map do |i|
207
+ @secryst_models[model].translate(i)
208
+ end.join("\n")
209
+ end
210
+ end
211
+ end
@@ -0,0 +1,283 @@
1
+ require 'regexp_parser'
2
+
3
+
4
+ def process(node)
5
+ children = if node.respond_to?(:expressions) && node.expressions
6
+ children = node.expressions.map.each { |expr| process(expr) }
7
+ end
8
+ # puts node.inspect
9
+ out = case node
10
+ when Regexp::Expression::Root
11
+ children
12
+ when Regexp::Expression::Assertion::Lookbehind
13
+ [:lookbehind_start, children, :lookbehind_stop]
14
+ when Regexp::Expression::Assertion::NegativeLookbehind
15
+ [:negative_lookbehind_start, children, :negative_lookbehind_stop]
16
+ when Regexp::Expression::Assertion::Lookahead
17
+ [:lookahead_start, children, :lookahead_stop]
18
+ when Regexp::Expression::Assertion::NegativeLookahead
19
+ [:negative_lookahead_start, children, :negative_lookahead_stop]
20
+ when Regexp::Expression::Group::Capture
21
+ [:capture_start, children, :capture_stop]
22
+ when Regexp::Expression::CharacterSet
23
+ # puts children.inspect
24
+ if children.flatten.include? (:range_start) #or children.size > 1
25
+ [:characterset_start, :array_start, children, :array_stop, :characterset_stop]
26
+ else
27
+ [:characterset_start, children, :characterset_stop]
28
+ end
29
+ when Regexp::Expression::Alternation
30
+ [:alternation_start, children, :alternation_stop]
31
+ when Regexp::Expression::Alternative
32
+ [:alternative_start, children, :alternative_stop]
33
+ when Regexp::Expression::CharacterSet::Range
34
+ lit1 = node.expressions[0].text
35
+ lit2 = node.expressions[1].text
36
+ [:range_start, lit1, :range_mid, lit2, :range_stop]
37
+ when Regexp::Expression::Anchor::WordBoundary
38
+ :boundary
39
+ when Regexp::Expression::Anchor::NonWordBoundary
40
+ :non_word_boundary
41
+ when Regexp::Expression::EscapeSequence::Backspace
42
+ :boundary # most probably boundary
43
+ when Regexp::Expression::CharacterType::Space
44
+ :space
45
+ when Regexp::Expression::Anchor::BeginningOfLine
46
+ :line_start
47
+ when Regexp::Expression::Anchor::EndOfLine
48
+ :line_end
49
+ when Regexp::Expression::CharacterType::Any
50
+ :any_character
51
+ when Regexp::Expression::Literal
52
+ node.text
53
+ when Regexp::Expression::EscapeSequence::Literal
54
+ node.text
55
+ when Regexp::Expression::EscapeSequence::Codepoint
56
+ node.text
57
+ when Regexp::Expression::PosixClass
58
+ '[' + node.text + ']'
59
+ when Regexp::Expression::UnicodeProperty::Script
60
+ node.text
61
+ when Regexp::Expression::Backreference::Number # why is there a space before after node.number?
62
+ [:backref_num_start, node.number, :backref_num_stop]
63
+ else
64
+ out = [:missing, node.class]
65
+
66
+ out << children if node.respond_to? :expressions
67
+ if node.respond_to? :quantifier and node.quantifier
68
+ # TODO add quantifier support
69
+ pp node
70
+ # out << process(node.quantifier)
71
+ end
72
+ out
73
+ end
74
+ if node.respond_to?(:quantifier) && node.quantifier&.token.to_s == "interval" && node.quantifier.max == node.quantifier.min
75
+ out = [out] * node.quantifier.max
76
+ elsif node.respond_to?(:quantifier) && node.quantifier
77
+ qname = node.quantifier.token.to_s
78
+ out = ["#{qname}_start".to_sym, [out], "#{qname}_stop".to_sym]
79
+ end
80
+ out
81
+ end
82
+
83
+ def process_root(node)
84
+ node2 = node.dup
85
+ root = {}
86
+ if before = node.select { |x| x[0] == :lookbehind_start }
87
+ # root[:before] = before[1]
88
+ # node2.delete(before)
89
+ if before.size == 1
90
+ root[:before] = before[0][1]
91
+ node2.delete(before[0])
92
+ elsif before.size >1
93
+ # pp not_before
94
+
95
+ a = [:alternation_start]
96
+ a << before.map{|x| [:alternative_start, x[1], :alternative_stop] }
97
+ a << [:alternation_stop]
98
+ root[:before] = a
99
+ # pp root[:not_before]
100
+ before.each{|n| node2.delete(n)}
101
+ end
102
+
103
+ end
104
+ if not_before = node.select { |x| x[0] == :negative_lookbehind_start }
105
+ # root[:not_before] = not_before[1]
106
+ # node2.delete(not_before)
107
+
108
+ if not_before.size == 1
109
+ root[:not_before] = not_before[0][1]
110
+ node2.delete(not_before[0])
111
+ elsif not_before.size >1
112
+ # pp not_before
113
+
114
+ a = [:alternation_start]
115
+ a << not_before.map{|x| [:alternative_start, x[1], :alternative_stop] }
116
+ a << [:alternation_stop]
117
+ root[:not_before] = a
118
+ # pp root[:not_before]
119
+ not_before.each{|n| node2.delete(n)}
120
+ end
121
+ end
122
+ if after = node.select { |x| x[0] == :lookahead_start }
123
+ # root[:after] = after[1]
124
+ # node2.delete(after)
125
+
126
+ if after.size == 1
127
+ root[:after] = after[0][1]
128
+ node2.delete(after[0])
129
+ elsif after.size >1
130
+ # pp not_before
131
+
132
+ a = [:alternation_start]
133
+ a << after.map{|x| [:alternative_start, x[1], :alternative_stop] }
134
+ a << [:alternation_stop]
135
+ root[:after] = a
136
+ # pp root[:not_before]
137
+ after.each{|n| node2.delete(n)}
138
+ end
139
+
140
+ end
141
+ if not_after = node.select { |x| x[0] == :negative_lookahead_start }
142
+ # root[:not_after] = not_after[1]
143
+ # node2.delete(not_after)
144
+ if not_after.size == 1
145
+ root[:not_after] = not_after[0][1]
146
+ node2.delete(not_after[0])
147
+ elsif not_after.size >1
148
+ # pp not_after
149
+
150
+ a = [:alternation_start]
151
+ a << not_after.map{|x| [:alternative_start, x[1], :alternative_stop] }
152
+ a << [:alternation_stop]
153
+ root[:not_after] = a
154
+ # pp root[:not_after]
155
+ not_after.each{|n| node2.delete(n)}
156
+ end
157
+
158
+ end
159
+ root[:from] = node2
160
+ root
161
+ end
162
+
163
+ def stringify(node)
164
+ tokens = node.flatten
165
+ subs = {
166
+ characterset_start: 'any(',
167
+ characterset_stop: ')',
168
+ array_start: '[',
169
+ array_stop: ']',
170
+ capture_start: 'capture(',
171
+ capture_stop: ')',
172
+ zero_or_one_start: 'maybe(',
173
+ zero_or_one_stop: ')',
174
+ zero_or_more_start: 'maybe_some(',
175
+ zero_or_more_stop: ')',
176
+ one_or_more_start: 'some(',
177
+ one_or_more_stop: ')',
178
+ alternation_start: 'any([',
179
+ alternation_stop: '])',
180
+ alternative_start: '',
181
+ alternative_stop: '',
182
+ boundary: 'boundary',
183
+ non_word_boundary: 'non_word_boundary',
184
+ space: 'space',
185
+ line_start: 'line_start',
186
+ line_end: 'line_end',
187
+ any_character: 'any_character',
188
+ range_start: 'any(',
189
+ range_mid: '..',
190
+ range_stop: ')',
191
+ backref_num_start: 'ref(',
192
+ backref_num_stop: ')'
193
+ }
194
+
195
+ str = []
196
+ tokens.each_with_index do |token, idx|
197
+ prev = tokens[idx - 1] if idx > 0
198
+ left_side = %i[characterset_stop capture_stop
199
+ zero_or_one_stop zero_or_more_stop one_or_more_stop
200
+ boundary non_word_boundary
201
+ line_start any_character range_stop space
202
+ backref_num_stop]
203
+ right_side = %i[characterset_start capture_start
204
+ zero_or_one_start zero_or_more_start one_or_more_start
205
+ boundary non_word_boundary
206
+ line_end any_character range_start space
207
+ backref_num_start]
208
+ #if prev==:range_stop and token==:range_start
209
+ # str << ' :adding_ranges '
210
+ #end
211
+ if (prev.instance_of?(String) && right_side.include?(token)) or
212
+ (left_side.include?(prev) && token.instance_of?(String)) or
213
+ (left_side.include?(prev) && right_side.include?(token))
214
+ str << ' + '
215
+ end
216
+ str << ', ' if prev == :alternative_stop and token == :alternative_start
217
+ # str << '[' if prev == :characterset_start and token == :range_start
218
+ # str << ']' if prev == :range_stop and token ==:characterset_stop
219
+ if subs.include? token
220
+ str << subs[token]
221
+ elsif token.instance_of?(String)
222
+ if prev.instance_of?(String)
223
+ str[-1] = "#{str[-1][0..-2]}#{token}\""
224
+ else
225
+ str << "\"#{token}\""
226
+ end
227
+ else
228
+ str << " #{token.inspect} "
229
+ end
230
+ # puts [idx, token].inspect
231
+ # puts str.inspect
232
+ end
233
+ str.join.gsub('\\\\u', '\\u')
234
+ end
235
+
236
+ def stringify_root(root, indent: 0)
237
+ warning = ''
238
+ root[:from] = [""] if root[:from] == []
239
+ str = " "*indent+"sub #{stringify(root[:from])}, #{root[:to]}"
240
+ [:before, :not_before, :after, :not_after].each do |look|
241
+ # puts "#{look.inspect} = #{root[look]}"
242
+ next unless root[look]
243
+ str_look = stringify(root[look])
244
+ str_look = "\"\"" if root[look] == [] || root[look] == nil
245
+ #if str_look.empty? #apparently it is empty sometimes. iso-mal-Mlym-Latn for example
246
+ # warning << "warning: #{look} is empty string;"
247
+ #else
248
+ str << ", #{look}: #{str_look}"
249
+ #end
250
+ end
251
+ str = " "*indent+"# #{str} # warning: :" if str =~ /[^\[]:[^ \]]/
252
+ str = " "*indent+"# #{str} # #{warning}" if !warning.empty?
253
+
254
+ str = " "*indent+"# #{str} # warning: :missing unimplemented" if str.include?(':missing')
255
+ str = " "*indent+"# #{str} # warning: :interval unimplemented" if str.include?(':interval')
256
+ str = " "*indent+"# #{str} # warning: :adding_ranges unimplemented" if str.include?(':adding_ranges')
257
+ if str.include?('zero_or_one')
258
+ str = " "*indent+"# #{str} # warning: zero_or_one"
259
+ puts "str.includes 'zero_or_one'"
260
+ pp root
261
+ end
262
+ # str = " "*indent+"# #{str} # warning: one_or_more" if str.include?('one_or_more')
263
+ str = " "*indent+"# #{str} # warning: :lookahead_start" if str.include?(':lookahead_start')
264
+ # str += " # original: #{root[:from]}"
265
+ str
266
+ end
267
+
268
+ if __FILE__ == $0
269
+ rs = File.open(__dir__+"/../../docs/utils/regexp_examples.txt").read.gsub(/([^\\^])\\u/, '\\1\\\\u').gsub(/\\\\b/, '\b')
270
+ rs = rs.split("\n")
271
+ rs.each do |r|
272
+ puts r
273
+ tree = Regexp::Parser.parse(r, 'ruby/2.1')
274
+ conv = process(tree)
275
+ pp conv
276
+ root = process_root(conv)
277
+ pp root
278
+ root[:to] = ['X']
279
+ str = stringify_root(root)
280
+ puts str
281
+ puts "\n\n"
282
+ end
283
+ end