interscript 0.1.5 → 2.1.0a8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +29 -0
  5. data/LICENSE.adoc +31 -0
  6. data/README.md +3 -0
  7. data/Rakefile +53 -0
  8. data/bin/console +14 -0
  9. data/bin/interscript +3 -39
  10. data/bin/maps_analyze_staging +168 -0
  11. data/bin/maps_debug_compilers +58 -0
  12. data/bin/maps_debug_ordering +88 -0
  13. data/bin/maps_debug_ruby_compile +24 -0
  14. data/bin/maps_debug_step_by_step +44 -0
  15. data/bin/maps_optimize_order +112 -0
  16. data/bin/maps_v1_analyze_regexps +45 -0
  17. data/bin/maps_v1_to_v2 +426 -0
  18. data/exe/interscript +6 -0
  19. data/interscript.gemspec +31 -0
  20. data/lib/interscript.rb +81 -123
  21. data/lib/interscript/command.rb +5 -5
  22. data/lib/interscript/compiler.rb +22 -0
  23. data/lib/interscript/compiler/javascript.rb +292 -0
  24. data/lib/interscript/compiler/ruby.rb +262 -0
  25. data/lib/interscript/dsl.rb +67 -0
  26. data/lib/interscript/dsl/aliases.rb +23 -0
  27. data/lib/interscript/dsl/document.rb +46 -0
  28. data/lib/interscript/dsl/group.rb +45 -0
  29. data/lib/interscript/dsl/group/parallel.rb +6 -0
  30. data/lib/interscript/dsl/items.rb +89 -0
  31. data/lib/interscript/dsl/metadata.rb +26 -0
  32. data/lib/interscript/dsl/stage.rb +6 -0
  33. data/lib/interscript/dsl/symbol_mm.rb +11 -0
  34. data/lib/interscript/dsl/tests.rb +12 -0
  35. data/lib/interscript/interpreter.rb +251 -0
  36. data/lib/interscript/node.rb +25 -0
  37. data/lib/interscript/node/alias_def.rb +15 -0
  38. data/lib/interscript/node/dependency.rb +13 -0
  39. data/lib/interscript/node/document.rb +45 -0
  40. data/lib/interscript/node/group.rb +34 -0
  41. data/lib/interscript/node/group/parallel.rb +9 -0
  42. data/lib/interscript/node/group/sequential.rb +2 -0
  43. data/lib/interscript/node/item.rb +52 -0
  44. data/lib/interscript/node/item/alias.rb +42 -0
  45. data/lib/interscript/node/item/any.rb +61 -0
  46. data/lib/interscript/node/item/capture.rb +50 -0
  47. data/lib/interscript/node/item/group.rb +51 -0
  48. data/lib/interscript/node/item/repeat.rb +40 -0
  49. data/lib/interscript/node/item/stage.rb +23 -0
  50. data/lib/interscript/node/item/string.rb +51 -0
  51. data/lib/interscript/node/metadata.rb +18 -0
  52. data/lib/interscript/node/rule.rb +6 -0
  53. data/lib/interscript/node/rule/funcall.rb +18 -0
  54. data/lib/interscript/node/rule/run.rb +15 -0
  55. data/lib/interscript/node/rule/sub.rb +65 -0
  56. data/lib/interscript/node/stage.rb +19 -0
  57. data/lib/interscript/node/tests.rb +15 -0
  58. data/lib/interscript/stdlib.rb +211 -0
  59. data/lib/interscript/utils/regexp_converter.rb +283 -0
  60. data/lib/interscript/version.rb +1 -1
  61. data/requirements.txt +1 -0
  62. metadata +73 -311
  63. data/README.adoc +0 -298
  64. data/bin/rspec +0 -29
  65. data/lib/__pycache__/g2pwrapper.cpython-38.pyc +0 -0
  66. data/lib/g2pwrapper.py +0 -34
  67. data/lib/interscript-opal.rb +0 -2
  68. data/lib/interscript/fs.rb +0 -69
  69. data/lib/interscript/mapping.rb +0 -142
  70. data/lib/interscript/opal.rb +0 -23
  71. data/lib/interscript/opal/maps.js.erb +0 -7
  72. data/lib/interscript/opal_map_translate.rb +0 -12
  73. data/lib/model-7 +0 -0
  74. data/lib/tha-pt-b-7 +0 -0
  75. data/maps/acadsin-zho-Hani-Latn-2002.yaml +0 -38912
  76. data/maps/alalc-aze-Cyrl-Latn-1997.yaml +0 -141
  77. data/maps/alalc-bel-cyrl-latn-1997.yaml +0 -125
  78. data/maps/alalc-ben-Beng-Latn-2017.yaml +0 -130
  79. data/maps/alalc-bul-Cyrl-Latn-1997.yaml +0 -94
  80. data/maps/alalc-ell-Grek-Latn-1997.yaml +0 -625
  81. data/maps/alalc-ell-Grek-Latn-2010.yaml +0 -628
  82. data/maps/alalc-kat-Geok-Latn-1997.yaml +0 -112
  83. data/maps/alalc-kat-Geor-Latn-1997.yaml +0 -146
  84. data/maps/alalc-kor-Hang-Latn-1997.yaml +0 -94
  85. data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +0 -103
  86. data/maps/alalc-mkd-cyrl-latn-1997.yaml +0 -114
  87. data/maps/alalc-rus-Cyrl-Latn-1997.yaml +0 -222
  88. data/maps/alalc-rus-Cyrl-Latn-2012.yaml +0 -162
  89. data/maps/alalc-srp-Cyrl-Latn-1997.yaml +0 -114
  90. data/maps/alalc-srp-cyrl-latn-2013.yaml +0 -135
  91. data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +0 -141
  92. data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +0 -16
  93. data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +0 -283
  94. data/maps/bas-rus-Cyrl-Latn-2017-bss.yaml +0 -175
  95. data/maps/bas-rus-Cyrl-Latn-2017-oss.yaml +0 -169
  96. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +0 -294
  97. data/maps/bgn-kor-Hang-Latn-1943.yaml +0 -31
  98. data/maps/bgn-kor-Kore-Latn-1943.yaml +0 -31
  99. data/maps/bgna-bul-Cyrl-Latn-2006.yaml +0 -208
  100. data/maps/bgna-bul-Cyrl-Latn-2009.yaml +0 -208
  101. data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +0 -108
  102. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +0 -104
  103. data/maps/bgnpcgn-bak-Cyrl-Latn-2007.yaml +0 -184
  104. data/maps/bgnpcgn-bel-cyrl-latn-1979.yaml +0 -285
  105. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +0 -115
  106. data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +0 -38
  107. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +0 -702
  108. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +0 -20
  109. data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +0 -257
  110. data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +0 -127
  111. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +0 -43
  112. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +0 -253
  113. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +0 -48
  114. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +0 -48
  115. data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +0 -159
  116. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +0 -190
  117. data/maps/bgnpcgn-per-Arab-Latn-1956.yaml +0 -93
  118. data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +0 -314
  119. data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +0 -166
  120. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +0 -163
  121. data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +0 -208
  122. data/maps/bgnpcgn-zho-Hans-Latn-1979.yaml +0 -7456
  123. data/maps/by-bel-Cyrl-Latn-1998.yaml +0 -168
  124. data/maps/by-bel-Cyrl-Latn-2007.yaml +0 -115
  125. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +0 -685
  126. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +0 -681
  127. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +0 -20
  128. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +0 -32
  129. data/maps/ggg-kat-Geor-Latn-2002.yaml +0 -89
  130. data/maps/gki-bel-cyrl-latn-1992.yaml +0 -33
  131. data/maps/gki-bel-cyrl-latn-2000.yaml +0 -201
  132. data/maps/gost-rus-cyrl-latn-16876-71-1983.yaml +0 -186
  133. data/maps/hk-yue-Hani-Latn-1888.yaml +0 -38497
  134. data/maps/icao-bel-Cyrl-Latn-9303.yaml +0 -141
  135. data/maps/icao-bul-Cyrl-Latn-9303.yaml +0 -122
  136. data/maps/icao-heb-Hebr-Latn-9303.yaml +0 -151
  137. data/maps/icao-mkd-Cyrl-Latn-9303.yaml +0 -117
  138. data/maps/icao-per-Arab-Latn-9303.yaml +0 -104
  139. data/maps/icao-rus-Cyrl-Latn-9303.yaml +0 -118
  140. data/maps/icao-srp-Cyrl-Latn-9303.yaml +0 -117
  141. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +0 -120
  142. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +0 -610
  143. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +0 -41
  144. data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +0 -62
  145. data/maps/iso-rus-Cyrl-Latn-9-1995.yaml +0 -272
  146. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +0 -109
  147. data/maps/kp-kor-Hang-Latn-2002.yaml +0 -901
  148. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +0 -44820
  149. data/maps/mext-jpn-Hrkt-Latn-1954.yaml +0 -411
  150. data/maps/moct-kor-Hang-Latn-2000.yaml +0 -803
  151. data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +0 -541
  152. data/maps/mvd-bel-Cyrl-Latn-2008.yaml +0 -225
  153. data/maps/mvd-bel-Cyrl-Latn-2010.yaml +0 -63
  154. data/maps/mvd-rus-Cyrl-Latn-2008.yaml +0 -110
  155. data/maps/mvd-rus-Cyrl-Latn-2010.yaml +0 -37
  156. data/maps/nil-kor-Hang-Hang-jamo.yaml +0 -11193
  157. data/maps/odni-aze-Cyrl-Latn-2015.yaml +0 -144
  158. data/maps/odni-bel-Cyrl-Latn-2015.yaml +0 -148
  159. data/maps/odni-bul-Cyrl-Latn-2015.yaml +0 -96
  160. data/maps/odni-kat-Geor-Latn-2015.yaml +0 -88
  161. data/maps/odni-kaz-Cyrl-Latn-2015.yaml +0 -148
  162. data/maps/odni-kir-Cyrl-Latn-2015.yaml +0 -136
  163. data/maps/odni-mkd-cyrl-latn-2015.yaml +0 -122
  164. data/maps/odni-rus-Cyrl-Latn-2015.yaml +0 -77
  165. data/maps/odni-srp-Cyrl-Latn-2015.yaml +0 -129
  166. data/maps/odni-tat-Cyrl-Latn-2015.yaml +0 -142
  167. data/maps/odni-tgk-Cyrl-Latn-2015.yaml +0 -148
  168. data/maps/odni-uig-Cyrl-Latn-2015.yaml +0 -138
  169. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +0 -157
  170. data/maps/odni-uzb-Cyrl-Latn-2015.yaml +0 -167
  171. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +0 -90
  172. data/maps/royin-tha-Thai-Latn-1968.yaml +0 -179
  173. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +0 -180
  174. data/maps/royin-tha-Thai-Latn-1999.yaml +0 -76
  175. data/maps/sac-zho-Hans-Latn-1979.yaml +0 -24759
  176. data/maps/ses-ara-arab-latn-1930.yaml +0 -275
  177. data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +0 -222
  178. data/maps/ua-ukr-Cyrl-Latn-1996.yaml +0 -193
  179. data/maps/un-ara-Arab-Latn-1971.yaml +0 -127
  180. data/maps/un-ara-Arab-Latn-1972.yaml +0 -152
  181. data/maps/un-ara-Arab-Latn-2017.yaml +0 -383
  182. data/maps/un-bel-Cyrl-Latn-2007.yaml +0 -114
  183. data/maps/un-ben-Beng-Latn-2016.yaml +0 -534
  184. data/maps/un-ell-Grek-Latn-1987-tl.yaml +0 -32
  185. data/maps/un-ell-Grek-Latn-1987-ts.yaml +0 -20
  186. data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +0 -780
  187. data/maps/un-mon-Mong-Latn-2013.yaml +0 -93
  188. data/maps/un-rus-Cyrl-Latn-1987.yaml +0 -166
  189. data/maps/un-ukr-cyrl-latn-1998.yaml +0 -30
  190. data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +0 -406
  191. data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +0 -386
  192. data/maps/var-kor-Hang-Latn-mr-1939.yaml +0 -1054
  193. data/maps/var-kor-Kore-Hang-2013.yaml +0 -59754
  194. data/maps/var-kor-Kore-Latn-mr-1939.yaml +0 -37
  195. data/maps/var-tha-Thai-Thai-phonemic.yaml +0 -59
  196. data/maps/var-tha-Thai-Zsym-ipa.yaml +0 -301
  197. data/maps/var-zho-Hani-Latn-1979.yaml +0 -38908
  198. data/spec/interscript/mapping_spec.rb +0 -42
  199. data/spec/interscript_spec.rb +0 -26
  200. data/spec/spec_helper.rb +0 -3
@@ -0,0 +1,18 @@
1
+ class Interscript::Node::MetaData < Interscript::Node
2
+ attr_accessor :data
3
+ def initialize data={}
4
+ @data = data
5
+ end
6
+
7
+ def []=(k,v)
8
+ @data[k] = v
9
+ end
10
+ def [](k)
11
+ @data[k]
12
+ end
13
+
14
+ def to_hash
15
+ {:class => self.class.to_s,
16
+ :data => @data}
17
+ end
18
+ end
@@ -0,0 +1,6 @@
1
+ class Interscript::Node::Rule < Interscript::Node
2
+ end
3
+
4
+ require "interscript/node/rule/sub"
5
+ require "interscript/node/rule/run"
6
+ require "interscript/node/rule/funcall"
@@ -0,0 +1,18 @@
1
+ class Interscript::Node::Rule::Funcall < Interscript::Node::Rule
2
+ attr_accessor :name, :kwargs
3
+ def initialize name, **kwargs
4
+ @name = name
5
+ @kwargs = kwargs
6
+ end
7
+
8
+ def to_hash
9
+ { :class => self.class.to_s,
10
+ :name => self.name,
11
+ :kwargs => self.kwargs
12
+ }
13
+ end
14
+
15
+ def inspect
16
+ "#{@name} #{kwargs.inspect[1..-2]}"
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ class Interscript::Node::Rule::Run < Interscript::Node::Rule
2
+ attr_accessor :stage
3
+ def initialize stage
4
+ @stage = stage
5
+ end
6
+
7
+ def to_hash
8
+ { :class => self.class.to_s,
9
+ :stage => self.stage.to_hash }
10
+ end
11
+
12
+ def inspect
13
+ "run #{@stage.inspect}"
14
+ end
15
+ end
@@ -0,0 +1,65 @@
1
+ class Interscript::Node::Rule::Sub < Interscript::Node::Rule
2
+ attr_accessor :from, :to
3
+ attr_accessor :before, :not_before, :after, :not_after
4
+ attr_accessor :priority
5
+
6
+ def initialize from, to, before: nil, not_before: nil, after: nil, not_after: nil, priority: nil
7
+ self.from = Interscript::Node::Item.try_convert from
8
+ if to == :upcase
9
+ self.to = :upcase
10
+ else
11
+ self.to = Interscript::Node::Item.try_convert to
12
+ end
13
+
14
+ self.priority = priority
15
+
16
+ #raise TypeError, "Can't supply both before and not_before" if before && not_before
17
+ #raise TypeError, "Can't supply both after and not_after" if after && not_after
18
+
19
+ self.before = Interscript::Node::Item.try_convert(before) if before
20
+ self.after = Interscript::Node::Item.try_convert(after) if after
21
+ self.not_before = Interscript::Node::Item.try_convert(not_before) if not_before
22
+ self.not_after = Interscript::Node::Item.try_convert(not_after) if not_after
23
+ end
24
+
25
+ def max_length
26
+ len = self.from.max_length
27
+ len += self.before.max_length if self.before
28
+ len += self.after.max_length if self.after
29
+ len += self.not_before.max_length if self.not_before
30
+ len += self.not_after.max_length if self.not_after
31
+ len += self.priority if self.priority
32
+ len
33
+ end
34
+
35
+ def to_hash
36
+ puts self.from.inspect if $DEBUG
37
+ puts params.inspect if $DEBUG
38
+ { :class => self.class.to_s,
39
+ :from => self.from.to_hash,
40
+ :to => self.to == :upcase ? :upcase : self.to.to_hash,
41
+ :before => self.before&.to_hash,
42
+ :not_before => self.not_before&.to_hash,
43
+ :after => self.after&.to_hash,
44
+ :not_after => self.not_after&.to_hash,
45
+ :priority => self.priority
46
+ }
47
+ end
48
+
49
+ def inspect
50
+ out = "sub "
51
+ params = []
52
+ params << @from.inspect
53
+ if @to == :upcase
54
+ params << "upcase"
55
+ else
56
+ params << @to.inspect
57
+ end
58
+ params << "before: #{@before.inspect}" if @before
59
+ params << "after: #{@after.inspect}" if @after
60
+ params << "not_before: #{@not_before.inspect}" if @not_before
61
+ params << "not_after: #{@not_after.inspect}" if @not_after
62
+ params << "priority: #{@priority.inspect}" if @priority
63
+ out << params.join(", ")
64
+ end
65
+ end
@@ -0,0 +1,19 @@
1
+ class Interscript::Node::Stage < Interscript::Node::Group::Sequential
2
+ attr_accessor :name, :doc_name
3
+
4
+ def initialize name = :main
5
+ @name = name
6
+ super()
7
+ end
8
+
9
+ def to_hash
10
+ { :class => self.class.to_s,
11
+ :name => name,
12
+ :children => @children.map{|x| x.to_hash} }
13
+ end
14
+
15
+ def inspect
16
+ name = "(#{@name})" if @name != :main
17
+ "stage#{name} {\n#{super}\n}"
18
+ end
19
+ end
@@ -0,0 +1,15 @@
1
+ class Interscript::Node::Tests < Interscript::Node
2
+ attr_accessor :data
3
+ def initialize data=[]
4
+ @data = data
5
+ end
6
+
7
+ def <<(pair)
8
+ @data << pair
9
+ end
10
+
11
+ def to_hash
12
+ { :class => self.class.to_s,
13
+ :data => @data }
14
+ end
15
+ end
@@ -0,0 +1,211 @@
1
+ class Interscript::Stdlib
2
+ ALIASES = {
3
+ any_character: '.',
4
+ none: "",
5
+ space: " ",
6
+ whitespace: "[\\b \\t\\0\\r\\n]",
7
+ boundary: "\\b",
8
+ non_word_boundary: "\\B",
9
+ word: "\\w",
10
+ not_word: "\\W",
11
+ alpha: "[a-zA-Z]",
12
+ not_alpha: "[^a-zA-Z]",
13
+ digit: "\\d",
14
+ not_digit: "\\D",
15
+ line_start: "^",
16
+ line_end: "$",
17
+ string_start: "\\A",
18
+ string_end: "\\z"
19
+ }
20
+
21
+ def self.re_only_alias?(a)
22
+ ! %i[none space].include?(a)
23
+ end
24
+
25
+ @treecache = {}
26
+
27
+ def self.parallel_regexp_compile(subs_hash)
28
+ # puts subs_hash.inspect
29
+ regexp = subs_hash.each_with_index.map do |p,i|
30
+ "(?<_%d>%s)" % [i,p[0]]
31
+ end.join("|")
32
+ subs_regexp = Regexp.compile(regexp)
33
+ # puts subs_regexp.inspect
34
+ end
35
+
36
+ def self.parallel_regexp_gsub(string, subs_regexp, subs_hash)
37
+ string.gsub(subs_regexp) do |match|
38
+ lm = Regexp.last_match
39
+ # Extract the match name
40
+ idx = lm.named_captures.compact.keys.first[1..-1].to_i
41
+ subs_hash[idx]
42
+ end
43
+ end
44
+
45
+ def self.parallel_regexp_gsub_debug(string, subs_regexp, subs_array)
46
+ # only gathering debug info, test data is available in maps_analyze_staging
47
+ $subs_matches = []
48
+ $subs_regexp = subs_regexp
49
+ #$subs_array = subs_array
50
+ string.gsub(subs_regexp) do |match|
51
+ lm = Regexp.last_match
52
+ # puts lm.inspect
53
+ # Extract the match name
54
+ matched = lm.named_captures.compact.keys.first
55
+ # puts matched.inspect
56
+ # puts [lm.begin(matched), lm.end(matched)].inspect
57
+ idx = matched[1..-1].to_i
58
+ debug_info = {begin: lm.begin(matched), end: lm.end(matched), idx: idx, result: subs_array[idx]}
59
+ $subs_matches << debug_info
60
+ subs_array[idx]
61
+ end
62
+ end
63
+
64
+
65
+ def self.parallel_replace_compile_hash(a)
66
+ h = {}
67
+ a.each do |from,to|
68
+ h[from] = to
69
+ end
70
+ h
71
+ end
72
+
73
+ def self.parallel_replace_hash(str,h)
74
+ newstr = ""
75
+ len = str.length
76
+ max_key_len = h.keys.map(&:length).max
77
+ i = 0
78
+ while i < len
79
+ max_key_len.downto(1).each do |checked_len|
80
+ substr = str[i,checked_len]
81
+ if h[substr]
82
+ newstr << h[substr]
83
+ i += substr.length
84
+ elsif checked_len==1
85
+ newstr << str[i,1]
86
+ i += 1
87
+ end
88
+ end
89
+ end
90
+ newstr
91
+ end
92
+
93
+ # hash can be either a hash or a hash-like array
94
+ def self.parallel_replace_compile_tree(hash)
95
+ hh = hash.hash
96
+ if @treecache[hh]
97
+ tree = @treecache[hh]
98
+ else
99
+ tree = {}
100
+ hash.each do |from, to|
101
+ from = Array(from)
102
+ from.each do |f|
103
+ branch = tree
104
+ chars = f.split("")
105
+ chars[0..-2].each do |c|
106
+ branch[c.ord] ||= {}
107
+ branch = branch[c.ord]
108
+ end
109
+ branch[chars.last.ord] ||= {}
110
+ branch[chars.last.ord][nil] = to
111
+ end
112
+ end
113
+ @treecache[hh] = tree
114
+ end
115
+ end
116
+
117
+ def self.parallel_replace_tree(str, tree)
118
+ newstr = ""
119
+ len = str.length
120
+ i = 0
121
+ while i < len
122
+ c = str[i]
123
+
124
+ sub = ""
125
+ branch = tree
126
+ match, repl = nil, nil
127
+
128
+ j = 0
129
+ while j < len-i
130
+ cc = str[i+j]
131
+ if branch.include? cc.ord
132
+ branch = branch[cc.ord]
133
+ sub << cc
134
+ if branch.include? nil
135
+ match = sub.dup
136
+ repl = branch[nil]
137
+ end
138
+ j += 1
139
+ else
140
+ break
141
+ end
142
+ end
143
+
144
+ if match
145
+ i += match.length
146
+ newstr << repl
147
+ else
148
+ newstr << c
149
+ i += 1
150
+ end
151
+ end
152
+ newstr
153
+ end
154
+
155
+ def self.parallel_replace(str, hash)
156
+ tree = parallel_replace_compile_tree(hash)
157
+ parallel_replace_tree(str, tree)
158
+ end
159
+
160
+ # On Windows at least, sort_by is non-deterministic. Let's add some determinism
161
+ # to our efforts.
162
+ def self.deterministic_sort_by_max_length(ary)
163
+ # Deterministic on Linux:
164
+ # ary.sort_by{ |rule| -rule.max_length }
165
+
166
+ ary.each_with_index.sort_by{ |rule,idx| -rule.max_length*100000 + idx }.map(&:first)
167
+ end
168
+
169
+ def self.available_functions
170
+ %i[title_case downcase compose decompose separate secryst]
171
+ end
172
+
173
+ module Functions
174
+ def self.title_case(output, word_separator: " ")
175
+ output = output.gsub(/^(.)/, &:upcase)
176
+ output = output.gsub(/#{word_separator}(.)/, &:upcase) unless word_separator == ''
177
+ output
178
+ end
179
+
180
+ def self.downcase(output, _:nil)
181
+ output.downcase
182
+ end
183
+
184
+ def self.compose(output, _:nil)
185
+ output.unicode_normalize(:nfc)
186
+ end
187
+
188
+ def self.decompose(output, _:nil)
189
+ output.unicode_normalize(:nfd)
190
+ end
191
+
192
+ def self.separate(output, separator: " ")
193
+ output.split("").join(separator)
194
+ end
195
+
196
+ @secryst_models = {}
197
+ def self.secryst(output, model:)
198
+ require "secryst" rescue nil # Try to load secryst, but don't fail hard if not possible.
199
+ unless defined? Secryst
200
+ raise StandardError, "Secryst is not loaded. Please read docs/Usage_with_Secryst.adoc"
201
+ end
202
+ Interscript.secryst_index_locations.each do |remote|
203
+ Secryst::Provisioning.add_remote(remote)
204
+ end
205
+ @secryst_models[model] ||= Secryst::Translator.new(model_file: model)
206
+ output.split("\n").map(&:chomp).map do |i|
207
+ @secryst_models[model].translate(i)
208
+ end.join("\n")
209
+ end
210
+ end
211
+ end
@@ -0,0 +1,283 @@
1
+ require 'regexp_parser'
2
+
3
+
4
+ def process(node)
5
+ children = if node.respond_to?(:expressions) && node.expressions
6
+ children = node.expressions.map.each { |expr| process(expr) }
7
+ end
8
+ # puts node.inspect
9
+ out = case node
10
+ when Regexp::Expression::Root
11
+ children
12
+ when Regexp::Expression::Assertion::Lookbehind
13
+ [:lookbehind_start, children, :lookbehind_stop]
14
+ when Regexp::Expression::Assertion::NegativeLookbehind
15
+ [:negative_lookbehind_start, children, :negative_lookbehind_stop]
16
+ when Regexp::Expression::Assertion::Lookahead
17
+ [:lookahead_start, children, :lookahead_stop]
18
+ when Regexp::Expression::Assertion::NegativeLookahead
19
+ [:negative_lookahead_start, children, :negative_lookahead_stop]
20
+ when Regexp::Expression::Group::Capture
21
+ [:capture_start, children, :capture_stop]
22
+ when Regexp::Expression::CharacterSet
23
+ # puts children.inspect
24
+ if children.flatten.include? (:range_start) #or children.size > 1
25
+ [:characterset_start, :array_start, children, :array_stop, :characterset_stop]
26
+ else
27
+ [:characterset_start, children, :characterset_stop]
28
+ end
29
+ when Regexp::Expression::Alternation
30
+ [:alternation_start, children, :alternation_stop]
31
+ when Regexp::Expression::Alternative
32
+ [:alternative_start, children, :alternative_stop]
33
+ when Regexp::Expression::CharacterSet::Range
34
+ lit1 = node.expressions[0].text
35
+ lit2 = node.expressions[1].text
36
+ [:range_start, lit1, :range_mid, lit2, :range_stop]
37
+ when Regexp::Expression::Anchor::WordBoundary
38
+ :boundary
39
+ when Regexp::Expression::Anchor::NonWordBoundary
40
+ :non_word_boundary
41
+ when Regexp::Expression::EscapeSequence::Backspace
42
+ :boundary # most probably boundary
43
+ when Regexp::Expression::CharacterType::Space
44
+ :space
45
+ when Regexp::Expression::Anchor::BeginningOfLine
46
+ :line_start
47
+ when Regexp::Expression::Anchor::EndOfLine
48
+ :line_end
49
+ when Regexp::Expression::CharacterType::Any
50
+ :any_character
51
+ when Regexp::Expression::Literal
52
+ node.text
53
+ when Regexp::Expression::EscapeSequence::Literal
54
+ node.text
55
+ when Regexp::Expression::EscapeSequence::Codepoint
56
+ node.text
57
+ when Regexp::Expression::PosixClass
58
+ '[' + node.text + ']'
59
+ when Regexp::Expression::UnicodeProperty::Script
60
+ node.text
61
+ when Regexp::Expression::Backreference::Number # why is there a space before after node.number?
62
+ [:backref_num_start, node.number, :backref_num_stop]
63
+ else
64
+ out = [:missing, node.class]
65
+
66
+ out << children if node.respond_to? :expressions
67
+ if node.respond_to? :quantifier and node.quantifier
68
+ # TODO add quantifier support
69
+ pp node
70
+ # out << process(node.quantifier)
71
+ end
72
+ out
73
+ end
74
+ if node.respond_to?(:quantifier) && node.quantifier&.token.to_s == "interval" && node.quantifier.max == node.quantifier.min
75
+ out = [out] * node.quantifier.max
76
+ elsif node.respond_to?(:quantifier) && node.quantifier
77
+ qname = node.quantifier.token.to_s
78
+ out = ["#{qname}_start".to_sym, [out], "#{qname}_stop".to_sym]
79
+ end
80
+ out
81
+ end
82
+
83
+ def process_root(node)
84
+ node2 = node.dup
85
+ root = {}
86
+ if before = node.select { |x| x[0] == :lookbehind_start }
87
+ # root[:before] = before[1]
88
+ # node2.delete(before)
89
+ if before.size == 1
90
+ root[:before] = before[0][1]
91
+ node2.delete(before[0])
92
+ elsif before.size >1
93
+ # pp not_before
94
+
95
+ a = [:alternation_start]
96
+ a << before.map{|x| [:alternative_start, x[1], :alternative_stop] }
97
+ a << [:alternation_stop]
98
+ root[:before] = a
99
+ # pp root[:not_before]
100
+ before.each{|n| node2.delete(n)}
101
+ end
102
+
103
+ end
104
+ if not_before = node.select { |x| x[0] == :negative_lookbehind_start }
105
+ # root[:not_before] = not_before[1]
106
+ # node2.delete(not_before)
107
+
108
+ if not_before.size == 1
109
+ root[:not_before] = not_before[0][1]
110
+ node2.delete(not_before[0])
111
+ elsif not_before.size >1
112
+ # pp not_before
113
+
114
+ a = [:alternation_start]
115
+ a << not_before.map{|x| [:alternative_start, x[1], :alternative_stop] }
116
+ a << [:alternation_stop]
117
+ root[:not_before] = a
118
+ # pp root[:not_before]
119
+ not_before.each{|n| node2.delete(n)}
120
+ end
121
+ end
122
+ if after = node.select { |x| x[0] == :lookahead_start }
123
+ # root[:after] = after[1]
124
+ # node2.delete(after)
125
+
126
+ if after.size == 1
127
+ root[:after] = after[0][1]
128
+ node2.delete(after[0])
129
+ elsif after.size >1
130
+ # pp not_before
131
+
132
+ a = [:alternation_start]
133
+ a << after.map{|x| [:alternative_start, x[1], :alternative_stop] }
134
+ a << [:alternation_stop]
135
+ root[:after] = a
136
+ # pp root[:not_before]
137
+ after.each{|n| node2.delete(n)}
138
+ end
139
+
140
+ end
141
+ if not_after = node.select { |x| x[0] == :negative_lookahead_start }
142
+ # root[:not_after] = not_after[1]
143
+ # node2.delete(not_after)
144
+ if not_after.size == 1
145
+ root[:not_after] = not_after[0][1]
146
+ node2.delete(not_after[0])
147
+ elsif not_after.size >1
148
+ # pp not_after
149
+
150
+ a = [:alternation_start]
151
+ a << not_after.map{|x| [:alternative_start, x[1], :alternative_stop] }
152
+ a << [:alternation_stop]
153
+ root[:not_after] = a
154
+ # pp root[:not_after]
155
+ not_after.each{|n| node2.delete(n)}
156
+ end
157
+
158
+ end
159
+ root[:from] = node2
160
+ root
161
+ end
162
+
163
+ def stringify(node)
164
+ tokens = node.flatten
165
+ subs = {
166
+ characterset_start: 'any(',
167
+ characterset_stop: ')',
168
+ array_start: '[',
169
+ array_stop: ']',
170
+ capture_start: 'capture(',
171
+ capture_stop: ')',
172
+ zero_or_one_start: 'maybe(',
173
+ zero_or_one_stop: ')',
174
+ zero_or_more_start: 'maybe_some(',
175
+ zero_or_more_stop: ')',
176
+ one_or_more_start: 'some(',
177
+ one_or_more_stop: ')',
178
+ alternation_start: 'any([',
179
+ alternation_stop: '])',
180
+ alternative_start: '',
181
+ alternative_stop: '',
182
+ boundary: 'boundary',
183
+ non_word_boundary: 'non_word_boundary',
184
+ space: 'space',
185
+ line_start: 'line_start',
186
+ line_end: 'line_end',
187
+ any_character: 'any_character',
188
+ range_start: 'any(',
189
+ range_mid: '..',
190
+ range_stop: ')',
191
+ backref_num_start: 'ref(',
192
+ backref_num_stop: ')'
193
+ }
194
+
195
+ str = []
196
+ tokens.each_with_index do |token, idx|
197
+ prev = tokens[idx - 1] if idx > 0
198
+ left_side = %i[characterset_stop capture_stop
199
+ zero_or_one_stop zero_or_more_stop one_or_more_stop
200
+ boundary non_word_boundary
201
+ line_start any_character range_stop space
202
+ backref_num_stop]
203
+ right_side = %i[characterset_start capture_start
204
+ zero_or_one_start zero_or_more_start one_or_more_start
205
+ boundary non_word_boundary
206
+ line_end any_character range_start space
207
+ backref_num_start]
208
+ #if prev==:range_stop and token==:range_start
209
+ # str << ' :adding_ranges '
210
+ #end
211
+ if (prev.instance_of?(String) && right_side.include?(token)) or
212
+ (left_side.include?(prev) && token.instance_of?(String)) or
213
+ (left_side.include?(prev) && right_side.include?(token))
214
+ str << ' + '
215
+ end
216
+ str << ', ' if prev == :alternative_stop and token == :alternative_start
217
+ # str << '[' if prev == :characterset_start and token == :range_start
218
+ # str << ']' if prev == :range_stop and token ==:characterset_stop
219
+ if subs.include? token
220
+ str << subs[token]
221
+ elsif token.instance_of?(String)
222
+ if prev.instance_of?(String)
223
+ str[-1] = "#{str[-1][0..-2]}#{token}\""
224
+ else
225
+ str << "\"#{token}\""
226
+ end
227
+ else
228
+ str << " #{token.inspect} "
229
+ end
230
+ # puts [idx, token].inspect
231
+ # puts str.inspect
232
+ end
233
+ str.join.gsub('\\\\u', '\\u')
234
+ end
235
+
236
+ def stringify_root(root, indent: 0)
237
+ warning = ''
238
+ root[:from] = [""] if root[:from] == []
239
+ str = " "*indent+"sub #{stringify(root[:from])}, #{root[:to]}"
240
+ [:before, :not_before, :after, :not_after].each do |look|
241
+ # puts "#{look.inspect} = #{root[look]}"
242
+ next unless root[look]
243
+ str_look = stringify(root[look])
244
+ str_look = "\"\"" if root[look] == [] || root[look] == nil
245
+ #if str_look.empty? #apparently it is empty sometimes. iso-mal-Mlym-Latn for example
246
+ # warning << "warning: #{look} is empty string;"
247
+ #else
248
+ str << ", #{look}: #{str_look}"
249
+ #end
250
+ end
251
+ str = " "*indent+"# #{str} # warning: :" if str =~ /[^\[]:[^ \]]/
252
+ str = " "*indent+"# #{str} # #{warning}" if !warning.empty?
253
+
254
+ str = " "*indent+"# #{str} # warning: :missing unimplemented" if str.include?(':missing')
255
+ str = " "*indent+"# #{str} # warning: :interval unimplemented" if str.include?(':interval')
256
+ str = " "*indent+"# #{str} # warning: :adding_ranges unimplemented" if str.include?(':adding_ranges')
257
+ if str.include?('zero_or_one')
258
+ str = " "*indent+"# #{str} # warning: zero_or_one"
259
+ puts "str.includes 'zero_or_one'"
260
+ pp root
261
+ end
262
+ # str = " "*indent+"# #{str} # warning: one_or_more" if str.include?('one_or_more')
263
+ str = " "*indent+"# #{str} # warning: :lookahead_start" if str.include?(':lookahead_start')
264
+ # str += " # original: #{root[:from]}"
265
+ str
266
+ end
267
+
268
+ if __FILE__ == $0
269
+ rs = File.open(__dir__+"/../../docs/utils/regexp_examples.txt").read.gsub(/([^\\^])\\u/, '\\1\\\\u').gsub(/\\\\b/, '\b')
270
+ rs = rs.split("\n")
271
+ rs.each do |r|
272
+ puts r
273
+ tree = Regexp::Parser.parse(r, 'ruby/2.1')
274
+ conv = process(tree)
275
+ pp conv
276
+ root = process_root(conv)
277
+ pp root
278
+ root[:to] = ['X']
279
+ str = stringify_root(root)
280
+ puts str
281
+ puts "\n\n"
282
+ end
283
+ end