interscript 0.1.6 → 2.1.0a9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +29 -0
  5. data/LICENSE.adoc +31 -0
  6. data/README.md +3 -0
  7. data/Rakefile +53 -0
  8. data/bin/console +14 -0
  9. data/bin/interscript +3 -39
  10. data/bin/maps_analyze_staging +168 -0
  11. data/bin/maps_debug_compilers +58 -0
  12. data/bin/maps_debug_ordering +88 -0
  13. data/bin/maps_debug_ruby_compile +24 -0
  14. data/bin/maps_debug_step_by_step +44 -0
  15. data/bin/maps_optimize_order +112 -0
  16. data/bin/maps_v1_analyze_regexps +45 -0
  17. data/bin/maps_v1_to_v2 +426 -0
  18. data/exe/interscript +6 -0
  19. data/interscript.gemspec +31 -0
  20. data/lib/interscript.rb +81 -127
  21. data/lib/interscript/command.rb +5 -5
  22. data/lib/interscript/compiler.rb +22 -0
  23. data/lib/interscript/compiler/javascript.rb +292 -0
  24. data/lib/interscript/compiler/ruby.rb +262 -0
  25. data/lib/interscript/dsl.rb +67 -0
  26. data/lib/interscript/dsl/aliases.rb +23 -0
  27. data/lib/interscript/dsl/document.rb +46 -0
  28. data/lib/interscript/dsl/group.rb +45 -0
  29. data/lib/interscript/dsl/group/parallel.rb +6 -0
  30. data/lib/interscript/dsl/items.rb +89 -0
  31. data/lib/interscript/dsl/metadata.rb +26 -0
  32. data/lib/interscript/dsl/stage.rb +6 -0
  33. data/lib/interscript/dsl/symbol_mm.rb +11 -0
  34. data/lib/interscript/dsl/tests.rb +12 -0
  35. data/lib/interscript/interpreter.rb +251 -0
  36. data/lib/interscript/node.rb +25 -0
  37. data/lib/interscript/node/alias_def.rb +15 -0
  38. data/lib/interscript/node/dependency.rb +13 -0
  39. data/lib/interscript/node/document.rb +45 -0
  40. data/lib/interscript/node/group.rb +34 -0
  41. data/lib/interscript/node/group/parallel.rb +9 -0
  42. data/lib/interscript/node/group/sequential.rb +2 -0
  43. data/lib/interscript/node/item.rb +52 -0
  44. data/lib/interscript/node/item/alias.rb +42 -0
  45. data/lib/interscript/node/item/any.rb +61 -0
  46. data/lib/interscript/node/item/capture.rb +50 -0
  47. data/lib/interscript/node/item/group.rb +51 -0
  48. data/lib/interscript/node/item/repeat.rb +40 -0
  49. data/lib/interscript/node/item/stage.rb +23 -0
  50. data/lib/interscript/node/item/string.rb +51 -0
  51. data/lib/interscript/node/metadata.rb +18 -0
  52. data/lib/interscript/node/rule.rb +6 -0
  53. data/lib/interscript/node/rule/funcall.rb +18 -0
  54. data/lib/interscript/node/rule/run.rb +15 -0
  55. data/lib/interscript/node/rule/sub.rb +65 -0
  56. data/lib/interscript/node/stage.rb +19 -0
  57. data/lib/interscript/node/tests.rb +15 -0
  58. data/lib/interscript/stdlib.rb +211 -0
  59. data/lib/interscript/utils/regexp_converter.rb +283 -0
  60. data/lib/interscript/version.rb +1 -1
  61. data/requirements.txt +1 -0
  62. metadata +75 -339
  63. data/README.adoc +0 -298
  64. data/bin/rspec +0 -29
  65. data/lib/__pycache__/g2pwrapper.cpython-38.pyc +0 -0
  66. data/lib/g2pwrapper.py +0 -34
  67. data/lib/interscript-opal.rb +0 -2
  68. data/lib/interscript/fs.rb +0 -71
  69. data/lib/interscript/mapping.rb +0 -142
  70. data/lib/interscript/opal.rb +0 -27
  71. data/lib/interscript/opal/maps.js.erb +0 -10
  72. data/lib/interscript/opal_map_translate.rb +0 -12
  73. data/lib/model-7 +0 -0
  74. data/lib/tha-pt-b-7 +0 -0
  75. data/maps/acadsin-zho-Hani-Latn-2002.yaml +0 -38912
  76. data/maps/alalc-amh-Ethi-Latn-1997.yaml +0 -509
  77. data/maps/alalc-amh-Ethi-Latn-2011.yaml +0 -138
  78. data/maps/alalc-ara-Arab-Latn-1997.yaml +0 -1283
  79. data/maps/alalc-asm-Deva-Latn-1997.yaml +0 -159
  80. data/maps/alalc-aze-Cyrl-Latn-1997.yaml +0 -141
  81. data/maps/alalc-bel-Cyrl-Latn-1997.yaml +0 -125
  82. data/maps/alalc-ben-Beng-Latn-2017.yaml +0 -130
  83. data/maps/alalc-bul-Cyrl-Latn-1997.yaml +0 -94
  84. data/maps/alalc-ell-Grek-Latn-1997.yaml +0 -624
  85. data/maps/alalc-ell-Grek-Latn-2010.yaml +0 -627
  86. data/maps/alalc-hin-Deva-Latn-2020.yaml +0 -159
  87. data/maps/alalc-kat-Geok-Latn-1997.yaml +0 -111
  88. data/maps/alalc-kat-Geor-Latn-1997.yaml +0 -146
  89. data/maps/alalc-kor-Hang-Latn-1997.yaml +0 -94
  90. data/maps/alalc-mar-Deva-Latn-1997.yaml +0 -170
  91. data/maps/alalc-mkd-Cyrl-Latn-1997.yaml +0 -114
  92. data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +0 -103
  93. data/maps/alalc-pan-Deva-Latn-1997.yaml +0 -237
  94. data/maps/alalc-rus-Cyrl-Latn-1997.yaml +0 -221
  95. data/maps/alalc-rus-Cyrl-Latn-2012.yaml +0 -162
  96. data/maps/alalc-srp-Cyrl-Latn-1997.yaml +0 -114
  97. data/maps/alalc-srp-Cyrl-Latn-2013.yaml +0 -135
  98. data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +0 -141
  99. data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +0 -16
  100. data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +0 -283
  101. data/maps/bas-rus-Cyrl-Latn-2017-bss.yaml +0 -174
  102. data/maps/bas-rus-Cyrl-Latn-2017-oss.yaml +0 -169
  103. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +0 -292
  104. data/maps/bgn-kor-Hang-Latn-1943.yaml +0 -31
  105. data/maps/bgn-kor-Kore-Latn-1943.yaml +0 -31
  106. data/maps/bgna-bul-Cyrl-Latn-2006.yaml +0 -208
  107. data/maps/bgna-bul-Cyrl-Latn-2009.yaml +0 -208
  108. data/maps/bgnpcgn-amh-Ethi-Latn-1967.yaml +0 -528
  109. data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +0 -592
  110. data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +0 -108
  111. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +0 -104
  112. data/maps/bgnpcgn-bak-Cyrl-Latn-2007.yaml +0 -184
  113. data/maps/bgnpcgn-bel-Cyrl-Latn-1979.yaml +0 -285
  114. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +0 -115
  115. data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +0 -38
  116. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +0 -701
  117. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +0 -19
  118. data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +0 -257
  119. data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +0 -127
  120. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +0 -42
  121. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +0 -253
  122. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +0 -48
  123. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +0 -48
  124. data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +0 -159
  125. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +0 -190
  126. data/maps/bgnpcgn-nep-Deva-Latn-2011.yaml +0 -200
  127. data/maps/bgnpcgn-per-Arab-Latn-1956.yaml +0 -92
  128. data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +0 -314
  129. data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +0 -166
  130. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +0 -162
  131. data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +0 -208
  132. data/maps/bgnpcgn-zho-Hans-Latn-1979.yaml +0 -7456
  133. data/maps/bis-asm-Beng-Latn-13194-1991.yaml +0 -159
  134. data/maps/bis-ben-Beng-Latn-13194-1991.yaml +0 -156
  135. data/maps/bis-dev-Deva-Latn-13194-1991.yaml +0 -184
  136. data/maps/bis-gjr-Gujr-Latn-13194-1991.yaml +0 -166
  137. data/maps/bis-knd-Knda-Latn-13194-1991.yaml +0 -173
  138. data/maps/bis-mlm-Mlym-Latn-13194-1991.yaml +0 -176
  139. data/maps/bis-ori-Orya-Latn-13194-1991.yaml +0 -160
  140. data/maps/bis-pnj-Guru-Latn-13194-1991.yaml +0 -175
  141. data/maps/bis-tel-Telu-Latn-13194-1991.yaml +0 -170
  142. data/maps/bis-tml-Taml-Latn-13194-1991.yaml +0 -155
  143. data/maps/by-bel-Cyrl-Latn-1998.yaml +0 -168
  144. data/maps/by-bel-Cyrl-Latn-2007.yaml +0 -115
  145. data/maps/dos-nep-Deva-Latn-1997.yaml +0 -33
  146. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +0 -684
  147. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +0 -680
  148. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +0 -19
  149. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +0 -31
  150. data/maps/ggg-kat-Geor-Latn-2002.yaml +0 -88
  151. data/maps/gki-bel-Cyrl-Latn-1992.yaml +0 -33
  152. data/maps/gki-bel-Cyrl-Latn-2000.yaml +0 -201
  153. data/maps/gost-rus-Cyrl-Latn-16876-71-1983.yaml +0 -186
  154. data/maps/hk-yue-Hani-Latn-1888.yaml +0 -38497
  155. data/maps/icao-bel-Cyrl-Latn-9303.yaml +0 -136
  156. data/maps/icao-bul-Cyrl-Latn-9303.yaml +0 -118
  157. data/maps/icao-heb-Hebr-Latn-9303.yaml +0 -151
  158. data/maps/icao-mkd-Cyrl-Latn-9303.yaml +0 -117
  159. data/maps/icao-per-Arab-Latn-9303.yaml +0 -103
  160. data/maps/icao-rus-Cyrl-Latn-9303.yaml +0 -117
  161. data/maps/icao-srp-Cyrl-Latn-9303.yaml +0 -117
  162. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +0 -119
  163. data/maps/iso-ara-Arab-Latn-233-1984.yaml +0 -323
  164. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +0 -609
  165. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +0 -40
  166. data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +0 -62
  167. data/maps/iso-rus-Cyrl-Latn-9-1995.yaml +0 -271
  168. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +0 -109
  169. data/maps/kp-kor-Hang-Latn-2002.yaml +0 -901
  170. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +0 -44820
  171. data/maps/mext-jpn-Hrkt-Latn-1954.yaml +0 -411
  172. data/maps/moct-kor-Hang-Latn-2000.yaml +0 -803
  173. data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +0 -541
  174. data/maps/mvd-bel-Cyrl-Latn-2008.yaml +0 -225
  175. data/maps/mvd-bel-Cyrl-Latn-2010.yaml +0 -63
  176. data/maps/mvd-rus-Cyrl-Latn-2008.yaml +0 -109
  177. data/maps/mvd-rus-Cyrl-Latn-2010.yaml +0 -37
  178. data/maps/nil-kor-Hang-Hang-jamo.yaml +0 -11193
  179. data/maps/odni-aze-Cyrl-Latn-2015.yaml +0 -144
  180. data/maps/odni-bel-Cyrl-Latn-2015.yaml +0 -148
  181. data/maps/odni-bul-Cyrl-Latn-2015.yaml +0 -96
  182. data/maps/odni-hin-Deva-Latn-2015.yaml +0 -258
  183. data/maps/odni-kat-Geor-Latn-2015.yaml +0 -87
  184. data/maps/odni-kaz-Cyrl-Latn-2015.yaml +0 -148
  185. data/maps/odni-kir-Cyrl-Latn-2015.yaml +0 -136
  186. data/maps/odni-mkd-Cyrl-Latn-2015.yaml +0 -122
  187. data/maps/odni-rus-Cyrl-Latn-2015.yaml +0 -77
  188. data/maps/odni-srp-Cyrl-Latn-2015.yaml +0 -129
  189. data/maps/odni-tat-Cyrl-Latn-2015.yaml +0 -142
  190. data/maps/odni-tgk-Cyrl-Latn-2015.yaml +0 -148
  191. data/maps/odni-uig-Cyrl-Latn-2015.yaml +0 -138
  192. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +0 -157
  193. data/maps/odni-urd-Arab-Latn-2015.yaml +0 -221
  194. data/maps/odni-uzb-Cyrl-Latn-2015.yaml +0 -166
  195. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +0 -90
  196. data/maps/royin-tha-Thai-Latn-1968.yaml +0 -179
  197. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +0 -180
  198. data/maps/royin-tha-Thai-Latn-1999.yaml +0 -76
  199. data/maps/sac-zho-Hans-Latn-1979.yaml +0 -24759
  200. data/maps/ses-ara-Arab-Latn-1930.yaml +0 -279
  201. data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +0 -222
  202. data/maps/ua-ukr-Cyrl-Latn-1996.yaml +0 -193
  203. data/maps/un-ara-Arab-Latn-1971.yaml +0 -139
  204. data/maps/un-ara-Arab-Latn-1972.yaml +0 -159
  205. data/maps/un-ara-Arab-Latn-2017.yaml +0 -420
  206. data/maps/un-bel-Cyrl-Latn-2007.yaml +0 -114
  207. data/maps/un-ben-Beng-Latn-2016.yaml +0 -534
  208. data/maps/un-ell-Grek-Latn-1987-tl.yaml +0 -31
  209. data/maps/un-ell-Grek-Latn-1987-ts.yaml +0 -19
  210. data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +0 -780
  211. data/maps/un-mon-Mong-Latn-2013.yaml +0 -99
  212. data/maps/un-nep-Deva-Latn-1972.yaml +0 -163
  213. data/maps/un-rus-Cyrl-Latn-1987.yaml +0 -166
  214. data/maps/un-ukr-Cyrl-Latn-1998.yaml +0 -30
  215. data/maps/ungegn-amh-Ethi-Latn-2016.yaml +0 -575
  216. data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +0 -406
  217. data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +0 -386
  218. data/maps/var-kor-Hang-Latn-mr-1939.yaml +0 -1054
  219. data/maps/var-kor-Kore-Hang-2013.yaml +0 -59754
  220. data/maps/var-kor-Kore-Latn-mr-1939.yaml +0 -36
  221. data/maps/var-tha-Thai-Thai-phonemic.yaml +0 -59
  222. data/maps/var-tha-Thai-Zsym-ipa.yaml +0 -301
  223. data/maps/var-zho-Hani-Latn-1979.yaml +0 -38908
  224. data/spec/interscript/mapping_spec.rb +0 -42
  225. data/spec/interscript_spec.rb +0 -26
  226. data/spec/spec_helper.rb +0 -3
@@ -0,0 +1,18 @@
1
+ class Interscript::Node::MetaData < Interscript::Node
2
+ attr_accessor :data
3
+ def initialize data={}
4
+ @data = data
5
+ end
6
+
7
+ def []=(k,v)
8
+ @data[k] = v
9
+ end
10
+ def [](k)
11
+ @data[k]
12
+ end
13
+
14
+ def to_hash
15
+ {:class => self.class.to_s,
16
+ :data => @data}
17
+ end
18
+ end
@@ -0,0 +1,6 @@
1
+ class Interscript::Node::Rule < Interscript::Node
2
+ end
3
+
4
+ require "interscript/node/rule/sub"
5
+ require "interscript/node/rule/run"
6
+ require "interscript/node/rule/funcall"
@@ -0,0 +1,18 @@
1
+ class Interscript::Node::Rule::Funcall < Interscript::Node::Rule
2
+ attr_accessor :name, :kwargs
3
+ def initialize name, **kwargs
4
+ @name = name
5
+ @kwargs = kwargs
6
+ end
7
+
8
+ def to_hash
9
+ { :class => self.class.to_s,
10
+ :name => self.name,
11
+ :kwargs => self.kwargs
12
+ }
13
+ end
14
+
15
+ def inspect
16
+ "#{@name} #{kwargs.inspect[1..-2]}"
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ class Interscript::Node::Rule::Run < Interscript::Node::Rule
2
+ attr_accessor :stage
3
+ def initialize stage
4
+ @stage = stage
5
+ end
6
+
7
+ def to_hash
8
+ { :class => self.class.to_s,
9
+ :stage => self.stage.to_hash }
10
+ end
11
+
12
+ def inspect
13
+ "run #{@stage.inspect}"
14
+ end
15
+ end
@@ -0,0 +1,65 @@
1
+ class Interscript::Node::Rule::Sub < Interscript::Node::Rule
2
+ attr_accessor :from, :to
3
+ attr_accessor :before, :not_before, :after, :not_after
4
+ attr_accessor :priority
5
+
6
+ def initialize from, to, before: nil, not_before: nil, after: nil, not_after: nil, priority: nil
7
+ self.from = Interscript::Node::Item.try_convert from
8
+ if to == :upcase
9
+ self.to = :upcase
10
+ else
11
+ self.to = Interscript::Node::Item.try_convert to
12
+ end
13
+
14
+ self.priority = priority
15
+
16
+ #raise TypeError, "Can't supply both before and not_before" if before && not_before
17
+ #raise TypeError, "Can't supply both after and not_after" if after && not_after
18
+
19
+ self.before = Interscript::Node::Item.try_convert(before) if before
20
+ self.after = Interscript::Node::Item.try_convert(after) if after
21
+ self.not_before = Interscript::Node::Item.try_convert(not_before) if not_before
22
+ self.not_after = Interscript::Node::Item.try_convert(not_after) if not_after
23
+ end
24
+
25
+ def max_length
26
+ len = self.from.max_length
27
+ len += self.before.max_length if self.before
28
+ len += self.after.max_length if self.after
29
+ len += self.not_before.max_length if self.not_before
30
+ len += self.not_after.max_length if self.not_after
31
+ len += self.priority if self.priority
32
+ len
33
+ end
34
+
35
+ def to_hash
36
+ puts self.from.inspect if $DEBUG
37
+ puts params.inspect if $DEBUG
38
+ { :class => self.class.to_s,
39
+ :from => self.from.to_hash,
40
+ :to => self.to == :upcase ? :upcase : self.to.to_hash,
41
+ :before => self.before&.to_hash,
42
+ :not_before => self.not_before&.to_hash,
43
+ :after => self.after&.to_hash,
44
+ :not_after => self.not_after&.to_hash,
45
+ :priority => self.priority
46
+ }
47
+ end
48
+
49
+ def inspect
50
+ out = "sub "
51
+ params = []
52
+ params << @from.inspect
53
+ if @to == :upcase
54
+ params << "upcase"
55
+ else
56
+ params << @to.inspect
57
+ end
58
+ params << "before: #{@before.inspect}" if @before
59
+ params << "after: #{@after.inspect}" if @after
60
+ params << "not_before: #{@not_before.inspect}" if @not_before
61
+ params << "not_after: #{@not_after.inspect}" if @not_after
62
+ params << "priority: #{@priority.inspect}" if @priority
63
+ out << params.join(", ")
64
+ end
65
+ end
@@ -0,0 +1,19 @@
1
+ class Interscript::Node::Stage < Interscript::Node::Group::Sequential
2
+ attr_accessor :name, :doc_name
3
+
4
+ def initialize name = :main
5
+ @name = name
6
+ super()
7
+ end
8
+
9
+ def to_hash
10
+ { :class => self.class.to_s,
11
+ :name => name,
12
+ :children => @children.map{|x| x.to_hash} }
13
+ end
14
+
15
+ def inspect
16
+ name = "(#{@name})" if @name != :main
17
+ "stage#{name} {\n#{super}\n}"
18
+ end
19
+ end
@@ -0,0 +1,15 @@
1
+ class Interscript::Node::Tests < Interscript::Node
2
+ attr_accessor :data
3
+ def initialize data=[]
4
+ @data = data
5
+ end
6
+
7
+ def <<(pair)
8
+ @data << pair
9
+ end
10
+
11
+ def to_hash
12
+ { :class => self.class.to_s,
13
+ :data => @data }
14
+ end
15
+ end
@@ -0,0 +1,211 @@
1
+ class Interscript::Stdlib
2
+ ALIASES = {
3
+ any_character: '.',
4
+ none: "",
5
+ space: " ",
6
+ whitespace: "[\\b \\t\\0\\r\\n]",
7
+ boundary: "\\b",
8
+ non_word_boundary: "\\B",
9
+ word: "\\w",
10
+ not_word: "\\W",
11
+ alpha: "[a-zA-Z]",
12
+ not_alpha: "[^a-zA-Z]",
13
+ digit: "\\d",
14
+ not_digit: "\\D",
15
+ line_start: "^",
16
+ line_end: "$",
17
+ string_start: "\\A",
18
+ string_end: "\\z"
19
+ }
20
+
21
+ def self.re_only_alias?(a)
22
+ ! %i[none space].include?(a)
23
+ end
24
+
25
+ @treecache = {}
26
+
27
+ def self.parallel_regexp_compile(subs_hash)
28
+ # puts subs_hash.inspect
29
+ regexp = subs_hash.each_with_index.map do |p,i|
30
+ "(?<_%d>%s)" % [i,p[0]]
31
+ end.join("|")
32
+ subs_regexp = Regexp.compile(regexp)
33
+ # puts subs_regexp.inspect
34
+ end
35
+
36
+ def self.parallel_regexp_gsub(string, subs_regexp, subs_hash)
37
+ string.gsub(subs_regexp) do |match|
38
+ lm = Regexp.last_match
39
+ # Extract the match name
40
+ idx = lm.named_captures.compact.keys.first[1..-1].to_i
41
+ subs_hash[idx]
42
+ end
43
+ end
44
+
45
+ def self.parallel_regexp_gsub_debug(string, subs_regexp, subs_array)
46
+ # only gathering debug info, test data is available in maps_analyze_staging
47
+ $subs_matches = []
48
+ $subs_regexp = subs_regexp
49
+ #$subs_array = subs_array
50
+ string.gsub(subs_regexp) do |match|
51
+ lm = Regexp.last_match
52
+ # puts lm.inspect
53
+ # Extract the match name
54
+ matched = lm.named_captures.compact.keys.first
55
+ # puts matched.inspect
56
+ # puts [lm.begin(matched), lm.end(matched)].inspect
57
+ idx = matched[1..-1].to_i
58
+ debug_info = {begin: lm.begin(matched), end: lm.end(matched), idx: idx, result: subs_array[idx]}
59
+ $subs_matches << debug_info
60
+ subs_array[idx]
61
+ end
62
+ end
63
+
64
+
65
+ def self.parallel_replace_compile_hash(a)
66
+ h = {}
67
+ a.each do |from,to|
68
+ h[from] = to
69
+ end
70
+ h
71
+ end
72
+
73
+ def self.parallel_replace_hash(str,h)
74
+ newstr = ""
75
+ len = str.length
76
+ max_key_len = h.keys.map(&:length).max
77
+ i = 0
78
+ while i < len
79
+ max_key_len.downto(1).each do |checked_len|
80
+ substr = str[i,checked_len]
81
+ if h[substr]
82
+ newstr << h[substr]
83
+ i += substr.length
84
+ elsif checked_len==1
85
+ newstr << str[i,1]
86
+ i += 1
87
+ end
88
+ end
89
+ end
90
+ newstr
91
+ end
92
+
93
+ # hash can be either a hash or a hash-like array
94
+ def self.parallel_replace_compile_tree(hash)
95
+ hh = hash.hash
96
+ if @treecache[hh]
97
+ tree = @treecache[hh]
98
+ else
99
+ tree = {}
100
+ hash.each do |from, to|
101
+ from = Array(from)
102
+ from.each do |f|
103
+ branch = tree
104
+ chars = f.split("")
105
+ chars[0..-2].each do |c|
106
+ branch[c.ord] ||= {}
107
+ branch = branch[c.ord]
108
+ end
109
+ branch[chars.last.ord] ||= {}
110
+ branch[chars.last.ord][nil] = to
111
+ end
112
+ end
113
+ @treecache[hh] = tree
114
+ end
115
+ end
116
+
117
+ def self.parallel_replace_tree(str, tree)
118
+ newstr = ""
119
+ len = str.length
120
+ i = 0
121
+ while i < len
122
+ c = str[i]
123
+
124
+ sub = ""
125
+ branch = tree
126
+ match, repl = nil, nil
127
+
128
+ j = 0
129
+ while j < len-i
130
+ cc = str[i+j]
131
+ if branch.include? cc.ord
132
+ branch = branch[cc.ord]
133
+ sub << cc
134
+ if branch.include? nil
135
+ match = sub.dup
136
+ repl = branch[nil]
137
+ end
138
+ j += 1
139
+ else
140
+ break
141
+ end
142
+ end
143
+
144
+ if match
145
+ i += match.length
146
+ newstr << repl
147
+ else
148
+ newstr << c
149
+ i += 1
150
+ end
151
+ end
152
+ newstr
153
+ end
154
+
155
+ def self.parallel_replace(str, hash)
156
+ tree = parallel_replace_compile_tree(hash)
157
+ parallel_replace_tree(str, tree)
158
+ end
159
+
160
+ # On Windows at least, sort_by is non-deterministic. Let's add some determinism
161
+ # to our efforts.
162
+ def self.deterministic_sort_by_max_length(ary)
163
+ # Deterministic on Linux:
164
+ # ary.sort_by{ |rule| -rule.max_length }
165
+
166
+ ary.each_with_index.sort_by{ |rule,idx| -rule.max_length*100000 + idx }.map(&:first)
167
+ end
168
+
169
+ def self.available_functions
170
+ %i[title_case downcase compose decompose separate secryst]
171
+ end
172
+
173
+ module Functions
174
+ def self.title_case(output, word_separator: " ")
175
+ output = output.gsub(/^(.)/, &:upcase)
176
+ output = output.gsub(/#{word_separator}(.)/, &:upcase) unless word_separator == ''
177
+ output
178
+ end
179
+
180
+ def self.downcase(output, _:nil)
181
+ output.downcase
182
+ end
183
+
184
+ def self.compose(output, _:nil)
185
+ output.unicode_normalize(:nfc)
186
+ end
187
+
188
+ def self.decompose(output, _:nil)
189
+ output.unicode_normalize(:nfd)
190
+ end
191
+
192
+ def self.separate(output, separator: " ")
193
+ output.split("").join(separator)
194
+ end
195
+
196
+ @secryst_models = {}
197
+ def self.secryst(output, model:)
198
+ require "secryst" rescue nil # Try to load secryst, but don't fail hard if not possible.
199
+ unless defined? Secryst
200
+ raise StandardError, "Secryst is not loaded. Please read docs/Usage_with_Secryst.adoc"
201
+ end
202
+ Interscript.secryst_index_locations.each do |remote|
203
+ Secryst::Provisioning.add_remote(remote)
204
+ end
205
+ @secryst_models[model] ||= Secryst::Translator.new(model_file: model)
206
+ output.split("\n").map(&:chomp).map do |i|
207
+ @secryst_models[model].translate(i)
208
+ end.join("\n")
209
+ end
210
+ end
211
+ end
@@ -0,0 +1,283 @@
1
+ require 'regexp_parser'
2
+
3
+
4
+ def process(node)
5
+ children = if node.respond_to?(:expressions) && node.expressions
6
+ children = node.expressions.map.each { |expr| process(expr) }
7
+ end
8
+ # puts node.inspect
9
+ out = case node
10
+ when Regexp::Expression::Root
11
+ children
12
+ when Regexp::Expression::Assertion::Lookbehind
13
+ [:lookbehind_start, children, :lookbehind_stop]
14
+ when Regexp::Expression::Assertion::NegativeLookbehind
15
+ [:negative_lookbehind_start, children, :negative_lookbehind_stop]
16
+ when Regexp::Expression::Assertion::Lookahead
17
+ [:lookahead_start, children, :lookahead_stop]
18
+ when Regexp::Expression::Assertion::NegativeLookahead
19
+ [:negative_lookahead_start, children, :negative_lookahead_stop]
20
+ when Regexp::Expression::Group::Capture
21
+ [:capture_start, children, :capture_stop]
22
+ when Regexp::Expression::CharacterSet
23
+ # puts children.inspect
24
+ if children.flatten.include? (:range_start) #or children.size > 1
25
+ [:characterset_start, :array_start, children, :array_stop, :characterset_stop]
26
+ else
27
+ [:characterset_start, children, :characterset_stop]
28
+ end
29
+ when Regexp::Expression::Alternation
30
+ [:alternation_start, children, :alternation_stop]
31
+ when Regexp::Expression::Alternative
32
+ [:alternative_start, children, :alternative_stop]
33
+ when Regexp::Expression::CharacterSet::Range
34
+ lit1 = node.expressions[0].text
35
+ lit2 = node.expressions[1].text
36
+ [:range_start, lit1, :range_mid, lit2, :range_stop]
37
+ when Regexp::Expression::Anchor::WordBoundary
38
+ :boundary
39
+ when Regexp::Expression::Anchor::NonWordBoundary
40
+ :non_word_boundary
41
+ when Regexp::Expression::EscapeSequence::Backspace
42
+ :boundary # most probably boundary
43
+ when Regexp::Expression::CharacterType::Space
44
+ :space
45
+ when Regexp::Expression::Anchor::BeginningOfLine
46
+ :line_start
47
+ when Regexp::Expression::Anchor::EndOfLine
48
+ :line_end
49
+ when Regexp::Expression::CharacterType::Any
50
+ :any_character
51
+ when Regexp::Expression::Literal
52
+ node.text
53
+ when Regexp::Expression::EscapeSequence::Literal
54
+ node.text
55
+ when Regexp::Expression::EscapeSequence::Codepoint
56
+ node.text
57
+ when Regexp::Expression::PosixClass
58
+ '[' + node.text + ']'
59
+ when Regexp::Expression::UnicodeProperty::Script
60
+ node.text
61
+ when Regexp::Expression::Backreference::Number # why is there a space before after node.number?
62
+ [:backref_num_start, node.number, :backref_num_stop]
63
+ else
64
+ out = [:missing, node.class]
65
+
66
+ out << children if node.respond_to? :expressions
67
+ if node.respond_to? :quantifier and node.quantifier
68
+ # TODO add quantifier support
69
+ pp node
70
+ # out << process(node.quantifier)
71
+ end
72
+ out
73
+ end
74
+ if node.respond_to?(:quantifier) && node.quantifier&.token.to_s == "interval" && node.quantifier.max == node.quantifier.min
75
+ out = [out] * node.quantifier.max
76
+ elsif node.respond_to?(:quantifier) && node.quantifier
77
+ qname = node.quantifier.token.to_s
78
+ out = ["#{qname}_start".to_sym, [out], "#{qname}_stop".to_sym]
79
+ end
80
+ out
81
+ end
82
+
83
+ def process_root(node)
84
+ node2 = node.dup
85
+ root = {}
86
+ if before = node.select { |x| x[0] == :lookbehind_start }
87
+ # root[:before] = before[1]
88
+ # node2.delete(before)
89
+ if before.size == 1
90
+ root[:before] = before[0][1]
91
+ node2.delete(before[0])
92
+ elsif before.size >1
93
+ # pp not_before
94
+
95
+ a = [:alternation_start]
96
+ a << before.map{|x| [:alternative_start, x[1], :alternative_stop] }
97
+ a << [:alternation_stop]
98
+ root[:before] = a
99
+ # pp root[:not_before]
100
+ before.each{|n| node2.delete(n)}
101
+ end
102
+
103
+ end
104
+ if not_before = node.select { |x| x[0] == :negative_lookbehind_start }
105
+ # root[:not_before] = not_before[1]
106
+ # node2.delete(not_before)
107
+
108
+ if not_before.size == 1
109
+ root[:not_before] = not_before[0][1]
110
+ node2.delete(not_before[0])
111
+ elsif not_before.size >1
112
+ # pp not_before
113
+
114
+ a = [:alternation_start]
115
+ a << not_before.map{|x| [:alternative_start, x[1], :alternative_stop] }
116
+ a << [:alternation_stop]
117
+ root[:not_before] = a
118
+ # pp root[:not_before]
119
+ not_before.each{|n| node2.delete(n)}
120
+ end
121
+ end
122
+ if after = node.select { |x| x[0] == :lookahead_start }
123
+ # root[:after] = after[1]
124
+ # node2.delete(after)
125
+
126
+ if after.size == 1
127
+ root[:after] = after[0][1]
128
+ node2.delete(after[0])
129
+ elsif after.size >1
130
+ # pp not_before
131
+
132
+ a = [:alternation_start]
133
+ a << after.map{|x| [:alternative_start, x[1], :alternative_stop] }
134
+ a << [:alternation_stop]
135
+ root[:after] = a
136
+ # pp root[:not_before]
137
+ after.each{|n| node2.delete(n)}
138
+ end
139
+
140
+ end
141
+ if not_after = node.select { |x| x[0] == :negative_lookahead_start }
142
+ # root[:not_after] = not_after[1]
143
+ # node2.delete(not_after)
144
+ if not_after.size == 1
145
+ root[:not_after] = not_after[0][1]
146
+ node2.delete(not_after[0])
147
+ elsif not_after.size >1
148
+ # pp not_after
149
+
150
+ a = [:alternation_start]
151
+ a << not_after.map{|x| [:alternative_start, x[1], :alternative_stop] }
152
+ a << [:alternation_stop]
153
+ root[:not_after] = a
154
+ # pp root[:not_after]
155
+ not_after.each{|n| node2.delete(n)}
156
+ end
157
+
158
+ end
159
+ root[:from] = node2
160
+ root
161
+ end
162
+
163
+ def stringify(node)
164
+ tokens = node.flatten
165
+ subs = {
166
+ characterset_start: 'any(',
167
+ characterset_stop: ')',
168
+ array_start: '[',
169
+ array_stop: ']',
170
+ capture_start: 'capture(',
171
+ capture_stop: ')',
172
+ zero_or_one_start: 'maybe(',
173
+ zero_or_one_stop: ')',
174
+ zero_or_more_start: 'maybe_some(',
175
+ zero_or_more_stop: ')',
176
+ one_or_more_start: 'some(',
177
+ one_or_more_stop: ')',
178
+ alternation_start: 'any([',
179
+ alternation_stop: '])',
180
+ alternative_start: '',
181
+ alternative_stop: '',
182
+ boundary: 'boundary',
183
+ non_word_boundary: 'non_word_boundary',
184
+ space: 'space',
185
+ line_start: 'line_start',
186
+ line_end: 'line_end',
187
+ any_character: 'any_character',
188
+ range_start: 'any(',
189
+ range_mid: '..',
190
+ range_stop: ')',
191
+ backref_num_start: 'ref(',
192
+ backref_num_stop: ')'
193
+ }
194
+
195
+ str = []
196
+ tokens.each_with_index do |token, idx|
197
+ prev = tokens[idx - 1] if idx > 0
198
+ left_side = %i[characterset_stop capture_stop
199
+ zero_or_one_stop zero_or_more_stop one_or_more_stop
200
+ boundary non_word_boundary
201
+ line_start any_character range_stop space
202
+ backref_num_stop]
203
+ right_side = %i[characterset_start capture_start
204
+ zero_or_one_start zero_or_more_start one_or_more_start
205
+ boundary non_word_boundary
206
+ line_end any_character range_start space
207
+ backref_num_start]
208
+ #if prev==:range_stop and token==:range_start
209
+ # str << ' :adding_ranges '
210
+ #end
211
+ if (prev.instance_of?(String) && right_side.include?(token)) or
212
+ (left_side.include?(prev) && token.instance_of?(String)) or
213
+ (left_side.include?(prev) && right_side.include?(token))
214
+ str << ' + '
215
+ end
216
+ str << ', ' if prev == :alternative_stop and token == :alternative_start
217
+ # str << '[' if prev == :characterset_start and token == :range_start
218
+ # str << ']' if prev == :range_stop and token ==:characterset_stop
219
+ if subs.include? token
220
+ str << subs[token]
221
+ elsif token.instance_of?(String)
222
+ if prev.instance_of?(String)
223
+ str[-1] = "#{str[-1][0..-2]}#{token}\""
224
+ else
225
+ str << "\"#{token}\""
226
+ end
227
+ else
228
+ str << " #{token.inspect} "
229
+ end
230
+ # puts [idx, token].inspect
231
+ # puts str.inspect
232
+ end
233
+ str.join.gsub('\\\\u', '\\u')
234
+ end
235
+
236
+ def stringify_root(root, indent: 0)
237
+ warning = ''
238
+ root[:from] = [""] if root[:from] == []
239
+ str = " "*indent+"sub #{stringify(root[:from])}, #{root[:to]}"
240
+ [:before, :not_before, :after, :not_after].each do |look|
241
+ # puts "#{look.inspect} = #{root[look]}"
242
+ next unless root[look]
243
+ str_look = stringify(root[look])
244
+ str_look = "\"\"" if root[look] == [] || root[look] == nil
245
+ #if str_look.empty? #apparently it is empty sometimes. iso-mal-Mlym-Latn for example
246
+ # warning << "warning: #{look} is empty string;"
247
+ #else
248
+ str << ", #{look}: #{str_look}"
249
+ #end
250
+ end
251
+ str = " "*indent+"# #{str} # warning: :" if str =~ /[^\[]:[^ \]]/
252
+ str = " "*indent+"# #{str} # #{warning}" if !warning.empty?
253
+
254
+ str = " "*indent+"# #{str} # warning: :missing unimplemented" if str.include?(':missing')
255
+ str = " "*indent+"# #{str} # warning: :interval unimplemented" if str.include?(':interval')
256
+ str = " "*indent+"# #{str} # warning: :adding_ranges unimplemented" if str.include?(':adding_ranges')
257
+ if str.include?('zero_or_one')
258
+ str = " "*indent+"# #{str} # warning: zero_or_one"
259
+ puts "str.includes 'zero_or_one'"
260
+ pp root
261
+ end
262
+ # str = " "*indent+"# #{str} # warning: one_or_more" if str.include?('one_or_more')
263
+ str = " "*indent+"# #{str} # warning: :lookahead_start" if str.include?(':lookahead_start')
264
+ # str += " # original: #{root[:from]}"
265
+ str
266
+ end
267
+
268
+ if __FILE__ == $0
269
+ rs = File.open(__dir__+"/../../docs/utils/regexp_examples.txt").read.gsub(/([^\\^])\\u/, '\\1\\\\u').gsub(/\\\\b/, '\b')
270
+ rs = rs.split("\n")
271
+ rs.each do |r|
272
+ puts r
273
+ tree = Regexp::Parser.parse(r, 'ruby/2.1')
274
+ conv = process(tree)
275
+ pp conv
276
+ root = process_root(conv)
277
+ pp root
278
+ root[:to] = ['X']
279
+ str = stringify_root(root)
280
+ puts str
281
+ puts "\n\n"
282
+ end
283
+ end