interscript 0.1.4 → 2.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +29 -0
  5. data/LICENSE.adoc +31 -0
  6. data/README.md +3 -0
  7. data/Rakefile +53 -0
  8. data/bin/console +14 -0
  9. data/bin/interscript +3 -39
  10. data/bin/maps_analyze_staging +168 -0
  11. data/bin/maps_debug_compilers +58 -0
  12. data/bin/maps_debug_ordering +88 -0
  13. data/bin/maps_debug_ruby_compile +24 -0
  14. data/bin/maps_debug_step_by_step +44 -0
  15. data/bin/maps_optimize_order +112 -0
  16. data/bin/maps_v1_analyze_regexps +45 -0
  17. data/bin/maps_v1_to_v2 +426 -0
  18. data/exe/interscript +6 -0
  19. data/interscript.gemspec +31 -0
  20. data/lib/interscript.rb +76 -128
  21. data/lib/interscript/command.rb +6 -5
  22. data/lib/interscript/compiler.rb +22 -0
  23. data/lib/interscript/compiler/javascript.rb +292 -0
  24. data/lib/interscript/compiler/ruby.rb +262 -0
  25. data/lib/interscript/dsl.rb +67 -0
  26. data/lib/interscript/dsl/aliases.rb +23 -0
  27. data/lib/interscript/dsl/document.rb +46 -0
  28. data/lib/interscript/dsl/group.rb +45 -0
  29. data/lib/interscript/dsl/group/parallel.rb +6 -0
  30. data/lib/interscript/dsl/items.rb +89 -0
  31. data/lib/interscript/dsl/metadata.rb +26 -0
  32. data/lib/interscript/dsl/stage.rb +6 -0
  33. data/lib/interscript/dsl/symbol_mm.rb +11 -0
  34. data/lib/interscript/dsl/tests.rb +12 -0
  35. data/lib/interscript/interpreter.rb +251 -0
  36. data/lib/interscript/node.rb +25 -0
  37. data/lib/interscript/node/alias_def.rb +15 -0
  38. data/lib/interscript/node/dependency.rb +13 -0
  39. data/lib/interscript/node/document.rb +45 -0
  40. data/lib/interscript/node/group.rb +34 -0
  41. data/lib/interscript/node/group/parallel.rb +9 -0
  42. data/lib/interscript/node/group/sequential.rb +2 -0
  43. data/lib/interscript/node/item.rb +52 -0
  44. data/lib/interscript/node/item/alias.rb +42 -0
  45. data/lib/interscript/node/item/any.rb +61 -0
  46. data/lib/interscript/node/item/capture.rb +50 -0
  47. data/lib/interscript/node/item/group.rb +51 -0
  48. data/lib/interscript/node/item/repeat.rb +40 -0
  49. data/lib/interscript/node/item/stage.rb +23 -0
  50. data/lib/interscript/node/item/string.rb +51 -0
  51. data/lib/interscript/node/metadata.rb +18 -0
  52. data/lib/interscript/node/rule.rb +6 -0
  53. data/lib/interscript/node/rule/funcall.rb +18 -0
  54. data/lib/interscript/node/rule/run.rb +15 -0
  55. data/lib/interscript/node/rule/sub.rb +65 -0
  56. data/lib/interscript/node/stage.rb +19 -0
  57. data/lib/interscript/node/tests.rb +15 -0
  58. data/lib/interscript/stdlib.rb +211 -0
  59. data/lib/interscript/utils/regexp_converter.rb +283 -0
  60. data/lib/interscript/version.rb +1 -1
  61. data/requirements.txt +1 -0
  62. metadata +73 -223
  63. data/README.adoc +0 -297
  64. data/bin/rspec +0 -29
  65. data/lib/g2pwrapper.py +0 -34
  66. data/lib/interscript/mapping.rb +0 -125
  67. data/lib/model-7 +0 -0
  68. data/lib/tha-pt-b-7 +0 -0
  69. data/maps/acadsin-zho-Hani-Latn-2002.yaml +0 -38912
  70. data/maps/alalc-aze-Cyrl-Latn-1997.yaml +0 -141
  71. data/maps/alalc-bel-cyrl-latn-1997.yaml +0 -125
  72. data/maps/alalc-ben-Beng-Latn-2017.yaml +0 -130
  73. data/maps/alalc-bul-Cyrl-Latn-1997.yaml +0 -94
  74. data/maps/alalc-ell-Grek-Latn-1997.yaml +0 -625
  75. data/maps/alalc-ell-Grek-Latn-2010.yaml +0 -628
  76. data/maps/alalc-kat-Geok-Latn-1997.yaml +0 -112
  77. data/maps/alalc-kat-Geor-Latn-1997.yaml +0 -146
  78. data/maps/alalc-kor-Hang-Latn-1997.yaml +0 -94
  79. data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +0 -103
  80. data/maps/alalc-mkd-cyrl-latn-1997.yaml +0 -114
  81. data/maps/alalc-rus-Cyrl-Latn-1997.yaml +0 -222
  82. data/maps/alalc-rus-Cyrl-Latn-2012.yaml +0 -162
  83. data/maps/alalc-srp-Cyrl-Latn-1997.yaml +0 -114
  84. data/maps/alalc-srp-cyrl-latn-2013.yaml +0 -135
  85. data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +0 -141
  86. data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +0 -16
  87. data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +0 -283
  88. data/maps/bas-rus-Cyrl-Latn-2017-bss.yaml +0 -175
  89. data/maps/bas-rus-Cyrl-Latn-2017-oss.yaml +0 -169
  90. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +0 -294
  91. data/maps/bgn-kor-Hang-Latn-1943.yaml +0 -31
  92. data/maps/bgn-kor-Kore-Latn-1943.yaml +0 -31
  93. data/maps/bgna-bul-Cyrl-Latn-2006.yaml +0 -208
  94. data/maps/bgna-bul-Cyrl-Latn-2009.yaml +0 -208
  95. data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +0 -108
  96. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +0 -104
  97. data/maps/bgnpcgn-bak-Cyrl-Latn-2007.yaml +0 -184
  98. data/maps/bgnpcgn-bel-cyrl-latn-1979.yaml +0 -285
  99. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +0 -115
  100. data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +0 -38
  101. data/maps/bgnpcgn-chn-Hans-Latn-1979.yaml +0 -7456
  102. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +0 -702
  103. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +0 -20
  104. data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +0 -257
  105. data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +0 -127
  106. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +0 -43
  107. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +0 -253
  108. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +0 -48
  109. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +0 -48
  110. data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +0 -159
  111. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +0 -190
  112. data/maps/bgnpcgn-per-Arab-Latn-1956.yaml +0 -93
  113. data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +0 -314
  114. data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +0 -166
  115. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +0 -163
  116. data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +0 -208
  117. data/maps/by-bel-Cyrl-Latn-1998.yaml +0 -168
  118. data/maps/by-bel-Cyrl-Latn-2007.yaml +0 -115
  119. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +0 -685
  120. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +0 -681
  121. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +0 -20
  122. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +0 -32
  123. data/maps/ggg-kat-Geor-Latn-2002.yaml +0 -89
  124. data/maps/gki-bel-cyrl-latn-1992.yaml +0 -33
  125. data/maps/gki-bel-cyrl-latn-2000.yaml +0 -201
  126. data/maps/gost-rus-cyrl-latn-16876-71-1983.yaml +0 -186
  127. data/maps/hk-yue-Hani-Latn-1888.yaml +0 -38497
  128. data/maps/icao-bel-Cyrl-Latn-9303.yaml +0 -141
  129. data/maps/icao-bul-Cyrl-Latn-9303.yaml +0 -122
  130. data/maps/icao-heb-Hebr-Latn-9303.yaml +0 -151
  131. data/maps/icao-mkd-Cyrl-Latn-9303.yaml +0 -117
  132. data/maps/icao-per-Arab-Latn-9303.yaml +0 -104
  133. data/maps/icao-rus-Cyrl-Latn-9303.yaml +0 -118
  134. data/maps/icao-srp-Cyrl-Latn-9303.yaml +0 -117
  135. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +0 -120
  136. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +0 -610
  137. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +0 -41
  138. data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +0 -62
  139. data/maps/iso-rus-Cyrl-Latn-9-1995.yaml +0 -272
  140. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +0 -109
  141. data/maps/kp-kor-Hang-Latn-2002.yaml +0 -901
  142. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +0 -44820
  143. data/maps/mext-jpn-Hrkt-Latn-1954.yaml +0 -411
  144. data/maps/moct-kor-Hang-Latn-2000.yaml +0 -803
  145. data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +0 -541
  146. data/maps/mvd-bel-Cyrl-Latn-2008.yaml +0 -225
  147. data/maps/mvd-bel-Cyrl-Latn-2010.yaml +0 -63
  148. data/maps/mvd-rus-Cyrl-Latn-2008.yaml +0 -110
  149. data/maps/mvd-rus-Cyrl-Latn-2010.yaml +0 -37
  150. data/maps/nil-kor-Hang-Hang-jamo.yaml +0 -11193
  151. data/maps/odni-bel-Cyrl-Latn-2015.yaml +0 -148
  152. data/maps/odni-bul-Cyrl-Latn-2015.yaml +0 -96
  153. data/maps/odni-kat-Geor-Latn-2015.yaml +0 -88
  154. data/maps/odni-rus-Cyrl-Latn-2015.yaml +0 -77
  155. data/maps/odni-srp-Cyrl-Latn-2015.yaml +0 -129
  156. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +0 -157
  157. data/maps/odni-uzb-Cyrl-Latn-2015.yaml +0 -167
  158. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +0 -90
  159. data/maps/royin-tha-Thai-Latn-1968.yaml +0 -179
  160. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +0 -180
  161. data/maps/royin-tha-Thai-Latn-1999.yaml +0 -76
  162. data/maps/sac-zho-Hans-Latn-1979.yaml +0 -24759
  163. data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +0 -222
  164. data/maps/ua-ukr-Cyrl-Latn-1996.yaml +0 -193
  165. data/maps/un-bel-Cyrl-Latn-2007.yaml +0 -114
  166. data/maps/un-ben-Beng-Latn-2016.yaml +0 -534
  167. data/maps/un-ell-Grek-Latn-1987-tl.yaml +0 -32
  168. data/maps/un-ell-Grek-Latn-1987-ts.yaml +0 -20
  169. data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +0 -780
  170. data/maps/un-mon-Mong-Latn-2013.yaml +0 -93
  171. data/maps/un-rus-Cyrl-Latn-1987.yaml +0 -166
  172. data/maps/un-ukr-cyrl-latn-1998.yaml +0 -30
  173. data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +0 -406
  174. data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +0 -386
  175. data/maps/var-kor-Hang-Latn-mr-1939.yaml +0 -1054
  176. data/maps/var-kor-Kore-Hang-2013.yaml +0 -59754
  177. data/maps/var-kor-Kore-Latn-mr-1939.yaml +0 -37
  178. data/maps/var-tha-Thai-Thai-phonemic.yaml +0 -59
  179. data/maps/var-tha-Thai-Zsym-ipa.yaml +0 -301
  180. data/maps/var-zho-Hani-Latn-1979.yaml +0 -38908
  181. data/spec/interscript/mapping_spec.rb +0 -42
  182. data/spec/interscript_spec.rb +0 -26
  183. data/spec/spec_helper.rb +0 -3
data/exe/interscript ADDED
@@ -0,0 +1,6 @@
1
+ require 'interscript/command'
2
+
3
+ if ARGV.any? && !Interscript::Command.all_tasks.key?(ARGV.first)
4
+ ARGV.unshift :translit
5
+ end
6
+ Interscript::Command.start ARGV
@@ -0,0 +1,31 @@
1
+ require_relative 'lib/interscript/version'
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "interscript"
5
+ spec.version = Interscript::VERSION
6
+ spec.summary = %q{Interoperable script conversion systems}
7
+ spec.description = %q{Interoperable script conversion systems}
8
+ spec.authors = ["Ribose Inc."]
9
+ spec.email = ["open.source@ribose.com"]
10
+
11
+ spec.date = %q{2019-11-17}
12
+ spec.homepage = "https://www.interscript.com"
13
+ spec.license = "BSD-2-Clause"
14
+
15
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/interscript/interscript"
19
+
20
+ # Specify which files should be added to the gem when it is released.
21
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
22
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
23
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
24
+ end
25
+ spec.bindir = "exe"
26
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
27
+ spec.require_paths = ["lib"]
28
+
29
+ spec.add_dependency "thor"
30
+ spec.add_dependency "interscript-maps"
31
+ end
data/lib/interscript.rb CHANGED
@@ -1,163 +1,111 @@
1
- # frozen_string_literal: true
2
-
1
+ require "interscript/version"
3
2
  require "yaml"
4
- require "interscript/mapping"
5
3
 
6
- # Transliteration
7
4
  module Interscript
5
+ class MapNotFoundError < StandardError; end
8
6
 
9
7
  class << self
10
- def root_path
11
- @root_path ||= Pathname.new(File.dirname(__dir__))
8
+ def load_path
9
+ @load_path ||= ['.', *Interscript.map_locations]
12
10
  end
13
11
 
14
- def transliterate_file(system_code, input_file, output_file, maps)
15
- input = File.read(input_file)
16
- output = transliterate(system_code, input, maps)
12
+ def locate map_name
13
+ map_name = map_aliases[map_name] if map_aliases.include? map_name
17
14
 
18
- File.open(output_file, 'w') do |f|
19
- f.puts(output)
15
+ load_path.each do |i|
16
+ # iml is an extension for a library, imp for a map
17
+ ["iml", "imp"].each do |ext|
18
+ f = File.expand_path("#{map_name}.#{ext}", i)
19
+ return f if File.exist?(f)
20
+ end
20
21
  end
21
- puts "Output written to: #{output_file}"
22
+ raise MapNotFoundError, "Couldn't locate #{map_name}"
22
23
  end
23
24
 
24
- def import_python_modules
25
- begin
26
- pyimport :g2pwrapper
27
- rescue
28
- pyimport :sys
29
- sys.path.append(root_path.to_s+"/lib/")
30
- pyimport :g2pwrapper
31
- end
25
+ def parse(map_name)
26
+ Interscript::DSL.parse(map_name)
32
27
  end
33
28
 
34
- def external_process(process_name, string)
35
- import_python_modules
36
- case process_name
37
- when 'sequitur.pythainlp_lexicon'
38
- return g2pwrapper.transliterate('pythainlp_lexicon', string)
39
- when 'sequitur.wiktionary_phonemic'
40
- return g2pwrapper.transliterate('wiktionary_phonemic', string)
41
- else
42
- puts "Invalid Process"
43
- end
29
+ def load(system_code, maps={}, compiler: Interscript::Interpreter)
30
+ maps[[system_code, compiler.name]] ||= compiler.(system_code)
44
31
  end
45
32
 
46
- def transliterate(system_code, string, maps={})
47
- if (!maps.has_key?system_code)
48
- maps[system_code] = Interscript::Mapping.for(system_code)
49
- end
50
- # mapping = Interscript::Mapping.for(system_code)
51
- mapping = maps[system_code]
33
+ # Transliterates the string.
34
+ def transliterate(system_code, string, maps={}, compiler: Interscript::Interpreter)
35
+ # The current best implementation is Interpreter
36
+ load(system_code, maps, compiler: compiler).(string)
37
+ end
52
38
 
39
+ # Gives each possible value of the transliteration.
40
+ def transliterate_each(system_code, string, maps={}, &block)
41
+ load(system_code, maps).(string, each: true, &block)
42
+ end
53
43
 
54
- # First, apply chained transliteration as specified in the list `chain`
55
- chain = mapping.chain.dup
56
- while chain.length > 0
57
- string = transliterate(chain.shift, string, maps)
58
- end
44
+ def transliterate_file(system_code, input_file, output_file, maps={})
45
+ input = File.read(input_file)
46
+ output = transliterate(system_code, input, maps)
59
47
 
60
- # Then, apply the rest of the map
61
- separator = mapping.character_separator || ""
62
- word_separator = mapping.word_separator || ""
63
- title_case = mapping.title_case
64
- downcase = mapping.downcase
65
-
66
- # charmap = mapping.characters&.sort_by { |k, _v| k.size }&.reverse&.to_h
67
- # dictmap = mapping.dictionary&.sort_by { |k, _v| k.size }&.reverse&.to_h
68
- charmap = mapping.characters_hash
69
- dictmap = mapping.dictionary_hash
70
- trie = mapping.dictionary_trie
71
-
72
- # Segmentation
73
- string = external_process(mapping.segmentation, string) if mapping.segmentation
74
-
75
- # Transliteration/Transcription
76
- string = external_process(mapping.transcription, string) if mapping.transcription
77
-
78
- pos = 0
79
- while pos < string.to_s.size
80
- m = 0
81
- wordmatch = ""
82
-
83
- # Using Trie, find the longest matching substring
84
- while (pos + m < string.to_s.size) && (trie.partial_word?string[pos..pos+m])
85
- wordmatch = string[pos..pos+m] if trie.word?string[pos..pos+m]
86
- m += 1
87
- end
88
- m = wordmatch.length
89
- if m > 0
90
- repl = dictmap[string[pos..pos+m-1]]
91
- string[pos..pos+m-1] = repl
92
- pos += repl.length
93
- else
94
- pos += 1
95
- end
48
+ File.open(output_file, 'w') do |f|
49
+ f.puts(output)
96
50
  end
97
51
 
98
- output = string.clone
99
- offsets = Array.new string.to_s.size, 1
100
-
101
- # mapping.rules.each do |r|
102
- # string.to_s.scan(/#{r['pattern']}/) do |matches|
103
- # match = Regexp.last_match
104
- # pos = match.offset(0).first
105
- # result = r['result'].clone
106
- # matches.each.with_index { |v, i| result.sub!(/\\#{i + 1}/, v) } if matches.is_a? Array
107
- # result.upcase! if up_case_around?(string, pos)
108
- # output[offsets[0...pos].sum, match[0].size] = result
109
- # offsets[pos] += result.size - match[0].size
110
- # end
111
- # end
112
- mapping.rules.each do |r|
113
- output.gsub!(/#{r['pattern']}/, r['result'])
114
- end
52
+ puts "Output written to: #{output_file}"
53
+ output_file
54
+ end
115
55
 
116
- charmap.each do |k, v|
117
- while (match = output&.match(/#{k}/))
118
- pos = match.offset(0).first
119
- result = !downcase && up_case_around?(output, pos) ? v.upcase : v
120
- result = result[0] if result.is_a?(Array) # if more than one, choose the first one
121
- output[pos, match[0].size] = add_separator(separator, pos, result)
122
- end
123
- end
56
+ def map_gems
57
+ @map_gems ||= Gem.find_latest_files('interscript-maps.yaml').map do |i|
58
+ [i, YAML.load_file(i)]
59
+ end.to_h
60
+ end
124
61
 
125
- mapping.postrules.each do |r|
126
- output.gsub!(/#{r['pattern']}/, r['result'])
127
- end
62
+ def map_locations
63
+ @map_locations ||= map_gems.map do |i,v|
64
+ paths = v["paths"].dup
65
+ paths += v["staging"] if ENV["INTERSCRIPT_STAGING"] && v["staging"]
128
66
 
129
- if output
130
- output.sub!(/^(.)/, &:upcase) if title_case
131
- if word_separator != ''
132
- output.gsub!(/#{word_separator}#{separator}/,word_separator)
133
- output.gsub!(/#{word_separator}(.)/, &:upcase) if title_case
67
+ paths.map do |j|
68
+ File.expand_path(j, File.dirname(i))
134
69
  end
135
- end
70
+ end.flatten
71
+ end
136
72
 
137
- output ? output.unicode_normalize : output
73
+ def secryst_index_locations
74
+ @secryst_index_locations ||= map_gems.map do |i,v|
75
+ v["secryst-models"]
76
+ end.compact.flatten
138
77
  end
139
78
 
140
- private
79
+ def map_aliases
80
+ return @map_aliases if @map_aliases
141
81
 
142
- def add_separator(separator, pos, result)
143
- pos == 0 ? result : separator + result
82
+ @map_aliases = {}
83
+ map_gems.each do |i,v|
84
+ (v["aliases"] || {}).each do |code, value|
85
+ value.each do |al, map|
86
+ @map_aliases[al] = map["alias_to"]
87
+ end
88
+ end
89
+ end
90
+ @map_aliases
144
91
  end
145
92
 
146
- def up_case_around?(string, pos)
147
- return false if string[pos] == string[pos].downcase
93
+ # List all possible maps to use
94
+ def maps(basename: true, load_path: false, select: "*", libraries: false)
95
+ paths = load_path ? Interscript.load_path : Interscript.map_locations
96
+ ext = libraries ? "iml" : "imp"
148
97
 
149
- i = pos - 1
150
- i -= 1 while i.positive? && string[i] !~ /[[:alpha:]]/
151
- before = i >= 0 && i < pos ? string[i].to_s.strip : ''
98
+ imps = paths.map { |i| Dir["#{i}/#{select}.#{ext}"] }.flatten
152
99
 
153
- i = pos + 1
154
- i += 1 while i < string.size - 1 && string[i] !~ /[[:alpha:]]/
155
- after = i > pos ? string[i].to_s.strip : ''
156
-
157
- before_uc = !before.empty? && before == before.upcase
158
- after_uc = !after.empty? && after == after.upcase
159
- # before_uc && (after.empty? || after_uc) || after_uc && (before.empty? || before_uc)
160
- before_uc || after_uc
100
+ basename ? imps.map { |j| File.basename(j, ".#{ext}") } : imps
161
101
  end
162
102
  end
163
103
  end
104
+
105
+ require 'interscript/stdlib'
106
+
107
+ require "interscript/compiler"
108
+ require "interscript/interpreter"
109
+
110
+ require 'interscript/dsl'
111
+ require 'interscript/node'
@@ -1,16 +1,18 @@
1
1
  require 'thor'
2
2
  require 'interscript'
3
-
3
+ require 'json'
4
4
  module Interscript
5
5
  # Command line interface
6
6
  class Command < Thor
7
7
  desc '<file>', 'Transliterate text'
8
8
  option :system, aliases: '-s', required: true, desc: 'Transliteration system'
9
9
  option :output, aliases: '-o', required: false, desc: 'Output file'
10
+ # Was this option really well thought out? The last parameter is a cache, isn't it?
11
+ #option :map, aliases: '-m', required: false, default: "{}", desc: 'Transliteration mapping json'
10
12
 
11
13
  def translit(input)
12
14
  if options[:output]
13
- Interscript.transliterate_file(options[:system], input, options[:output])
15
+ Interscript.transliterate_file(options[:system], input, options[:output]) #, JSON.parse(options[:map]))
14
16
  else
15
17
  puts Interscript.transliterate(options[:system], IO.read(input))
16
18
  end
@@ -18,9 +20,8 @@ module Interscript
18
20
 
19
21
  desc 'list', 'Prints allowed transliteration systems'
20
22
  def list
21
- dir = File.expand_path '../../maps/*.yaml', __dir__
22
- Dir[dir].each do |path|
23
- puts File.basename path, '.yaml'
23
+ Interscript.maps(load_path: true).each do |path|
24
+ puts path
24
25
  end
25
26
  end
26
27
  end
@@ -0,0 +1,22 @@
1
+ # An Interscript compiler interface
2
+ class Interscript::Compiler
3
+ attr_accessor :code
4
+
5
+ def self.call(map, **kwargs)
6
+ if String === map
7
+ map = Interscript::DSL.parse(map)
8
+ end
9
+ compiler = new
10
+ compiler.compile(map, **kwargs)
11
+ compiler
12
+ end
13
+
14
+ def compile(map)
15
+ raise NotImplementedError, "Compile method on #{self.class} is not implemented"
16
+ end
17
+
18
+ # Execute a map
19
+ def call
20
+ raise NotImplementedError, "Call class on #{self.class} is not implemented"
21
+ end
22
+ end
@@ -0,0 +1,292 @@
1
+ begin
2
+ require 'mini_racer'
3
+ rescue LoadError
4
+ # Ignore loading error
5
+ end
6
+ require 'json'
7
+
8
+ class Interscript::Compiler::Javascript < Interscript::Compiler
9
+ def compile(map, debug: false)
10
+ @map = map
11
+ @parallel_trees = {}
12
+ @parallel_regexps = {}
13
+ @debug = debug
14
+ c = "var map = function(Interscript) {"
15
+ c << "Interscript.define_map(#{map.name.inspect}, function(Interscript, map) {\n";
16
+ c << "map.dependencies = #{map.dependencies.map(&:full_name).to_json};\n"
17
+ c
18
+
19
+ map.aliases.each do |name, value|
20
+ val = compile_item(value.data, map, :str)
21
+ c << "map.aliases.#{name} = #{val};\n"
22
+ val = '"'+compile_item(value.data, map, :re)+'"'
23
+ c << "map.aliases_re.#{name} = #{val};\n"
24
+ end
25
+
26
+ map.stages.each do |_, stage|
27
+ c << compile_rule(stage, @map, true)
28
+ end
29
+ @parallel_trees.each do |k,v|
30
+ c << "map.cache.PTREE_#{k} = #{v.to_json};\n"
31
+ end
32
+ @parallel_regexps.each do |k,v|
33
+ v = "[\"#{v[0]}\", #{v[1].to_json}]"
34
+ c << "map.cache.PRE_#{k} = #{v};\n"
35
+ end
36
+
37
+ c << "});"
38
+ c << "};"
39
+ c << "if (typeof module !== 'undefined') { module.exports = map; }"
40
+ c << "else if (typeof Interscript !== 'undefined') { map(Interscript); }"
41
+ c << 'else { throw "We couldn\'t dispatch Interscript from a map!"; }'
42
+ @code = c
43
+ end
44
+
45
+ def parallel_regexp_compile(subs_hash)
46
+ # puts subs_hash.inspect
47
+ regexp = subs_hash.each_with_index.map do |p,i|
48
+ "(?<_%d>%s)" % [i,p[0]]
49
+ end.join("|")
50
+ subs_regexp = regexp
51
+ # puts subs_regexp.inspect
52
+ end
53
+
54
+ def compile_rule(r, map = @map, wrapper = false)
55
+ c = ""
56
+ case r
57
+ when Interscript::Node::Stage
58
+ c += "map.stages.#{r.name} = function(s) {\n"
59
+ c += "globalThis.map_debug = globalThis.map_debug || [];\n" if @debug
60
+ r.children.each do |t|
61
+ comp = compile_rule(t, map)
62
+ c += comp
63
+ c += %{globalThis.map_debug.push([s, #{@map.name.to_s.to_json}, #{r.name.to_s.to_json}, #{t.inspect.to_json}, #{comp.to_json}]);\n} if @debug
64
+ end
65
+ c += "return s;\n"
66
+ c += "};\n"
67
+ when Interscript::Node::Group::Parallel
68
+ begin
69
+ # Try to build a tree
70
+ a = []
71
+ r.children.each do |i|
72
+ raise ArgumentError, "Can't parallelize #{i.class}" unless Interscript::Node::Rule::Sub === i
73
+ raise ArgumentError, "Can't parallelize rules with :before" if i.before
74
+ raise ArgumentError, "Can't parallelize rules with :after" if i.after
75
+ raise ArgumentError, "Can't parallelize rules with :not_before" if i.not_before
76
+ raise ArgumentError, "Can't parallelize rules with :not_after" if i.not_after
77
+
78
+ a << [compile_item(i.from, map, :par), compile_item(i.to, map, :parstr)]
79
+ end
80
+ ah = a.hash.abs
81
+ unless @parallel_trees.include? ah
82
+ tree = Interscript::Stdlib.parallel_replace_compile_tree(a)
83
+ @parallel_trees[ah] = tree
84
+ end
85
+ c += "s = Interscript.parallel_replace_tree(s, map.cache.PTREE_#{ah});\n"
86
+ rescue
87
+ # Otherwise let's build a megaregexp
88
+ a = []
89
+ Interscript::Stdlib.deterministic_sort_by_max_length(r.children).each do |i|
90
+ raise ArgumentError, "Can't parallelize #{i.class}" unless Interscript::Node::Rule::Sub === i
91
+
92
+ a << [build_regexp(i, map), compile_item(i.to, map, :parstr)]
93
+ end
94
+ ah = a.hash.abs
95
+ unless @parallel_regexps.include? ah
96
+ re = parallel_regexp_compile(a)
97
+ @parallel_regexps[ah] = [re, a.map(&:last)]
98
+ end
99
+ c += "s = Interscript.parallel_regexp_gsub(s, map.cache.PRE_#{ah});\n"
100
+ end
101
+ when Interscript::Node::Rule::Sub
102
+ from = %{"#{build_regexp(r, map).gsub("/", "\\\\/")}"}
103
+ if r.to == :upcase
104
+ to = 'function(a){return a.toUpperCase();}'
105
+ else
106
+ to = compile_item(r.to, map, :str)
107
+ end
108
+ c += "s = Interscript.gsub(s, #{from}, #{to});\n"
109
+ when Interscript::Node::Rule::Funcall
110
+ c += "s = Interscript.functions.#{r.name}(s, #{r.kwargs.to_json});\n"
111
+ when Interscript::Node::Rule::Run
112
+ if r.stage.map
113
+ doc = map.dep_aliases[r.stage.map].document
114
+ stage = doc.imported_stages[r.stage.name]
115
+ else
116
+ stage = map.imported_stages[r.stage.name]
117
+ end
118
+ c += "s = Interscript.transliterate(#{stage.doc_name.to_json}, s, #{stage.name.to_json});\n"
119
+ else
120
+ raise ArgumentError, "Can't compile unhandled #{r.class}"
121
+ end
122
+ c
123
+ end
124
+
125
+ def build_regexp(r, map=@map)
126
+ from = compile_item(r.from, map, :re)
127
+ before = compile_item(r.before, map, :re) if r.before
128
+ after = compile_item(r.after, map, :re) if r.after
129
+ not_before = compile_item(r.not_before, map, :re) if r.not_before
130
+ not_after = compile_item(r.not_after, map, :re) if r.not_after
131
+
132
+ re = ""
133
+ re += "(?<=#{before})" if before
134
+ re += "(?<!#{not_before})" if not_before
135
+ re += from
136
+ re += "(?!#{not_after})" if not_after
137
+ re += "(?=#{after})" if after
138
+
139
+ re
140
+ end
141
+
142
+ def compile_item i, doc=@map, target=nil
143
+ i = i.first_string if %i[str parstr].include? target
144
+ i = Interscript::Node::Item.try_convert(i)
145
+ if target == :parstr
146
+ parstr = true
147
+ target = :par
148
+ end
149
+
150
+ out = case i
151
+ when Interscript::Node::Item::Alias
152
+ astr = if i.map
153
+ d = doc.dep_aliases[i.map].document
154
+ a = d.imported_aliases[i.name]
155
+ raise ArgumentError, "Alias #{i.name} of #{i.stage.map} not found" unless a
156
+ "Interscript.get_alias_ALIASTYPE(#{a.doc_name.to_json}, #{a.name.to_json})"
157
+ elsif Interscript::Stdlib::ALIASES.include?(i.name)
158
+ if target != :re && Interscript::Stdlib.re_only_alias?(i.name)
159
+ raise ArgumentError, "Can't use #{i.name} in a #{target} context"
160
+ end
161
+ stdlib_alias = true
162
+ "Interscript.aliases.#{i.name}"
163
+ else
164
+ a = doc.imported_aliases[i.name]
165
+ raise ArgumentError, "Alias #{i.name} not found" unless a
166
+
167
+ "Interscript.get_alias_ALIASTYPE(#{a.doc_name.to_json}, #{a.name.to_json})"
168
+ end
169
+
170
+ if target == :str
171
+ astr = astr.sub("_ALIASTYPE(", "(")
172
+ elsif target == :re
173
+ astr = %{"+#{astr.sub("_ALIASTYPE(", "_re(")}+"}
174
+ elsif parstr && stdlib_alias
175
+ astr = Interscript::Stdlib::ALIASES[i.name]
176
+ elsif target == :par
177
+ # raise NotImplementedError, "Can't use aliases in parallel mode yet"
178
+ astr = Interscript::Stdlib::ALIASES[i.name]
179
+ end
180
+ when Interscript::Node::Item::String
181
+ if target == :str
182
+ # Replace $1 with \$1, this is weird, but it works!
183
+ i.data.gsub("$", "\\\\$").to_json
184
+ elsif target == :par
185
+ i.data
186
+ elsif target == :re
187
+ Regexp.escape(i.data)
188
+ end
189
+ when Interscript::Node::Item::Group
190
+ if target == :par
191
+ i.children.map do |j|
192
+ compile_item(j, doc, target)
193
+ end.reduce([""]) do |j,k|
194
+ Array(j).product(Array(k)).map(&:join)
195
+ end
196
+ elsif target == :str
197
+ i.children.map { |j| compile_item(j, doc, target) }.join("+")
198
+ elsif target == :re
199
+ i.children.map { |j| compile_item(j, doc, target) }.join
200
+ end
201
+ when Interscript::Node::Item::CaptureGroup
202
+ if target != :re
203
+ raise ArgumentError, "Can't use a CaptureGroup in a #{target} context"
204
+ end
205
+ "(" + compile_item(i.data, doc, target) + ")"
206
+ when Interscript::Node::Item::Maybe,
207
+ Interscript::Node::Item::MaybeSome,
208
+ Interscript::Node::Item::Some
209
+
210
+ resuffix = { Interscript::Node::Item::Maybe => "?" ,
211
+ Interscript::Node::Item::Some => "+" ,
212
+ Interscript::Node::Item::MaybeSome => "*" }[i.class]
213
+
214
+ if target == :par
215
+ raise ArgumentError, "Can't use a MaybeSome in a #{target} context"
216
+ end
217
+ if Interscript::Node::Item::String === i.data && i.data.data.length != 1
218
+ "(?:" + compile_item(i.data, doc, target) + ")" + resuffix
219
+ else
220
+ compile_item(i.data, doc, target) + resuffix
221
+ end
222
+ when Interscript::Node::Item::CaptureRef
223
+ if target == :par
224
+ raise ArgumentError, "Can't use CaptureRef in parallel mode"
225
+ elsif target == :re
226
+ "\\\\#{i.id}"
227
+ elsif target == :str
228
+ "\"$#{i.id}\""
229
+ end
230
+ when Interscript::Node::Item::Any
231
+ if target == :str
232
+ raise ArgumentError, "Can't use Any in a string context" # A linter could find this!
233
+ elsif target == :par
234
+ i.data.map(&:data)
235
+ elsif target == :re
236
+ case i.value
237
+ when Array
238
+ data = i.data.map { |j| compile_item(j, doc, target) }
239
+ "(?:"+data.join("|")+")"
240
+ when String
241
+ "[#{Regexp.escape(i.value)}]"
242
+ when Range
243
+ "[#{Regexp.escape(i.value.first)}-#{Regexp.escape(i.value.last)}]"
244
+ end
245
+ end
246
+ end
247
+ end
248
+
249
+ @maps_loaded = {}
250
+ @ctx = nil
251
+ class << self
252
+ attr_accessor :maps_loaded
253
+ attr_accessor :ctx
254
+ end
255
+
256
+ def load
257
+ if !self.class.maps_loaded[@map.name]
258
+ @map.dependencies.each do |dep|
259
+ dep = dep.full_name
260
+ if !self.class.maps_loaded[dep]
261
+ Interscript.load(dep, compiler: self.class).load
262
+ end
263
+ end
264
+
265
+ ctx = self.class.ctx
266
+ unless ctx
267
+ ctx = MiniRacer::Context.new
268
+ ctx.eval File.read(__dir__+"/../../../../js/test-compiler/xregexp.js")
269
+ # Compatibility with Safari: will come later
270
+ #ctx.eval File.read(__dir__+"/../../../js/xregexp-oniguruma.js")
271
+ ctx.eval File.read(__dir__+"/../../../../js/src/stdlib.js")
272
+ self.class.ctx = ctx
273
+ end
274
+ #puts @code
275
+ ctx.eval @code
276
+ self.class.maps_loaded[@map.name] = true
277
+ end
278
+ end
279
+
280
+ def call(str, stage=:main)
281
+ load
282
+ self.class.ctx.eval "Interscript.transliterate(#{@map.name.to_json}, #{str.to_json}, #{stage.to_json})"
283
+ end
284
+
285
+ def self.read_debug_data
286
+ self.ctx.eval "globalThis.map_debug || []"
287
+ end
288
+
289
+ def self.reset_debug_data
290
+ self.ctx.eval "globalThis.map_debug = []"
291
+ end
292
+ end