interscript 0.1.0 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.adoc +250 -17
- data/bin/interscript +36 -17
- data/bin/rspec +29 -0
- data/bin/setup +8 -0
- data/lib/__pycache__/g2pwrapper.cpython-38.pyc +0 -0
- data/lib/g2pwrapper.py +34 -0
- data/lib/interscript-opal.rb +2 -0
- data/lib/interscript.rb +138 -38
- data/lib/interscript/command.rb +28 -0
- data/lib/interscript/fs.rb +69 -0
- data/lib/interscript/mapping.rb +142 -0
- data/lib/interscript/opal.rb +23 -0
- data/lib/interscript/opal/maps.js.erb +7 -0
- data/lib/interscript/opal_map_translate.rb +12 -0
- data/lib/interscript/version.rb +1 -1
- data/lib/model-7 +0 -0
- data/lib/tha-pt-b-7 +0 -0
- data/maps/acadsin-zho-Hani-Latn-2002.yaml +38912 -0
- data/maps/alalc-aze-Cyrl-Latn-1997.yaml +141 -0
- data/maps/alalc-bel-cyrl-latn-1997.yaml +125 -0
- data/maps/alalc-ben-Beng-Latn-2017.yaml +130 -0
- data/maps/alalc-bul-Cyrl-Latn-1997.yaml +94 -0
- data/maps/alalc-ell-Grek-Latn-1997.yaml +625 -0
- data/maps/alalc-ell-Grek-Latn-2010.yaml +628 -0
- data/maps/alalc-kat-Geok-Latn-1997.yaml +112 -0
- data/maps/alalc-kat-Geor-Latn-1997.yaml +146 -0
- data/maps/alalc-kor-Hang-Latn-1997.yaml +94 -0
- data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +103 -0
- data/maps/alalc-mkd-cyrl-latn-1997.yaml +114 -0
- data/maps/alalc-rus-Cyrl-Latn-1997.yaml +222 -0
- data/maps/alalc-rus-Cyrl-Latn-2012.yaml +162 -0
- data/maps/alalc-srp-Cyrl-Latn-1997.yaml +114 -0
- data/maps/alalc-srp-cyrl-latn-2013.yaml +135 -0
- data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +141 -0
- data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +16 -0
- data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +283 -0
- data/maps/bas-rus-Cyrl-Latn-2017-bss.yaml +175 -0
- data/maps/bas-rus-Cyrl-Latn-2017-oss.yaml +169 -0
- data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +294 -0
- data/maps/bgn-kor-Hang-Latn-1943.yaml +31 -0
- data/maps/bgn-kor-Kore-Latn-1943.yaml +31 -0
- data/maps/bgna-bul-Cyrl-Latn-2006.yaml +208 -0
- data/maps/bgna-bul-Cyrl-Latn-2009.yaml +208 -0
- data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +108 -0
- data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +104 -0
- data/maps/bgnpcgn-bak-Cyrl-Latn-2007.yaml +184 -0
- data/maps/bgnpcgn-bel-cyrl-latn-1979.yaml +285 -0
- data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +115 -0
- data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +38 -0
- data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +702 -0
- data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +20 -0
- data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +257 -0
- data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +127 -0
- data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +43 -0
- data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +253 -0
- data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +48 -0
- data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +48 -0
- data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +159 -0
- data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +190 -0
- data/maps/bgnpcgn-per-Arab-Latn-1956.yaml +93 -0
- data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +314 -0
- data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +166 -0
- data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +163 -0
- data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +208 -0
- data/maps/bgnpcgn-zho-Hans-Latn-1979.yaml +7456 -0
- data/maps/by-bel-Cyrl-Latn-1998.yaml +168 -0
- data/maps/by-bel-Cyrl-Latn-2007.yaml +115 -0
- data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +685 -0
- data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +681 -0
- data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +20 -0
- data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +32 -0
- data/maps/ggg-kat-Geor-Latn-2002.yaml +89 -0
- data/maps/gki-bel-cyrl-latn-1992.yaml +33 -0
- data/maps/gki-bel-cyrl-latn-2000.yaml +201 -0
- data/maps/gost-rus-cyrl-latn-16876-71-1983.yaml +186 -0
- data/maps/hk-yue-Hani-Latn-1888.yaml +38497 -0
- data/maps/icao-bel-Cyrl-Latn-9303.yaml +141 -0
- data/maps/icao-bul-Cyrl-Latn-9303.yaml +122 -0
- data/maps/icao-heb-Hebr-Latn-9303.yaml +151 -0
- data/maps/icao-mkd-Cyrl-Latn-9303.yaml +117 -0
- data/maps/icao-per-Arab-Latn-9303.yaml +104 -0
- data/maps/icao-rus-Cyrl-Latn-9303.yaml +118 -0
- data/maps/icao-srp-Cyrl-Latn-9303.yaml +117 -0
- data/maps/icao-ukr-Cyrl-Latn-9303.yaml +120 -0
- data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +610 -0
- data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +41 -0
- data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +62 -0
- data/maps/iso-rus-Cyrl-Latn-9-1995.yaml +272 -0
- data/maps/iso-tha-Thai-Latn-11940-1998.yaml +109 -0
- data/maps/kp-kor-Hang-Latn-2002.yaml +901 -0
- data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +44820 -0
- data/maps/mext-jpn-Hrkt-Latn-1954.yaml +411 -0
- data/maps/moct-kor-Hang-Latn-2000.yaml +803 -0
- data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +541 -0
- data/maps/mvd-bel-Cyrl-Latn-2008.yaml +225 -0
- data/maps/mvd-bel-Cyrl-Latn-2010.yaml +63 -0
- data/maps/mvd-rus-Cyrl-Latn-2008.yaml +110 -0
- data/maps/mvd-rus-Cyrl-Latn-2010.yaml +37 -0
- data/maps/nil-kor-Hang-Hang-jamo.yaml +11193 -0
- data/maps/odni-aze-Cyrl-Latn-2015.yaml +144 -0
- data/maps/odni-bel-Cyrl-Latn-2015.yaml +148 -0
- data/maps/odni-bul-Cyrl-Latn-2015.yaml +96 -0
- data/maps/odni-kat-Geor-Latn-2015.yaml +88 -0
- data/maps/odni-kaz-Cyrl-Latn-2015.yaml +148 -0
- data/maps/odni-kir-Cyrl-Latn-2015.yaml +136 -0
- data/maps/odni-mkd-cyrl-latn-2015.yaml +122 -0
- data/maps/odni-rus-Cyrl-Latn-2015.yaml +77 -0
- data/maps/odni-srp-Cyrl-Latn-2015.yaml +129 -0
- data/maps/odni-tat-Cyrl-Latn-2015.yaml +142 -0
- data/maps/odni-tgk-Cyrl-Latn-2015.yaml +148 -0
- data/maps/odni-uig-Cyrl-Latn-2015.yaml +138 -0
- data/maps/odni-ukr-Cyrl-Latn-2015.yaml +157 -0
- data/maps/odni-uzb-Cyrl-Latn-2015.yaml +167 -0
- data/maps/royin-tha-Thai-Latn-1939-generic.yaml +90 -0
- data/maps/royin-tha-Thai-Latn-1968.yaml +179 -0
- data/maps/royin-tha-Thai-Latn-1999-chained.yaml +180 -0
- data/maps/royin-tha-Thai-Latn-1999.yaml +76 -0
- data/maps/sac-zho-Hans-Latn-1979.yaml +24759 -0
- data/maps/ses-ara-arab-latn-1930.yaml +275 -0
- data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +222 -0
- data/maps/ua-ukr-Cyrl-Latn-1996.yaml +193 -0
- data/maps/un-ara-Arab-Latn-1971.yaml +127 -0
- data/maps/un-ara-Arab-Latn-1972.yaml +152 -0
- data/maps/un-ara-Arab-Latn-2017.yaml +383 -0
- data/maps/un-bel-Cyrl-Latn-2007.yaml +114 -0
- data/maps/un-ben-Beng-Latn-2016.yaml +534 -0
- data/maps/un-ell-Grek-Latn-1987-tl.yaml +32 -0
- data/maps/un-ell-Grek-Latn-1987-ts.yaml +20 -0
- data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +780 -0
- data/maps/un-mon-Mong-Latn-2013.yaml +93 -0
- data/maps/un-rus-Cyrl-Latn-1987.yaml +166 -0
- data/maps/un-ukr-cyrl-latn-1998.yaml +30 -0
- data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +406 -0
- data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +386 -0
- data/maps/var-kor-Hang-Latn-mr-1939.yaml +1054 -0
- data/maps/var-kor-Kore-Hang-2013.yaml +59754 -0
- data/maps/var-kor-Kore-Latn-mr-1939.yaml +37 -0
- data/maps/var-tha-Thai-Thai-phonemic.yaml +59 -0
- data/maps/var-tha-Thai-Zsym-ipa.yaml +301 -0
- data/maps/var-zho-Hani-Latn-1979.yaml +38908 -0
- data/spec/interscript/mapping_spec.rb +42 -0
- data/spec/interscript_spec.rb +26 -0
- data/spec/spec_helper.rb +3 -0
- metadata +295 -11
data/lib/interscript.rb
CHANGED
@@ -1,53 +1,153 @@
|
|
1
|
-
|
2
|
-
require 'singleton'
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
SYSTEM_DEFINITIONS_PATH = File.expand_path('../../maps', __FILE__)
|
3
|
+
require "interscript/opal/maps" if RUBY_ENGINE == "opal"
|
4
|
+
require "interscript/mapping"
|
8
5
|
|
9
|
-
|
10
|
-
|
11
|
-
end
|
6
|
+
# Transliteration
|
7
|
+
module Interscript
|
12
8
|
|
13
|
-
|
14
|
-
|
15
|
-
|
9
|
+
class InvalidSystemError < StandardError; end
|
10
|
+
class ExternalProcessNotRecognizedError < StandardError; end
|
11
|
+
class ExternalProcessUnavailableError < StandardError; end
|
16
12
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
13
|
+
if RUBY_ENGINE == 'opal'
|
14
|
+
require "interscript/opal"
|
15
|
+
extend Opal
|
16
|
+
else
|
17
|
+
require "interscript/fs"
|
18
|
+
extend Fs
|
21
19
|
end
|
22
20
|
|
23
|
-
|
24
|
-
@systems[system_code] ||= YAML.load_file(File.join(SYSTEM_DEFINITIONS_PATH, "#{system_code}.yaml"))
|
25
|
-
end
|
21
|
+
class << self
|
26
22
|
|
27
|
-
|
28
|
-
|
29
|
-
|
23
|
+
def transliterate(system_code, string, maps={})
|
24
|
+
unless maps.has_key? system_code
|
25
|
+
maps[system_code] = Interscript::Mapping.for(system_code)
|
26
|
+
end
|
27
|
+
# mapping = Interscript::Mapping.for(system_code)
|
28
|
+
mapping = maps[system_code]
|
30
29
|
|
31
|
-
|
32
|
-
|
33
|
-
|
30
|
+
# First, apply chained transliteration as specified in the list `chain`
|
31
|
+
chain = mapping.chain.dup
|
32
|
+
while chain.length > 0
|
33
|
+
string = transliterate(chain.shift, string, maps)
|
34
|
+
end
|
34
35
|
|
35
|
-
|
36
|
-
|
37
|
-
|
36
|
+
# Then, apply the rest of the map
|
37
|
+
separator = mapping.character_separator || ""
|
38
|
+
word_separator = mapping.word_separator || ""
|
39
|
+
title_case = mapping.title_case
|
40
|
+
downcase = mapping.downcase
|
38
41
|
|
39
|
-
|
40
|
-
|
42
|
+
# charmap = mapping.characters&.sort_by { |k, _v| k.size }&.reverse&.to_h
|
43
|
+
# dictmap = mapping.dictionary&.sort_by { |k, _v| k.size }&.reverse&.to_h
|
44
|
+
charmap = mapping.characters_hash
|
45
|
+
dictmap = mapping.dictionary_hash
|
46
|
+
trie = mapping.dictionary_trie
|
41
47
|
|
42
|
-
|
48
|
+
string = external_processing(mapping, string)
|
43
49
|
|
44
|
-
|
50
|
+
pos = 0
|
51
|
+
while pos < string.to_s.size
|
52
|
+
m = 0
|
53
|
+
wordmatch = ""
|
45
54
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
55
|
+
# Using Trie, find the longest matching substring
|
56
|
+
while (pos + m < string.to_s.size) && (trie.partial_word?string[pos..pos+m])
|
57
|
+
wordmatch = string[pos..pos+m] if trie.word?string[pos..pos+m]
|
58
|
+
m += 1
|
59
|
+
end
|
51
60
|
|
52
|
-
|
61
|
+
m = wordmatch.length
|
62
|
+
if m > 0
|
63
|
+
repl = dictmap[string[pos..pos+m-1]]
|
64
|
+
string = sub_replace(string, pos, m, repl)
|
65
|
+
pos += repl.length
|
66
|
+
else
|
67
|
+
pos += 1
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
output = string.clone
|
72
|
+
offsets = Array.new string.to_s.size, 1
|
73
|
+
|
74
|
+
# mapping.rules.each do |r|
|
75
|
+
# string.to_s.scan(/#{r['pattern']}/) do |matches|
|
76
|
+
# match = Regexp.last_match
|
77
|
+
# pos = match.offset(0).first
|
78
|
+
# result = r['result'].clone
|
79
|
+
# matches.each.with_index { |v, i| result.sub!(/\\#{i + 1}/, v) } if matches.is_a? Array
|
80
|
+
# result.upcase! if up_case_around?(string, pos)
|
81
|
+
# output[offsets[0...pos].sum, match[0].size] = result
|
82
|
+
# offsets[pos] += result.size - match[0].size
|
83
|
+
# end
|
84
|
+
# end
|
85
|
+
|
86
|
+
mapping.rules.each do |r|
|
87
|
+
next unless output
|
88
|
+
re = mkregexp(r["pattern"])
|
89
|
+
output = output.gsub(re, r["result"])
|
90
|
+
end
|
91
|
+
|
92
|
+
charmap.each do |k, v|
|
93
|
+
while (match = output&.match(/#{k}/))
|
94
|
+
pos = match.offset(0).first
|
95
|
+
result = !downcase && up_case_around?(output, pos) ? v.upcase : v
|
96
|
+
|
97
|
+
# if more than one, choose the first one
|
98
|
+
result = result[0] if result.is_a?(Array)
|
99
|
+
|
100
|
+
output = sub_replace(
|
101
|
+
output,
|
102
|
+
pos,
|
103
|
+
match[0].size,
|
104
|
+
add_separator(separator, pos, result)
|
105
|
+
)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
mapping.postrules.each do |r|
|
110
|
+
next unless output
|
111
|
+
re = mkregexp(r["pattern"])
|
112
|
+
output = output.gsub(re, r["result"])
|
113
|
+
end
|
53
114
|
|
115
|
+
return unless output
|
116
|
+
|
117
|
+
output = output.sub(/^(.)/, &:upcase) if title_case
|
118
|
+
if word_separator != ''
|
119
|
+
output = output.gsub(/#{word_separator}#{separator}/u, word_separator)
|
120
|
+
|
121
|
+
if title_case
|
122
|
+
output = output.gsub(/#{word_separator}(.)/u, &:upcase)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
output.unicode_normalize
|
127
|
+
end
|
128
|
+
|
129
|
+
private
|
130
|
+
|
131
|
+
def add_separator(separator, pos, result)
|
132
|
+
pos == 0 ? result : separator + result
|
133
|
+
end
|
134
|
+
|
135
|
+
def up_case_around?(string, pos)
|
136
|
+
return false if string[pos] == string[pos].downcase
|
137
|
+
|
138
|
+
i = pos - 1
|
139
|
+
i -= 1 while i.positive? && string[i] !~ Regexp.new(ALPHA_REGEXP)
|
140
|
+
before = i >= 0 && i < pos ? string[i].to_s.strip : ''
|
141
|
+
|
142
|
+
i = pos + 1
|
143
|
+
i += 1 while i < string.size - 1 && string[i] !~ Regexp.new(ALPHA_REGEXP)
|
144
|
+
after = i > pos ? string[i].to_s.strip : ''
|
145
|
+
|
146
|
+
before_uc = !before.empty? && before == before.upcase
|
147
|
+
after_uc = !after.empty? && after == after.upcase
|
148
|
+
# before_uc && (after.empty? || after_uc) || after_uc && (before.empty? || before_uc)
|
149
|
+
before_uc || after_uc
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'interscript'
|
3
|
+
require 'json'
|
4
|
+
module Interscript
|
5
|
+
# Command line interface
|
6
|
+
class Command < Thor
|
7
|
+
desc '<file>', 'Transliterate text'
|
8
|
+
option :system, aliases: '-s', required: true, desc: 'Transliteration system'
|
9
|
+
option :output, aliases: '-o', required: false, desc: 'Output file'
|
10
|
+
option :map, aliases: '-m', required: false, default: "{}", desc: 'Transliteration mapping json'
|
11
|
+
|
12
|
+
def translit(input)
|
13
|
+
if options[:output]
|
14
|
+
Interscript.transliterate_file(options[:system], input, options[:output], JSON.parse(options[:map]))
|
15
|
+
else
|
16
|
+
puts Interscript.transliterate(options[:system], IO.read(input))
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
desc 'list', 'Prints allowed transliteration systems'
|
21
|
+
def list
|
22
|
+
dir = File.expand_path '../../maps/*.yaml', __dir__
|
23
|
+
Dir[dir].each do |path|
|
24
|
+
puts File.basename path, '.yaml'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module Interscript
|
2
|
+
module Fs
|
3
|
+
ALPHA_REGEXP = '[[:alpha:]]'
|
4
|
+
|
5
|
+
def sub_replace(string, pos, size, repl)
|
6
|
+
string[pos..pos + size - 1] = repl
|
7
|
+
string
|
8
|
+
end
|
9
|
+
|
10
|
+
def root_path
|
11
|
+
@root_path ||= Pathname.new(File.join(File.dirname(__dir__), ".."))
|
12
|
+
end
|
13
|
+
|
14
|
+
def transliterate_file(system_code, input_file, output_file, maps={})
|
15
|
+
input = File.read(input_file)
|
16
|
+
output = transliterate(system_code, input, maps)
|
17
|
+
|
18
|
+
File.open(output_file, 'w') do |f|
|
19
|
+
f.puts(output)
|
20
|
+
end
|
21
|
+
|
22
|
+
puts "Output written to: #{output_file}"
|
23
|
+
output_file
|
24
|
+
end
|
25
|
+
|
26
|
+
def import_python_modules
|
27
|
+
begin
|
28
|
+
pyimport :g2pwrapper
|
29
|
+
rescue
|
30
|
+
pyimport :sys
|
31
|
+
sys.path.append(root_path.to_s + "/lib/")
|
32
|
+
pyimport :g2pwrapper
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def external_process(process_name, string)
|
37
|
+
import_python_modules
|
38
|
+
|
39
|
+
case process_name
|
40
|
+
when 'sequitur.pythainlp_lexicon'
|
41
|
+
return g2pwrapper.transliterate('pythainlp_lexicon', string)
|
42
|
+
when 'sequitur.wiktionary_phonemic'
|
43
|
+
return g2pwrapper.transliterate('wiktionary_phonemic', string)
|
44
|
+
else
|
45
|
+
raise ExternalProcessNotRecognizedError.new
|
46
|
+
end
|
47
|
+
|
48
|
+
rescue
|
49
|
+
raise ExternalProcessUnavailableError.new
|
50
|
+
end
|
51
|
+
|
52
|
+
def external_processing(mapping, string)
|
53
|
+
# Segmentation
|
54
|
+
string = external_process(mapping.segmentation, string) if mapping.segmentation
|
55
|
+
|
56
|
+
# Transliteration/Transcription
|
57
|
+
string = external_process(mapping.transcription, string) if mapping.transcription
|
58
|
+
|
59
|
+
string
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def mkregexp(regexpstring)
|
65
|
+
/#{regexpstring}/u
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
require 'rambling-trie'
|
2
|
+
require 'yaml'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module Interscript
|
6
|
+
|
7
|
+
class Mapping
|
8
|
+
attr_reader(
|
9
|
+
:id,
|
10
|
+
:url,
|
11
|
+
:name,
|
12
|
+
:notes,
|
13
|
+
:rules,
|
14
|
+
:tests,
|
15
|
+
:language,
|
16
|
+
:postrules,
|
17
|
+
:characters,
|
18
|
+
:description,
|
19
|
+
:authority_id,
|
20
|
+
:creation_date,
|
21
|
+
:source_script,
|
22
|
+
:destination_script,
|
23
|
+
:chain,
|
24
|
+
:character_separator,
|
25
|
+
:word_separator,
|
26
|
+
:title_case,
|
27
|
+
:downcase,
|
28
|
+
:dictionary,
|
29
|
+
:characters_hash,
|
30
|
+
:dictionary_hash,
|
31
|
+
:segmentation,
|
32
|
+
:transcription,
|
33
|
+
:dictionary_trie
|
34
|
+
)
|
35
|
+
|
36
|
+
def initialize(system_code, options = {})
|
37
|
+
@system_code = system_code
|
38
|
+
@depth = options.fetch(:depth, 0).to_i
|
39
|
+
|
40
|
+
unless RUBY_ENGINE == 'opal'
|
41
|
+
@system_path = options.fetch(:system_code, default_path)
|
42
|
+
end
|
43
|
+
|
44
|
+
load_and_serialize_system_mappings
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.for(system_code, options = {})
|
48
|
+
new(system_code, options)
|
49
|
+
end
|
50
|
+
|
51
|
+
def load_and_serialize_system_mappings
|
52
|
+
return if depth >= 5
|
53
|
+
|
54
|
+
mappings = load_system_mappings
|
55
|
+
serialize_system_mappings(mappings)
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
attr_reader :depth, :system_code, :system_path
|
61
|
+
|
62
|
+
def system_code_file
|
63
|
+
[system_code, "yaml"].join(".")
|
64
|
+
end
|
65
|
+
|
66
|
+
def default_path
|
67
|
+
@default_path ||= Interscript.root_path.join("maps")
|
68
|
+
end
|
69
|
+
|
70
|
+
def load_system_mappings
|
71
|
+
if RUBY_ENGINE == 'opal'
|
72
|
+
load_opal_mappings
|
73
|
+
else
|
74
|
+
load_fs_mappings
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def load_opal_mappings
|
79
|
+
JSON.parse(`InterscriptMaps[#{system_code}]`)
|
80
|
+
end
|
81
|
+
|
82
|
+
def load_fs_mappings
|
83
|
+
YAML.load_file(system_path.join(system_code_file))
|
84
|
+
rescue Errno::ENOENT
|
85
|
+
raise Interscript::InvalidSystemError.new("No system mappings found")
|
86
|
+
end
|
87
|
+
|
88
|
+
def serialize_system_mappings(mappings)
|
89
|
+
@id = mappings.fetch("id", nil)
|
90
|
+
@url = mappings.fetch("url", nil)
|
91
|
+
@name = mappings.fetch("name", nil)
|
92
|
+
@notes = mappings.fetch("notes", nil)
|
93
|
+
@tests = mappings.fetch("tests", [])
|
94
|
+
@language = mappings.fetch("language", nil)
|
95
|
+
@description = mappings.fetch("description", nil)
|
96
|
+
@authority_id = mappings.fetch("authority_id", nil)
|
97
|
+
@creation_date = mappings.fetch("creation_date", nil)
|
98
|
+
@source_script = mappings.fetch("source_script", nil)
|
99
|
+
@destination_script = mappings.fetch("destination_script", nil)
|
100
|
+
@chain = mappings.fetch("chain", [])
|
101
|
+
@character_separator = mappings["map"]["character_separator"] || nil
|
102
|
+
@word_separator = mappings["map"]["word_separator"] || nil
|
103
|
+
@title_case = mappings["map"]["title_case"] || false
|
104
|
+
@downcase = mappings["map"]["downcase"] || false
|
105
|
+
@rules = mappings["map"]["rules"] || []
|
106
|
+
@postrules = mappings["map"]["postrules"] || []
|
107
|
+
@characters = mappings["map"]["characters"] || {}
|
108
|
+
@dictionary = mappings["map"]["dictionary"] || {}
|
109
|
+
@segmentation = mappings["map"]["segementation"] || nil
|
110
|
+
@transcription = mappings["map"]["transcription"] || nil
|
111
|
+
|
112
|
+
include_inherited_mappings(mappings)
|
113
|
+
build_hashes
|
114
|
+
build_trie
|
115
|
+
end
|
116
|
+
|
117
|
+
def include_inherited_mappings(mappings)
|
118
|
+
inherit_systems = [].push(mappings["map"]["inherit"]).flatten
|
119
|
+
|
120
|
+
inherit_systems.each do |inherit_system|
|
121
|
+
next unless inherit_system
|
122
|
+
|
123
|
+
inherited_mapping = Mapping.for(inherit_system, depth: depth + 1)
|
124
|
+
|
125
|
+
@rules = [inherited_mapping.rules, rules].flatten
|
126
|
+
@postrules = [inherited_mapping.postrules, postrules].flatten
|
127
|
+
@characters = (inherited_mapping.characters|| {}).merge(characters)
|
128
|
+
@dictionary = (inherited_mapping.dictionary|| {}).merge(dictionary)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def build_hashes
|
133
|
+
@characters_hash = characters&.sort_by { |k, _v| k.size }&.reverse&.to_h
|
134
|
+
@dictionary_hash = dictionary&.sort_by { |k, _v| k.size }&.reverse&.to_h
|
135
|
+
end
|
136
|
+
|
137
|
+
def build_trie
|
138
|
+
@dictionary_trie = Rambling::Trie.create
|
139
|
+
dictionary_trie.concat dictionary.keys
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Interscript
|
2
|
+
module Opal
|
3
|
+
ALPHA_REGEXP = '\p{L}'
|
4
|
+
|
5
|
+
def mkregexp(regexpstring)
|
6
|
+
flags = 'u'
|
7
|
+
if regexpstring.include? "(?i)"
|
8
|
+
regexpstring = regexpstring.gsub("(?i)", "").gsub("(?-i)", "")
|
9
|
+
flags = 'ui'
|
10
|
+
end
|
11
|
+
Regexp.new("/#{regexpstring}/#{flags}")
|
12
|
+
end
|
13
|
+
|
14
|
+
def sub_replace(string, pos, size, repl)
|
15
|
+
string[0, pos] + repl + string[pos + size..-1]
|
16
|
+
end
|
17
|
+
|
18
|
+
def external_processing(mapping, string)
|
19
|
+
string
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|