glaemscribe 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/glaemscribe +2 -2
- data/glaemresources/charsets/cirth_ds.cst +514 -179
- data/glaemresources/charsets/eldamar.cst +210 -0
- data/glaemresources/charsets/tengwar_ds_annatar.cst +2452 -130
- data/glaemresources/charsets/tengwar_ds_eldamar.cst +2319 -125
- data/glaemresources/charsets/tengwar_ds_elfica.cst +2317 -126
- data/glaemresources/charsets/tengwar_ds_parmaite.cst +2319 -127
- data/glaemresources/charsets/tengwar_ds_sindarin.cst +2318 -127
- data/glaemresources/charsets/tengwar_freemono.cst +1 -1
- data/glaemresources/charsets/tengwar_guni_annatar.cst +2451 -131
- data/glaemresources/charsets/tengwar_guni_eldamar.cst +2317 -126
- data/glaemresources/charsets/tengwar_guni_elfica.cst +2316 -127
- data/glaemresources/charsets/tengwar_guni_parmaite.cst +2319 -127
- data/glaemresources/charsets/tengwar_guni_sindarin.cst +2317 -126
- data/glaemresources/charsets/tengwar_telcontar.cst +7 -0
- data/glaemresources/modes/blackspeech-tengwar-general_use.glaem +1 -1
- data/glaemresources/modes/english-cirth-espeak.glaem +687 -0
- data/glaemresources/modes/english-tengwar-espeak.glaem +814 -0
- data/glaemresources/modes/japanese-tengwar.glaem +9 -4
- data/glaemresources/modes/lang_belta-tengwar-dadef.glaem +248 -0
- data/glaemresources/modes/raw-cirth.glaem +154 -0
- data/lib/api/charset_parser.rb +7 -1
- data/lib/api/mode.rb +35 -10
- data/lib/api/mode_parser.rb +21 -12
- data/lib/api/post_processor/outspace.rb +44 -0
- data/lib/api/rule_group.rb +1 -1
- data/lib/api/transcription_pre_post_processor.rb +8 -5
- data/lib/api/transcription_processor.rb +12 -9
- data/lib/glaemscribe.rb +2 -0
- data/lib_espeak/espeakng.for.glaemscribe.nowasm.sync.js +25 -11
- data/lib_espeak/glaemscribe_tts.js +363 -223
- metadata +12 -6
@@ -1,11 +1,11 @@
|
|
1
1
|
/*
|
2
2
|
|
3
3
|
Glǽmscribe (also written Glaemscribe) is a software dedicated to
|
4
|
-
the transcription of texts between writing systems, and more
|
5
|
-
specifically dedicated to the transcription of J.R.R. Tolkien's
|
4
|
+
the transcription of texts between writing systems, and more
|
5
|
+
specifically dedicated to the transcription of J.R.R. Tolkien's
|
6
6
|
invented languages to some of his devised writing systems.
|
7
7
|
|
8
|
-
Copyright (C) 2015 Benjamin Babut (Talagan).
|
8
|
+
Copyright (C) 2015-2020 Benjamin Babut (Talagan).
|
9
9
|
|
10
10
|
This program is free software: you can redistribute it and/or modify
|
11
11
|
it under the terms of the GNU Affero General Public License as published by
|
@@ -23,71 +23,44 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
23
23
|
*/
|
24
24
|
|
25
25
|
|
26
|
-
// A wrapper around espeak to perform
|
27
|
-
// and generate IPA and/or WAV while keeping
|
26
|
+
// A wrapper around espeak to perform various TTS tasks,
|
27
|
+
// and generate IPA and/or WAV while keeping punctuation signs or cleaning them up.
|
28
28
|
//
|
29
29
|
// Espeak does not have this feature, so this is a significantly dirty hack.
|
30
|
-
//
|
30
|
+
//
|
31
31
|
// Additionally we perform a few glaemscribe-specific tasks, such as preserving raw tengwar
|
32
|
-
|
32
|
+
// or numbers which are treated independently.
|
33
33
|
|
34
34
|
// For the ruby loader, define the Glaemscribe module.
|
35
35
|
Glaemscribe = (typeof(Glaemscribe) === 'undefined')?({}):(Glaemscribe);
|
36
36
|
|
37
|
-
Glaemscribe.TTS = function() {
|
38
|
-
|
37
|
+
Glaemscribe.TTS = function() {
|
38
|
+
|
39
39
|
var client = this;
|
40
40
|
client.proxy = new ESpeakNGGlue();
|
41
41
|
}
|
42
42
|
|
43
43
|
Glaemscribe.TTS.ipa_configurations = {
|
44
|
-
'en': {
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
special_token_ipa_ncn: '',
|
51
|
-
special_token_ipa_ncs: '',
|
52
|
-
special_token_ipa_scn: '',
|
53
|
-
special_token_ipa_scs: '',
|
54
|
-
// Replace by special token AND KEEP when calculating ipa
|
55
|
-
clauseaffecting_punctuation: "!.,;:!?–—",
|
56
|
-
// Replace by special token but do not keep when calculating ipa
|
57
|
-
// '’ : apostrophes should stay in the original text !!! Don't break liz's bag !!
|
58
|
-
// This is because apostrophes shouldn't trigger a pause in the prononciation (e.g. genitives)
|
59
|
-
clauseunaffecting_punctuation: "·“”«»-[](){}<>≤≥$|\""
|
60
|
-
},
|
61
|
-
'fr': {
|
62
|
-
special_token_ncn: '', // no space / sign / no space
|
63
|
-
special_token_ncs: '', // no space / sign / space
|
64
|
-
special_token_scn: '', // space / sign / no space
|
65
|
-
special_token_scs: '', // space / sign / space
|
66
|
-
|
67
|
-
special_token_ipa_ncn: '',
|
68
|
-
special_token_ipa_ncs: '',
|
69
|
-
special_token_ipa_scn: '',
|
70
|
-
special_token_ipa_scs: '',
|
44
|
+
'en-tengwar': {
|
45
|
+
|
46
|
+
punct_token: '', // Invariant, for punctuation
|
47
|
+
block_token: '', // Invariant, for special blocks (nums / raw tengwar)
|
48
|
+
|
71
49
|
// Replace by special token AND KEEP when calculating ipa
|
72
|
-
clauseaffecting_punctuation: "!.,;:!?–—",
|
50
|
+
clauseaffecting_punctuation: "!.,;:!?–—",
|
73
51
|
// Replace by special token but do not keep when calculating ipa
|
74
|
-
// '’ : apostrophes should stay in the original text
|
75
|
-
//
|
76
|
-
|
77
|
-
//
|
78
|
-
|
79
|
-
// Long vowel back replacement.
|
80
|
-
return text.replace(/-/g,"ː");
|
81
|
-
}
|
52
|
+
// For those signs : '’ : apostrophes should stay in the original text !!! Don't break liz's bag !!
|
53
|
+
// Apostrophes shouldn't trigger a pause in the prononciation (e.g. genitives, I've, don't etc)
|
54
|
+
// But apostrophe and single quote are the same thing.
|
55
|
+
// It's necessary to document that single quotes should then be avoided.
|
56
|
+
clauseunaffecting_punctuation: "·“”«»-[](){}⟨⟩<>≤≥$|\""
|
82
57
|
}
|
83
58
|
}
|
84
59
|
|
85
|
-
Glaemscribe.TTS.ipa_configurations['en-
|
86
|
-
Glaemscribe.TTS.ipa_configurations['en-
|
87
|
-
Glaemscribe.TTS.ipa_configurations['en-tengwar-
|
88
|
-
Glaemscribe.TTS.ipa_configurations['en-tengwar']
|
89
|
-
Glaemscribe.TTS.ipa_configurations['en-tengwar-gb'] = Glaemscribe.TTS.ipa_configurations['en'];
|
90
|
-
Glaemscribe.TTS.ipa_configurations['en-tengwar-us'] = Glaemscribe.TTS.ipa_configurations['en'];
|
60
|
+
Glaemscribe.TTS.ipa_configurations['en-tengwar'] = Glaemscribe.TTS.ipa_configurations['en-tengwar'];
|
61
|
+
Glaemscribe.TTS.ipa_configurations['en-tengwar-rp'] = Glaemscribe.TTS.ipa_configurations['en-tengwar'];
|
62
|
+
Glaemscribe.TTS.ipa_configurations['en-tengwar-gb'] = Glaemscribe.TTS.ipa_configurations['en-tengwar'];
|
63
|
+
Glaemscribe.TTS.ipa_configurations['en-tengwar-us'] = Glaemscribe.TTS.ipa_configurations['en-tengwar'];
|
91
64
|
|
92
65
|
|
93
66
|
Glaemscribe.TTS.voice_list = function(voice) {
|
@@ -96,6 +69,10 @@ Glaemscribe.TTS.voice_list = function(voice) {
|
|
96
69
|
|
97
70
|
// Static helper. To be used in pure js (not ruby).
|
98
71
|
Glaemscribe.TTS.option_name_to_voice = function(oname) {
|
72
|
+
|
73
|
+
if(!oname)
|
74
|
+
return null;
|
75
|
+
|
99
76
|
return oname.toLowerCase().replace(/^espeak_voice_/,'').replace(/_/g,'-');
|
100
77
|
}
|
101
78
|
|
@@ -109,257 +86,420 @@ Glaemscribe.TTS.prototype.make_char_checker = function(string){
|
|
109
86
|
return cc;
|
110
87
|
}
|
111
88
|
|
112
|
-
Glaemscribe.TTS.prototype.
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
89
|
+
Glaemscribe.TTS.prototype.isSpace = function(a) {
|
90
|
+
return (a == ' ' || a == '\t');
|
91
|
+
}
|
92
|
+
|
93
|
+
Glaemscribe.TTS.prototype.read_cap_token = function(text, starti, cap_checker) {
|
94
|
+
|
95
|
+
var client = this
|
96
|
+
var i = starti;
|
97
|
+
var tok = ""
|
98
|
+
|
99
|
+
if(cap_checker[text[i]] == null)
|
100
|
+
return null;
|
101
|
+
|
102
|
+
i++;
|
103
|
+
|
104
|
+
// Advance the sequence
|
105
|
+
for(; i<text.length; i++) {
|
106
|
+
if( (cap_checker[text[i]] == null) && !client.isSpace(text[i])) {
|
107
|
+
break;
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
// Rewind trailing spaces
|
112
|
+
var toklen = i - starti;
|
113
|
+
|
114
|
+
for(i = starti + toklen - 1; i>=starti ; i--) {
|
115
|
+
if(client.isSpace(text[i]))
|
116
|
+
toklen--;
|
122
117
|
else
|
123
|
-
|
118
|
+
break;
|
119
|
+
}
|
120
|
+
|
121
|
+
return text.substring(starti,starti+toklen);
|
122
|
+
};
|
123
|
+
|
124
|
+
Glaemscribe.TTS.prototype.preceded_by_space = function(text,i) {
|
125
|
+
var client = this;
|
126
|
+
|
127
|
+
if(i <= 0)
|
128
|
+
return false;
|
129
|
+
else
|
130
|
+
return client.isSpace(text[i-1]);
|
131
|
+
}
|
132
|
+
|
133
|
+
Glaemscribe.TTS.prototype.succeeded_by_space = function(text,i) {
|
134
|
+
var client = this;
|
135
|
+
|
136
|
+
if(i >= text.length-1)
|
137
|
+
return false;
|
138
|
+
else
|
139
|
+
return client.isSpace(text[i+1]);
|
140
|
+
}
|
141
|
+
|
142
|
+
// Escapes raw mode AND numbers
|
143
|
+
Glaemscribe.TTS.prototype.escape_special_blocks = function(voice, entry, for_ipa) {
|
144
|
+
|
145
|
+
var config = Glaemscribe.TTS.ipa_configurations[voice];
|
146
|
+
|
147
|
+
// TODO : make this configurable
|
148
|
+
|
149
|
+
// Tonekize raw_mode escaping + numbers, we don't want them to be converted in IPA
|
150
|
+
// Also, keep numbers in the writing, to prevent espeak from pronuncing them
|
151
|
+
var ipaexpr = /(\s*)({{[\s\S]*?}}|\b[0-9][0-9\s]*\b)(\s*)/g;
|
152
|
+
var wavexpr = /(\s*)({{[\s\S]*?}})(\s*)/g;
|
153
|
+
var rawgexp = (for_ipa)?(ipaexpr):(wavexpr);
|
154
|
+
|
155
|
+
var captured = [];
|
156
|
+
|
157
|
+
var ret = entry.replace(rawgexp, function(match,p1,p2,p3) {
|
158
|
+
captured.push(match);
|
159
|
+
if(!for_ipa)
|
160
|
+
return ' '; // For wav, just replace by empty space and do not pronunce.
|
161
|
+
else {
|
162
|
+
return p1 + config['block_token'] + p3; // For IPA, replace by dummy token.
|
163
|
+
}
|
124
164
|
});
|
125
|
-
|
165
|
+
|
126
166
|
return [ret, captured];
|
127
167
|
}
|
128
168
|
|
129
|
-
|
130
|
-
|
169
|
+
|
170
|
+
Glaemscribe.TTS.prototype.ipa_instrument_punct = function(voice, text) {
|
171
|
+
|
131
172
|
var client = this;
|
132
173
|
var config = Glaemscribe.TTS.ipa_configurations[voice];
|
133
|
-
|
134
|
-
// Normalize all tabs by spaces
|
135
|
-
text = text.replace(/\t/g," ");
|
136
|
-
// Small hack to prevent espeak from pronouncing last dot
|
137
|
-
// since our tokenization may isolate it.
|
138
|
-
text += "\n";
|
139
|
-
|
174
|
+
|
140
175
|
var cap = client.make_char_checker(config['clauseaffecting_punctuation']);
|
141
176
|
var cup = client.make_char_checker(config['clauseunaffecting_punctuation']);
|
142
|
-
|
177
|
+
|
143
178
|
var accum = "";
|
144
179
|
var kept_signs = [];
|
145
|
-
|
146
|
-
var
|
147
|
-
|
148
|
-
|
180
|
+
|
181
|
+
var rescap = null;
|
182
|
+
|
183
|
+
for(var i=0;i<text.length;i++)
|
149
184
|
{
|
150
|
-
// Is precedent char a space ?
|
151
|
-
if(i == 0)
|
152
|
-
prec_is_space = false;
|
153
|
-
else
|
154
|
-
prec_is_space = (text[i-1] == " ");
|
155
|
-
|
156
|
-
// Is precedent char a space ?
|
157
|
-
if(i == text.length-1)
|
158
|
-
next_is_space = false;
|
159
|
-
else
|
160
|
-
next_is_space = (text[i+1] == " ");
|
161
|
-
|
162
185
|
if(text[i] == "\n")
|
163
186
|
{
|
164
|
-
accum += config['
|
187
|
+
accum += config['punct_token'];
|
165
188
|
kept_signs.push(text[i]);
|
166
189
|
}
|
167
|
-
else if(
|
190
|
+
else if(cup[text[i]] != null)
|
168
191
|
{
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
accum += " " + config['special_token_ncs'] + " " + text[i] + " ";
|
179
|
-
kept_signs.push(text[i] + " ");
|
180
|
-
}
|
181
|
-
if(prec_is_space && !next_is_space)
|
182
|
-
{
|
183
|
-
// Always insert spaces, but remember how they were placed
|
184
|
-
accum += " " + config['special_token_scn'] + " " + text[i] + " ";
|
185
|
-
kept_signs.push(" " + text[i]);
|
186
|
-
}
|
187
|
-
if(prec_is_space && next_is_space)
|
188
|
-
{
|
189
|
-
// Always insert spaces, but remember how they were placed
|
190
|
-
accum += " " + config['special_token_scs'] + " " + text[i] + " ";
|
191
|
-
kept_signs.push(" " + text[i] + " ");
|
192
|
-
}
|
192
|
+
// This sign does not affect clause analysis by espeak.
|
193
|
+
// Replace the sign by a special "word" / token AND REMOVE the sign
|
194
|
+
// We will restore it after IPA calculation.
|
195
|
+
accum += " " + config['punct_token'] + " " ;
|
196
|
+
kept_signs.push(
|
197
|
+
((client.preceded_by_space(text,i))?(" "):("")) +
|
198
|
+
text[i] +
|
199
|
+
((client.succeeded_by_space(text,i))?(" "):(""))
|
200
|
+
);
|
193
201
|
}
|
194
|
-
else if(
|
202
|
+
else if(rescap = client.read_cap_token(text,i,cap))
|
203
|
+
{
|
204
|
+
// This punctuation sign affects clause analysis.
|
205
|
+
// Replace the sign by a special "word" / token AND keep the sign
|
206
|
+
// Always insert spaces, but remember how they were placed
|
207
|
+
accum += " " + text[i] + " " + config['punct_token'] + " " ;
|
208
|
+
kept_signs.push(
|
209
|
+
((client.preceded_by_space(text, i))?(" "):("")) +
|
210
|
+
rescap +
|
211
|
+
((client.succeeded_by_space(text, i + rescap.length - 1))?(" "):(""))
|
212
|
+
);
|
213
|
+
i += rescap.length - 1;
|
214
|
+
}
|
215
|
+
else
|
216
|
+
{
|
217
|
+
accum += text[i];
|
218
|
+
}
|
219
|
+
}
|
220
|
+
|
221
|
+
return [accum, kept_signs];
|
222
|
+
}
|
223
|
+
|
224
|
+
Glaemscribe.TTS.prototype.wav_instrument_punct = function(voice, text) {
|
225
|
+
|
226
|
+
var client = this;
|
227
|
+
var config = Glaemscribe.TTS.ipa_configurations[voice];
|
228
|
+
var cap = client.make_char_checker(config['clauseaffecting_punctuation']);
|
229
|
+
var accum = "";
|
230
|
+
var rescap = null;
|
231
|
+
|
232
|
+
for(var i=0;i<text.length;i++)
|
233
|
+
{
|
234
|
+
if(rescap = client.read_cap_token(text,i,cap))
|
195
235
|
{
|
196
|
-
|
197
|
-
|
198
|
-
if(!prec_is_space && !next_is_space)
|
199
|
-
{
|
200
|
-
accum += " " + config['special_token_ncn'] + " " ;
|
201
|
-
kept_signs.push(text[i]);
|
202
|
-
}
|
203
|
-
if(!prec_is_space && next_is_space)
|
204
|
-
{
|
205
|
-
accum += " " + config['special_token_ncs'] + " " ;
|
206
|
-
kept_signs.push(text[i] + " ");
|
207
|
-
}
|
208
|
-
if(prec_is_space && !next_is_space)
|
209
|
-
{
|
210
|
-
accum += " " + config['special_token_scn'] + " " ;
|
211
|
-
kept_signs.push(" " + text[i]);
|
212
|
-
}
|
213
|
-
if(prec_is_space && next_is_space)
|
214
|
-
{
|
215
|
-
accum += " " + config['special_token_scs'] + " " ;
|
216
|
-
kept_signs.push(" " + text[i] + " ");
|
217
|
-
}
|
236
|
+
accum += text[i]; // Just keep the first sign, ignore the others
|
237
|
+
i += rescap.length - 1;
|
218
238
|
}
|
219
239
|
else
|
220
240
|
{
|
221
241
|
accum += text[i];
|
222
242
|
}
|
223
243
|
}
|
224
|
-
|
225
|
-
|
226
|
-
//console.log(kept_signs)
|
227
|
-
return [accum,kept_signs];
|
244
|
+
|
245
|
+
return accum;
|
228
246
|
}
|
229
247
|
|
230
|
-
Glaemscribe.TTS.prototype.
|
231
|
-
|
248
|
+
Glaemscribe.TTS.prototype.ipa_instrument_blocks = function(voice, text)
|
249
|
+
{
|
250
|
+
var client = this;
|
232
251
|
var config = Glaemscribe.TTS.ipa_configurations[voice];
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
var
|
240
|
-
|
241
|
-
|
242
|
-
// console.log("=====")
|
243
|
-
// console.log(ipa)
|
244
|
-
// console.log(config)
|
245
|
-
// console.log(ncsr)
|
246
|
-
|
247
|
-
// Tokens have been accumulated linearly
|
248
|
-
ipa = ipa.replace(ncnr, function(match, contents, offset, s) {return '∰∰'; });
|
249
|
-
ipa = ipa.replace(ncsr, function(match, contents, offset, s) {return '∰∰'; });
|
250
|
-
ipa = ipa.replace(scnr, function(match, contents, offset, s) {return '∰∰'; });
|
251
|
-
ipa = ipa.replace(scsr, function(match, contents, offset, s) {return '∰∰'; });
|
252
|
-
|
253
|
-
// console.log("=====")
|
254
|
-
// console.log(ipa)
|
252
|
+
|
253
|
+
return this.escape_special_blocks(voice, text, true);
|
254
|
+
}
|
255
|
+
|
256
|
+
Glaemscribe.TTS.prototype.ipa_restore_tokens = function(text, token, kept_tokens) {
|
257
|
+
|
258
|
+
var rx = new RegExp("\\s*(" + token + ")\\s*","g");
|
259
|
+
|
255
260
|
var nth = -1;
|
256
|
-
|
261
|
+
text = text.replace(rx,function(match, contents, offset, s) {
|
257
262
|
nth += 1;
|
258
263
|
return kept_tokens[nth];
|
259
264
|
});
|
260
|
-
|
265
|
+
|
266
|
+
return text;
|
267
|
+
}
|
268
|
+
|
269
|
+
Glaemscribe.TTS.prototype.post_ipa = function(voice, ipa, pre_ipa_res) {
|
270
|
+
|
271
|
+
var client = this;
|
272
|
+
var config = Glaemscribe.TTS.ipa_configurations[voice];
|
273
|
+
ipa = ipa.replace(/\n/g, " ");
|
274
|
+
|
275
|
+
ipa = client.ipa_restore_tokens(ipa, config.punct_token, pre_ipa_res.punct_tokens);
|
276
|
+
ipa = client.ipa_restore_tokens(ipa, config.block_token, pre_ipa_res.block_tokens);
|
277
|
+
|
261
278
|
// Post-treatment of anti 'dot' pronounciation hack
|
262
|
-
if(ipa[ipa.length-1] === "\n")
|
279
|
+
if(ipa[ipa.length-1] === "\n")
|
263
280
|
ipa = ipa.slice(0,-1);
|
264
|
-
|
265
|
-
// console.log("=====")
|
266
|
-
// console.log(ipa)
|
281
|
+
|
267
282
|
return ipa
|
268
283
|
}
|
269
284
|
|
270
285
|
|
271
|
-
Glaemscribe.TTS.prototype.
|
272
|
-
|
286
|
+
Glaemscribe.TTS.prototype.pre_ipa = function(args, voice, text) {
|
287
|
+
|
273
288
|
var client = this;
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
289
|
+
var config = Glaemscribe.TTS.ipa_configurations[voice];
|
290
|
+
|
291
|
+
if(!config)
|
292
|
+
throw "Trying to use unsupported voice '" + voice + "'!";
|
293
|
+
|
294
|
+
// Normalize all tabs by spaces
|
295
|
+
text = text.replace(/\t/g," ");
|
296
|
+
|
297
|
+
// Small hack to prevent espeak from pronouncing last dot
|
298
|
+
// since our tokenization may isolate it.
|
299
|
+
text += "\n";
|
300
|
+
|
301
|
+
// Instrument blocks first (they may contain punctuation)
|
302
|
+
var bi = client.ipa_instrument_blocks(voice,text);
|
303
|
+
text = bi[0];
|
304
|
+
|
305
|
+
// Instrument punctuation, then
|
306
|
+
var pi = client.ipa_instrument_punct(voice,text);
|
307
|
+
text = pi[0];
|
308
|
+
|
309
|
+
// Small hack to always have a capital after a dot.
|
310
|
+
// And prevent espeak from transcribing/pronuncing "dot"
|
311
|
+
text = text.replace(/(\.\s+.)/g, function(match,p1) {
|
312
|
+
return p1.toUpperCase()
|
313
|
+
});
|
314
|
+
|
315
|
+
return {
|
316
|
+
text: text,
|
317
|
+
block_tokens: bi[1],
|
318
|
+
punct_tokens: pi[1]
|
319
|
+
}
|
320
|
+
}
|
321
|
+
|
322
|
+
Glaemscribe.TTS.prototype.pre_wav = function(args, voice, text) {
|
323
|
+
var client = this;
|
324
|
+
var config = Glaemscribe.TTS.ipa_configurations[voice];
|
325
|
+
|
326
|
+
if(!config)
|
327
|
+
throw "Trying to use unsupported voice '" + voice + "'!";
|
328
|
+
|
329
|
+
// First, escape the special blocks. Just ignore them.
|
283
330
|
if(args.has_raw_mode) {
|
284
|
-
var pre_raw_res = this.
|
331
|
+
var pre_raw_res = this.escape_special_blocks(voice, text, false);
|
285
332
|
text = pre_raw_res[0];
|
286
|
-
pre_raw_tokens = pre_raw_res[1];
|
287
|
-
}
|
288
|
-
|
289
|
-
// Pre parse ipa
|
290
|
-
var pre_ipa_tokens = [];
|
291
|
-
var pre_ipa_res = client.pre_ipa(voice,text);
|
292
|
-
text = pre_ipa_res[0];
|
293
|
-
pre_ipa_tokens = pre_ipa_res[1];
|
294
|
-
|
295
|
-
// Restitute raw things
|
296
|
-
if(args.has_raw_mode) {
|
297
|
-
var nth = -1;
|
298
|
-
text = text.replace(/∰∰/g,function(match, contents, offset, s) {
|
299
|
-
nth += 1;
|
300
|
-
return pre_raw_tokens[nth];
|
301
|
-
});
|
302
333
|
}
|
303
|
-
|
304
|
-
|
305
|
-
|
334
|
+
|
335
|
+
// Now simplify the punctuation to avoid problems.
|
336
|
+
text = this.wav_instrument_punct(voice, text);
|
337
|
+
|
338
|
+
return {
|
339
|
+
text: text
|
340
|
+
}
|
341
|
+
}
|
342
|
+
|
343
|
+
//////////////////
|
344
|
+
// SYNTHESIZE //
|
345
|
+
//////////////////
|
346
|
+
|
347
|
+
|
348
|
+
Glaemscribe.TTS.prototype.synthesize_ipa = function(text, args, onended) {
|
349
|
+
|
350
|
+
var client = this;
|
351
|
+
args = args || {};
|
352
|
+
var voice = args.voice || 'en-tengwar'
|
353
|
+
|
354
|
+
// Pre parse text and find raw mode things {{ ... }}
|
355
|
+
// Cache them. This will also the pre-instrumentation
|
356
|
+
// To treat each block as one word
|
357
|
+
var pipa = client.pre_ipa(args, voice, text);
|
358
|
+
text = pipa['text'];
|
359
|
+
|
360
|
+
// Now the IPA is instrumented.
|
361
|
+
// Prepare client
|
362
|
+
client.proxy.set_voice(voice);
|
306
363
|
|
307
364
|
var ts = new Date();
|
308
365
|
var ret = {};
|
309
366
|
client.proxy.synthesize(text, false, true, true, function(result) {
|
310
|
-
|
311
367
|
// Post parse ipa
|
312
|
-
result.ipa = client.post_ipa(voice, result.pho,
|
368
|
+
result.ipa = client.post_ipa(voice, result.pho, pipa);
|
313
369
|
|
314
370
|
var te = new Date();
|
315
371
|
result.synthesis_time = (te - ts);
|
316
372
|
delete result.pho;
|
317
|
-
|
373
|
+
|
318
374
|
if(onended)
|
319
375
|
onended(result);
|
320
376
|
|
321
377
|
ret = result;
|
322
378
|
});
|
379
|
+
|
323
380
|
return ret;
|
324
381
|
}
|
325
382
|
|
326
383
|
// Should be kept separated from IPA, because we do not work on the same text
|
327
384
|
Glaemscribe.TTS.prototype.synthesize_wav = function(text, args, onended) {
|
328
|
-
|
329
|
-
var client
|
330
|
-
|
385
|
+
|
386
|
+
var client = this;
|
331
387
|
args = args || {}
|
332
|
-
var voice = args.voice || 'en'
|
388
|
+
var voice = args.voice || 'en-tengwar'
|
389
|
+
|
390
|
+
// Pre-trandform text
|
391
|
+
var pwav = client.pre_wav(args, voice, text);
|
392
|
+
text = pwav['text'];
|
333
393
|
|
334
|
-
|
394
|
+
// Prepare client
|
335
395
|
client.proxy.set_rate(args.rate || 120);
|
336
396
|
client.proxy.set_pitch(args.pitch || 5);
|
337
|
-
client.proxy.set_voice(
|
338
|
-
|
339
|
-
if(args.has_raw_mode) {
|
340
|
-
var pre_raw_res = this.escape_raw_mode(text,true);
|
341
|
-
text = pre_raw_res[0];
|
342
|
-
}
|
397
|
+
client.proxy.set_voice(voice);
|
343
398
|
|
344
|
-
var ret = {};
|
345
399
|
var ts = new Date();
|
400
|
+
var ret = {};
|
346
401
|
client.proxy.synthesize(text, true, false, false, function(result) {
|
347
402
|
var te = new Date();
|
348
403
|
result.synthesis_time = (te - ts);
|
349
404
|
delete result.pho;
|
350
|
-
|
405
|
+
|
351
406
|
// Uint8Array > Array conversion, for ruby?
|
352
|
-
// ret.wav = [].slice.call(ret.wav);
|
353
|
-
|
407
|
+
// ret.wav = [].slice.call(ret.wav);
|
408
|
+
|
354
409
|
if(onended)
|
355
410
|
onended(result);
|
356
|
-
|
411
|
+
|
357
412
|
ret = result;
|
358
413
|
});
|
359
|
-
|
414
|
+
|
360
415
|
return ret;
|
361
416
|
}
|
362
417
|
|
418
|
+
|
419
|
+
// Below is an expirement of a parsing tool for orthographic modes.
|
420
|
+
// Not finished and probably not usable.
|
421
|
+
Glaemscribe.TTS.TokenType = {};
|
422
|
+
Glaemscribe.TTS.TokenType.WORD = 'WORD';
|
423
|
+
Glaemscribe.TTS.TokenType.NON_WORD = 'NON_WORD';
|
424
|
+
Glaemscribe.TTS.TokenType.NUM = 'NUM';
|
425
|
+
Glaemscribe.TTS.TokenType.SPACE = 'SPACE';
|
426
|
+
Glaemscribe.TTS.TokenType.PUNCT = 'PUNCT';
|
427
|
+
|
428
|
+
Glaemscribe.TTS.prototype.orthographic_disambiguator_en = function(text) {
|
429
|
+
|
430
|
+
var client = this;
|
431
|
+
|
432
|
+
var uwmatcher = /(\p{L}+)/u;
|
433
|
+
var spl = text.split(uwmatcher);
|
434
|
+
|
435
|
+
var tokens = spl.map(function(s) {
|
436
|
+
var t = {};
|
437
|
+
var is_word = s.match(uwmatcher)
|
438
|
+
|
439
|
+
t.type = (is_word)?(Glaemscribe.TTS.TokenType.WORD):(Glaemscribe.TTS.TokenType.NON_WORD);
|
440
|
+
t.content = s;
|
441
|
+
return t;
|
442
|
+
});
|
443
|
+
|
444
|
+
var tokens2 = [];
|
445
|
+
|
446
|
+
// Handle apostrophe
|
447
|
+
for(var i=0;i<tokens.length;i++) {
|
448
|
+
if( i == 0 || i == tokens.length-1 || tokens[i].type == Glaemscribe.TTS.TokenType.WORD ) {
|
449
|
+
tokens2.push(tokens[i]);
|
450
|
+
continue;
|
451
|
+
}
|
452
|
+
|
453
|
+
if(tokens[i].content == "'" &&
|
454
|
+
tokens[i-1].type == Glaemscribe.TTS.TokenType.WORD &&
|
455
|
+
tokens[i+1].type == Glaemscribe.TTS.TokenType.WORD )
|
456
|
+
{
|
457
|
+
tokens2.pop();
|
458
|
+
var tok = {};
|
459
|
+
tok.type = Glaemscribe.TTS.TokenType.WORD;
|
460
|
+
tok.content = tokens[i-1].content + tokens[i].content + tokens[i+1].content;
|
461
|
+
tokens2.push(tok);
|
462
|
+
i += 1;
|
463
|
+
}
|
464
|
+
else {
|
465
|
+
tokens2.push(tokens[i]);
|
466
|
+
}
|
467
|
+
}
|
468
|
+
tokens = tokens2;
|
469
|
+
|
470
|
+
// Numerize tokens
|
471
|
+
var i = 0;
|
472
|
+
tokens.forEach(function(t) {
|
473
|
+
t.num = i;
|
474
|
+
i += 1;
|
475
|
+
});
|
476
|
+
|
477
|
+
// Remove non-speechable tokens
|
478
|
+
var stokens = tokens.filter(function(t) {
|
479
|
+
return (t.type == Glaemscribe.TTS.TokenType.WORD);
|
480
|
+
});
|
481
|
+
|
482
|
+
// Join speachable tokens
|
483
|
+
var r = stokens.map(function(t) { return t.content}).join(' ');
|
484
|
+
|
485
|
+
var args = {};
|
486
|
+
var voice = args.voice || 'en-tengwar';
|
487
|
+
|
488
|
+
client.proxy.set_voice(voice);
|
489
|
+
client.proxy.synthesize(r, false, true, true, function(result) {
|
490
|
+
r = result.pho;
|
491
|
+
});
|
492
|
+
r = r.split('').map(function(t) { return t.trim() });
|
493
|
+
|
494
|
+
var j = 0;
|
495
|
+
r.forEach(function(w) {
|
496
|
+
tokens[stokens[j].num].ipa = r[j];
|
497
|
+
j += 1;
|
498
|
+
});
|
499
|
+
|
500
|
+
return tokens;
|
501
|
+
}
|
502
|
+
|
363
503
|
Glaemscribe.TTS.is_engine_loaded = function() {
|
364
504
|
return (typeof(ESpeakNGGlue) !== 'undefined');
|
365
505
|
};
|