glaemscribe 1.2.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/bin/glaemscribe +2 -2
  3. data/glaemresources/charsets/cirth_ds.cst +514 -179
  4. data/glaemresources/charsets/eldamar.cst +210 -0
  5. data/glaemresources/charsets/tengwar_ds_annatar.cst +2452 -130
  6. data/glaemresources/charsets/tengwar_ds_eldamar.cst +2319 -125
  7. data/glaemresources/charsets/tengwar_ds_elfica.cst +2317 -126
  8. data/glaemresources/charsets/tengwar_ds_parmaite.cst +2319 -127
  9. data/glaemresources/charsets/tengwar_ds_sindarin.cst +2318 -127
  10. data/glaemresources/charsets/tengwar_freemono.cst +1 -1
  11. data/glaemresources/charsets/tengwar_guni_annatar.cst +2451 -131
  12. data/glaemresources/charsets/tengwar_guni_eldamar.cst +2317 -126
  13. data/glaemresources/charsets/tengwar_guni_elfica.cst +2316 -127
  14. data/glaemresources/charsets/tengwar_guni_parmaite.cst +2319 -127
  15. data/glaemresources/charsets/tengwar_guni_sindarin.cst +2317 -126
  16. data/glaemresources/charsets/tengwar_telcontar.cst +7 -0
  17. data/glaemresources/modes/blackspeech-tengwar-general_use.glaem +1 -1
  18. data/glaemresources/modes/english-cirth-espeak.glaem +687 -0
  19. data/glaemresources/modes/english-tengwar-espeak.glaem +814 -0
  20. data/glaemresources/modes/japanese-tengwar.glaem +9 -4
  21. data/glaemresources/modes/lang_belta-tengwar-dadef.glaem +248 -0
  22. data/glaemresources/modes/raw-cirth.glaem +154 -0
  23. data/lib/api/charset_parser.rb +7 -1
  24. data/lib/api/mode.rb +35 -10
  25. data/lib/api/mode_parser.rb +21 -12
  26. data/lib/api/post_processor/outspace.rb +44 -0
  27. data/lib/api/rule_group.rb +1 -1
  28. data/lib/api/transcription_pre_post_processor.rb +8 -5
  29. data/lib/api/transcription_processor.rb +12 -9
  30. data/lib/glaemscribe.rb +2 -0
  31. data/lib_espeak/espeakng.for.glaemscribe.nowasm.sync.js +25 -11
  32. data/lib_espeak/glaemscribe_tts.js +363 -223
  33. metadata +12 -6
@@ -1,11 +1,11 @@
1
1
  /*
2
2
 
3
3
  Glǽmscribe (also written Glaemscribe) is a software dedicated to
4
- the transcription of texts between writing systems, and more
5
- specifically dedicated to the transcription of J.R.R. Tolkien's
4
+ the transcription of texts between writing systems, and more
5
+ specifically dedicated to the transcription of J.R.R. Tolkien's
6
6
  invented languages to some of his devised writing systems.
7
7
 
8
- Copyright (C) 2015 Benjamin Babut (Talagan).
8
+ Copyright (C) 2015-2020 Benjamin Babut (Talagan).
9
9
 
10
10
  This program is free software: you can redistribute it and/or modify
11
11
  it under the terms of the GNU Affero General Public License as published by
@@ -23,71 +23,44 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
23
23
  */
24
24
 
25
25
 
26
- // A wrapper around espeak to perform handle various TTS tasks,
27
- // and generate IPA and/or WAV while keeping punctuations.
26
+ // A wrapper around espeak to perform various TTS tasks,
27
+ // and generate IPA and/or WAV while keeping punctuation signs or cleaning them up.
28
28
  //
29
29
  // Espeak does not have this feature, so this is a significantly dirty hack.
30
- //
30
+ //
31
31
  // Additionally we perform a few glaemscribe-specific tasks, such as preserving raw tengwar
32
-
32
+ // or numbers which are treated independently.
33
33
 
34
34
  // For the ruby loader, define the Glaemscribe module.
35
35
  Glaemscribe = (typeof(Glaemscribe) === 'undefined')?({}):(Glaemscribe);
36
36
 
37
- Glaemscribe.TTS = function() {
38
-
37
+ Glaemscribe.TTS = function() {
38
+
39
39
  var client = this;
40
40
  client.proxy = new ESpeakNGGlue();
41
41
  }
42
42
 
43
43
  Glaemscribe.TTS.ipa_configurations = {
44
- 'en': {
45
- special_token_ncn: '', // no space / sign / no space
46
- special_token_ncs: '', // no space / sign / space
47
- special_token_scn: '', // space / sign / no space
48
- special_token_scs: '', // space / sign / space
49
-
50
- special_token_ipa_ncn: '',
51
- special_token_ipa_ncs: '',
52
- special_token_ipa_scn: '',
53
- special_token_ipa_scs: '',
54
- // Replace by special token AND KEEP when calculating ipa
55
- clauseaffecting_punctuation: "!.,;:!?–—",
56
- // Replace by special token but do not keep when calculating ipa
57
- // '’ : apostrophes should stay in the original text !!! Don't break liz's bag !!
58
- // This is because apostrophes shouldn't trigger a pause in the prononciation (e.g. genitives)
59
- clauseunaffecting_punctuation: "·“”«»-[](){}<>≤≥$|\""
60
- },
61
- 'fr': {
62
- special_token_ncn: '', // no space / sign / no space
63
- special_token_ncs: '', // no space / sign / space
64
- special_token_scn: '', // space / sign / no space
65
- special_token_scs: '', // space / sign / space
66
-
67
- special_token_ipa_ncn: '',
68
- special_token_ipa_ncs: '',
69
- special_token_ipa_scn: '',
70
- special_token_ipa_scs: '',
44
+ 'en-tengwar': {
45
+
46
+ punct_token: '', // Invariant, for punctuation
47
+ block_token: '', // Invariant, for special blocks (nums / raw tengwar)
48
+
71
49
  // Replace by special token AND KEEP when calculating ipa
72
- clauseaffecting_punctuation: "!.,;:!?–—",
50
+ clauseaffecting_punctuation: "!.,;:!?–—",
73
51
  // Replace by special token but do not keep when calculating ipa
74
- // '’ : apostrophes should stay in the original text, let espeak eat them
75
- // This is because apostrophes shouldn't trigger a pause in the prononciation (e.g. genitives)
76
- clauseunaffecting_punctuation: "·“”«»-[](){}<>≤≥$|\"",
77
- // Callback before reconstituing markers.
78
- pre_reconsitute_markers_callback: function(text) {
79
- // Long vowel back replacement.
80
- return text.replace(/-/g,"ː");
81
- }
52
+ // For those signs : '’ : apostrophes should stay in the original text !!! Don't break liz's bag !!
53
+ // Apostrophes shouldn't trigger a pause in the prononciation (e.g. genitives, I've, don't etc)
54
+ // But apostrophe and single quote are the same thing.
55
+ // It's necessary to document that single quotes should then be avoided.
56
+ clauseunaffecting_punctuation: "·“”«»-[](){}⟨⟩<>≤≥$|\""
82
57
  }
83
58
  }
84
59
 
85
- Glaemscribe.TTS.ipa_configurations['en-us'] = Glaemscribe.TTS.ipa_configurations['en'];
86
- Glaemscribe.TTS.ipa_configurations['en-gb'] = Glaemscribe.TTS.ipa_configurations['en'];
87
- Glaemscribe.TTS.ipa_configurations['en-tengwar-zlegacy'] = Glaemscribe.TTS.ipa_configurations['en'];
88
- Glaemscribe.TTS.ipa_configurations['en-tengwar'] = Glaemscribe.TTS.ipa_configurations['en'];
89
- Glaemscribe.TTS.ipa_configurations['en-tengwar-gb'] = Glaemscribe.TTS.ipa_configurations['en'];
90
- Glaemscribe.TTS.ipa_configurations['en-tengwar-us'] = Glaemscribe.TTS.ipa_configurations['en'];
60
+ Glaemscribe.TTS.ipa_configurations['en-tengwar'] = Glaemscribe.TTS.ipa_configurations['en-tengwar'];
61
+ Glaemscribe.TTS.ipa_configurations['en-tengwar-rp'] = Glaemscribe.TTS.ipa_configurations['en-tengwar'];
62
+ Glaemscribe.TTS.ipa_configurations['en-tengwar-gb'] = Glaemscribe.TTS.ipa_configurations['en-tengwar'];
63
+ Glaemscribe.TTS.ipa_configurations['en-tengwar-us'] = Glaemscribe.TTS.ipa_configurations['en-tengwar'];
91
64
 
92
65
 
93
66
  Glaemscribe.TTS.voice_list = function(voice) {
@@ -96,6 +69,10 @@ Glaemscribe.TTS.voice_list = function(voice) {
96
69
 
97
70
  // Static helper. To be used in pure js (not ruby).
98
71
  Glaemscribe.TTS.option_name_to_voice = function(oname) {
72
+
73
+ if(!oname)
74
+ return null;
75
+
99
76
  return oname.toLowerCase().replace(/^espeak_voice_/,'').replace(/_/g,'-');
100
77
  }
101
78
 
@@ -109,257 +86,420 @@ Glaemscribe.TTS.prototype.make_char_checker = function(string){
109
86
  return cc;
110
87
  }
111
88
 
112
- Glaemscribe.TTS.prototype.escape_raw_mode = function(entry,full_remove) {
113
-
114
- var rawgexp = /({{[\s\S]*?}})/g;
115
- var captured = [];
116
-
117
- var ret = entry.replace(rawgexp, function(match,p1) {
118
-
119
- captured.push(match);
120
- if(full_remove)
121
- return ' ';
89
+ Glaemscribe.TTS.prototype.isSpace = function(a) {
90
+ return (a == ' ' || a == '\t');
91
+ }
92
+
93
+ Glaemscribe.TTS.prototype.read_cap_token = function(text, starti, cap_checker) {
94
+
95
+ var client = this
96
+ var i = starti;
97
+ var tok = ""
98
+
99
+ if(cap_checker[text[i]] == null)
100
+ return null;
101
+
102
+ i++;
103
+
104
+ // Advance the sequence
105
+ for(; i<text.length; i++) {
106
+ if( (cap_checker[text[i]] == null) && !client.isSpace(text[i])) {
107
+ break;
108
+ }
109
+ }
110
+
111
+ // Rewind trailing spaces
112
+ var toklen = i - starti;
113
+
114
+ for(i = starti + toklen - 1; i>=starti ; i--) {
115
+ if(client.isSpace(text[i]))
116
+ toklen--;
122
117
  else
123
- return '∰∰';
118
+ break;
119
+ }
120
+
121
+ return text.substring(starti,starti+toklen);
122
+ };
123
+
124
+ Glaemscribe.TTS.prototype.preceded_by_space = function(text,i) {
125
+ var client = this;
126
+
127
+ if(i <= 0)
128
+ return false;
129
+ else
130
+ return client.isSpace(text[i-1]);
131
+ }
132
+
133
+ Glaemscribe.TTS.prototype.succeeded_by_space = function(text,i) {
134
+ var client = this;
135
+
136
+ if(i >= text.length-1)
137
+ return false;
138
+ else
139
+ return client.isSpace(text[i+1]);
140
+ }
141
+
142
+ // Escapes raw mode AND numbers
143
+ Glaemscribe.TTS.prototype.escape_special_blocks = function(voice, entry, for_ipa) {
144
+
145
+ var config = Glaemscribe.TTS.ipa_configurations[voice];
146
+
147
+ // TODO : make this configurable
148
+
149
+ // Tonekize raw_mode escaping + numbers, we don't want them to be converted in IPA
150
+ // Also, keep numbers in the writing, to prevent espeak from pronuncing them
151
+ var ipaexpr = /(\s*)({{[\s\S]*?}}|\b[0-9][0-9\s]*\b)(\s*)/g;
152
+ var wavexpr = /(\s*)({{[\s\S]*?}})(\s*)/g;
153
+ var rawgexp = (for_ipa)?(ipaexpr):(wavexpr);
154
+
155
+ var captured = [];
156
+
157
+ var ret = entry.replace(rawgexp, function(match,p1,p2,p3) {
158
+ captured.push(match);
159
+ if(!for_ipa)
160
+ return ' '; // For wav, just replace by empty space and do not pronunce.
161
+ else {
162
+ return p1 + config['block_token'] + p3; // For IPA, replace by dummy token.
163
+ }
124
164
  });
125
-
165
+
126
166
  return [ret, captured];
127
167
  }
128
168
 
129
- Glaemscribe.TTS.prototype.pre_ipa = function(voice,text) {
130
-
169
+
170
+ Glaemscribe.TTS.prototype.ipa_instrument_punct = function(voice, text) {
171
+
131
172
  var client = this;
132
173
  var config = Glaemscribe.TTS.ipa_configurations[voice];
133
-
134
- // Normalize all tabs by spaces
135
- text = text.replace(/\t/g," ");
136
- // Small hack to prevent espeak from pronouncing last dot
137
- // since our tokenization may isolate it.
138
- text += "\n";
139
-
174
+
140
175
  var cap = client.make_char_checker(config['clauseaffecting_punctuation']);
141
176
  var cup = client.make_char_checker(config['clauseunaffecting_punctuation']);
142
-
177
+
143
178
  var accum = "";
144
179
  var kept_signs = [];
145
-
146
- var prec_is_space = false;
147
- var next_is_space = false;
148
- for(var i=0;i<text.length;i++)
180
+
181
+ var rescap = null;
182
+
183
+ for(var i=0;i<text.length;i++)
149
184
  {
150
- // Is precedent char a space ?
151
- if(i == 0)
152
- prec_is_space = false;
153
- else
154
- prec_is_space = (text[i-1] == " ");
155
-
156
- // Is precedent char a space ?
157
- if(i == text.length-1)
158
- next_is_space = false;
159
- else
160
- next_is_space = (text[i+1] == " ");
161
-
162
185
  if(text[i] == "\n")
163
186
  {
164
- accum += config['special_token_ncn'];
187
+ accum += config['punct_token'];
165
188
  kept_signs.push(text[i]);
166
189
  }
167
- else if(cap[text[i]] != null)
190
+ else if(cup[text[i]] != null)
168
191
  {
169
- if(!prec_is_space && !next_is_space)
170
- {
171
- // Always insert spaces, but remember how they were placed
172
- accum += " " + config['special_token_ncn'] + " " + text[i] + " ";
173
- kept_signs.push(text[i]);
174
- }
175
- if(!prec_is_space && next_is_space)
176
- {
177
- // Always insert spaces, but remember how they were placed
178
- accum += " " + config['special_token_ncs'] + " " + text[i] + " ";
179
- kept_signs.push(text[i] + " ");
180
- }
181
- if(prec_is_space && !next_is_space)
182
- {
183
- // Always insert spaces, but remember how they were placed
184
- accum += " " + config['special_token_scn'] + " " + text[i] + " ";
185
- kept_signs.push(" " + text[i]);
186
- }
187
- if(prec_is_space && next_is_space)
188
- {
189
- // Always insert spaces, but remember how they were placed
190
- accum += " " + config['special_token_scs'] + " " + text[i] + " ";
191
- kept_signs.push(" " + text[i] + " ");
192
- }
192
+ // This sign does not affect clause analysis by espeak.
193
+ // Replace the sign by a special "word" / token AND REMOVE the sign
194
+ // We will restore it after IPA calculation.
195
+ accum += " " + config['punct_token'] + " " ;
196
+ kept_signs.push(
197
+ ((client.preceded_by_space(text,i))?(" "):("")) +
198
+ text[i] +
199
+ ((client.succeeded_by_space(text,i))?(" "):(""))
200
+ );
193
201
  }
194
- else if(cup[text[i]] != null)
202
+ else if(rescap = client.read_cap_token(text,i,cap))
203
+ {
204
+ // This punctuation sign affects clause analysis.
205
+ // Replace the sign by a special "word" / token AND keep the sign
206
+ // Always insert spaces, but remember how they were placed
207
+ accum += " " + text[i] + " " + config['punct_token'] + " " ;
208
+ kept_signs.push(
209
+ ((client.preceded_by_space(text, i))?(" "):("")) +
210
+ rescap +
211
+ ((client.succeeded_by_space(text, i + rescap.length - 1))?(" "):(""))
212
+ );
213
+ i += rescap.length - 1;
214
+ }
215
+ else
216
+ {
217
+ accum += text[i];
218
+ }
219
+ }
220
+
221
+ return [accum, kept_signs];
222
+ }
223
+
224
+ Glaemscribe.TTS.prototype.wav_instrument_punct = function(voice, text) {
225
+
226
+ var client = this;
227
+ var config = Glaemscribe.TTS.ipa_configurations[voice];
228
+ var cap = client.make_char_checker(config['clauseaffecting_punctuation']);
229
+ var accum = "";
230
+ var rescap = null;
231
+
232
+ for(var i=0;i<text.length;i++)
233
+ {
234
+ if(rescap = client.read_cap_token(text,i,cap))
195
235
  {
196
- // The difference is that we don't keep the sign before calculating ipa.
197
- // Just remove them to avoid espeak spell them
198
- if(!prec_is_space && !next_is_space)
199
- {
200
- accum += " " + config['special_token_ncn'] + " " ;
201
- kept_signs.push(text[i]);
202
- }
203
- if(!prec_is_space && next_is_space)
204
- {
205
- accum += " " + config['special_token_ncs'] + " " ;
206
- kept_signs.push(text[i] + " ");
207
- }
208
- if(prec_is_space && !next_is_space)
209
- {
210
- accum += " " + config['special_token_scn'] + " " ;
211
- kept_signs.push(" " + text[i]);
212
- }
213
- if(prec_is_space && next_is_space)
214
- {
215
- accum += " " + config['special_token_scs'] + " " ;
216
- kept_signs.push(" " + text[i] + " ");
217
- }
236
+ accum += text[i]; // Just keep the first sign, ignore the others
237
+ i += rescap.length - 1;
218
238
  }
219
239
  else
220
240
  {
221
241
  accum += text[i];
222
242
  }
223
243
  }
224
-
225
- //console.log(accum);
226
- //console.log(kept_signs)
227
- return [accum,kept_signs];
244
+
245
+ return accum;
228
246
  }
229
247
 
230
- Glaemscribe.TTS.prototype.post_ipa = function(voice, ipa, kept_tokens) {
231
-
248
+ Glaemscribe.TTS.prototype.ipa_instrument_blocks = function(voice, text)
249
+ {
250
+ var client = this;
232
251
  var config = Glaemscribe.TTS.ipa_configurations[voice];
233
- ipa = ipa.replace(/\n/g," ");
234
- if(config['pre_reconsitute_markers_callback'])
235
- ipa = config['pre_reconsitute_markers_callback'](ipa);
236
-
237
- var ncnr = new RegExp("\\s*(" + config['special_token_ipa_ncn'] + ")\\s*","g");
238
- var scnr = new RegExp("\\s*(" + config['special_token_ipa_scn'] + ")\\s*","g");
239
- var ncsr = new RegExp("\\s*(" + config['special_token_ipa_ncs'] + ")\\s*","g");
240
- var scsr = new RegExp("\\s*(" + config['special_token_ipa_scs'] + ")\\s*","g");
241
-
242
- // console.log("=====")
243
- // console.log(ipa)
244
- // console.log(config)
245
- // console.log(ncsr)
246
-
247
- // Tokens have been accumulated linearly
248
- ipa = ipa.replace(ncnr, function(match, contents, offset, s) {return '∰∰'; });
249
- ipa = ipa.replace(ncsr, function(match, contents, offset, s) {return '∰∰'; });
250
- ipa = ipa.replace(scnr, function(match, contents, offset, s) {return '∰∰'; });
251
- ipa = ipa.replace(scsr, function(match, contents, offset, s) {return '∰∰'; });
252
-
253
- // console.log("=====")
254
- // console.log(ipa)
252
+
253
+ return this.escape_special_blocks(voice, text, true);
254
+ }
255
+
256
+ Glaemscribe.TTS.prototype.ipa_restore_tokens = function(text, token, kept_tokens) {
257
+
258
+ var rx = new RegExp("\\s*(" + token + ")\\s*","g");
259
+
255
260
  var nth = -1;
256
- ipa = ipa.replace(/∰∰/g,function(match, contents, offset, s) {
261
+ text = text.replace(rx,function(match, contents, offset, s) {
257
262
  nth += 1;
258
263
  return kept_tokens[nth];
259
264
  });
260
-
265
+
266
+ return text;
267
+ }
268
+
269
+ Glaemscribe.TTS.prototype.post_ipa = function(voice, ipa, pre_ipa_res) {
270
+
271
+ var client = this;
272
+ var config = Glaemscribe.TTS.ipa_configurations[voice];
273
+ ipa = ipa.replace(/\n/g, " ");
274
+
275
+ ipa = client.ipa_restore_tokens(ipa, config.punct_token, pre_ipa_res.punct_tokens);
276
+ ipa = client.ipa_restore_tokens(ipa, config.block_token, pre_ipa_res.block_tokens);
277
+
261
278
  // Post-treatment of anti 'dot' pronounciation hack
262
- if(ipa[ipa.length-1] === "\n")
279
+ if(ipa[ipa.length-1] === "\n")
263
280
  ipa = ipa.slice(0,-1);
264
-
265
- // console.log("=====")
266
- // console.log(ipa)
281
+
267
282
  return ipa
268
283
  }
269
284
 
270
285
 
271
- Glaemscribe.TTS.prototype.synthesize_ipa = function(text, args, onended) {
272
-
286
+ Glaemscribe.TTS.prototype.pre_ipa = function(args, voice, text) {
287
+
273
288
  var client = this;
274
-
275
- args = args || {}
276
- var voice = args.voice || 'en'
277
-
278
- var ts = new Date();
279
- var tp = ts;
280
-
281
- // Cache raw things
282
- var pre_raw_tokens = [];
289
+ var config = Glaemscribe.TTS.ipa_configurations[voice];
290
+
291
+ if(!config)
292
+ throw "Trying to use unsupported voice '" + voice + "'!";
293
+
294
+ // Normalize all tabs by spaces
295
+ text = text.replace(/\t/g," ");
296
+
297
+ // Small hack to prevent espeak from pronouncing last dot
298
+ // since our tokenization may isolate it.
299
+ text += "\n";
300
+
301
+ // Instrument blocks first (they may contain punctuation)
302
+ var bi = client.ipa_instrument_blocks(voice,text);
303
+ text = bi[0];
304
+
305
+ // Instrument punctuation, then
306
+ var pi = client.ipa_instrument_punct(voice,text);
307
+ text = pi[0];
308
+
309
+ // Small hack to always have a capital after a dot.
310
+ // And prevent espeak from transcribing/pronuncing "dot"
311
+ text = text.replace(/(\.\s+.)/g, function(match,p1) {
312
+ return p1.toUpperCase()
313
+ });
314
+
315
+ return {
316
+ text: text,
317
+ block_tokens: bi[1],
318
+ punct_tokens: pi[1]
319
+ }
320
+ }
321
+
322
+ Glaemscribe.TTS.prototype.pre_wav = function(args, voice, text) {
323
+ var client = this;
324
+ var config = Glaemscribe.TTS.ipa_configurations[voice];
325
+
326
+ if(!config)
327
+ throw "Trying to use unsupported voice '" + voice + "'!";
328
+
329
+ // First, escape the special blocks. Just ignore them.
283
330
  if(args.has_raw_mode) {
284
- var pre_raw_res = this.escape_raw_mode(text,false);
331
+ var pre_raw_res = this.escape_special_blocks(voice, text, false);
285
332
  text = pre_raw_res[0];
286
- pre_raw_tokens = pre_raw_res[1];
287
- }
288
-
289
- // Pre parse ipa
290
- var pre_ipa_tokens = [];
291
- var pre_ipa_res = client.pre_ipa(voice,text);
292
- text = pre_ipa_res[0];
293
- pre_ipa_tokens = pre_ipa_res[1];
294
-
295
- // Restitute raw things
296
- if(args.has_raw_mode) {
297
- var nth = -1;
298
- text = text.replace(/∰∰/g,function(match, contents, offset, s) {
299
- nth += 1;
300
- return pre_raw_tokens[nth];
301
- });
302
333
  }
303
-
304
- args = args || {}
305
- client.proxy.set_voice(args.voice || 'en');
334
+
335
+ // Now simplify the punctuation to avoid problems.
336
+ text = this.wav_instrument_punct(voice, text);
337
+
338
+ return {
339
+ text: text
340
+ }
341
+ }
342
+
343
+ //////////////////
344
+ // SYNTHESIZE //
345
+ //////////////////
346
+
347
+
348
+ Glaemscribe.TTS.prototype.synthesize_ipa = function(text, args, onended) {
349
+
350
+ var client = this;
351
+ args = args || {};
352
+ var voice = args.voice || 'en-tengwar'
353
+
354
+ // Pre parse text and find raw mode things {{ ... }}
355
+ // Cache them. This will also the pre-instrumentation
356
+ // To treat each block as one word
357
+ var pipa = client.pre_ipa(args, voice, text);
358
+ text = pipa['text'];
359
+
360
+ // Now the IPA is instrumented.
361
+ // Prepare client
362
+ client.proxy.set_voice(voice);
306
363
 
307
364
  var ts = new Date();
308
365
  var ret = {};
309
366
  client.proxy.synthesize(text, false, true, true, function(result) {
310
-
311
367
  // Post parse ipa
312
- result.ipa = client.post_ipa(voice, result.pho, pre_ipa_tokens);
368
+ result.ipa = client.post_ipa(voice, result.pho, pipa);
313
369
 
314
370
  var te = new Date();
315
371
  result.synthesis_time = (te - ts);
316
372
  delete result.pho;
317
-
373
+
318
374
  if(onended)
319
375
  onended(result);
320
376
 
321
377
  ret = result;
322
378
  });
379
+
323
380
  return ret;
324
381
  }
325
382
 
326
383
  // Should be kept separated from IPA, because we do not work on the same text
327
384
  Glaemscribe.TTS.prototype.synthesize_wav = function(text, args, onended) {
328
-
329
- var client = this;
330
-
385
+
386
+ var client = this;
331
387
  args = args || {}
332
- var voice = args.voice || 'en'
388
+ var voice = args.voice || 'en-tengwar'
389
+
390
+ // Pre-trandform text
391
+ var pwav = client.pre_wav(args, voice, text);
392
+ text = pwav['text'];
333
393
 
334
- args = args || {}
394
+ // Prepare client
335
395
  client.proxy.set_rate(args.rate || 120);
336
396
  client.proxy.set_pitch(args.pitch || 5);
337
- client.proxy.set_voice(args.voice || 'en');
338
-
339
- if(args.has_raw_mode) {
340
- var pre_raw_res = this.escape_raw_mode(text,true);
341
- text = pre_raw_res[0];
342
- }
397
+ client.proxy.set_voice(voice);
343
398
 
344
- var ret = {};
345
399
  var ts = new Date();
400
+ var ret = {};
346
401
  client.proxy.synthesize(text, true, false, false, function(result) {
347
402
  var te = new Date();
348
403
  result.synthesis_time = (te - ts);
349
404
  delete result.pho;
350
-
405
+
351
406
  // Uint8Array > Array conversion, for ruby?
352
- // ret.wav = [].slice.call(ret.wav);
353
-
407
+ // ret.wav = [].slice.call(ret.wav);
408
+
354
409
  if(onended)
355
410
  onended(result);
356
-
411
+
357
412
  ret = result;
358
413
  });
359
-
414
+
360
415
  return ret;
361
416
  }
362
417
 
418
+
419
+ // Below is an expirement of a parsing tool for orthographic modes.
420
+ // Not finished and probably not usable.
421
+ Glaemscribe.TTS.TokenType = {};
422
+ Glaemscribe.TTS.TokenType.WORD = 'WORD';
423
+ Glaemscribe.TTS.TokenType.NON_WORD = 'NON_WORD';
424
+ Glaemscribe.TTS.TokenType.NUM = 'NUM';
425
+ Glaemscribe.TTS.TokenType.SPACE = 'SPACE';
426
+ Glaemscribe.TTS.TokenType.PUNCT = 'PUNCT';
427
+
428
+ Glaemscribe.TTS.prototype.orthographic_disambiguator_en = function(text) {
429
+
430
+ var client = this;
431
+
432
+ var uwmatcher = /(\p{L}+)/u;
433
+ var spl = text.split(uwmatcher);
434
+
435
+ var tokens = spl.map(function(s) {
436
+ var t = {};
437
+ var is_word = s.match(uwmatcher)
438
+
439
+ t.type = (is_word)?(Glaemscribe.TTS.TokenType.WORD):(Glaemscribe.TTS.TokenType.NON_WORD);
440
+ t.content = s;
441
+ return t;
442
+ });
443
+
444
+ var tokens2 = [];
445
+
446
+ // Handle apostrophe
447
+ for(var i=0;i<tokens.length;i++) {
448
+ if( i == 0 || i == tokens.length-1 || tokens[i].type == Glaemscribe.TTS.TokenType.WORD ) {
449
+ tokens2.push(tokens[i]);
450
+ continue;
451
+ }
452
+
453
+ if(tokens[i].content == "'" &&
454
+ tokens[i-1].type == Glaemscribe.TTS.TokenType.WORD &&
455
+ tokens[i+1].type == Glaemscribe.TTS.TokenType.WORD )
456
+ {
457
+ tokens2.pop();
458
+ var tok = {};
459
+ tok.type = Glaemscribe.TTS.TokenType.WORD;
460
+ tok.content = tokens[i-1].content + tokens[i].content + tokens[i+1].content;
461
+ tokens2.push(tok);
462
+ i += 1;
463
+ }
464
+ else {
465
+ tokens2.push(tokens[i]);
466
+ }
467
+ }
468
+ tokens = tokens2;
469
+
470
+ // Numerize tokens
471
+ var i = 0;
472
+ tokens.forEach(function(t) {
473
+ t.num = i;
474
+ i += 1;
475
+ });
476
+
477
+ // Remove non-speechable tokens
478
+ var stokens = tokens.filter(function(t) {
479
+ return (t.type == Glaemscribe.TTS.TokenType.WORD);
480
+ });
481
+
482
+ // Join speachable tokens
483
+ var r = stokens.map(function(t) { return t.content}).join('  ');
484
+
485
+ var args = {};
486
+ var voice = args.voice || 'en-tengwar';
487
+
488
+ client.proxy.set_voice(voice);
489
+ client.proxy.synthesize(r, false, true, true, function(result) {
490
+ r = result.pho;
491
+ });
492
+ r = r.split('').map(function(t) { return t.trim() });
493
+
494
+ var j = 0;
495
+ r.forEach(function(w) {
496
+ tokens[stokens[j].num].ipa = r[j];
497
+ j += 1;
498
+ });
499
+
500
+ return tokens;
501
+ }
502
+
363
503
  Glaemscribe.TTS.is_engine_loaded = function() {
364
504
  return (typeof(ESpeakNGGlue) !== 'undefined');
365
505
  };