n8n-nodes-tts-bigboss 1.0.7 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/TTSBigBoss.node.js +281 -62
- package/nodes/TTSBigBoss/TTSBigBoss.node.ts +374 -97
- package/package.json +9 -4
package/dist/TTSBigBoss.node.js
CHANGED
|
@@ -48,6 +48,134 @@ const http = __importStar(require("http"));
|
|
|
48
48
|
const stream = __importStar(require("stream"));
|
|
49
49
|
const util_1 = require("util");
|
|
50
50
|
const pipeline = (0, util_1.promisify)(stream.pipeline);
|
|
51
|
+
const MAX_CHARS_PER_CHUNK = 300;
|
|
52
|
+
const SILENCE_DURATION_MS = 200;
|
|
53
|
+
function splitTextIntoChunks(text) {
|
|
54
|
+
const chunks = [];
|
|
55
|
+
const sentences = text.match(/[^.!?]+[.!?]*/g) || [text];
|
|
56
|
+
let currentChunk = '';
|
|
57
|
+
for (const sentence of sentences) {
|
|
58
|
+
if (currentChunk.length + sentence.length <= MAX_CHARS_PER_CHUNK) {
|
|
59
|
+
currentChunk += sentence;
|
|
60
|
+
}
|
|
61
|
+
else {
|
|
62
|
+
if (currentChunk.length > 0) {
|
|
63
|
+
chunks.push(currentChunk.trim());
|
|
64
|
+
}
|
|
65
|
+
currentChunk = sentence;
|
|
66
|
+
while (currentChunk.length > MAX_CHARS_PER_CHUNK) {
|
|
67
|
+
let splitPoint = currentChunk.lastIndexOf(' ', MAX_CHARS_PER_CHUNK);
|
|
68
|
+
if (splitPoint === -1) {
|
|
69
|
+
splitPoint = MAX_CHARS_PER_CHUNK;
|
|
70
|
+
}
|
|
71
|
+
chunks.push(currentChunk.substring(0, splitPoint).trim());
|
|
72
|
+
currentChunk = currentChunk.substring(splitPoint).trim();
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
if (currentChunk.length > 0) {
|
|
77
|
+
chunks.push(currentChunk.trim());
|
|
78
|
+
}
|
|
79
|
+
return chunks.filter(chunk => chunk.length > 0);
|
|
80
|
+
}
|
|
81
|
+
function createWavBuffer(audioData, sampleRate = 24000) {
|
|
82
|
+
const numChannels = 1;
|
|
83
|
+
const bitsPerSample = 16;
|
|
84
|
+
const bytesPerSample = bitsPerSample / 8;
|
|
85
|
+
let int16Data;
|
|
86
|
+
if (audioData instanceof Float32Array) {
|
|
87
|
+
int16Data = new Int16Array(audioData.length);
|
|
88
|
+
for (let i = 0; i < audioData.length; i++) {
|
|
89
|
+
const s = Math.max(-1, Math.min(1, audioData[i]));
|
|
90
|
+
int16Data[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
else {
|
|
94
|
+
int16Data = new Int16Array(audioData.buffer, audioData.byteOffset, audioData.byteLength / 2);
|
|
95
|
+
}
|
|
96
|
+
const dataSize = int16Data.length * bytesPerSample;
|
|
97
|
+
const buffer = Buffer.alloc(44 + dataSize);
|
|
98
|
+
buffer.write('RIFF', 0);
|
|
99
|
+
buffer.writeUInt32LE(36 + dataSize, 4);
|
|
100
|
+
buffer.write('WAVE', 8);
|
|
101
|
+
buffer.write('fmt ', 12);
|
|
102
|
+
buffer.writeUInt32LE(16, 16);
|
|
103
|
+
buffer.writeUInt16LE(1, 20);
|
|
104
|
+
buffer.writeUInt16LE(numChannels, 22);
|
|
105
|
+
buffer.writeUInt32LE(sampleRate, 24);
|
|
106
|
+
buffer.writeUInt32LE(sampleRate * numChannels * bytesPerSample, 28);
|
|
107
|
+
buffer.writeUInt16LE(numChannels * bytesPerSample, 32);
|
|
108
|
+
buffer.writeUInt16LE(bitsPerSample, 34);
|
|
109
|
+
buffer.write('data', 36);
|
|
110
|
+
buffer.writeUInt32LE(dataSize, 40);
|
|
111
|
+
for (let i = 0; i < int16Data.length; i++) {
|
|
112
|
+
buffer.writeInt16LE(int16Data[i], 44 + i * 2);
|
|
113
|
+
}
|
|
114
|
+
return buffer;
|
|
115
|
+
}
|
|
116
|
+
function concatenateAudioBuffers(audioChunks, silenceDurationMs, sampleRate = 24000) {
|
|
117
|
+
if (audioChunks.length === 0) {
|
|
118
|
+
return createWavBuffer(new Float32Array(), sampleRate);
|
|
119
|
+
}
|
|
120
|
+
if (audioChunks.length === 1) {
|
|
121
|
+
return audioChunks[0].audio instanceof Buffer
|
|
122
|
+
? audioChunks[0].audio
|
|
123
|
+
: createWavBuffer(audioChunks[0].audio, audioChunks[0].sampling_rate);
|
|
124
|
+
}
|
|
125
|
+
const silenceSamples = Math.round((silenceDurationMs / 1000) * sampleRate);
|
|
126
|
+
let totalLength = 0;
|
|
127
|
+
const float32Chunks = [];
|
|
128
|
+
for (const chunk of audioChunks) {
|
|
129
|
+
if (chunk.audio instanceof Float32Array) {
|
|
130
|
+
float32Chunks.push(chunk.audio);
|
|
131
|
+
totalLength += chunk.audio.length;
|
|
132
|
+
}
|
|
133
|
+
else {
|
|
134
|
+
const int16 = new Int16Array(chunk.audio.buffer, chunk.audio.byteOffset, chunk.audio.byteLength / 2);
|
|
135
|
+
const float32 = new Float32Array(int16.length);
|
|
136
|
+
for (let i = 0; i < int16.length; i++) {
|
|
137
|
+
float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7FFF);
|
|
138
|
+
}
|
|
139
|
+
float32Chunks.push(float32);
|
|
140
|
+
totalLength += float32.length;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
totalLength += (audioChunks.length - 1) * silenceSamples;
|
|
144
|
+
const combinedAudio = new Float32Array(totalLength);
|
|
145
|
+
let offset = 0;
|
|
146
|
+
for (let i = 0; i < float32Chunks.length; i++) {
|
|
147
|
+
const chunk = float32Chunks[i];
|
|
148
|
+
combinedAudio.set(chunk, offset);
|
|
149
|
+
offset += chunk.length;
|
|
150
|
+
if (i < float32Chunks.length - 1) {
|
|
151
|
+
offset += silenceSamples;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return createWavBuffer(combinedAudio, sampleRate);
|
|
155
|
+
}
|
|
156
|
+
function generateSRTFromChunks(textChunks, audioDurations) {
|
|
157
|
+
if (textChunks.length === 0)
|
|
158
|
+
return '';
|
|
159
|
+
let srt = '';
|
|
160
|
+
let currentTime = 0;
|
|
161
|
+
let counter = 1;
|
|
162
|
+
const msToSrt = (ms) => {
|
|
163
|
+
const totalSec = Math.floor(ms / 1000);
|
|
164
|
+
const mili = Math.floor(ms % 1000);
|
|
165
|
+
const h = Math.floor(totalSec / 3600);
|
|
166
|
+
const m = Math.floor((totalSec % 3600) / 60);
|
|
167
|
+
const s = totalSec % 60;
|
|
168
|
+
return `${h.toString().padStart(2, '0')}:${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')},${mili.toString().padStart(3, '0')}`;
|
|
169
|
+
};
|
|
170
|
+
for (let i = 0; i < textChunks.length; i++) {
|
|
171
|
+
const duration = audioDurations[i] || 0;
|
|
172
|
+
const startTime = currentTime;
|
|
173
|
+
const endTime = currentTime + duration;
|
|
174
|
+
srt += `${counter++}\n${msToSrt(startTime)} --> ${msToSrt(endTime)}\n${textChunks[i].trim()}\n\n`;
|
|
175
|
+
currentTime = endTime + SILENCE_DURATION_MS;
|
|
176
|
+
}
|
|
177
|
+
return srt;
|
|
178
|
+
}
|
|
51
179
|
const PIPER_MODELS = [
|
|
52
180
|
{ name: 'Arabic (Jordan) - Kareem (Male) - Low', value: 'ar_JO-kareem-low' },
|
|
53
181
|
{ name: 'Arabic (Jordan) - Kareem (Male) - Medium', value: 'ar_JO-kareem-medium' },
|
|
@@ -73,6 +201,16 @@ const PIPER_MODELS = [
|
|
|
73
201
|
{ name: 'German - Thorsten (Male) - Low', value: 'de_DE-thorsten-low' },
|
|
74
202
|
];
|
|
75
203
|
const EDGE_URL = 'wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=6A5AA1D4EAFF4E9FB37E23D68491D6F4';
|
|
204
|
+
const EDGE_HEADERS = {
|
|
205
|
+
'Authority': 'speech.platform.bing.com',
|
|
206
|
+
'Sec-CH-UA': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
|
|
207
|
+
'Sec-CH-UA-Mobile': '?0',
|
|
208
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
|
|
209
|
+
'Sec-CH-UA-Platform': '"Windows"',
|
|
210
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
211
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
212
|
+
'Origin': 'chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold'
|
|
213
|
+
};
|
|
76
214
|
const EDGE_VOICES = [
|
|
77
215
|
{ name: 'Arabic (Egypt) - Salma', value: 'ar-EG-SalmaNeural' },
|
|
78
216
|
{ name: 'Arabic (Egypt) - Shakir', value: 'ar-EG-ShakirNeural' },
|
|
@@ -127,6 +265,11 @@ class TTSBigBoss {
|
|
|
127
265
|
value: 'coqui',
|
|
128
266
|
description: 'Connect to a running Coqui TTS/XTTS server.',
|
|
129
267
|
},
|
|
268
|
+
{
|
|
269
|
+
name: 'Kokoro TTS (Local OpenAI API)',
|
|
270
|
+
value: 'kokoro',
|
|
271
|
+
description: 'Connect to a local Kokoro server compatible with OpenAI API (e.g. /v1/audio/speech).',
|
|
272
|
+
},
|
|
130
273
|
{
|
|
131
274
|
name: 'System Command (Custom)',
|
|
132
275
|
value: 'system',
|
|
@@ -276,12 +419,47 @@ class TTSBigBoss {
|
|
|
276
419
|
},
|
|
277
420
|
description: 'Name from Hugging Face (e.g. en_US-bryce-medium) or full URL to .onnx file.',
|
|
278
421
|
},
|
|
422
|
+
{
|
|
423
|
+
displayName: 'API URL',
|
|
424
|
+
name: 'kokoroUrl',
|
|
425
|
+
type: 'string',
|
|
426
|
+
default: 'http://localhost:8880/v1/audio/speech',
|
|
427
|
+
description: 'Endpoint URL for Kokoro generation (OpenAI compatible).',
|
|
428
|
+
displayOptions: {
|
|
429
|
+
show: {
|
|
430
|
+
engine: ['kokoro'],
|
|
431
|
+
},
|
|
432
|
+
},
|
|
433
|
+
},
|
|
434
|
+
{
|
|
435
|
+
displayName: 'Voice / Model',
|
|
436
|
+
name: 'kokoroVoice',
|
|
437
|
+
type: 'string',
|
|
438
|
+
default: 'af_bella',
|
|
439
|
+
description: 'Voice ID (e.g. af_bella, af_sarah, am_adam). Arabic might require specific model ID.',
|
|
440
|
+
displayOptions: {
|
|
441
|
+
show: {
|
|
442
|
+
engine: ['kokoro'],
|
|
443
|
+
},
|
|
444
|
+
},
|
|
445
|
+
},
|
|
446
|
+
{
|
|
447
|
+
displayName: 'Speed',
|
|
448
|
+
name: 'kokoroSpeed',
|
|
449
|
+
type: 'number',
|
|
450
|
+
default: 1.0,
|
|
451
|
+
displayOptions: {
|
|
452
|
+
show: {
|
|
453
|
+
engine: ['kokoro'],
|
|
454
|
+
},
|
|
455
|
+
},
|
|
456
|
+
},
|
|
279
457
|
{
|
|
280
458
|
displayName: 'Base Server URL',
|
|
281
459
|
name: 'coquiUrl',
|
|
282
460
|
type: 'string',
|
|
283
|
-
default: 'http://
|
|
284
|
-
description: 'Base URL of Coqui server (e.g. http://
|
|
461
|
+
default: 'http://localhost:5002',
|
|
462
|
+
description: 'Base URL of Coqui server (e.g. http://localhost:5002 or http://host.docker.internal:5002).',
|
|
285
463
|
displayOptions: {
|
|
286
464
|
show: {
|
|
287
465
|
engine: ['coqui'],
|
|
@@ -434,6 +612,21 @@ class TTSBigBoss {
|
|
|
434
612
|
srtBuffer = Buffer.from(result.srt, 'utf8');
|
|
435
613
|
}
|
|
436
614
|
}
|
|
615
|
+
else if (engine === 'kokoro') {
|
|
616
|
+
const url = this.getNodeParameter('kokoroUrl', i);
|
|
617
|
+
const voice = this.getNodeParameter('kokoroVoice', i);
|
|
618
|
+
const speed = this.getNodeParameter('kokoroSpeed', i);
|
|
619
|
+
const payload = {
|
|
620
|
+
model: 'kokoro',
|
|
621
|
+
input: text,
|
|
622
|
+
voice: voice,
|
|
623
|
+
speed: speed,
|
|
624
|
+
response_format: 'mp3'
|
|
625
|
+
};
|
|
626
|
+
audioBuffer = await httpRequest(url, 'POST', payload);
|
|
627
|
+
const duration = getAudioDuration(audioBuffer, 'mp3');
|
|
628
|
+
srtBuffer = Buffer.from(generateHeuristicSRT(text, duration), 'utf8');
|
|
629
|
+
}
|
|
437
630
|
else if (engine === 'piper_local') {
|
|
438
631
|
let piperModel = this.getNodeParameter('piperModel', i);
|
|
439
632
|
if (piperModel === 'custom') {
|
|
@@ -456,7 +649,7 @@ class TTSBigBoss {
|
|
|
456
649
|
if (code === 0)
|
|
457
650
|
resolve();
|
|
458
651
|
if (errData.includes('json.exception.parse_error')) {
|
|
459
|
-
reject(new Error(`Piper Config Error: The downloaded JSON configuration for model '${piperModel}' seems corrupted
|
|
652
|
+
reject(new Error(`Piper Config Error: The downloaded JSON configuration for model '${piperModel}' seems corrupted. Try deleting the file at ${configPath}.`));
|
|
460
653
|
}
|
|
461
654
|
else {
|
|
462
655
|
reject(new Error(`Piper failed (exit ${code}): ${errData}`));
|
|
@@ -467,7 +660,8 @@ class TTSBigBoss {
|
|
|
467
660
|
if (!fs.existsSync(outFile))
|
|
468
661
|
throw new Error('Piper did not produce output file');
|
|
469
662
|
audioBuffer = fs.readFileSync(outFile);
|
|
470
|
-
|
|
663
|
+
const duration = getAudioDuration(audioBuffer, 'wav');
|
|
664
|
+
srtBuffer = Buffer.from(generateHeuristicSRT(text, duration), 'utf8');
|
|
471
665
|
fs.unlinkSync(outFile);
|
|
472
666
|
}
|
|
473
667
|
else if (engine === 'coqui') {
|
|
@@ -488,7 +682,8 @@ class TTSBigBoss {
|
|
|
488
682
|
payload.speaker_id = speakerSelection;
|
|
489
683
|
}
|
|
490
684
|
audioBuffer = await httpRequest(url, 'POST', payload);
|
|
491
|
-
|
|
685
|
+
const duration = getAudioDuration(audioBuffer, 'wav');
|
|
686
|
+
srtBuffer = Buffer.from(generateHeuristicSRT(text, duration), 'utf8');
|
|
492
687
|
}
|
|
493
688
|
else {
|
|
494
689
|
const commandTpl = this.getNodeParameter('systemCommand', i);
|
|
@@ -522,7 +717,8 @@ class TTSBigBoss {
|
|
|
522
717
|
throw new Error('System command did not produce output file at expected path');
|
|
523
718
|
}
|
|
524
719
|
audioBuffer = fs.readFileSync(outFile);
|
|
525
|
-
|
|
720
|
+
const duration = getAudioDuration(audioBuffer);
|
|
721
|
+
srtBuffer = Buffer.from(generateHeuristicSRT(text, duration), 'utf8');
|
|
526
722
|
if (fs.existsSync(outFile))
|
|
527
723
|
fs.unlinkSync(outFile);
|
|
528
724
|
}
|
|
@@ -667,23 +863,41 @@ function ticksToTime(ticks) {
|
|
|
667
863
|
const mili = date.getMilliseconds().toString().padStart(3, '0');
|
|
668
864
|
return `${h}:${m}:${s},${mili}`;
|
|
669
865
|
}
|
|
670
|
-
function
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
const
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
866
|
+
async function downloadFile(url, dest) {
|
|
867
|
+
return new Promise((resolve, reject) => {
|
|
868
|
+
const file = fs.createWriteStream(dest);
|
|
869
|
+
file.on('error', (err) => {
|
|
870
|
+
fs.unlink(dest, () => { });
|
|
871
|
+
reject(new Error(`File write error: ${err.message}`));
|
|
872
|
+
});
|
|
873
|
+
const request = https.get(url, (response) => {
|
|
874
|
+
if (response.statusCode === 302 || response.statusCode === 301) {
|
|
875
|
+
file.close();
|
|
876
|
+
downloadFile(response.headers.location, dest).then(resolve).catch(reject);
|
|
877
|
+
return;
|
|
878
|
+
}
|
|
879
|
+
if (response.statusCode && response.statusCode !== 200) {
|
|
880
|
+
file.close();
|
|
881
|
+
fs.unlink(dest, () => { });
|
|
882
|
+
reject(new Error(`Download failed with status code: ${response.statusCode} for URL: ${url}`));
|
|
883
|
+
return;
|
|
884
|
+
}
|
|
885
|
+
response.pipe(file);
|
|
886
|
+
file.on('finish', () => {
|
|
887
|
+
file.close((err) => {
|
|
888
|
+
if (err)
|
|
889
|
+
reject(err);
|
|
890
|
+
else
|
|
891
|
+
resolve();
|
|
892
|
+
});
|
|
893
|
+
});
|
|
894
|
+
});
|
|
895
|
+
request.on('error', (err) => {
|
|
896
|
+
file.close();
|
|
897
|
+
fs.unlink(dest, () => { });
|
|
898
|
+
reject(new Error(`Network error: ${err.message}`));
|
|
899
|
+
});
|
|
900
|
+
});
|
|
687
901
|
}
|
|
688
902
|
async function ensurePiperBinary(binDir) {
|
|
689
903
|
const platform = os.platform();
|
|
@@ -735,15 +949,15 @@ async function ensurePiperModel(binDir, modelNameOrUrl) {
|
|
|
735
949
|
else {
|
|
736
950
|
const parts = modelNameOrUrl.split('-');
|
|
737
951
|
if (parts.length >= 3) {
|
|
738
|
-
const langRegion = parts[0]
|
|
739
|
-
const voice = parts[
|
|
740
|
-
const quality = parts[
|
|
741
|
-
const lang =
|
|
952
|
+
const langRegion = parts[0];
|
|
953
|
+
const voice = parts[1];
|
|
954
|
+
const quality = parts[2];
|
|
955
|
+
const lang = langRegion.split('_')[0];
|
|
742
956
|
modelFilename = modelNameOrUrl + '.onnx';
|
|
743
957
|
modelUrl = `https://huggingface.co/rhasspy/piper-voices/resolve/main/${lang}/${langRegion}/${voice}/${quality}/${modelFilename}?download=true`;
|
|
744
958
|
}
|
|
745
959
|
else {
|
|
746
|
-
throw new Error(`Invalid model name format: ${modelNameOrUrl}
|
|
960
|
+
throw new Error(`Invalid model name format: ${modelNameOrUrl}.`);
|
|
747
961
|
}
|
|
748
962
|
}
|
|
749
963
|
const modelPath = path.join(binDir, modelFilename);
|
|
@@ -772,41 +986,46 @@ async function ensurePiperModel(binDir, modelNameOrUrl) {
|
|
|
772
986
|
}
|
|
773
987
|
return { modelPath, configPath };
|
|
774
988
|
}
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
file.close();
|
|
785
|
-
downloadFile(response.headers.location, dest).then(resolve).catch(reject);
|
|
786
|
-
return;
|
|
787
|
-
}
|
|
788
|
-
if (response.statusCode && response.statusCode !== 200) {
|
|
789
|
-
file.close();
|
|
790
|
-
fs.unlink(dest, () => { });
|
|
791
|
-
reject(new Error(`Download failed with status code: ${response.statusCode} for URL: ${url}`));
|
|
792
|
-
return;
|
|
989
|
+
function getAudioDuration(buffer, hint = null) {
|
|
990
|
+
if (!buffer || buffer.length === 0)
|
|
991
|
+
return -1;
|
|
992
|
+
if ((hint === 'wav') || (buffer.length > 12 && buffer.toString('ascii', 0, 4) === 'RIFF' && buffer.toString('ascii', 8, 12) === 'WAVE')) {
|
|
993
|
+
try {
|
|
994
|
+
const byteRate = buffer.readUInt32LE(28);
|
|
995
|
+
if (byteRate > 0) {
|
|
996
|
+
const dataSize = buffer.length - 44;
|
|
997
|
+
return dataSize / byteRate;
|
|
793
998
|
}
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
999
|
+
}
|
|
1000
|
+
catch (e) { }
|
|
1001
|
+
}
|
|
1002
|
+
return -1;
|
|
1003
|
+
}
|
|
1004
|
+
function generateHeuristicSRT(text, durationSeconds) {
|
|
1005
|
+
if (durationSeconds <= 0) {
|
|
1006
|
+
durationSeconds = text.length / 14;
|
|
1007
|
+
}
|
|
1008
|
+
const sentences = text.match(/[^.!?]+[.!?]*/g) || [text];
|
|
1009
|
+
const totalContentLen = text.length;
|
|
1010
|
+
let currentStartTime = 0;
|
|
1011
|
+
let srt = '';
|
|
1012
|
+
let counter = 1;
|
|
1013
|
+
const msToSrt = (ms) => {
|
|
1014
|
+
const totalSec = Math.floor(ms / 1000);
|
|
1015
|
+
const mili = Math.floor(ms % 1000);
|
|
1016
|
+
const h = Math.floor(totalSec / 3600);
|
|
1017
|
+
const m = Math.floor((totalSec % 3600) / 60);
|
|
1018
|
+
const s = totalSec % 60;
|
|
1019
|
+
return `${h.toString().padStart(2, '0')}:${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')},${mili.toString().padStart(3, '0')}`;
|
|
1020
|
+
};
|
|
1021
|
+
for (const sentence of sentences) {
|
|
1022
|
+
const sentenceRatio = sentence.length / totalContentLen;
|
|
1023
|
+
const sentenceDuration = sentenceRatio * durationSeconds;
|
|
1024
|
+
const endTime = currentStartTime + sentenceDuration;
|
|
1025
|
+
srt += `${counter++}\n${msToSrt(currentStartTime * 1000)} --> ${msToSrt(endTime * 1000)}\n${sentence.trim()}\n\n`;
|
|
1026
|
+
currentStartTime = endTime;
|
|
1027
|
+
}
|
|
1028
|
+
return srt;
|
|
810
1029
|
}
|
|
811
1030
|
async function httpRequest(url, method = 'GET', body = null) {
|
|
812
1031
|
const requestModule = url.startsWith('https') ? https : http;
|
|
@@ -5,6 +5,7 @@ import {
|
|
|
5
5
|
INodeTypeDescription,
|
|
6
6
|
ILoadOptionsFunctions,
|
|
7
7
|
INodePropertyOptions,
|
|
8
|
+
NodeOperationError,
|
|
8
9
|
} from 'n8n-workflow';
|
|
9
10
|
import { v4 as uuidv4 } from 'uuid';
|
|
10
11
|
import * as fs from 'fs';
|
|
@@ -20,6 +21,194 @@ import * as zlib from 'zlib'; // For extracting .tar.gz if needed, typically usa
|
|
|
20
21
|
|
|
21
22
|
const pipeline = promisify(stream.pipeline);
|
|
22
23
|
|
|
24
|
+
// =============================================================================
|
|
25
|
+
// CORE HELPER FUNCTIONS
|
|
26
|
+
// =============================================================================
|
|
27
|
+
|
|
28
|
+
const MAX_CHARS_PER_CHUNK = 300; // Estimated safe limit for ~20-25 seconds of audio
|
|
29
|
+
const SILENCE_DURATION_MS = 200; // 200ms pause between concatenated audio chunks
|
|
30
|
+
|
|
31
|
+
interface AudioChunk {
|
|
32
|
+
audio: Float32Array | Buffer;
|
|
33
|
+
sampling_rate: number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Splits text into chunks based on sentence endings and a maximum character limit.
|
|
38
|
+
* Tries to keep sentences together and avoids splitting words.
|
|
39
|
+
*/
|
|
40
|
+
function splitTextIntoChunks(text: string): string[] {
|
|
41
|
+
const chunks: string[] = [];
|
|
42
|
+
const sentences = text.match(/[^.!?]+[.!?]*/g) || [text];
|
|
43
|
+
let currentChunk = '';
|
|
44
|
+
|
|
45
|
+
for (const sentence of sentences) {
|
|
46
|
+
if (currentChunk.length + sentence.length <= MAX_CHARS_PER_CHUNK) {
|
|
47
|
+
currentChunk += sentence;
|
|
48
|
+
} else {
|
|
49
|
+
if (currentChunk.length > 0) {
|
|
50
|
+
chunks.push(currentChunk.trim());
|
|
51
|
+
}
|
|
52
|
+
currentChunk = sentence;
|
|
53
|
+
|
|
54
|
+
// If a single sentence is still too long, split it further
|
|
55
|
+
while (currentChunk.length > MAX_CHARS_PER_CHUNK) {
|
|
56
|
+
let splitPoint = currentChunk.lastIndexOf(' ', MAX_CHARS_PER_CHUNK);
|
|
57
|
+
if (splitPoint === -1) {
|
|
58
|
+
splitPoint = MAX_CHARS_PER_CHUNK;
|
|
59
|
+
}
|
|
60
|
+
chunks.push(currentChunk.substring(0, splitPoint).trim());
|
|
61
|
+
currentChunk = currentChunk.substring(splitPoint).trim();
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if (currentChunk.length > 0) {
|
|
67
|
+
chunks.push(currentChunk.trim());
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
return chunks.filter(chunk => chunk.length > 0);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Create a WAV file buffer from raw PCM audio data
|
|
75
|
+
*/
|
|
76
|
+
function createWavBuffer(audioData: Float32Array | Buffer, sampleRate: number = 24000): Buffer {
|
|
77
|
+
const numChannels = 1; // Mono
|
|
78
|
+
const bitsPerSample = 16;
|
|
79
|
+
const bytesPerSample = bitsPerSample / 8;
|
|
80
|
+
|
|
81
|
+
// Convert Float32Array to Int16Array if needed
|
|
82
|
+
let int16Data: Int16Array;
|
|
83
|
+
if (audioData instanceof Float32Array) {
|
|
84
|
+
int16Data = new Int16Array(audioData.length);
|
|
85
|
+
for (let i = 0; i < audioData.length; i++) {
|
|
86
|
+
const s = Math.max(-1, Math.min(1, audioData[i]));
|
|
87
|
+
int16Data[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
|
|
88
|
+
}
|
|
89
|
+
} else {
|
|
90
|
+
// Already a buffer, assume it's raw PCM int16
|
|
91
|
+
int16Data = new Int16Array(audioData.buffer, audioData.byteOffset, audioData.byteLength / 2);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
const dataSize = int16Data.length * bytesPerSample;
|
|
95
|
+
const buffer = Buffer.alloc(44 + dataSize);
|
|
96
|
+
|
|
97
|
+
// WAV header
|
|
98
|
+
buffer.write('RIFF', 0);
|
|
99
|
+
buffer.writeUInt32LE(36 + dataSize, 4);
|
|
100
|
+
buffer.write('WAVE', 8);
|
|
101
|
+
buffer.write('fmt ', 12);
|
|
102
|
+
buffer.writeUInt32LE(16, 16); // PCM format chunk size
|
|
103
|
+
buffer.writeUInt16LE(1, 20); // PCM format
|
|
104
|
+
buffer.writeUInt16LE(numChannels, 22);
|
|
105
|
+
buffer.writeUInt32LE(sampleRate, 24);
|
|
106
|
+
buffer.writeUInt32LE(sampleRate * numChannels * bytesPerSample, 28); // Byte rate
|
|
107
|
+
buffer.writeUInt16LE(numChannels * bytesPerSample, 32); // Block align
|
|
108
|
+
buffer.writeUInt16LE(bitsPerSample, 34);
|
|
109
|
+
buffer.write('data', 36);
|
|
110
|
+
buffer.writeUInt32LE(dataSize, 40);
|
|
111
|
+
|
|
112
|
+
// Write PCM data
|
|
113
|
+
for (let i = 0; i < int16Data.length; i++) {
|
|
114
|
+
buffer.writeInt16LE(int16Data[i], 44 + i * 2);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return buffer;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Concatenates multiple audio buffers and adds silence between them.
|
|
122
|
+
* Handles both Buffer and Float32Array inputs.
|
|
123
|
+
*/
|
|
124
|
+
function concatenateAudioBuffers(
|
|
125
|
+
audioChunks: AudioChunk[],
|
|
126
|
+
silenceDurationMs: number,
|
|
127
|
+
sampleRate: number = 24000
|
|
128
|
+
): Buffer {
|
|
129
|
+
if (audioChunks.length === 0) {
|
|
130
|
+
return createWavBuffer(new Float32Array(), sampleRate);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
if (audioChunks.length === 1) {
|
|
134
|
+
return audioChunks[0].audio instanceof Buffer
|
|
135
|
+
? audioChunks[0].audio
|
|
136
|
+
: createWavBuffer(audioChunks[0].audio, audioChunks[0].sampling_rate);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Calculate total length including silence
|
|
140
|
+
const silenceSamples = Math.round((silenceDurationMs / 1000) * sampleRate);
|
|
141
|
+
let totalLength = 0;
|
|
142
|
+
|
|
143
|
+
// Convert all to Float32Array for easier concatenation
|
|
144
|
+
const float32Chunks: Float32Array[] = [];
|
|
145
|
+
for (const chunk of audioChunks) {
|
|
146
|
+
if (chunk.audio instanceof Float32Array) {
|
|
147
|
+
float32Chunks.push(chunk.audio);
|
|
148
|
+
totalLength += chunk.audio.length;
|
|
149
|
+
} else {
|
|
150
|
+
// Convert Buffer to Float32Array (assuming 16-bit PCM)
|
|
151
|
+
const int16 = new Int16Array(chunk.audio.buffer, chunk.audio.byteOffset, chunk.audio.byteLength / 2);
|
|
152
|
+
const float32 = new Float32Array(int16.length);
|
|
153
|
+
for (let i = 0; i < int16.length; i++) {
|
|
154
|
+
float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7FFF);
|
|
155
|
+
}
|
|
156
|
+
float32Chunks.push(float32);
|
|
157
|
+
totalLength += float32.length;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
totalLength += (audioChunks.length - 1) * silenceSamples;
|
|
162
|
+
|
|
163
|
+
const combinedAudio = new Float32Array(totalLength);
|
|
164
|
+
let offset = 0;
|
|
165
|
+
|
|
166
|
+
for (let i = 0; i < float32Chunks.length; i++) {
|
|
167
|
+
const chunk = float32Chunks[i];
|
|
168
|
+
combinedAudio.set(chunk, offset);
|
|
169
|
+
offset += chunk.length;
|
|
170
|
+
|
|
171
|
+
// Add silence if not the last chunk
|
|
172
|
+
if (i < float32Chunks.length - 1) {
|
|
173
|
+
offset += silenceSamples;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
return createWavBuffer(combinedAudio, sampleRate);
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Generate SRT subtitles for concatenated audio chunks
|
|
182
|
+
*/
|
|
183
|
+
function generateSRTFromChunks(textChunks: string[], audioDurations: number[]): string {
|
|
184
|
+
if (textChunks.length === 0) return '';
|
|
185
|
+
|
|
186
|
+
let srt = '';
|
|
187
|
+
let currentTime = 0;
|
|
188
|
+
let counter = 1;
|
|
189
|
+
|
|
190
|
+
const msToSrt = (ms: number) => {
|
|
191
|
+
const totalSec = Math.floor(ms / 1000);
|
|
192
|
+
const mili = Math.floor(ms % 1000);
|
|
193
|
+
const h = Math.floor(totalSec / 3600);
|
|
194
|
+
const m = Math.floor((totalSec % 3600) / 60);
|
|
195
|
+
const s = totalSec % 60;
|
|
196
|
+
return `${h.toString().padStart(2, '0')}:${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')},${mili.toString().padStart(3, '0')}`;
|
|
197
|
+
};
|
|
198
|
+
|
|
199
|
+
for (let i = 0; i < textChunks.length; i++) {
|
|
200
|
+
const duration = audioDurations[i] || 0;
|
|
201
|
+
const startTime = currentTime;
|
|
202
|
+
const endTime = currentTime + duration;
|
|
203
|
+
|
|
204
|
+
srt += `${counter++}\n${msToSrt(startTime)} --> ${msToSrt(endTime)}\n${textChunks[i].trim()}\n\n`;
|
|
205
|
+
currentTime = endTime + SILENCE_DURATION_MS; // Add silence duration
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
return srt;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
|
|
23
212
|
// Piper Models List (Curated High Quality)
|
|
24
213
|
// Note: Official Piper repo currently only has 'kareem' (Male) for Arabic.
|
|
25
214
|
// For Female Arabic voices, please use the 'Edge TTS' engine (Salma, Zariyah).
|
|
@@ -61,6 +250,16 @@ const PIPER_MODELS = [
|
|
|
61
250
|
|
|
62
251
|
// Edge TTS Constants
|
|
63
252
|
const EDGE_URL = 'wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=6A5AA1D4EAFF4E9FB37E23D68491D6F4';
|
|
253
|
+
const EDGE_HEADERS = {
|
|
254
|
+
'Authority': 'speech.platform.bing.com',
|
|
255
|
+
'Sec-CH-UA': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
|
|
256
|
+
'Sec-CH-UA-Mobile': '?0',
|
|
257
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
|
|
258
|
+
'Sec-CH-UA-Platform': '"Windows"',
|
|
259
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
260
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
261
|
+
'Origin': 'chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold'
|
|
262
|
+
};
|
|
64
263
|
const EDGE_VOICES = [
|
|
65
264
|
// Arabic
|
|
66
265
|
{ name: 'Arabic (Egypt) - Salma', value: 'ar-EG-SalmaNeural' },
|
|
@@ -127,6 +326,11 @@ export class TTSBigBoss implements INodeType {
|
|
|
127
326
|
value: 'coqui',
|
|
128
327
|
description: 'Connect to a running Coqui TTS/XTTS server.',
|
|
129
328
|
},
|
|
329
|
+
{
|
|
330
|
+
name: 'Kokoro TTS (Local OpenAI API)',
|
|
331
|
+
value: 'kokoro',
|
|
332
|
+
description: 'Connect to a local Kokoro server compatible with OpenAI API (e.g. /v1/audio/speech).',
|
|
333
|
+
},
|
|
130
334
|
{
|
|
131
335
|
name: 'System Command (Custom)',
|
|
132
336
|
value: 'system',
|
|
@@ -289,14 +493,52 @@ export class TTSBigBoss implements INodeType {
|
|
|
289
493
|
description: 'Name from Hugging Face (e.g. en_US-bryce-medium) or full URL to .onnx file.',
|
|
290
494
|
},
|
|
291
495
|
// ----------------------------------
|
|
496
|
+
// Kokoro Settings
|
|
497
|
+
// ----------------------------------
|
|
498
|
+
{
|
|
499
|
+
displayName: 'API URL',
|
|
500
|
+
name: 'kokoroUrl',
|
|
501
|
+
type: 'string',
|
|
502
|
+
default: 'http://localhost:8880/v1/audio/speech',
|
|
503
|
+
description: 'Endpoint URL for Kokoro generation (OpenAI compatible).',
|
|
504
|
+
displayOptions: {
|
|
505
|
+
show: {
|
|
506
|
+
engine: ['kokoro'],
|
|
507
|
+
},
|
|
508
|
+
},
|
|
509
|
+
},
|
|
510
|
+
{
|
|
511
|
+
displayName: 'Voice / Model',
|
|
512
|
+
name: 'kokoroVoice',
|
|
513
|
+
type: 'string',
|
|
514
|
+
default: 'af_bella',
|
|
515
|
+
description: 'Voice ID (e.g. af_bella, af_sarah, am_adam). Arabic might require specific model ID.',
|
|
516
|
+
displayOptions: {
|
|
517
|
+
show: {
|
|
518
|
+
engine: ['kokoro'],
|
|
519
|
+
},
|
|
520
|
+
},
|
|
521
|
+
},
|
|
522
|
+
{
|
|
523
|
+
displayName: 'Speed',
|
|
524
|
+
name: 'kokoroSpeed',
|
|
525
|
+
type: 'number',
|
|
526
|
+
default: 1.0,
|
|
527
|
+
displayOptions: {
|
|
528
|
+
show: {
|
|
529
|
+
engine: ['kokoro'],
|
|
530
|
+
},
|
|
531
|
+
},
|
|
532
|
+
},
|
|
533
|
+
// ----------------------------------
|
|
292
534
|
// Coqui Server Settings
|
|
293
535
|
// ----------------------------------
|
|
294
536
|
{
|
|
295
537
|
displayName: 'Base Server URL',
|
|
296
538
|
name: 'coquiUrl',
|
|
297
539
|
type: 'string',
|
|
298
|
-
default: 'http://
|
|
299
|
-
description: 'Base URL of Coqui server (e.g. http://
|
|
540
|
+
default: 'http://localhost:5002',
|
|
541
|
+
description: 'Base URL of Coqui server (e.g. http://localhost:5002 or http://host.docker.internal:5002).',
|
|
300
542
|
displayOptions: {
|
|
301
543
|
show: {
|
|
302
544
|
engine: ['coqui'],
|
|
@@ -461,6 +703,28 @@ export class TTSBigBoss implements INodeType {
|
|
|
461
703
|
srtBuffer = Buffer.from(result.srt, 'utf8');
|
|
462
704
|
}
|
|
463
705
|
|
|
706
|
+
} else if (engine === 'kokoro') {
|
|
707
|
+
// ----------------------------------
|
|
708
|
+
// KOKORO EXECUTION
|
|
709
|
+
// ----------------------------------
|
|
710
|
+
const url = this.getNodeParameter('kokoroUrl', i) as string;
|
|
711
|
+
const voice = this.getNodeParameter('kokoroVoice', i) as string;
|
|
712
|
+
const speed = this.getNodeParameter('kokoroSpeed', i) as number;
|
|
713
|
+
|
|
714
|
+
// Standard OpenAI 'createSpeech' payload
|
|
715
|
+
const payload = {
|
|
716
|
+
model: 'kokoro', // or whatever the server expects
|
|
717
|
+
input: text,
|
|
718
|
+
voice: voice,
|
|
719
|
+
speed: speed,
|
|
720
|
+
response_format: 'mp3'
|
|
721
|
+
};
|
|
722
|
+
|
|
723
|
+
audioBuffer = await httpRequest(url, 'POST', payload);
|
|
724
|
+
|
|
725
|
+
const duration = getAudioDuration(audioBuffer, 'mp3');
|
|
726
|
+
srtBuffer = Buffer.from(generateHeuristicSRT(text, duration), 'utf8');
|
|
727
|
+
|
|
464
728
|
} else if (engine === 'piper_local') {
|
|
465
729
|
// ----------------------------------
|
|
466
730
|
// PIPER LOCAL AUTOMATION
|
|
@@ -478,8 +742,6 @@ export class TTSBigBoss implements INodeType {
|
|
|
478
742
|
|
|
479
743
|
// 3. Execute
|
|
480
744
|
const outFile = path.join(tempDir, `piper_out_${uuidv4()}.wav`);
|
|
481
|
-
// Piper command: echo "text" | piper --model model.onnx --output_file out.wav
|
|
482
|
-
// We use child_process.spawn to pipe text safely
|
|
483
745
|
|
|
484
746
|
await new Promise<void>((resolve, reject) => {
|
|
485
747
|
const piperProc = child_process.spawn(piperBinPath, [
|
|
@@ -496,9 +758,8 @@ export class TTSBigBoss implements INodeType {
|
|
|
496
758
|
|
|
497
759
|
piperProc.on('close', (code) => {
|
|
498
760
|
if (code === 0) resolve();
|
|
499
|
-
// Check for the specific JSON error in stderr
|
|
500
761
|
if (errData.includes('json.exception.parse_error')) {
|
|
501
|
-
reject(new Error(`Piper Config Error: The downloaded JSON configuration for model '${piperModel}' seems corrupted
|
|
762
|
+
reject(new Error(`Piper Config Error: The downloaded JSON configuration for model '${piperModel}' seems corrupted. Try deleting the file at ${configPath}.`));
|
|
502
763
|
} else {
|
|
503
764
|
reject(new Error(`Piper failed (exit ${code}): ${errData}`));
|
|
504
765
|
}
|
|
@@ -510,7 +771,8 @@ export class TTSBigBoss implements INodeType {
|
|
|
510
771
|
if (!fs.existsSync(outFile)) throw new Error('Piper did not produce output file');
|
|
511
772
|
|
|
512
773
|
audioBuffer = fs.readFileSync(outFile);
|
|
513
|
-
|
|
774
|
+
const duration = getAudioDuration(audioBuffer, 'wav');
|
|
775
|
+
srtBuffer = Buffer.from(generateHeuristicSRT(text, duration), 'utf8');
|
|
514
776
|
|
|
515
777
|
fs.unlinkSync(outFile);
|
|
516
778
|
|
|
@@ -526,7 +788,6 @@ export class TTSBigBoss implements INodeType {
|
|
|
526
788
|
const wavPath = this.getNodeParameter('coquiWavPath', i, '') as string;
|
|
527
789
|
const lang = this.getNodeParameter('coquiLang', i) as string;
|
|
528
790
|
|
|
529
|
-
// Construct Payload
|
|
530
791
|
const payload: any = {
|
|
531
792
|
text: text,
|
|
532
793
|
language_id: lang,
|
|
@@ -538,9 +799,9 @@ export class TTSBigBoss implements INodeType {
|
|
|
538
799
|
payload.speaker_id = speakerSelection;
|
|
539
800
|
}
|
|
540
801
|
|
|
541
|
-
// Execute Request
|
|
542
802
|
audioBuffer = await httpRequest(url, 'POST', payload);
|
|
543
|
-
|
|
803
|
+
const duration = getAudioDuration(audioBuffer, 'wav');
|
|
804
|
+
srtBuffer = Buffer.from(generateHeuristicSRT(text, duration), 'utf8');
|
|
544
805
|
|
|
545
806
|
} else {
|
|
546
807
|
// ----------------------------------
|
|
@@ -556,7 +817,6 @@ export class TTSBigBoss implements INodeType {
|
|
|
556
817
|
.replace(/"{text}"/g, `"${text.replace(/"/g, '\\"')}"`) // Basic escape
|
|
557
818
|
.replace(/{text}/g, `"${text.replace(/"/g, '\\"')}"`);
|
|
558
819
|
|
|
559
|
-
// Handle Clone Input
|
|
560
820
|
if (useClone) {
|
|
561
821
|
const cloneProp = this.getNodeParameter('cloneInputProperty', i) as string;
|
|
562
822
|
const cloneData = await this.helpers.getBinaryDataBuffer(i, cloneProp);
|
|
@@ -567,7 +827,6 @@ export class TTSBigBoss implements INodeType {
|
|
|
567
827
|
.replace(/{reference_audio}/g, `"${cloneFile}"`);
|
|
568
828
|
}
|
|
569
829
|
|
|
570
|
-
// Execute
|
|
571
830
|
await new Promise((resolve, reject) => {
|
|
572
831
|
child_process.exec(cmd, (error, stdout, stderr) => {
|
|
573
832
|
if (error) {
|
|
@@ -583,9 +842,8 @@ export class TTSBigBoss implements INodeType {
|
|
|
583
842
|
}
|
|
584
843
|
|
|
585
844
|
audioBuffer = fs.readFileSync(outFile);
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
srtBuffer = Buffer.from(generateHeuristicSRT(text, audioBuffer.length), 'utf8');
|
|
845
|
+
const duration = getAudioDuration(audioBuffer);
|
|
846
|
+
srtBuffer = Buffer.from(generateHeuristicSRT(text, duration), 'utf8');
|
|
589
847
|
|
|
590
848
|
// Cleanup
|
|
591
849
|
if (fs.existsSync(outFile)) fs.unlinkSync(outFile);
|
|
@@ -784,41 +1042,43 @@ function ticksToTime(ticks: number): string {
|
|
|
784
1042
|
return `${h}:${m}:${s},${mili}`;
|
|
785
1043
|
}
|
|
786
1044
|
|
|
787
|
-
//
|
|
788
|
-
// HEURISTIC
|
|
789
|
-
//
|
|
790
|
-
function generateHeuristicSRT(text: string, byteLength: number): string {
|
|
791
|
-
// Estimate duration assuming typical MP3/WAV bitrate.
|
|
792
|
-
// Actually, system command usually produces WAV (PCM).
|
|
793
|
-
// Wrapper might produce MP3. Let's assume user command output.
|
|
794
|
-
// It is safer to assume ~15 chars per second reading speed if we don't know duration.
|
|
795
|
-
// Or assume 16000 bytes/sec for mono 16khz? Too unreliable.
|
|
796
|
-
// Let's use text length heuristic: Avg reading speed 150 wpm ~ 2.5 words/sec ~ 15 chars/sec?
|
|
797
|
-
// Let's try 15 chars / second.
|
|
798
|
-
|
|
799
|
-
const totalDurationSec = text.length / 15;
|
|
800
|
-
// Ideally we'd use 'ffprobe' to get exact duration, but let's stick to pure TS for now.
|
|
801
|
-
// If we really wanted to be robust, we'd add 'ffprobe' execution here.
|
|
802
|
-
|
|
803
|
-
const sentences = text.match(/[^.!?]+[.!?]*/g) || [text];
|
|
804
|
-
let currentStartTime = 0;
|
|
805
|
-
let srt = '';
|
|
806
|
-
let counter = 1;
|
|
1045
|
+
// ----------------------------------
|
|
1046
|
+
// OLD HEURISTIC REMOVED
|
|
1047
|
+
// ----------------------------------
|
|
807
1048
|
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
const
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
1049
|
+
async function downloadFile(url: string, dest: string): Promise<void> {
|
|
1050
|
+
return new Promise((resolve, reject) => {
|
|
1051
|
+
const file = fs.createWriteStream(dest);
|
|
1052
|
+
file.on('error', (err) => {
|
|
1053
|
+
fs.unlink(dest, () => { });
|
|
1054
|
+
reject(new Error(`File write error: ${err.message}`));
|
|
1055
|
+
});
|
|
1056
|
+
const request = https.get(url, (response) => {
|
|
1057
|
+
if (response.statusCode === 302 || response.statusCode === 301) {
|
|
1058
|
+
file.close();
|
|
1059
|
+
downloadFile(response.headers.location!, dest).then(resolve).catch(reject);
|
|
1060
|
+
return;
|
|
1061
|
+
}
|
|
1062
|
+
if (response.statusCode && response.statusCode !== 200) {
|
|
1063
|
+
file.close();
|
|
1064
|
+
fs.unlink(dest, () => { });
|
|
1065
|
+
reject(new Error(`Download failed with status code: ${response.statusCode} for URL: ${url}`));
|
|
1066
|
+
return;
|
|
1067
|
+
}
|
|
1068
|
+
response.pipe(file);
|
|
1069
|
+
file.on('finish', () => {
|
|
1070
|
+
file.close((err) => {
|
|
1071
|
+
if (err) reject(err);
|
|
1072
|
+
else resolve();
|
|
1073
|
+
});
|
|
1074
|
+
});
|
|
1075
|
+
});
|
|
1076
|
+
request.on('error', (err) => {
|
|
1077
|
+
file.close();
|
|
1078
|
+
fs.unlink(dest, () => { });
|
|
1079
|
+
reject(new Error(`Network error: ${err.message}`));
|
|
1080
|
+
});
|
|
1081
|
+
});
|
|
822
1082
|
}
|
|
823
1083
|
|
|
824
1084
|
// --------------------------------------------------------------------------
|
|
@@ -888,25 +1148,23 @@ async function ensurePiperModel(binDir: string, modelNameOrUrl: string): Promise
|
|
|
888
1148
|
modelUrl = modelNameOrUrl;
|
|
889
1149
|
modelFilename = path.basename(modelNameOrUrl);
|
|
890
1150
|
} else {
|
|
891
|
-
//
|
|
1151
|
+
// Correct Parsing for 'lang_REGION-voice-quality'
|
|
1152
|
+
// e.g. en_US-lessac-medium -> [en_US, lessac, medium]
|
|
1153
|
+
// e.g. ar_JO-kareem-medium -> [ar_JO, kareem, medium]
|
|
1154
|
+
|
|
892
1155
|
const parts = modelNameOrUrl.split('-');
|
|
893
1156
|
if (parts.length >= 3) {
|
|
894
|
-
const langRegion = parts[0]
|
|
895
|
-
const voice = parts[
|
|
896
|
-
const quality = parts[
|
|
897
|
-
const lang = parts[0]; // en
|
|
1157
|
+
const langRegion = parts[0]; // 'ar_JO' or 'en_US'
|
|
1158
|
+
const voice = parts[1]; // 'kareem'
|
|
1159
|
+
const quality = parts[2]; // 'medium'
|
|
898
1160
|
|
|
899
|
-
//
|
|
900
|
-
|
|
901
|
-
// url path: en/en_US/lessac/medium/en_US-lessac-medium.onnx
|
|
902
|
-
|
|
903
|
-
// Handle special case: ar_JO (no lang folder? check repo)
|
|
904
|
-
// Generally structure is: lang_short/lang_long/voice/quality/filename
|
|
1161
|
+
// Lang code is first part of langRegion (split by _)
|
|
1162
|
+
const lang = langRegion.split('_')[0]; // 'ar' form 'ar_JO'
|
|
905
1163
|
|
|
906
1164
|
modelFilename = modelNameOrUrl + '.onnx';
|
|
907
|
-
modelUrl = `https://huggingface.co/rhasspy/piper-voices/resolve/main/${lang}/${langRegion}/${voice}/${quality}/${modelFilename}?download=true`;
|
|
1165
|
+
modelUrl = `https://huggingface.co/rhasspy/piper-voices/resolve/main/${lang}/${langRegion}/${voice}/${quality}/${modelFilename}?download=true`;
|
|
908
1166
|
} else {
|
|
909
|
-
throw new Error(`Invalid model name format: ${modelNameOrUrl}
|
|
1167
|
+
throw new Error(`Invalid model name format: ${modelNameOrUrl}.`);
|
|
910
1168
|
}
|
|
911
1169
|
}
|
|
912
1170
|
|
|
@@ -942,47 +1200,66 @@ async function ensurePiperModel(binDir: string, modelNameOrUrl: string): Promise
|
|
|
942
1200
|
return { modelPath, configPath };
|
|
943
1201
|
}
|
|
944
1202
|
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
fs.unlink(dest, () => { }); // Cleanup
|
|
952
|
-
reject(new Error(`File write error: ${err.message}`));
|
|
953
|
-
});
|
|
1203
|
+
// --------------------------------------------------------------------------
|
|
1204
|
+
// HELPER: Determine Audio Duration for SRT
|
|
1205
|
+
// --------------------------------------------------------------------------
|
|
1206
|
+
function getAudioDuration(buffer: Buffer, hint: 'mp3' | 'wav' | null = null): number {
|
|
1207
|
+
// 1. Try generic text length if buffer empty (fallback)
|
|
1208
|
+
if (!buffer || buffer.length === 0) return -1;
|
|
954
1209
|
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
1210
|
+
// 2. Try parsing WAV header
|
|
1211
|
+
// RIFF....WAVEfmt
|
|
1212
|
+
if ((hint === 'wav') || (buffer.length > 12 && buffer.toString('ascii', 0, 4) === 'RIFF' && buffer.toString('ascii', 8, 12) === 'WAVE')) {
|
|
1213
|
+
try {
|
|
1214
|
+
// standard header is 44 bytes.
|
|
1215
|
+
const byteRate = buffer.readUInt32LE(28);
|
|
1216
|
+
if (byteRate > 0) {
|
|
1217
|
+
const dataSize = buffer.length - 44;
|
|
1218
|
+
return dataSize / byteRate;
|
|
961
1219
|
}
|
|
1220
|
+
} catch (e) { /* ignore */ }
|
|
1221
|
+
}
|
|
962
1222
|
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
return;
|
|
968
|
-
}
|
|
1223
|
+
// 3. Fallback: Char count estimation? No, we don't have text here.
|
|
1224
|
+
// Return -1 to signal "Use text length"
|
|
1225
|
+
return -1;
|
|
1226
|
+
}
|
|
969
1227
|
|
|
970
|
-
|
|
1228
|
+
// --------------------------------------------------------------------------
|
|
1229
|
+
// HEURISTIC SRT IMPLEMENTATION
|
|
1230
|
+
// --------------------------------------------------------------------------
|
|
1231
|
+
function generateHeuristicSRT(text: string, durationSeconds: number): string {
|
|
1232
|
+
// If duration unknown (-1), estimate from text length (14 chars/sec)
|
|
1233
|
+
if (durationSeconds <= 0) {
|
|
1234
|
+
durationSeconds = text.length / 14;
|
|
1235
|
+
}
|
|
971
1236
|
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
if (err) reject(err);
|
|
975
|
-
else resolve();
|
|
976
|
-
});
|
|
977
|
-
});
|
|
978
|
-
});
|
|
1237
|
+
const sentences = text.match(/[^.!?]+[.!?]*/g) || [text];
|
|
1238
|
+
const totalContentLen = text.length;
|
|
979
1239
|
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
1240
|
+
let currentStartTime = 0;
|
|
1241
|
+
let srt = '';
|
|
1242
|
+
let counter = 1;
|
|
1243
|
+
|
|
1244
|
+
const msToSrt = (ms: number) => {
|
|
1245
|
+
const totalSec = Math.floor(ms / 1000);
|
|
1246
|
+
const mili = Math.floor(ms % 1000);
|
|
1247
|
+
const h = Math.floor(totalSec / 3600);
|
|
1248
|
+
const m = Math.floor((totalSec % 3600) / 60);
|
|
1249
|
+
const s = totalSec % 60;
|
|
1250
|
+
return `${h.toString().padStart(2, '0')}:${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')},${mili.toString().padStart(3, '0')}`;
|
|
1251
|
+
};
|
|
1252
|
+
|
|
1253
|
+
for (const sentence of sentences) {
|
|
1254
|
+
// Proportion of time = Proportion of length
|
|
1255
|
+
const sentenceRatio = sentence.length / totalContentLen;
|
|
1256
|
+
const sentenceDuration = sentenceRatio * durationSeconds;
|
|
1257
|
+
const endTime = currentStartTime + sentenceDuration;
|
|
1258
|
+
|
|
1259
|
+
srt += `${counter++}\n${msToSrt(currentStartTime * 1000)} --> ${msToSrt(endTime * 1000)}\n${sentence.trim()}\n\n`;
|
|
1260
|
+
currentStartTime = endTime;
|
|
1261
|
+
}
|
|
1262
|
+
return srt;
|
|
986
1263
|
}
|
|
987
1264
|
|
|
988
1265
|
async function httpRequest(url: string, method: string = 'GET', body: any = null): Promise<Buffer> {
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "n8n-nodes-tts-bigboss",
|
|
3
|
-
"version": "
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "2.0.0",
|
|
4
|
+
"description": "Professional TTS node with multi-engine support, text chunking, and accurate SRT generation",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"n8n-community-node-package",
|
|
7
7
|
"n8n",
|
|
@@ -10,7 +10,8 @@
|
|
|
10
10
|
"srt",
|
|
11
11
|
"arabic",
|
|
12
12
|
"edge-tts",
|
|
13
|
-
"piper"
|
|
13
|
+
"piper",
|
|
14
|
+
"kokoro"
|
|
14
15
|
],
|
|
15
16
|
"license": "MIT",
|
|
16
17
|
"author": "isemo007",
|
|
@@ -35,7 +36,8 @@
|
|
|
35
36
|
"n8n-core": "^1.75.0",
|
|
36
37
|
"n8n-workflow": "^1.70.0",
|
|
37
38
|
"uuid": "^9.0.0",
|
|
38
|
-
"ws": "^8.13.0"
|
|
39
|
+
"ws": "^8.13.0",
|
|
40
|
+
"kokoro-js": "^1.2.1"
|
|
39
41
|
},
|
|
40
42
|
"devDependencies": {
|
|
41
43
|
"@types/lodash": "^4.14.195",
|
|
@@ -50,5 +52,8 @@
|
|
|
50
52
|
"typescript": "^5.0.0",
|
|
51
53
|
"webpack": "^5.88.0",
|
|
52
54
|
"webpack-cli": "^5.1.4"
|
|
55
|
+
},
|
|
56
|
+
"engines": {
|
|
57
|
+
"node": ">=18.0.0"
|
|
53
58
|
}
|
|
54
59
|
}
|