@decentchat/decentchat-plugin 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +170 -0
- package/index.ts +11 -0
- package/openclaw.plugin.json +32 -0
- package/package.json +53 -0
- package/setup-entry.ts +4 -0
- package/src/channel.ts +1059 -0
- package/src/huddle/AudioPipeline.ts +174 -0
- package/src/huddle/BotHuddleManager.ts +882 -0
- package/src/huddle/SpeechToText.ts +223 -0
- package/src/huddle/TextToSpeech.ts +260 -0
- package/src/huddle/index.ts +8 -0
- package/src/monitor.ts +1266 -0
- package/src/peer/DecentChatNodePeer.ts +4570 -0
- package/src/peer/FileStore.ts +59 -0
- package/src/peer/NodeMessageProtocol.ts +1057 -0
- package/src/peer/SyncProtocol.ts +701 -0
- package/src/peer/polyfill.ts +43 -0
- package/src/peer-registry.ts +32 -0
- package/src/runtime.ts +63 -0
- package/src/types.ts +136 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
import { execFile } from 'child_process';
|
|
2
|
+
import { promisify } from 'util';
|
|
3
|
+
import { writeFile, readFile, unlink } from 'fs/promises';
|
|
4
|
+
import { randomBytes } from 'crypto';
|
|
5
|
+
import { join } from 'path';
|
|
6
|
+
import { tmpdir } from 'os';
|
|
7
|
+
|
|
8
|
+
const execFileAsync = promisify(execFile);
|
|
9
|
+
|
|
10
|
+
export interface STTOptions {
|
|
11
|
+
engine?: 'whisper-cpp' | 'whisper-python' | 'openai' | 'groq';
|
|
12
|
+
model?: string;
|
|
13
|
+
language?: string;
|
|
14
|
+
apiKey?: string; // For openai/groq engines
|
|
15
|
+
log?: { info: (s: string) => void; warn?: (s: string) => void };
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
const DEFAULT_MODEL = 'base.en';
|
|
19
|
+
const MODEL_DIR = '/opt/homebrew/share/whisper-cpp/models';
|
|
20
|
+
const WHISPER_BIN = 'whisper-cli';
|
|
21
|
+
const EXEC_TIMEOUT = 30_000;
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* SpeechToText — converts PCM audio buffers to text.
|
|
25
|
+
* Supports local whisper-cpp or cloud APIs (OpenAI, Groq).
|
|
26
|
+
*/
|
|
27
|
+
export class SpeechToText {
|
|
28
|
+
private engine: string;
|
|
29
|
+
private modelPath: string;
|
|
30
|
+
private model: string;
|
|
31
|
+
private language?: string;
|
|
32
|
+
private apiKey?: string;
|
|
33
|
+
private log?: { info: (s: string) => void; warn?: (s: string) => void };
|
|
34
|
+
|
|
35
|
+
constructor(opts?: STTOptions) {
|
|
36
|
+
this.engine = opts?.engine ?? 'whisper-cpp';
|
|
37
|
+
this.model = opts?.model ?? DEFAULT_MODEL;
|
|
38
|
+
this.modelPath = join(MODEL_DIR, `ggml-${this.model}.bin`);
|
|
39
|
+
this.language = opts?.language;
|
|
40
|
+
this.apiKey = opts?.apiKey;
|
|
41
|
+
this.log = opts?.log;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Convert PCM buffer (16-bit signed LE, mono) to text.
|
|
46
|
+
*/
|
|
47
|
+
async transcribe(pcmBuffer: Buffer, sampleRate = 48000): Promise<string> {
|
|
48
|
+
if (this.engine === 'openai' || this.engine === 'groq') {
|
|
49
|
+
return this.transcribeCloud(pcmBuffer, sampleRate);
|
|
50
|
+
}
|
|
51
|
+
return this.transcribeLocal(pcmBuffer, sampleRate);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Cloud transcription via OpenAI or Groq Whisper API.
|
|
56
|
+
*/
|
|
57
|
+
private async transcribeCloud(pcmBuffer: Buffer, sampleRate: number): Promise<string> {
|
|
58
|
+
const wavBuffer = this.createWavBuffer(pcmBuffer, sampleRate);
|
|
59
|
+
const duration = (pcmBuffer.length / 2 / sampleRate).toFixed(1);
|
|
60
|
+
|
|
61
|
+
const isGroq = this.engine === 'groq';
|
|
62
|
+
const baseUrl = isGroq
|
|
63
|
+
? 'https://api.groq.com/openai/v1'
|
|
64
|
+
: 'https://api.openai.com/v1';
|
|
65
|
+
// Only use this.model for cloud if it looks like a cloud model name
|
|
66
|
+
// (contains 'whisper'). Otherwise use the provider default.
|
|
67
|
+
const isCloudModel = this.model.includes('whisper');
|
|
68
|
+
const model = isGroq
|
|
69
|
+
? (isCloudModel ? this.model : 'whisper-large-v3-turbo')
|
|
70
|
+
: (isCloudModel ? this.model : 'whisper-1');
|
|
71
|
+
const key = this.apiKey
|
|
72
|
+
?? (isGroq ? process.env.GROQ_API_KEY : process.env.OPENAI_API_KEY)
|
|
73
|
+
?? '';
|
|
74
|
+
|
|
75
|
+
if (!key) {
|
|
76
|
+
this.log?.warn?.(`[STT] No API key for ${this.engine} — set ${isGroq ? 'GROQ_API_KEY' : 'OPENAI_API_KEY'}`);
|
|
77
|
+
return '';
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
this.log?.info(`[STT] ${this.engine} transcribe: ${duration}s audio, model=${model}${this.language ? ', lang=' + this.language : ''}`);
|
|
81
|
+
const start = Date.now();
|
|
82
|
+
|
|
83
|
+
// Build multipart form data
|
|
84
|
+
const boundary = '----STTBoundary' + randomBytes(8).toString('hex');
|
|
85
|
+
const parts: Buffer[] = [];
|
|
86
|
+
|
|
87
|
+
// File part
|
|
88
|
+
parts.push(Buffer.from(
|
|
89
|
+
`--${boundary}\r\nContent-Disposition: form-data; name="file"; filename="audio.wav"\r\nContent-Type: audio/wav\r\n\r\n`
|
|
90
|
+
));
|
|
91
|
+
parts.push(wavBuffer);
|
|
92
|
+
parts.push(Buffer.from('\r\n'));
|
|
93
|
+
|
|
94
|
+
// Model part
|
|
95
|
+
parts.push(Buffer.from(
|
|
96
|
+
`--${boundary}\r\nContent-Disposition: form-data; name="model"\r\n\r\n${model}\r\n`
|
|
97
|
+
));
|
|
98
|
+
|
|
99
|
+
// Language part (optional)
|
|
100
|
+
if (this.language) {
|
|
101
|
+
parts.push(Buffer.from(
|
|
102
|
+
`--${boundary}\r\nContent-Disposition: form-data; name="language"\r\n\r\n${this.language}\r\n`
|
|
103
|
+
));
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Response format
|
|
107
|
+
parts.push(Buffer.from(
|
|
108
|
+
`--${boundary}\r\nContent-Disposition: form-data; name="response_format"\r\n\r\ntext\r\n`
|
|
109
|
+
));
|
|
110
|
+
|
|
111
|
+
parts.push(Buffer.from(`--${boundary}--\r\n`));
|
|
112
|
+
|
|
113
|
+
const body = Buffer.concat(parts);
|
|
114
|
+
|
|
115
|
+
const response = await fetch(`${baseUrl}/audio/transcriptions`, {
|
|
116
|
+
method: 'POST',
|
|
117
|
+
headers: {
|
|
118
|
+
'Authorization': `Bearer ${key}`,
|
|
119
|
+
'Content-Type': `multipart/form-data; boundary=${boundary}`,
|
|
120
|
+
},
|
|
121
|
+
body,
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
const elapsed = Date.now() - start;
|
|
125
|
+
|
|
126
|
+
if (!response.ok) {
|
|
127
|
+
const err = await response.text().catch(() => 'unknown');
|
|
128
|
+
this.log?.warn?.(`[STT] ${this.engine} error ${response.status}: ${err}`);
|
|
129
|
+
return '';
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
const text = (await response.text()).trim();
|
|
133
|
+
this.log?.info(`[STT] ${this.engine} transcribed in ${elapsed}ms: "${text.slice(0, 60)}${text.length > 60 ? '...' : ''}"`);
|
|
134
|
+
return text;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Local transcription via whisper-cli.
|
|
139
|
+
*/
|
|
140
|
+
private async transcribeLocal(pcmBuffer: Buffer, sampleRate: number): Promise<string> {
|
|
141
|
+
const id = randomBytes(6).toString('hex');
|
|
142
|
+
const tmp = tmpdir();
|
|
143
|
+
const inputWav = join(tmp, `stt-${id}.wav`);
|
|
144
|
+
const resampledWav = join(tmp, `stt-${id}-16k.wav`);
|
|
145
|
+
const outputBase = join(tmp, `stt-${id}-out`);
|
|
146
|
+
const outputTxt = `${outputBase}.txt`;
|
|
147
|
+
|
|
148
|
+
const tempFiles = [inputWav, resampledWav, outputTxt];
|
|
149
|
+
|
|
150
|
+
try {
|
|
151
|
+
// 1. Write PCM to WAV
|
|
152
|
+
const wavBuffer = this.createWavBuffer(pcmBuffer, sampleRate);
|
|
153
|
+
await writeFile(inputWav, wavBuffer);
|
|
154
|
+
this.log?.info(`[STT] Wrote ${wavBuffer.length} bytes WAV → ${inputWav}`);
|
|
155
|
+
|
|
156
|
+
// 2. Resample to 16 kHz mono via ffmpeg
|
|
157
|
+
await execFileAsync('ffmpeg', [
|
|
158
|
+
'-i', inputWav,
|
|
159
|
+
'-ar', '16000',
|
|
160
|
+
'-ac', '1',
|
|
161
|
+
'-y', resampledWav,
|
|
162
|
+
], { timeout: EXEC_TIMEOUT });
|
|
163
|
+
this.log?.info(`[STT] Resampled to 16 kHz → ${resampledWav}`);
|
|
164
|
+
|
|
165
|
+
// 3. Run whisper-cli
|
|
166
|
+
const args = [
|
|
167
|
+
'--model', this.modelPath,
|
|
168
|
+
'--output-txt',
|
|
169
|
+
'--output-file', outputBase,
|
|
170
|
+
'--no-timestamps',
|
|
171
|
+
];
|
|
172
|
+
if (this.language) {
|
|
173
|
+
args.push('--language', this.language);
|
|
174
|
+
}
|
|
175
|
+
args.push(resampledWav);
|
|
176
|
+
this.log?.info(`[STT] whisper-cli args: ${args.join(' ')}`);
|
|
177
|
+
await execFileAsync(WHISPER_BIN, args, { timeout: EXEC_TIMEOUT });
|
|
178
|
+
this.log?.info(`[STT] whisper-cli finished`);
|
|
179
|
+
|
|
180
|
+
// 4. Read the generated .txt
|
|
181
|
+
const text = await readFile(outputTxt, 'utf-8');
|
|
182
|
+
return text.trim();
|
|
183
|
+
} finally {
|
|
184
|
+
await Promise.all(
|
|
185
|
+
tempFiles.map(f => unlink(f).catch(() => {})),
|
|
186
|
+
);
|
|
187
|
+
this.log?.info(`[STT] Cleaned up temp files`);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Create a valid WAV (RIFF) buffer from raw 16-bit signed LE PCM data.
|
|
193
|
+
*/
|
|
194
|
+
private createWavBuffer(pcm: Buffer, sampleRate: number): Buffer {
|
|
195
|
+
const numChannels = 1;
|
|
196
|
+
const bitsPerSample = 16;
|
|
197
|
+
const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
|
|
198
|
+
const blockAlign = numChannels * (bitsPerSample / 8);
|
|
199
|
+
const dataSize = pcm.length;
|
|
200
|
+
const headerSize = 44;
|
|
201
|
+
|
|
202
|
+
const header = Buffer.alloc(headerSize);
|
|
203
|
+
let offset = 0;
|
|
204
|
+
|
|
205
|
+
header.write('RIFF', offset); offset += 4;
|
|
206
|
+
header.writeUInt32LE(dataSize + headerSize - 8, offset); offset += 4;
|
|
207
|
+
header.write('WAVE', offset); offset += 4;
|
|
208
|
+
|
|
209
|
+
header.write('fmt ', offset); offset += 4;
|
|
210
|
+
header.writeUInt32LE(16, offset); offset += 4;
|
|
211
|
+
header.writeUInt16LE(1, offset); offset += 2;
|
|
212
|
+
header.writeUInt16LE(numChannels, offset); offset += 2;
|
|
213
|
+
header.writeUInt32LE(sampleRate, offset); offset += 4;
|
|
214
|
+
header.writeUInt32LE(byteRate, offset); offset += 4;
|
|
215
|
+
header.writeUInt16LE(blockAlign, offset); offset += 2;
|
|
216
|
+
header.writeUInt16LE(bitsPerSample, offset); offset += 2;
|
|
217
|
+
|
|
218
|
+
header.write('data', offset); offset += 4;
|
|
219
|
+
header.writeUInt32LE(dataSize, offset); offset += 4;
|
|
220
|
+
|
|
221
|
+
return Buffer.concat([header, pcm]);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
import OpusScript from 'opusscript';
|
|
2
|
+
|
|
3
|
+
export interface TTSOptions {
|
|
4
|
+
apiKey: string;
|
|
5
|
+
voiceId?: string; // Default: 'EXAVITQu4vr4xnSDxMaL' (Rachel)
|
|
6
|
+
model?: string; // Default: 'eleven_turbo_v2_5' (multilingual)
|
|
7
|
+
language?: string; // Language code for multilingual models (e.g. 'sk', 'en')
|
|
8
|
+
sampleRate?: number; // Output sample rate for Opus: 48000
|
|
9
|
+
log?: { info: (s: string) => void };
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
const DEFAULT_VOICE_ID = 'EXAVITQu4vr4xnSDxMaL';
|
|
13
|
+
const DEFAULT_MODEL = 'eleven_turbo_v2_5';
|
|
14
|
+
const DEFAULT_SAMPLE_RATE = 48000;
|
|
15
|
+
const ELEVENLABS_PCM_RATE = 24000;
|
|
16
|
+
const FRAME_DURATION_MS = 20;
|
|
17
|
+
const OPUS_PT = 111;
|
|
18
|
+
const DEFAULT_SSRC = 1234;
|
|
19
|
+
|
|
20
|
+
export class TextToSpeech {
|
|
21
|
+
private apiKey: string;
|
|
22
|
+
private voiceId: string;
|
|
23
|
+
private model: string;
|
|
24
|
+
private language?: string;
|
|
25
|
+
private sampleRate: number;
|
|
26
|
+
private log?: { info: (s: string) => void };
|
|
27
|
+
private encoder: OpusScript;
|
|
28
|
+
|
|
29
|
+
constructor(opts: TTSOptions) {
|
|
30
|
+
this.apiKey = opts.apiKey;
|
|
31
|
+
this.voiceId = opts.voiceId ?? DEFAULT_VOICE_ID;
|
|
32
|
+
this.model = opts.model ?? DEFAULT_MODEL;
|
|
33
|
+
this.language = opts.language;
|
|
34
|
+
this.sampleRate = opts.sampleRate ?? DEFAULT_SAMPLE_RATE;
|
|
35
|
+
this.log = opts.log;
|
|
36
|
+
this.encoder = new OpusScript(this.sampleRate, 2, OpusScript.Application.AUDIO);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Convert text to a sequence of RTP packets containing Opus-encoded audio.
|
|
41
|
+
* Each packet represents a 20ms frame.
|
|
42
|
+
*/
|
|
43
|
+
async speak(text: string): Promise<Buffer[]> {
|
|
44
|
+
this.log?.info(`TTS: synthesizing "${text.slice(0, 60)}${text.length > 60 ? '...' : ''}"`);
|
|
45
|
+
|
|
46
|
+
// 1. Fetch PCM audio from ElevenLabs
|
|
47
|
+
const pcm24k = await this.fetchPcmFromElevenLabs(text);
|
|
48
|
+
this.log?.info(`TTS: received ${pcm24k.length} bytes of PCM @ ${ELEVENLABS_PCM_RATE}Hz`);
|
|
49
|
+
|
|
50
|
+
// 2. Resample 24kHz → 48kHz
|
|
51
|
+
// ElevenLabs occasionally returns odd byte counts; truncate to even
|
|
52
|
+
// since PCM 16-bit requires 2 bytes per sample.
|
|
53
|
+
const pcmEven = pcm24k.length % 2 !== 0 ? pcm24k.subarray(0, pcm24k.length - 1) : pcm24k;
|
|
54
|
+
const pcm48k = this.resample(pcmEven, ELEVENLABS_PCM_RATE, this.sampleRate);
|
|
55
|
+
this.log?.info(`TTS: resampled to ${pcm48k.length} bytes @ ${this.sampleRate}Hz`);
|
|
56
|
+
|
|
57
|
+
// 3. Chunk into 20ms frames and Opus encode
|
|
58
|
+
const samplesPerFrame = (this.sampleRate * FRAME_DURATION_MS) / 1000; // 960 at 48kHz
|
|
59
|
+
const bytesPerFrame = samplesPerFrame * 2; // 16-bit samples = 2 bytes each
|
|
60
|
+
const packets: Buffer[] = [];
|
|
61
|
+
let seq = 0;
|
|
62
|
+
let timestamp = 0;
|
|
63
|
+
|
|
64
|
+
for (let offset = 0; offset + bytesPerFrame <= pcm48k.length; offset += bytesPerFrame) {
|
|
65
|
+
const pcmFrame = pcm48k.subarray(offset, offset + bytesPerFrame);
|
|
66
|
+
// Convert mono PCM to stereo (interleaved L,R) for the 2-channel Opus encoder.
|
|
67
|
+
// The SDP advertises sprop-stereo=1, so the browser expects stereo frames.
|
|
68
|
+
const stereoPcm = Buffer.alloc(pcmFrame.length * 2);
|
|
69
|
+
for (let i = 0; i < samplesPerFrame; i++) {
|
|
70
|
+
const sample = pcmFrame.readInt16LE(i * 2);
|
|
71
|
+
stereoPcm.writeInt16LE(sample, i * 4); // L
|
|
72
|
+
stereoPcm.writeInt16LE(sample, i * 4 + 2); // R
|
|
73
|
+
}
|
|
74
|
+
const opusFrame = this.encoder.encode(stereoPcm, samplesPerFrame);
|
|
75
|
+
|
|
76
|
+
const rtpPacket = this.createRtpPacket(
|
|
77
|
+
Buffer.from(opusFrame),
|
|
78
|
+
seq,
|
|
79
|
+
timestamp,
|
|
80
|
+
DEFAULT_SSRC,
|
|
81
|
+
OPUS_PT,
|
|
82
|
+
seq === 0 // Marker bit on first packet (start of talkspurt)
|
|
83
|
+
);
|
|
84
|
+
packets.push(rtpPacket);
|
|
85
|
+
|
|
86
|
+
seq++;
|
|
87
|
+
timestamp += samplesPerFrame;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
this.log?.info(`TTS: encoded ${packets.length} RTP packets (${(packets.length * FRAME_DURATION_MS / 1000).toFixed(1)}s)`);
|
|
91
|
+
return packets;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Like speak(), but returns raw Opus frames (no RTP headers).
|
|
96
|
+
* For use with node-datachannel's media handler which adds RTP headers itself.
|
|
97
|
+
*/
|
|
98
|
+
async speakRaw(text: string): Promise<Buffer[]> {
|
|
99
|
+
// 1. Get PCM from ElevenLabs
|
|
100
|
+
this.log?.info(`TTS: synthesizing (raw) "${text.slice(0, 60)}${text.length > 60 ? "..." : ""}"`);
|
|
101
|
+
const pcm = await this.fetchPcmFromElevenLabs(text);
|
|
102
|
+
this.log?.info(`TTS: received ${pcm.length} bytes of PCM @ 24000Hz`);
|
|
103
|
+
if (!pcm || pcm.length === 0) return [];
|
|
104
|
+
|
|
105
|
+
// 2. Resample to 48kHz
|
|
106
|
+
const pcmEven = pcm.length % 2 !== 0 ? pcm.subarray(0, pcm.length - 1) : pcm;
|
|
107
|
+
const pcm48k = this.resample(pcmEven, 24000 as number, this.sampleRate);
|
|
108
|
+
this.log?.info(`TTS: resampled to ${pcm48k.length} bytes @ 48000Hz (raw mode)`);
|
|
109
|
+
|
|
110
|
+
// 3. Chunk into 20ms frames and Opus encode (no RTP wrapping)
|
|
111
|
+
const samplesPerFrame = (this.sampleRate * FRAME_DURATION_MS) / 1000;
|
|
112
|
+
const bytesPerFrame = samplesPerFrame * 2;
|
|
113
|
+
const frames: Buffer[] = [];
|
|
114
|
+
|
|
115
|
+
// Use a FRESH encoder per call — persistent encoder state causes Chrome decode issues.
|
|
116
|
+
const OpusScript = (await import('opusscript')).default;
|
|
117
|
+
const freshEncoder = new OpusScript(this.sampleRate, 2, OpusScript.Application.AUDIO);
|
|
118
|
+
|
|
119
|
+
for (let offset = 0; offset + bytesPerFrame <= pcm48k.length; offset += bytesPerFrame) {
|
|
120
|
+
const pcmFrame = pcm48k.subarray(offset, offset + bytesPerFrame);
|
|
121
|
+
// Convert mono PCM to stereo (interleaved L,R) for the 2-channel Opus encoder.
|
|
122
|
+
// The SDP advertises sprop-stereo=1, so the browser expects stereo frames.
|
|
123
|
+
const stereoPcm = Buffer.alloc(pcmFrame.length * 2);
|
|
124
|
+
for (let i = 0; i < samplesPerFrame; i++) {
|
|
125
|
+
const sample = pcmFrame.readInt16LE(i * 2);
|
|
126
|
+
stereoPcm.writeInt16LE(sample, i * 4); // L
|
|
127
|
+
stereoPcm.writeInt16LE(sample, i * 4 + 2); // R
|
|
128
|
+
}
|
|
129
|
+
const opusFrame = freshEncoder.encode(stereoPcm, samplesPerFrame);
|
|
130
|
+
frames.push(Buffer.from(opusFrame));
|
|
131
|
+
}
|
|
132
|
+
freshEncoder.delete();
|
|
133
|
+
|
|
134
|
+
this.log?.info(`TTS: encoded ${frames.length} raw Opus frames (${(frames.length * FRAME_DURATION_MS / 1000).toFixed(1)}s)`);
|
|
135
|
+
|
|
136
|
+
// DIAGNOSTIC: dump resampled PCM and Opus frames for offline analysis
|
|
137
|
+
try {
|
|
138
|
+
const fs = await import('fs');
|
|
139
|
+
fs.writeFileSync('/tmp/tts_debug_pcm48k.raw', pcm48k);
|
|
140
|
+
fs.writeFileSync('/tmp/tts_debug_frames.json', JSON.stringify(frames.map(f => Buffer.from(f).toString('base64'))));
|
|
141
|
+
this.log?.info(`TTS: DIAG dumped ${pcm48k.length}b PCM + ${frames.length} frames to /tmp/tts_debug_*`);
|
|
142
|
+
} catch {}
|
|
143
|
+
|
|
144
|
+
return frames;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Fetch raw PCM 16-bit 24kHz mono audio from ElevenLabs streaming TTS API.
|
|
149
|
+
*/
|
|
150
|
+
private async fetchPcmFromElevenLabs(text: string): Promise<Buffer> {
|
|
151
|
+
// output_format is a QUERY parameter, not a body parameter.
|
|
152
|
+
// Without it, ElevenLabs returns MP3 (audio/mpeg) instead of raw PCM.
|
|
153
|
+
const url = `https://api.elevenlabs.io/v1/text-to-speech/${this.voiceId}/stream?output_format=pcm_24000`;
|
|
154
|
+
|
|
155
|
+
const response = await fetch(url, {
|
|
156
|
+
method: 'POST',
|
|
157
|
+
headers: {
|
|
158
|
+
'xi-api-key': this.apiKey,
|
|
159
|
+
'Content-Type': 'application/json',
|
|
160
|
+
},
|
|
161
|
+
body: JSON.stringify({
|
|
162
|
+
text,
|
|
163
|
+
model_id: this.model,
|
|
164
|
+
...(this.language ? { language_code: this.language } : {}),
|
|
165
|
+
}),
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
if (!response.ok) {
|
|
169
|
+
const errorText = await response.text().catch(() => 'unknown error');
|
|
170
|
+
throw new Error(`ElevenLabs API error ${response.status}: ${errorText}`);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// Accumulate response body into a single buffer
|
|
174
|
+
const chunks: Uint8Array[] = [];
|
|
175
|
+
const reader = response.body?.getReader();
|
|
176
|
+
if (!reader) throw new Error('No response body from ElevenLabs');
|
|
177
|
+
|
|
178
|
+
while (true) {
|
|
179
|
+
const { done, value } = await reader.read();
|
|
180
|
+
if (done) break;
|
|
181
|
+
if (value) chunks.push(value);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Concatenate all chunks
|
|
185
|
+
const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
|
|
186
|
+
const result = Buffer.alloc(totalLength);
|
|
187
|
+
let pos = 0;
|
|
188
|
+
for (const chunk of chunks) {
|
|
189
|
+
result.set(chunk, pos);
|
|
190
|
+
pos += chunk.length;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
return result;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Resample PCM 16-bit mono audio using linear interpolation.
|
|
198
|
+
*/
|
|
199
|
+
resample(input: Buffer, fromRate: number, toRate: number): Buffer {
|
|
200
|
+
if (fromRate === toRate) return input;
|
|
201
|
+
|
|
202
|
+
// Defensive: ensure even byte count for 16-bit PCM
|
|
203
|
+
const safeInput = input.length % 2 !== 0 ? input.subarray(0, input.length - 1) : input;
|
|
204
|
+
const inputSamples = safeInput.length / 2; // 16-bit = 2 bytes per sample
|
|
205
|
+
const ratio = fromRate / toRate;
|
|
206
|
+
const outputSamples = Math.floor(inputSamples / ratio);
|
|
207
|
+
const output = Buffer.alloc(outputSamples * 2);
|
|
208
|
+
|
|
209
|
+
for (let i = 0; i < outputSamples; i++) {
|
|
210
|
+
const srcPos = i * ratio;
|
|
211
|
+
const srcIndex = Math.floor(srcPos);
|
|
212
|
+
const frac = srcPos - srcIndex;
|
|
213
|
+
|
|
214
|
+
const s0 = safeInput.readInt16LE(srcIndex * 2);
|
|
215
|
+
const s1 = srcIndex + 1 < inputSamples
|
|
216
|
+
? safeInput.readInt16LE((srcIndex + 1) * 2)
|
|
217
|
+
: s0;
|
|
218
|
+
|
|
219
|
+
const interpolated = Math.round(s0 + frac * (s1 - s0));
|
|
220
|
+
// Clamp to Int16 range
|
|
221
|
+
const clamped = Math.max(-32768, Math.min(32767, interpolated));
|
|
222
|
+
output.writeInt16LE(clamped, i * 2);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return output;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Create an RTP packet with the given Opus payload.
|
|
230
|
+
*
|
|
231
|
+
* RTP Header (12 bytes):
|
|
232
|
+
* Byte 0: 0x80 (V=2, no padding, no extension, CC=0)
|
|
233
|
+
* Byte 1: payload type
|
|
234
|
+
* Bytes 2-3: sequence number (big-endian)
|
|
235
|
+
* Bytes 4-7: timestamp (big-endian, increments by 960 per 20ms frame)
|
|
236
|
+
* Bytes 8-11: SSRC (big-endian)
|
|
237
|
+
*/
|
|
238
|
+
createRtpPacket(payload: Buffer, seq: number, timestamp: number, ssrc: number, pt: number, marker = false): Buffer {
|
|
239
|
+
const header = Buffer.alloc(12);
|
|
240
|
+
|
|
241
|
+
header[0] = 0x80; // V=2
|
|
242
|
+
header[1] = (marker ? 0x80 : 0) | (pt & 0x7f); // Marker bit + payload type
|
|
243
|
+
header.writeUInt16BE(seq & 0xffff, 2);
|
|
244
|
+
header.writeUInt32BE(timestamp >>> 0, 4);
|
|
245
|
+
header.writeUInt32BE(ssrc >>> 0, 8);
|
|
246
|
+
|
|
247
|
+
return Buffer.concat([header, payload]);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* Cleanup OpusScript encoder resources.
|
|
252
|
+
*/
|
|
253
|
+
destroy(): void {
|
|
254
|
+
try {
|
|
255
|
+
this.encoder.delete();
|
|
256
|
+
} catch {
|
|
257
|
+
// Already destroyed or not supported
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export { BotHuddleManager } from './BotHuddleManager.js';
|
|
2
|
+
export type { BotHuddleCallbacks, BotHuddleConfig, BotHuddleState, BotHuddleParticipant } from './BotHuddleManager.js';
|
|
3
|
+
export { AudioPipeline } from './AudioPipeline.js';
|
|
4
|
+
export type { AudioPipelineOptions } from './AudioPipeline.js';
|
|
5
|
+
export { SpeechToText } from './SpeechToText.js';
|
|
6
|
+
export type { STTOptions } from './SpeechToText.js';
|
|
7
|
+
export { TextToSpeech } from './TextToSpeech.js';
|
|
8
|
+
export type { TTSOptions } from './TextToSpeech.js';
|