@decentchat/decentchat-plugin 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,223 @@
1
+ import { execFile } from 'child_process';
2
+ import { promisify } from 'util';
3
+ import { writeFile, readFile, unlink } from 'fs/promises';
4
+ import { randomBytes } from 'crypto';
5
+ import { join } from 'path';
6
+ import { tmpdir } from 'os';
7
+
8
+ const execFileAsync = promisify(execFile);
9
+
10
+ export interface STTOptions {
11
+ engine?: 'whisper-cpp' | 'whisper-python' | 'openai' | 'groq';
12
+ model?: string;
13
+ language?: string;
14
+ apiKey?: string; // For openai/groq engines
15
+ log?: { info: (s: string) => void; warn?: (s: string) => void };
16
+ }
17
+
18
+ const DEFAULT_MODEL = 'base.en';
19
+ const MODEL_DIR = '/opt/homebrew/share/whisper-cpp/models';
20
+ const WHISPER_BIN = 'whisper-cli';
21
+ const EXEC_TIMEOUT = 30_000;
22
+
23
+ /**
24
+ * SpeechToText — converts PCM audio buffers to text.
25
+ * Supports local whisper-cpp or cloud APIs (OpenAI, Groq).
26
+ */
27
+ export class SpeechToText {
28
+ private engine: string;
29
+ private modelPath: string;
30
+ private model: string;
31
+ private language?: string;
32
+ private apiKey?: string;
33
+ private log?: { info: (s: string) => void; warn?: (s: string) => void };
34
+
35
+ constructor(opts?: STTOptions) {
36
+ this.engine = opts?.engine ?? 'whisper-cpp';
37
+ this.model = opts?.model ?? DEFAULT_MODEL;
38
+ this.modelPath = join(MODEL_DIR, `ggml-${this.model}.bin`);
39
+ this.language = opts?.language;
40
+ this.apiKey = opts?.apiKey;
41
+ this.log = opts?.log;
42
+ }
43
+
44
+ /**
45
+ * Convert PCM buffer (16-bit signed LE, mono) to text.
46
+ */
47
+ async transcribe(pcmBuffer: Buffer, sampleRate = 48000): Promise<string> {
48
+ if (this.engine === 'openai' || this.engine === 'groq') {
49
+ return this.transcribeCloud(pcmBuffer, sampleRate);
50
+ }
51
+ return this.transcribeLocal(pcmBuffer, sampleRate);
52
+ }
53
+
54
+ /**
55
+ * Cloud transcription via OpenAI or Groq Whisper API.
56
+ */
57
+ private async transcribeCloud(pcmBuffer: Buffer, sampleRate: number): Promise<string> {
58
+ const wavBuffer = this.createWavBuffer(pcmBuffer, sampleRate);
59
+ const duration = (pcmBuffer.length / 2 / sampleRate).toFixed(1);
60
+
61
+ const isGroq = this.engine === 'groq';
62
+ const baseUrl = isGroq
63
+ ? 'https://api.groq.com/openai/v1'
64
+ : 'https://api.openai.com/v1';
65
+ // Only use this.model for cloud if it looks like a cloud model name
66
+ // (contains 'whisper'). Otherwise use the provider default.
67
+ const isCloudModel = this.model.includes('whisper');
68
+ const model = isGroq
69
+ ? (isCloudModel ? this.model : 'whisper-large-v3-turbo')
70
+ : (isCloudModel ? this.model : 'whisper-1');
71
+ const key = this.apiKey
72
+ ?? (isGroq ? process.env.GROQ_API_KEY : process.env.OPENAI_API_KEY)
73
+ ?? '';
74
+
75
+ if (!key) {
76
+ this.log?.warn?.(`[STT] No API key for ${this.engine} — set ${isGroq ? 'GROQ_API_KEY' : 'OPENAI_API_KEY'}`);
77
+ return '';
78
+ }
79
+
80
+ this.log?.info(`[STT] ${this.engine} transcribe: ${duration}s audio, model=${model}${this.language ? ', lang=' + this.language : ''}`);
81
+ const start = Date.now();
82
+
83
+ // Build multipart form data
84
+ const boundary = '----STTBoundary' + randomBytes(8).toString('hex');
85
+ const parts: Buffer[] = [];
86
+
87
+ // File part
88
+ parts.push(Buffer.from(
89
+ `--${boundary}\r\nContent-Disposition: form-data; name="file"; filename="audio.wav"\r\nContent-Type: audio/wav\r\n\r\n`
90
+ ));
91
+ parts.push(wavBuffer);
92
+ parts.push(Buffer.from('\r\n'));
93
+
94
+ // Model part
95
+ parts.push(Buffer.from(
96
+ `--${boundary}\r\nContent-Disposition: form-data; name="model"\r\n\r\n${model}\r\n`
97
+ ));
98
+
99
+ // Language part (optional)
100
+ if (this.language) {
101
+ parts.push(Buffer.from(
102
+ `--${boundary}\r\nContent-Disposition: form-data; name="language"\r\n\r\n${this.language}\r\n`
103
+ ));
104
+ }
105
+
106
+ // Response format
107
+ parts.push(Buffer.from(
108
+ `--${boundary}\r\nContent-Disposition: form-data; name="response_format"\r\n\r\ntext\r\n`
109
+ ));
110
+
111
+ parts.push(Buffer.from(`--${boundary}--\r\n`));
112
+
113
+ const body = Buffer.concat(parts);
114
+
115
+ const response = await fetch(`${baseUrl}/audio/transcriptions`, {
116
+ method: 'POST',
117
+ headers: {
118
+ 'Authorization': `Bearer ${key}`,
119
+ 'Content-Type': `multipart/form-data; boundary=${boundary}`,
120
+ },
121
+ body,
122
+ });
123
+
124
+ const elapsed = Date.now() - start;
125
+
126
+ if (!response.ok) {
127
+ const err = await response.text().catch(() => 'unknown');
128
+ this.log?.warn?.(`[STT] ${this.engine} error ${response.status}: ${err}`);
129
+ return '';
130
+ }
131
+
132
+ const text = (await response.text()).trim();
133
+ this.log?.info(`[STT] ${this.engine} transcribed in ${elapsed}ms: "${text.slice(0, 60)}${text.length > 60 ? '...' : ''}"`);
134
+ return text;
135
+ }
136
+
137
+ /**
138
+ * Local transcription via whisper-cli.
139
+ */
140
+ private async transcribeLocal(pcmBuffer: Buffer, sampleRate: number): Promise<string> {
141
+ const id = randomBytes(6).toString('hex');
142
+ const tmp = tmpdir();
143
+ const inputWav = join(tmp, `stt-${id}.wav`);
144
+ const resampledWav = join(tmp, `stt-${id}-16k.wav`);
145
+ const outputBase = join(tmp, `stt-${id}-out`);
146
+ const outputTxt = `${outputBase}.txt`;
147
+
148
+ const tempFiles = [inputWav, resampledWav, outputTxt];
149
+
150
+ try {
151
+ // 1. Write PCM to WAV
152
+ const wavBuffer = this.createWavBuffer(pcmBuffer, sampleRate);
153
+ await writeFile(inputWav, wavBuffer);
154
+ this.log?.info(`[STT] Wrote ${wavBuffer.length} bytes WAV → ${inputWav}`);
155
+
156
+ // 2. Resample to 16 kHz mono via ffmpeg
157
+ await execFileAsync('ffmpeg', [
158
+ '-i', inputWav,
159
+ '-ar', '16000',
160
+ '-ac', '1',
161
+ '-y', resampledWav,
162
+ ], { timeout: EXEC_TIMEOUT });
163
+ this.log?.info(`[STT] Resampled to 16 kHz → ${resampledWav}`);
164
+
165
+ // 3. Run whisper-cli
166
+ const args = [
167
+ '--model', this.modelPath,
168
+ '--output-txt',
169
+ '--output-file', outputBase,
170
+ '--no-timestamps',
171
+ ];
172
+ if (this.language) {
173
+ args.push('--language', this.language);
174
+ }
175
+ args.push(resampledWav);
176
+ this.log?.info(`[STT] whisper-cli args: ${args.join(' ')}`);
177
+ await execFileAsync(WHISPER_BIN, args, { timeout: EXEC_TIMEOUT });
178
+ this.log?.info(`[STT] whisper-cli finished`);
179
+
180
+ // 4. Read the generated .txt
181
+ const text = await readFile(outputTxt, 'utf-8');
182
+ return text.trim();
183
+ } finally {
184
+ await Promise.all(
185
+ tempFiles.map(f => unlink(f).catch(() => {})),
186
+ );
187
+ this.log?.info(`[STT] Cleaned up temp files`);
188
+ }
189
+ }
190
+
191
+ /**
192
+ * Create a valid WAV (RIFF) buffer from raw 16-bit signed LE PCM data.
193
+ */
194
+ private createWavBuffer(pcm: Buffer, sampleRate: number): Buffer {
195
+ const numChannels = 1;
196
+ const bitsPerSample = 16;
197
+ const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
198
+ const blockAlign = numChannels * (bitsPerSample / 8);
199
+ const dataSize = pcm.length;
200
+ const headerSize = 44;
201
+
202
+ const header = Buffer.alloc(headerSize);
203
+ let offset = 0;
204
+
205
+ header.write('RIFF', offset); offset += 4;
206
+ header.writeUInt32LE(dataSize + headerSize - 8, offset); offset += 4;
207
+ header.write('WAVE', offset); offset += 4;
208
+
209
+ header.write('fmt ', offset); offset += 4;
210
+ header.writeUInt32LE(16, offset); offset += 4;
211
+ header.writeUInt16LE(1, offset); offset += 2;
212
+ header.writeUInt16LE(numChannels, offset); offset += 2;
213
+ header.writeUInt32LE(sampleRate, offset); offset += 4;
214
+ header.writeUInt32LE(byteRate, offset); offset += 4;
215
+ header.writeUInt16LE(blockAlign, offset); offset += 2;
216
+ header.writeUInt16LE(bitsPerSample, offset); offset += 2;
217
+
218
+ header.write('data', offset); offset += 4;
219
+ header.writeUInt32LE(dataSize, offset); offset += 4;
220
+
221
+ return Buffer.concat([header, pcm]);
222
+ }
223
+ }
@@ -0,0 +1,260 @@
1
+ import OpusScript from 'opusscript';
2
+
3
+ export interface TTSOptions {
4
+ apiKey: string;
5
+ voiceId?: string; // Default: 'EXAVITQu4vr4xnSDxMaL' (Rachel)
6
+ model?: string; // Default: 'eleven_turbo_v2_5' (multilingual)
7
+ language?: string; // Language code for multilingual models (e.g. 'sk', 'en')
8
+ sampleRate?: number; // Output sample rate for Opus: 48000
9
+ log?: { info: (s: string) => void };
10
+ }
11
+
12
+ const DEFAULT_VOICE_ID = 'EXAVITQu4vr4xnSDxMaL';
13
+ const DEFAULT_MODEL = 'eleven_turbo_v2_5';
14
+ const DEFAULT_SAMPLE_RATE = 48000;
15
+ const ELEVENLABS_PCM_RATE = 24000;
16
+ const FRAME_DURATION_MS = 20;
17
+ const OPUS_PT = 111;
18
+ const DEFAULT_SSRC = 1234;
19
+
20
+ export class TextToSpeech {
21
+ private apiKey: string;
22
+ private voiceId: string;
23
+ private model: string;
24
+ private language?: string;
25
+ private sampleRate: number;
26
+ private log?: { info: (s: string) => void };
27
+ private encoder: OpusScript;
28
+
29
+ constructor(opts: TTSOptions) {
30
+ this.apiKey = opts.apiKey;
31
+ this.voiceId = opts.voiceId ?? DEFAULT_VOICE_ID;
32
+ this.model = opts.model ?? DEFAULT_MODEL;
33
+ this.language = opts.language;
34
+ this.sampleRate = opts.sampleRate ?? DEFAULT_SAMPLE_RATE;
35
+ this.log = opts.log;
36
+ this.encoder = new OpusScript(this.sampleRate, 2, OpusScript.Application.AUDIO);
37
+ }
38
+
39
+ /**
40
+ * Convert text to a sequence of RTP packets containing Opus-encoded audio.
41
+ * Each packet represents a 20ms frame.
42
+ */
43
+ async speak(text: string): Promise<Buffer[]> {
44
+ this.log?.info(`TTS: synthesizing "${text.slice(0, 60)}${text.length > 60 ? '...' : ''}"`);
45
+
46
+ // 1. Fetch PCM audio from ElevenLabs
47
+ const pcm24k = await this.fetchPcmFromElevenLabs(text);
48
+ this.log?.info(`TTS: received ${pcm24k.length} bytes of PCM @ ${ELEVENLABS_PCM_RATE}Hz`);
49
+
50
+ // 2. Resample 24kHz → 48kHz
51
+ // ElevenLabs occasionally returns odd byte counts; truncate to even
52
+ // since PCM 16-bit requires 2 bytes per sample.
53
+ const pcmEven = pcm24k.length % 2 !== 0 ? pcm24k.subarray(0, pcm24k.length - 1) : pcm24k;
54
+ const pcm48k = this.resample(pcmEven, ELEVENLABS_PCM_RATE, this.sampleRate);
55
+ this.log?.info(`TTS: resampled to ${pcm48k.length} bytes @ ${this.sampleRate}Hz`);
56
+
57
+ // 3. Chunk into 20ms frames and Opus encode
58
+ const samplesPerFrame = (this.sampleRate * FRAME_DURATION_MS) / 1000; // 960 at 48kHz
59
+ const bytesPerFrame = samplesPerFrame * 2; // 16-bit samples = 2 bytes each
60
+ const packets: Buffer[] = [];
61
+ let seq = 0;
62
+ let timestamp = 0;
63
+
64
+ for (let offset = 0; offset + bytesPerFrame <= pcm48k.length; offset += bytesPerFrame) {
65
+ const pcmFrame = pcm48k.subarray(offset, offset + bytesPerFrame);
66
+ // Convert mono PCM to stereo (interleaved L,R) for the 2-channel Opus encoder.
67
+ // The SDP advertises sprop-stereo=1, so the browser expects stereo frames.
68
+ const stereoPcm = Buffer.alloc(pcmFrame.length * 2);
69
+ for (let i = 0; i < samplesPerFrame; i++) {
70
+ const sample = pcmFrame.readInt16LE(i * 2);
71
+ stereoPcm.writeInt16LE(sample, i * 4); // L
72
+ stereoPcm.writeInt16LE(sample, i * 4 + 2); // R
73
+ }
74
+ const opusFrame = this.encoder.encode(stereoPcm, samplesPerFrame);
75
+
76
+ const rtpPacket = this.createRtpPacket(
77
+ Buffer.from(opusFrame),
78
+ seq,
79
+ timestamp,
80
+ DEFAULT_SSRC,
81
+ OPUS_PT,
82
+ seq === 0 // Marker bit on first packet (start of talkspurt)
83
+ );
84
+ packets.push(rtpPacket);
85
+
86
+ seq++;
87
+ timestamp += samplesPerFrame;
88
+ }
89
+
90
+ this.log?.info(`TTS: encoded ${packets.length} RTP packets (${(packets.length * FRAME_DURATION_MS / 1000).toFixed(1)}s)`);
91
+ return packets;
92
+ }
93
+
94
+ /**
95
+ * Like speak(), but returns raw Opus frames (no RTP headers).
96
+ * For use with node-datachannel's media handler which adds RTP headers itself.
97
+ */
98
+ async speakRaw(text: string): Promise<Buffer[]> {
99
+ // 1. Get PCM from ElevenLabs
100
+ this.log?.info(`TTS: synthesizing (raw) "${text.slice(0, 60)}${text.length > 60 ? "..." : ""}"`);
101
+ const pcm = await this.fetchPcmFromElevenLabs(text);
102
+ this.log?.info(`TTS: received ${pcm.length} bytes of PCM @ 24000Hz`);
103
+ if (!pcm || pcm.length === 0) return [];
104
+
105
+ // 2. Resample to 48kHz
106
+ const pcmEven = pcm.length % 2 !== 0 ? pcm.subarray(0, pcm.length - 1) : pcm;
107
+ const pcm48k = this.resample(pcmEven, 24000 as number, this.sampleRate);
108
+ this.log?.info(`TTS: resampled to ${pcm48k.length} bytes @ 48000Hz (raw mode)`);
109
+
110
+ // 3. Chunk into 20ms frames and Opus encode (no RTP wrapping)
111
+ const samplesPerFrame = (this.sampleRate * FRAME_DURATION_MS) / 1000;
112
+ const bytesPerFrame = samplesPerFrame * 2;
113
+ const frames: Buffer[] = [];
114
+
115
+ // Use a FRESH encoder per call — persistent encoder state causes Chrome decode issues.
116
+ const OpusScript = (await import('opusscript')).default;
117
+ const freshEncoder = new OpusScript(this.sampleRate, 2, OpusScript.Application.AUDIO);
118
+
119
+ for (let offset = 0; offset + bytesPerFrame <= pcm48k.length; offset += bytesPerFrame) {
120
+ const pcmFrame = pcm48k.subarray(offset, offset + bytesPerFrame);
121
+ // Convert mono PCM to stereo (interleaved L,R) for the 2-channel Opus encoder.
122
+ // The SDP advertises sprop-stereo=1, so the browser expects stereo frames.
123
+ const stereoPcm = Buffer.alloc(pcmFrame.length * 2);
124
+ for (let i = 0; i < samplesPerFrame; i++) {
125
+ const sample = pcmFrame.readInt16LE(i * 2);
126
+ stereoPcm.writeInt16LE(sample, i * 4); // L
127
+ stereoPcm.writeInt16LE(sample, i * 4 + 2); // R
128
+ }
129
+ const opusFrame = freshEncoder.encode(stereoPcm, samplesPerFrame);
130
+ frames.push(Buffer.from(opusFrame));
131
+ }
132
+ freshEncoder.delete();
133
+
134
+ this.log?.info(`TTS: encoded ${frames.length} raw Opus frames (${(frames.length * FRAME_DURATION_MS / 1000).toFixed(1)}s)`);
135
+
136
+ // DIAGNOSTIC: dump resampled PCM and Opus frames for offline analysis
137
+ try {
138
+ const fs = await import('fs');
139
+ fs.writeFileSync('/tmp/tts_debug_pcm48k.raw', pcm48k);
140
+ fs.writeFileSync('/tmp/tts_debug_frames.json', JSON.stringify(frames.map(f => Buffer.from(f).toString('base64'))));
141
+ this.log?.info(`TTS: DIAG dumped ${pcm48k.length}b PCM + ${frames.length} frames to /tmp/tts_debug_*`);
142
+ } catch {}
143
+
144
+ return frames;
145
+ }
146
+
147
+ /**
148
+ * Fetch raw PCM 16-bit 24kHz mono audio from ElevenLabs streaming TTS API.
149
+ */
150
+ private async fetchPcmFromElevenLabs(text: string): Promise<Buffer> {
151
+ // output_format is a QUERY parameter, not a body parameter.
152
+ // Without it, ElevenLabs returns MP3 (audio/mpeg) instead of raw PCM.
153
+ const url = `https://api.elevenlabs.io/v1/text-to-speech/${this.voiceId}/stream?output_format=pcm_24000`;
154
+
155
+ const response = await fetch(url, {
156
+ method: 'POST',
157
+ headers: {
158
+ 'xi-api-key': this.apiKey,
159
+ 'Content-Type': 'application/json',
160
+ },
161
+ body: JSON.stringify({
162
+ text,
163
+ model_id: this.model,
164
+ ...(this.language ? { language_code: this.language } : {}),
165
+ }),
166
+ });
167
+
168
+ if (!response.ok) {
169
+ const errorText = await response.text().catch(() => 'unknown error');
170
+ throw new Error(`ElevenLabs API error ${response.status}: ${errorText}`);
171
+ }
172
+
173
+ // Accumulate response body into a single buffer
174
+ const chunks: Uint8Array[] = [];
175
+ const reader = response.body?.getReader();
176
+ if (!reader) throw new Error('No response body from ElevenLabs');
177
+
178
+ while (true) {
179
+ const { done, value } = await reader.read();
180
+ if (done) break;
181
+ if (value) chunks.push(value);
182
+ }
183
+
184
+ // Concatenate all chunks
185
+ const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
186
+ const result = Buffer.alloc(totalLength);
187
+ let pos = 0;
188
+ for (const chunk of chunks) {
189
+ result.set(chunk, pos);
190
+ pos += chunk.length;
191
+ }
192
+
193
+ return result;
194
+ }
195
+
196
+ /**
197
+ * Resample PCM 16-bit mono audio using linear interpolation.
198
+ */
199
+ resample(input: Buffer, fromRate: number, toRate: number): Buffer {
200
+ if (fromRate === toRate) return input;
201
+
202
+ // Defensive: ensure even byte count for 16-bit PCM
203
+ const safeInput = input.length % 2 !== 0 ? input.subarray(0, input.length - 1) : input;
204
+ const inputSamples = safeInput.length / 2; // 16-bit = 2 bytes per sample
205
+ const ratio = fromRate / toRate;
206
+ const outputSamples = Math.floor(inputSamples / ratio);
207
+ const output = Buffer.alloc(outputSamples * 2);
208
+
209
+ for (let i = 0; i < outputSamples; i++) {
210
+ const srcPos = i * ratio;
211
+ const srcIndex = Math.floor(srcPos);
212
+ const frac = srcPos - srcIndex;
213
+
214
+ const s0 = safeInput.readInt16LE(srcIndex * 2);
215
+ const s1 = srcIndex + 1 < inputSamples
216
+ ? safeInput.readInt16LE((srcIndex + 1) * 2)
217
+ : s0;
218
+
219
+ const interpolated = Math.round(s0 + frac * (s1 - s0));
220
+ // Clamp to Int16 range
221
+ const clamped = Math.max(-32768, Math.min(32767, interpolated));
222
+ output.writeInt16LE(clamped, i * 2);
223
+ }
224
+
225
+ return output;
226
+ }
227
+
228
+ /**
229
+ * Create an RTP packet with the given Opus payload.
230
+ *
231
+ * RTP Header (12 bytes):
232
+ * Byte 0: 0x80 (V=2, no padding, no extension, CC=0)
233
+ * Byte 1: payload type
234
+ * Bytes 2-3: sequence number (big-endian)
235
+ * Bytes 4-7: timestamp (big-endian, increments by 960 per 20ms frame)
236
+ * Bytes 8-11: SSRC (big-endian)
237
+ */
238
+ createRtpPacket(payload: Buffer, seq: number, timestamp: number, ssrc: number, pt: number, marker = false): Buffer {
239
+ const header = Buffer.alloc(12);
240
+
241
+ header[0] = 0x80; // V=2
242
+ header[1] = (marker ? 0x80 : 0) | (pt & 0x7f); // Marker bit + payload type
243
+ header.writeUInt16BE(seq & 0xffff, 2);
244
+ header.writeUInt32BE(timestamp >>> 0, 4);
245
+ header.writeUInt32BE(ssrc >>> 0, 8);
246
+
247
+ return Buffer.concat([header, payload]);
248
+ }
249
+
250
+ /**
251
+ * Cleanup OpusScript encoder resources.
252
+ */
253
+ destroy(): void {
254
+ try {
255
+ this.encoder.delete();
256
+ } catch {
257
+ // Already destroyed or not supported
258
+ }
259
+ }
260
+ }
@@ -0,0 +1,8 @@
1
+ export { BotHuddleManager } from './BotHuddleManager.js';
2
+ export type { BotHuddleCallbacks, BotHuddleConfig, BotHuddleState, BotHuddleParticipant } from './BotHuddleManager.js';
3
+ export { AudioPipeline } from './AudioPipeline.js';
4
+ export type { AudioPipelineOptions } from './AudioPipeline.js';
5
+ export { SpeechToText } from './SpeechToText.js';
6
+ export type { STTOptions } from './SpeechToText.js';
7
+ export { TextToSpeech } from './TextToSpeech.js';
8
+ export type { TTSOptions } from './TextToSpeech.js';