discoclaw 1.3.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/.env.example +4 -6
  2. package/.env.example.full +13 -32
  3. package/README.md +1 -1
  4. package/dist/cli/dashboard.test.js +0 -4
  5. package/dist/cli/init-wizard.js +4 -8
  6. package/dist/cli/init-wizard.test.js +4 -10
  7. package/dist/config.js +2 -42
  8. package/dist/config.test.js +8 -72
  9. package/dist/dashboard/server.js +1 -5
  10. package/dist/dashboard/server.test.js +3 -6
  11. package/dist/discord/actions.js +112 -6
  12. package/dist/discord/actions.test.js +117 -1
  13. package/dist/discord/help-command.js +1 -1
  14. package/dist/discord/message-coordinator.js +3 -8
  15. package/dist/discord/models-command.js +1 -1
  16. package/dist/discord/reaction-handler.js +2 -2
  17. package/dist/discord/reaction-handler.test.js +55 -0
  18. package/dist/discord/verify-push.js +31 -36
  19. package/dist/discord/verify-push.test.js +34 -6
  20. package/dist/discord/voice-command.js +1 -31
  21. package/dist/discord/voice-command.test.js +21 -259
  22. package/dist/discord/voice-status-command.js +3 -22
  23. package/dist/discord/voice-status-command.test.js +16 -124
  24. package/dist/discord-followup.test.js +133 -0
  25. package/dist/health/config-doctor.js +5 -27
  26. package/dist/health/config-doctor.test.js +1 -4
  27. package/dist/index.js +1 -28
  28. package/dist/runtime-overrides.js +2 -3
  29. package/dist/runtime-overrides.test.js +27 -193
  30. package/dist/tasks/store.js +10 -6
  31. package/dist/tasks/store.test.js +44 -0
  32. package/dist/tasks/task-action-executor.test.js +162 -50
  33. package/dist/tasks/task-action-mutations.js +22 -2
  34. package/dist/tasks/task-action-read-ops.js +7 -1
  35. package/dist/tasks/task-action-runner-types.js +19 -1
  36. package/dist/voice/audio-pipeline.js +145 -298
  37. package/docs/configuration.md +4 -9
  38. package/docs/official-docs.md +6 -9
  39. package/docs/runtime-switching.md +1 -1
  40. package/package.json +1 -1
  41. package/dist/voice/audio-pipeline.test.js +0 -1100
  42. package/dist/voice/stt-deepgram.js +0 -154
  43. package/dist/voice/stt-deepgram.test.js +0 -275
  44. package/dist/voice/stt-factory.js +0 -42
  45. package/dist/voice/stt-factory.test.js +0 -45
  46. package/dist/voice/stt-openai.js +0 -156
  47. package/dist/voice/stt-openai.test.js +0 -281
  48. package/dist/voice/tts-cartesia.js +0 -169
  49. package/dist/voice/tts-cartesia.test.js +0 -228
  50. package/dist/voice/tts-deepgram.js +0 -84
  51. package/dist/voice/tts-deepgram.test.js +0 -220
  52. package/dist/voice/tts-factory.js +0 -52
  53. package/dist/voice/tts-factory.test.js +0 -53
  54. package/dist/voice/tts-openai.js +0 -70
  55. package/dist/voice/tts-openai.test.js +0 -138
  56. package/dist/voice/types.test.js +0 -90
@@ -1,154 +0,0 @@
1
- import WebSocket from 'ws';
2
- const DEEPGRAM_STREAMING_URL = 'wss://api.deepgram.com/v1/listen';
3
- const MAX_RETRIES = 3;
4
- const BASE_BACKOFF_MS = 500;
5
- export const KEEPALIVE_INTERVAL_MS = 5_000;
6
- export class DeepgramSttProvider {
7
- apiKey;
8
- sampleRate;
9
- model;
10
- log;
11
- wsFactory;
12
- ws = null;
13
- callback = null;
14
- state = 'idle';
15
- retryCount = 0;
16
- feedCount = 0;
17
- keepAliveTimer = null;
18
- constructor(opts) {
19
- this.apiKey = opts.apiKey;
20
- this.sampleRate = opts.sampleRate;
21
- this.model = opts.model ?? 'nova-3-general';
22
- this.log = opts.log;
23
- this.wsFactory =
24
- opts.wsFactory ?? ((url, headers) => new WebSocket(url, { headers }));
25
- }
26
- async start() {
27
- if (this.state === 'open' || this.state === 'starting')
28
- return;
29
- this.state = 'starting';
30
- this.retryCount = 0;
31
- await this.connect();
32
- }
33
- feedAudio(frame) {
34
- if (this.state !== 'open') {
35
- throw new Error('Cannot feedAudio before start() or after stop()');
36
- }
37
- this.feedCount++;
38
- if (this.feedCount === 1 || this.feedCount % 100 === 0) {
39
- this.log.info({ feedCount: this.feedCount, bufferSize: frame.buffer.length }, 'stt:feedAudio');
40
- }
41
- this.ws.send(frame.buffer);
42
- }
43
- onTranscription(callback) {
44
- this.callback = callback;
45
- }
46
- async stop() {
47
- this.clearKeepAlive();
48
- if (this.state === 'stopped' || this.state === 'idle')
49
- return;
50
- this.state = 'stopped';
51
- if (this.ws && this.ws.readyState === WebSocket.OPEN) {
52
- this.ws.send(JSON.stringify({ type: 'CloseStream' }));
53
- this.ws.close();
54
- }
55
- this.ws = null;
56
- }
57
- buildUrl() {
58
- const params = new URLSearchParams({
59
- model: this.model,
60
- encoding: 'linear16',
61
- sample_rate: String(this.sampleRate),
62
- });
63
- return `${DEEPGRAM_STREAMING_URL}?${params.toString()}`;
64
- }
65
- connect() {
66
- this.clearKeepAlive();
67
- return new Promise((resolve, reject) => {
68
- const url = this.buildUrl();
69
- const ws = this.wsFactory(url, {
70
- Authorization: `Token ${this.apiKey}`,
71
- });
72
- this.ws = ws;
73
- ws.on('open', () => {
74
- this.state = 'open';
75
- this.log.info({ url: DEEPGRAM_STREAMING_URL }, 'Deepgram STT connected');
76
- this.keepAliveTimer = setInterval(() => {
77
- if (this.ws && this.ws.readyState === WebSocket.OPEN) {
78
- this.ws.send(JSON.stringify({ type: 'KeepAlive' }));
79
- }
80
- }, KEEPALIVE_INTERVAL_MS);
81
- resolve();
82
- });
83
- ws.on('message', (data) => {
84
- this.handleMessage(data);
85
- });
86
- ws.on('error', (err) => {
87
- this.log.error({ err: err.message }, 'Deepgram STT WebSocket error');
88
- });
89
- ws.on('close', (code, reason) => {
90
- if (this.state === 'stopped')
91
- return;
92
- // If we were still in the initial connect, reject
93
- if (this.state === 'starting') {
94
- reject(new Error(`WebSocket closed during connect: code=${code} reason=${reason.toString()}`));
95
- return;
96
- }
97
- this.handleUnexpectedClose();
98
- });
99
- });
100
- }
101
- handleMessage(data) {
102
- try {
103
- const parsed = JSON.parse(String(data));
104
- // Log all Deepgram messages for debugging
105
- const alt = parsed?.channel?.alternatives?.[0];
106
- const transcript = alt?.transcript ?? '';
107
- this.log.info({
108
- type: parsed.type,
109
- isFinal: parsed.is_final,
110
- speechFinal: parsed.speech_final,
111
- transcript: transcript.slice(0, 80),
112
- }, 'stt:deepgram message');
113
- if (!this.callback)
114
- return;
115
- if (!alt)
116
- return;
117
- const result = {
118
- text: transcript,
119
- confidence: alt.confidence,
120
- isFinal: Boolean(parsed.is_final && parsed.speech_final),
121
- };
122
- this.callback(result);
123
- }
124
- catch (err) {
125
- this.log.error({ err }, 'Failed to parse Deepgram STT message');
126
- }
127
- }
128
- clearKeepAlive() {
129
- if (this.keepAliveTimer !== null) {
130
- clearInterval(this.keepAliveTimer);
131
- this.keepAliveTimer = null;
132
- }
133
- }
134
- handleUnexpectedClose() {
135
- this.clearKeepAlive();
136
- if (this.retryCount >= MAX_RETRIES) {
137
- this.log.error({ retries: this.retryCount }, 'Deepgram STT exhausted reconnect retries');
138
- this.state = 'stopped';
139
- return;
140
- }
141
- this.retryCount++;
142
- const delay = BASE_BACKOFF_MS * 2 ** (this.retryCount - 1);
143
- this.log.warn({ attempt: this.retryCount, maxRetries: MAX_RETRIES, delayMs: delay }, 'Deepgram STT reconnecting after unexpected close');
144
- setTimeout(() => {
145
- if (this.state === 'stopped')
146
- return;
147
- this.state = 'starting';
148
- this.connect().catch((err) => {
149
- this.log.error({ err }, 'Deepgram STT reconnect failed');
150
- this.handleUnexpectedClose();
151
- });
152
- }, delay);
153
- }
154
- }
@@ -1,275 +0,0 @@
1
- import { describe, it, expect, vi, beforeEach } from 'vitest';
2
- import { EventEmitter } from 'node:events';
3
- import { DeepgramSttProvider, KEEPALIVE_INTERVAL_MS } from './stt-deepgram.js';
4
- // ---------------------------------------------------------------------------
5
- // Mock WebSocket (ws-library style: EventEmitter with readyState)
6
- // ---------------------------------------------------------------------------
7
- class MockWebSocket extends EventEmitter {
8
- static OPEN = 1;
9
- static CLOSED = 3;
10
- url;
11
- headers;
12
- readyState = MockWebSocket.OPEN;
13
- sent = [];
14
- constructor(url, headers) {
15
- super();
16
- this.url = url;
17
- this.headers = headers;
18
- // Auto-open on next microtask so callers can attach handlers
19
- queueMicrotask(() => this.emit('open'));
20
- }
21
- send(data) {
22
- this.sent.push(data);
23
- }
24
- close() {
25
- this.readyState = MockWebSocket.CLOSED;
26
- }
27
- // Test helpers
28
- _receiveMessage(data) {
29
- this.emit('message', JSON.stringify(data));
30
- }
31
- _triggerClose(code = 1006) {
32
- this.readyState = MockWebSocket.CLOSED;
33
- this.emit('close', code, Buffer.from(''));
34
- }
35
- _triggerError(msg = 'test error') {
36
- this.emit('error', new Error(msg));
37
- }
38
- }
39
- // ---------------------------------------------------------------------------
40
- // Helpers
41
- // ---------------------------------------------------------------------------
42
- function createLogger() {
43
- return { info: vi.fn(), warn: vi.fn(), error: vi.fn() };
44
- }
45
- let lastCreatedWs = null;
46
- function mockWsFactory(url, headers) {
47
- const ws = new MockWebSocket(url, headers);
48
- lastCreatedWs = ws;
49
- return ws;
50
- }
51
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
52
- const typedWsFactory = mockWsFactory;
53
- function makeProvider(overrides = {}) {
54
- return new DeepgramSttProvider({
55
- apiKey: overrides.apiKey ?? 'test-key',
56
- sampleRate: overrides.sampleRate ?? 16000,
57
- log: overrides.log ?? createLogger(),
58
- wsFactory: typedWsFactory,
59
- });
60
- }
61
- function makeFrame(data = [0, 1, 2, 3]) {
62
- return { buffer: Buffer.from(data), sampleRate: 16000, channels: 1 };
63
- }
64
- // Re-export for the retry test's backoff calculation
65
- const BASE_BACKOFF_MS = 500;
66
- // ---------------------------------------------------------------------------
67
- // Tests
68
- // ---------------------------------------------------------------------------
69
- beforeEach(() => {
70
- vi.clearAllMocks();
71
- lastCreatedWs = null;
72
- });
73
- describe('DeepgramSttProvider', () => {
74
- it('start opens connection with correct URL and auth header', async () => {
75
- const provider = makeProvider({ apiKey: 'my-key', sampleRate: 48000 });
76
- await provider.start();
77
- expect(lastCreatedWs).not.toBeNull();
78
- const url = new URL(lastCreatedWs.url);
79
- expect(url.protocol).toBe('wss:');
80
- expect(url.hostname).toBe('api.deepgram.com');
81
- expect(url.pathname).toBe('/v1/listen');
82
- expect(url.searchParams.get('model')).toBe('nova-3-general');
83
- expect(url.searchParams.get('encoding')).toBe('linear16');
84
- expect(url.searchParams.get('sample_rate')).toBe('48000');
85
- // Auth is via header, not query param
86
- expect(url.searchParams.get('token')).toBeNull();
87
- expect(lastCreatedWs.headers.Authorization).toBe('Token my-key');
88
- });
89
- it('feedAudio sends binary data', async () => {
90
- const provider = makeProvider();
91
- await provider.start();
92
- const frame = makeFrame([10, 20, 30]);
93
- provider.feedAudio(frame);
94
- expect(lastCreatedWs.sent).toHaveLength(1);
95
- expect(lastCreatedWs.sent[0]).toEqual(frame.buffer);
96
- });
97
- it('parses Deepgram JSON into TranscriptionResult for interim results', async () => {
98
- const provider = makeProvider();
99
- const results = [];
100
- provider.onTranscription((r) => results.push(r));
101
- await provider.start();
102
- lastCreatedWs._receiveMessage({
103
- is_final: false,
104
- speech_final: false,
105
- channel: { alternatives: [{ transcript: 'hello', confidence: 0.85 }] },
106
- });
107
- expect(results).toHaveLength(1);
108
- expect(results[0]).toEqual({ text: 'hello', confidence: 0.85, isFinal: false });
109
- });
110
- it('parses Deepgram JSON into TranscriptionResult for final results', async () => {
111
- const provider = makeProvider();
112
- const results = [];
113
- provider.onTranscription((r) => results.push(r));
114
- await provider.start();
115
- lastCreatedWs._receiveMessage({
116
- is_final: true,
117
- speech_final: true,
118
- channel: { alternatives: [{ transcript: 'hello world', confidence: 0.97 }] },
119
- });
120
- expect(results).toHaveLength(1);
121
- expect(results[0]).toEqual({ text: 'hello world', confidence: 0.97, isFinal: true });
122
- });
123
- it('isFinal requires both is_final and speech_final', async () => {
124
- const provider = makeProvider();
125
- const results = [];
126
- provider.onTranscription((r) => results.push(r));
127
- await provider.start();
128
- // is_final true but speech_final false → not final
129
- lastCreatedWs._receiveMessage({
130
- is_final: true,
131
- speech_final: false,
132
- channel: { alternatives: [{ transcript: 'partial', confidence: 0.9 }] },
133
- });
134
- expect(results[0].isFinal).toBe(false);
135
- });
136
- it('stop sends CloseStream message', async () => {
137
- const provider = makeProvider();
138
- await provider.start();
139
- const ws = lastCreatedWs;
140
- await provider.stop();
141
- const closeMsg = ws.sent.find((m) => typeof m === 'string' && JSON.parse(m).type === 'CloseStream');
142
- expect(closeMsg).toBeDefined();
143
- });
144
- it('double stop is idempotent', async () => {
145
- const provider = makeProvider();
146
- await provider.start();
147
- await provider.stop();
148
- // Should not throw
149
- await provider.stop();
150
- });
151
- it('feedAudio before start throws', () => {
152
- const provider = makeProvider();
153
- expect(() => provider.feedAudio(makeFrame())).toThrow('Cannot feedAudio before start() or after stop()');
154
- });
155
- it('reconnect fires on unexpected close up to retry limit', async () => {
156
- vi.useFakeTimers();
157
- const log = createLogger();
158
- const provider = makeProvider({ log });
159
- await provider.start();
160
- // Trigger unexpected close — should schedule reconnect
161
- lastCreatedWs._triggerClose(1006);
162
- expect(log.warn).toHaveBeenCalledTimes(1);
163
- expect(vi.mocked(log.warn).mock.calls[0][1]).toContain('reconnecting');
164
- // Advance past first retry (500ms)
165
- await vi.advanceTimersByTimeAsync(500);
166
- expect(lastCreatedWs).not.toBeNull();
167
- // Trigger another close
168
- lastCreatedWs._triggerClose(1006);
169
- expect(log.warn).toHaveBeenCalledTimes(2);
170
- // Advance past second retry (1000ms)
171
- await vi.advanceTimersByTimeAsync(1000);
172
- // Trigger third close
173
- lastCreatedWs._triggerClose(1006);
174
- expect(log.warn).toHaveBeenCalledTimes(3);
175
- // Advance past third retry (2000ms)
176
- await vi.advanceTimersByTimeAsync(2000);
177
- // Fourth close — retries exhausted
178
- lastCreatedWs._triggerClose(1006);
179
- expect(log.error).toHaveBeenCalled();
180
- expect(vi
181
- .mocked(log.error)
182
- .mock.calls.some((c) => typeof c[1] === 'string' && c[1].includes('exhausted'))).toBe(true);
183
- vi.useRealTimers();
184
- });
185
- it('error is logged after retries exhausted', async () => {
186
- vi.useFakeTimers();
187
- const log = createLogger();
188
- const provider = makeProvider({ log });
189
- await provider.start();
190
- // Exhaust all 3 retries
191
- for (let i = 0; i < 3; i++) {
192
- lastCreatedWs._triggerClose(1006);
193
- await vi.advanceTimersByTimeAsync(BASE_BACKOFF_MS * 2 ** i);
194
- }
195
- // Final close after all retries
196
- lastCreatedWs._triggerClose(1006);
197
- const errorCalls = vi.mocked(log.error).mock.calls;
198
- const exhaustedCall = errorCalls.find((c) => typeof c[1] === 'string' && c[1].includes('exhausted reconnect retries'));
199
- expect(exhaustedCall).toBeDefined();
200
- vi.useRealTimers();
201
- });
202
- // -------------------------------------------------------------------------
203
- // KeepAlive tests
204
- // -------------------------------------------------------------------------
205
- it('sends KeepAlive text frames on the interval after start', async () => {
206
- vi.useFakeTimers();
207
- const provider = makeProvider();
208
- await provider.start();
209
- const ws = lastCreatedWs;
210
- // No KeepAlive sent yet (only just connected)
211
- const keepAliveBefore = ws.sent.filter((m) => typeof m === 'string' && JSON.parse(m).type === 'KeepAlive');
212
- expect(keepAliveBefore).toHaveLength(0);
213
- // Advance one interval — should get one KeepAlive
214
- await vi.advanceTimersByTimeAsync(KEEPALIVE_INTERVAL_MS);
215
- const keepAliveAfter1 = ws.sent.filter((m) => typeof m === 'string' && JSON.parse(m).type === 'KeepAlive');
216
- expect(keepAliveAfter1).toHaveLength(1);
217
- // Advance another interval — should get a second KeepAlive
218
- await vi.advanceTimersByTimeAsync(KEEPALIVE_INTERVAL_MS);
219
- const keepAliveAfter2 = ws.sent.filter((m) => typeof m === 'string' && JSON.parse(m).type === 'KeepAlive');
220
- expect(keepAliveAfter2).toHaveLength(2);
221
- await provider.stop();
222
- vi.useRealTimers();
223
- });
224
- it('stops sending KeepAlive after stop()', async () => {
225
- vi.useFakeTimers();
226
- const provider = makeProvider();
227
- await provider.start();
228
- const ws = lastCreatedWs;
229
- await provider.stop();
230
- // Advance well past the interval — no KeepAlive should appear
231
- await vi.advanceTimersByTimeAsync(KEEPALIVE_INTERVAL_MS * 3);
232
- const keepAlives = ws.sent.filter((m) => typeof m === 'string' && JSON.parse(m).type === 'KeepAlive');
233
- expect(keepAlives).toHaveLength(0);
234
- vi.useRealTimers();
235
- });
236
- it('clears old keepalive timer and starts new one on reconnect', async () => {
237
- vi.useFakeTimers();
238
- const provider = makeProvider();
239
- await provider.start();
240
- const ws1 = lastCreatedWs;
241
- // Advance to get one KeepAlive on the first connection
242
- await vi.advanceTimersByTimeAsync(KEEPALIVE_INTERVAL_MS);
243
- const ka1 = ws1.sent.filter((m) => typeof m === 'string' && JSON.parse(m).type === 'KeepAlive');
244
- expect(ka1).toHaveLength(1);
245
- // Trigger unexpected close → reconnect
246
- ws1._triggerClose(1006);
247
- await vi.advanceTimersByTimeAsync(BASE_BACKOFF_MS); // first retry backoff
248
- const ws2 = lastCreatedWs;
249
- expect(ws2).not.toBe(ws1);
250
- // Old timer should be cleared — no further KeepAlives on ws1
251
- await vi.advanceTimersByTimeAsync(KEEPALIVE_INTERVAL_MS);
252
- const ka1After = ws1.sent.filter((m) => typeof m === 'string' && JSON.parse(m).type === 'KeepAlive');
253
- expect(ka1After).toHaveLength(1); // still just the original one
254
- // New timer should fire on ws2
255
- const ka2 = ws2.sent.filter((m) => typeof m === 'string' && JSON.parse(m).type === 'KeepAlive');
256
- expect(ka2).toHaveLength(1);
257
- await provider.stop();
258
- vi.useRealTimers();
259
- });
260
- it('KeepAlive messages are JSON text strings, not Buffers', async () => {
261
- vi.useFakeTimers();
262
- const provider = makeProvider();
263
- await provider.start();
264
- const ws = lastCreatedWs;
265
- await vi.advanceTimersByTimeAsync(KEEPALIVE_INTERVAL_MS);
266
- const keepAlives = ws.sent.filter((m) => typeof m === 'string' && JSON.parse(m).type === 'KeepAlive');
267
- expect(keepAlives).toHaveLength(1);
268
- // Must be a string (text frame), not a Buffer (binary frame)
269
- expect(typeof keepAlives[0]).toBe('string');
270
- expect(keepAlives[0]).not.toBeInstanceOf(Buffer);
271
- expect(JSON.parse(keepAlives[0])).toEqual({ type: 'KeepAlive' });
272
- await provider.stop();
273
- vi.useRealTimers();
274
- });
275
- });
@@ -1,42 +0,0 @@
1
- import { DeepgramSttProvider } from './stt-deepgram.js';
2
- import { OpenaiSttProvider } from './stt-openai.js';
3
- /**
4
- * Create an STT provider based on the voice config.
5
- *
6
- * Maintainers: start with `docs/official-docs.md` before changing provider
7
- * wiring, model defaults, endpoint assumptions, or request parameters here.
8
- *
9
- * Currently supported: `deepgram` (Nova-3 General streaming via WebSocket),
10
- * `openai` (Whisper API via REST).
11
- * Planned: `whisper` (local Whisper model, Phase 2b).
12
- *
13
- * Requires `DISCOCLAW_VOICE_ENABLED=1` and a provider-specific API key
14
- * (e.g. `DEEPGRAM_API_KEY`, `OPENAI_API_KEY`). See docs/voice.md for setup.
15
- */
16
- export function createSttProvider(config, log) {
17
- switch (config.sttProvider) {
18
- case 'deepgram': {
19
- if (!config.deepgramApiKey) {
20
- throw new Error('deepgramApiKey is required when sttProvider is "deepgram"');
21
- }
22
- return new DeepgramSttProvider({
23
- apiKey: config.deepgramApiKey,
24
- sampleRate: 16000,
25
- model: config.deepgramSttModel,
26
- log,
27
- });
28
- }
29
- case 'openai': {
30
- if (!config.openaiApiKey) {
31
- throw new Error('openaiApiKey is required when sttProvider is "openai"');
32
- }
33
- return new OpenaiSttProvider({
34
- apiKey: config.openaiApiKey,
35
- sampleRate: 16000,
36
- log,
37
- });
38
- }
39
- case 'whisper':
40
- throw new Error('Whisper STT adapter is not yet implemented (Phase 2b)');
41
- }
42
- }
@@ -1,45 +0,0 @@
1
- import { describe, it, expect, vi } from 'vitest';
2
- import { createSttProvider } from './stt-factory.js';
3
- import { DeepgramSttProvider } from './stt-deepgram.js';
4
- import { OpenaiSttProvider } from './stt-openai.js';
5
- // Stub globalThis.WebSocket so DeepgramSttProvider constructor doesn't throw
6
- class StubWebSocket {
7
- onopen = null;
8
- constructor() {
9
- queueMicrotask(() => this.onopen?.({ type: 'open' }));
10
- }
11
- send() { }
12
- close() { }
13
- }
14
- globalThis.WebSocket = StubWebSocket;
15
- function createLogger() {
16
- return { info: vi.fn(), warn: vi.fn(), error: vi.fn() };
17
- }
18
- function baseConfig(overrides = {}) {
19
- return {
20
- enabled: true,
21
- sttProvider: 'deepgram',
22
- ttsProvider: 'cartesia',
23
- deepgramApiKey: 'test-key',
24
- ...overrides,
25
- };
26
- }
27
- describe('createSttProvider', () => {
28
- it('returns a DeepgramSttProvider for deepgram config', () => {
29
- const provider = createSttProvider(baseConfig(), createLogger());
30
- expect(provider).toBeInstanceOf(DeepgramSttProvider);
31
- });
32
- it('throws when deepgramApiKey is missing for deepgram provider', () => {
33
- expect(() => createSttProvider(baseConfig({ deepgramApiKey: undefined }), createLogger())).toThrow('deepgramApiKey is required');
34
- });
35
- it('returns an OpenaiSttProvider for openai config', () => {
36
- const provider = createSttProvider(baseConfig({ sttProvider: 'openai', openaiApiKey: 'sk-test' }), createLogger());
37
- expect(provider).toBeInstanceOf(OpenaiSttProvider);
38
- });
39
- it('throws when openaiApiKey is missing for openai provider', () => {
40
- expect(() => createSttProvider(baseConfig({ sttProvider: 'openai' }), createLogger())).toThrow('openaiApiKey is required');
41
- });
42
- it('throws not-implemented for whisper provider', () => {
43
- expect(() => createSttProvider(baseConfig({ sttProvider: 'whisper' }), createLogger())).toThrow('not yet implemented');
44
- });
45
- });
@@ -1,156 +0,0 @@
1
- const OPENAI_TRANSCRIPTIONS_URL = 'https://api.openai.com/v1/audio/transcriptions';
2
- const DEFAULT_SILENCE_THRESHOLD_MS = 1500;
3
- const WHISPER_MODEL = 'whisper-1';
4
- /**
5
- * OpenAI Whisper STT adapter.
6
- *
7
- * Whisper is a batch API — there is no streaming endpoint. This adapter
8
- * buffers incoming PCM frames and triggers transcription when silence is
9
- * detected (no new audio for `silenceThresholdMs`). On transcribe, it
10
- * constructs a minimal WAV header, POSTs to the OpenAI transcriptions
11
- * endpoint, and fires the `onTranscription` callback with `isFinal: true`.
12
- * Keep the multipart request shape aligned with the official OpenAI audio
13
- * transcription docs.
14
- */
15
- export class OpenaiSttProvider {
16
- apiKey;
17
- sampleRate;
18
- log;
19
- silenceThresholdMs;
20
- fetchFn;
21
- state = 'idle';
22
- callback = null;
23
- audioBuffers = [];
24
- totalBytes = 0;
25
- silenceTimer = null;
26
- constructor(opts) {
27
- this.apiKey = opts.apiKey;
28
- this.sampleRate = opts.sampleRate;
29
- this.log = opts.log;
30
- this.silenceThresholdMs = opts.silenceThresholdMs ?? DEFAULT_SILENCE_THRESHOLD_MS;
31
- this.fetchFn = opts.fetchFn ?? globalThis.fetch;
32
- }
33
- async start() {
34
- if (this.state === 'running')
35
- return;
36
- this.state = 'running';
37
- this.audioBuffers = [];
38
- this.totalBytes = 0;
39
- this.log.info('OpenAI Whisper STT started');
40
- }
41
- feedAudio(frame) {
42
- if (this.state !== 'running') {
43
- throw new Error('Cannot feedAudio before start() or after stop()');
44
- }
45
- this.audioBuffers.push(frame.buffer);
46
- this.totalBytes += frame.buffer.length;
47
- // Reset silence timer on every audio frame
48
- this.resetSilenceTimer();
49
- }
50
- onTranscription(callback) {
51
- this.callback = callback;
52
- }
53
- async stop() {
54
- if (this.state === 'stopped' || this.state === 'idle')
55
- return;
56
- this.state = 'stopped';
57
- this.clearSilenceTimer();
58
- // Transcribe any remaining buffered audio
59
- if (this.totalBytes > 0) {
60
- await this.transcribeBuffer();
61
- }
62
- this.audioBuffers = [];
63
- this.totalBytes = 0;
64
- }
65
- resetSilenceTimer() {
66
- this.clearSilenceTimer();
67
- this.silenceTimer = setTimeout(() => {
68
- this.onSilenceDetected();
69
- }, this.silenceThresholdMs);
70
- }
71
- clearSilenceTimer() {
72
- if (this.silenceTimer !== null) {
73
- clearTimeout(this.silenceTimer);
74
- this.silenceTimer = null;
75
- }
76
- }
77
- onSilenceDetected() {
78
- if (this.state !== 'running' || this.totalBytes === 0)
79
- return;
80
- this.transcribeBuffer().catch((err) => {
81
- this.log.error({ err }, 'OpenAI Whisper transcription failed');
82
- });
83
- }
84
- async transcribeBuffer() {
85
- const pcm = Buffer.concat(this.audioBuffers);
86
- this.audioBuffers = [];
87
- this.totalBytes = 0;
88
- const wav = buildWav(pcm, this.sampleRate, 1);
89
- this.log.info({ pcmBytes: pcm.length, wavBytes: wav.length }, 'OpenAI Whisper: sending audio for transcription');
90
- try {
91
- const formData = new FormData();
92
- // Copy into a plain ArrayBuffer so TypeScript accepts it as BlobPart
93
- const ab = new ArrayBuffer(wav.byteLength);
94
- new Uint8Array(ab).set(new Uint8Array(wav.buffer, wav.byteOffset, wav.byteLength));
95
- formData.append('file', new Blob([ab], { type: 'audio/wav' }), 'audio.wav');
96
- formData.append('model', WHISPER_MODEL);
97
- const response = await this.fetchFn(OPENAI_TRANSCRIPTIONS_URL, {
98
- method: 'POST',
99
- headers: {
100
- Authorization: `Bearer ${this.apiKey}`,
101
- },
102
- body: formData,
103
- });
104
- if (!response.ok) {
105
- const body = await response.text();
106
- this.log.error({ status: response.status, body: body.slice(0, 200) }, 'OpenAI Whisper API error');
107
- return;
108
- }
109
- const data = (await response.json());
110
- const text = data.text?.trim() ?? '';
111
- if (text.length === 0) {
112
- this.log.info('OpenAI Whisper: empty transcription, skipping callback');
113
- return;
114
- }
115
- this.log.info({ text: text.slice(0, 80) }, 'OpenAI Whisper transcription');
116
- if (this.callback) {
117
- this.callback({ text, isFinal: true });
118
- }
119
- }
120
- catch (err) {
121
- this.log.error({ err }, 'OpenAI Whisper transcription request failed');
122
- }
123
- }
124
- }
125
- // ---------------------------------------------------------------------------
126
- // WAV header construction (PCM s16le mono)
127
- // ---------------------------------------------------------------------------
128
- /**
129
- * Build a minimal WAV file from raw PCM s16le data.
130
- * 16-bit samples, mono, at the given sample rate.
131
- */
132
- export function buildWav(pcm, sampleRate, channels) {
133
- const bitsPerSample = 16;
134
- const byteRate = sampleRate * channels * (bitsPerSample / 8);
135
- const blockAlign = channels * (bitsPerSample / 8);
136
- const dataSize = pcm.length;
137
- const headerSize = 44;
138
- const header = Buffer.alloc(headerSize);
139
- // RIFF chunk descriptor
140
- header.write('RIFF', 0);
141
- header.writeUInt32LE(36 + dataSize, 4); // ChunkSize
142
- header.write('WAVE', 8);
143
- // fmt sub-chunk
144
- header.write('fmt ', 12);
145
- header.writeUInt32LE(16, 16); // Subchunk1Size (PCM)
146
- header.writeUInt16LE(1, 20); // AudioFormat (1 = PCM)
147
- header.writeUInt16LE(channels, 22);
148
- header.writeUInt32LE(sampleRate, 24);
149
- header.writeUInt32LE(byteRate, 28);
150
- header.writeUInt16LE(blockAlign, 32);
151
- header.writeUInt16LE(bitsPerSample, 34);
152
- // data sub-chunk
153
- header.write('data', 36);
154
- header.writeUInt32LE(dataSize, 40);
155
- return Buffer.concat([header, pcm]);
156
- }