discoclaw 1.2.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/.context/voice.md +30 -2
  2. package/.env.example +7 -3
  3. package/.env.example.full +13 -32
  4. package/README.md +1 -1
  5. package/dist/cli/dashboard.js +7 -1
  6. package/dist/cli/dashboard.test.js +0 -4
  7. package/dist/cli/init-wizard.js +4 -8
  8. package/dist/cli/init-wizard.test.js +4 -10
  9. package/dist/config.js +5 -38
  10. package/dist/config.test.js +8 -72
  11. package/dist/cron/executor.js +72 -1
  12. package/dist/dashboard/api/metrics.js +7 -0
  13. package/dist/dashboard/api/metrics.test.js +16 -0
  14. package/dist/dashboard/api/traces.js +14 -0
  15. package/dist/dashboard/api/traces.test.js +40 -0
  16. package/dist/dashboard/page.js +187 -8
  17. package/dist/dashboard/server.js +82 -19
  18. package/dist/dashboard/server.test.js +123 -10
  19. package/dist/discord/actions.js +112 -6
  20. package/dist/discord/actions.test.js +117 -1
  21. package/dist/discord/deferred-runner.js +306 -219
  22. package/dist/discord/help-command.js +1 -1
  23. package/dist/discord/message-coordinator.js +4 -36
  24. package/dist/discord/models-command.js +1 -1
  25. package/dist/discord/reaction-handler.js +83 -5
  26. package/dist/discord/reaction-handler.test.js +55 -0
  27. package/dist/discord/verify-push.js +31 -36
  28. package/dist/discord/verify-push.test.js +34 -6
  29. package/dist/discord/voice-command.js +1 -31
  30. package/dist/discord/voice-command.test.js +21 -259
  31. package/dist/discord/voice-status-command.js +3 -22
  32. package/dist/discord/voice-status-command.test.js +16 -124
  33. package/dist/discord-followup.test.js +133 -0
  34. package/dist/health/config-doctor.js +5 -27
  35. package/dist/health/config-doctor.test.js +1 -4
  36. package/dist/index.js +15 -28
  37. package/dist/observability/trace-store.js +56 -0
  38. package/dist/observability/trace-utils.js +31 -0
  39. package/dist/runtime/codex-cli.js +3 -2
  40. package/dist/runtime/codex-cli.test.js +33 -0
  41. package/dist/runtime/model-tiers.js +1 -1
  42. package/dist/runtime/model-tiers.test.js +9 -0
  43. package/dist/runtime/openai-tool-schemas.js +17 -0
  44. package/dist/runtime-overrides.js +2 -3
  45. package/dist/runtime-overrides.test.js +27 -193
  46. package/dist/tasks/store.js +10 -6
  47. package/dist/tasks/store.test.js +44 -0
  48. package/dist/tasks/task-action-executor.test.js +162 -50
  49. package/dist/tasks/task-action-mutations.js +22 -2
  50. package/dist/tasks/task-action-read-ops.js +7 -1
  51. package/dist/tasks/task-action-runner-types.js +19 -1
  52. package/dist/voice/audio-pipeline.js +183 -96
  53. package/dist/voice/audio-receiver.js +8 -0
  54. package/dist/voice/audio-receiver.test.js +16 -0
  55. package/dist/voice/conversation-buffer.js +16 -6
  56. package/dist/voice/providers/gemini-live-provider.js +481 -0
  57. package/dist/voice/providers/gemini-live-provider.test.js +834 -0
  58. package/dist/voice/providers/gemini-live-responder.js +267 -0
  59. package/dist/voice/providers/gemini-live-responder.test.js +615 -0
  60. package/dist/voice/providers/gemini-live-token-estimator.js +100 -0
  61. package/dist/voice/providers/gemini-live-token-estimator.test.js +160 -0
  62. package/dist/voice/providers/gemini-live-types.js +32 -0
  63. package/dist/voice/providers/gemini-tool-mapper.js +91 -0
  64. package/dist/voice/providers/gemini-tool-mapper.test.js +253 -0
  65. package/dist/voice/providers/index.js +3 -0
  66. package/dist/voice/voice-prompt-builder.js +26 -17
  67. package/dist/voice/voice-prompt-builder.test.js +16 -1
  68. package/docs/configuration.md +4 -9
  69. package/docs/official-docs.md +6 -9
  70. package/docs/runtime-switching.md +1 -1
  71. package/package.json +1 -1
  72. package/dist/voice/audio-pipeline.test.js +0 -619
  73. package/dist/voice/stt-deepgram.js +0 -154
  74. package/dist/voice/stt-deepgram.test.js +0 -275
  75. package/dist/voice/stt-factory.js +0 -42
  76. package/dist/voice/stt-factory.test.js +0 -45
  77. package/dist/voice/stt-openai.js +0 -156
  78. package/dist/voice/stt-openai.test.js +0 -281
  79. package/dist/voice/tts-cartesia.js +0 -169
  80. package/dist/voice/tts-cartesia.test.js +0 -228
  81. package/dist/voice/tts-deepgram.js +0 -84
  82. package/dist/voice/tts-deepgram.test.js +0 -220
  83. package/dist/voice/tts-factory.js +0 -52
  84. package/dist/voice/tts-factory.test.js +0 -53
  85. package/dist/voice/tts-openai.js +0 -70
  86. package/dist/voice/tts-openai.test.js +0 -138
  87. package/dist/voice/types.test.js +0 -84
@@ -1,281 +0,0 @@
1
- import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
2
- import { OpenaiSttProvider, buildWav } from './stt-openai.js';
3
- // ---------------------------------------------------------------------------
4
- // Helpers
5
- // ---------------------------------------------------------------------------
6
- function createLogger() {
7
- return { info: vi.fn(), warn: vi.fn(), error: vi.fn() };
8
- }
9
- function makeFrame(data = [0, 1, 2, 3]) {
10
- return { buffer: Buffer.from(data), sampleRate: 16000, channels: 1 };
11
- }
12
- function mockFetch(text = 'hello world', ok = true, status = 200) {
13
- return vi.fn().mockResolvedValue({
14
- ok,
15
- status,
16
- json: async () => ({ text }),
17
- text: async () => (ok ? JSON.stringify({ text }) : 'API error'),
18
- });
19
- }
20
- function makeProvider(overrides = {}) {
21
- return new OpenaiSttProvider({
22
- apiKey: overrides.apiKey ?? 'test-key',
23
- sampleRate: overrides.sampleRate ?? 16000,
24
- log: overrides.log ?? createLogger(),
25
- silenceThresholdMs: overrides.silenceThresholdMs ?? 200,
26
- fetchFn: overrides.fetchFn ?? mockFetch(),
27
- });
28
- }
29
- // ---------------------------------------------------------------------------
30
- // Tests
31
- // ---------------------------------------------------------------------------
32
- beforeEach(() => {
33
- vi.useFakeTimers();
34
- });
35
- afterEach(() => {
36
- vi.useRealTimers();
37
- vi.restoreAllMocks();
38
- });
39
- describe('OpenaiSttProvider', () => {
40
- // -- Lifecycle --
41
- it('start transitions to running state', async () => {
42
- const provider = makeProvider();
43
- await provider.start();
44
- // Should not throw when feeding after start
45
- provider.feedAudio(makeFrame());
46
- });
47
- it('double start is idempotent', async () => {
48
- const log = createLogger();
49
- const provider = makeProvider({ log });
50
- await provider.start();
51
- await provider.start();
52
- // info called once for the first start only
53
- expect(vi.mocked(log.info).mock.calls.filter((c) => c[0] === 'OpenAI Whisper STT started')).toHaveLength(1);
54
- });
55
- it('feedAudio before start throws', () => {
56
- const provider = makeProvider();
57
- expect(() => provider.feedAudio(makeFrame())).toThrow('Cannot feedAudio before start() or after stop()');
58
- });
59
- it('feedAudio after stop throws', async () => {
60
- const provider = makeProvider();
61
- await provider.start();
62
- await provider.stop();
63
- expect(() => provider.feedAudio(makeFrame())).toThrow('Cannot feedAudio before start() or after stop()');
64
- });
65
- it('stop is idempotent', async () => {
66
- const provider = makeProvider();
67
- await provider.start();
68
- await provider.stop();
69
- await provider.stop(); // should not throw
70
- });
71
- // -- Silence detection --
72
- it('triggers transcription after silence threshold', async () => {
73
- const fetchFn = mockFetch('hello');
74
- const provider = makeProvider({ fetchFn, silenceThresholdMs: 200 });
75
- const results = [];
76
- provider.onTranscription((r) => results.push(r));
77
- await provider.start();
78
- provider.feedAudio(makeFrame([1, 2, 3, 4]));
79
- // Advance past silence threshold
80
- await vi.advanceTimersByTimeAsync(200);
81
- expect(fetchFn).toHaveBeenCalledTimes(1);
82
- expect(results).toHaveLength(1);
83
- expect(results[0]).toEqual({ text: 'hello', isFinal: true });
84
- });
85
- it('resets silence timer on new audio', async () => {
86
- const fetchFn = mockFetch('hello');
87
- const provider = makeProvider({ fetchFn, silenceThresholdMs: 200 });
88
- await provider.start();
89
- provider.feedAudio(makeFrame());
90
- // Advance partway (150ms < 200ms threshold)
91
- await vi.advanceTimersByTimeAsync(150);
92
- expect(fetchFn).not.toHaveBeenCalled();
93
- // Feed more audio — resets timer
94
- provider.feedAudio(makeFrame());
95
- // Advance another 150ms (total 300ms from start, but only 150ms from last audio)
96
- await vi.advanceTimersByTimeAsync(150);
97
- expect(fetchFn).not.toHaveBeenCalled();
98
- // Advance the remaining 50ms to hit threshold from last audio
99
- await vi.advanceTimersByTimeAsync(50);
100
- expect(fetchFn).toHaveBeenCalledTimes(1);
101
- });
102
- it('does not trigger transcription when buffer is empty', async () => {
103
- const fetchFn = mockFetch();
104
- const provider = makeProvider({ fetchFn, silenceThresholdMs: 100 });
105
- await provider.start();
106
- // Feed then trigger silence so buffer is consumed
107
- provider.feedAudio(makeFrame());
108
- await vi.advanceTimersByTimeAsync(100);
109
- expect(fetchFn).toHaveBeenCalledTimes(1);
110
- // Now wait again — no new audio, so no second call
111
- await vi.advanceTimersByTimeAsync(200);
112
- expect(fetchFn).toHaveBeenCalledTimes(1);
113
- });
114
- // -- Buffer cleanup --
115
- it('clears buffer after transcription', async () => {
116
- const fetchFn = mockFetch('first');
117
- const provider = makeProvider({ fetchFn, silenceThresholdMs: 100 });
118
- const results = [];
119
- provider.onTranscription((r) => results.push(r));
120
- await provider.start();
121
- provider.feedAudio(makeFrame([1, 2]));
122
- await vi.advanceTimersByTimeAsync(100);
123
- expect(results).toHaveLength(1);
124
- // Feed new audio — should only contain the new data
125
- vi.mocked(fetchFn).mockResolvedValue({
126
- ok: true,
127
- status: 200,
128
- json: async () => ({ text: 'second' }),
129
- text: async () => JSON.stringify({ text: 'second' }),
130
- });
131
- provider.feedAudio(makeFrame([3, 4]));
132
- await vi.advanceTimersByTimeAsync(100);
133
- expect(results).toHaveLength(2);
134
- expect(results[1].text).toBe('second');
135
- });
136
- it('stop transcribes remaining buffer', async () => {
137
- const fetchFn = mockFetch('final words');
138
- const provider = makeProvider({ fetchFn, silenceThresholdMs: 5000 });
139
- const results = [];
140
- provider.onTranscription((r) => results.push(r));
141
- await provider.start();
142
- provider.feedAudio(makeFrame([10, 20, 30]));
143
- // Stop before silence threshold — should flush remaining buffer
144
- await provider.stop();
145
- expect(fetchFn).toHaveBeenCalledTimes(1);
146
- expect(results).toHaveLength(1);
147
- expect(results[0].text).toBe('final words');
148
- });
149
- it('stop with empty buffer does not call API', async () => {
150
- const fetchFn = mockFetch();
151
- const provider = makeProvider({ fetchFn });
152
- await provider.start();
153
- await provider.stop();
154
- expect(fetchFn).not.toHaveBeenCalled();
155
- });
156
- // -- API request format --
157
- it('sends correct Authorization header and model', async () => {
158
- const fetchFn = mockFetch('test');
159
- const provider = makeProvider({ fetchFn, apiKey: 'sk-my-key', silenceThresholdMs: 100 });
160
- await provider.start();
161
- provider.feedAudio(makeFrame([1, 2, 3, 4]));
162
- await vi.advanceTimersByTimeAsync(100);
163
- expect(fetchFn).toHaveBeenCalledTimes(1);
164
- const [url, init] = vi.mocked(fetchFn).mock.calls[0];
165
- expect(url).toBe('https://api.openai.com/v1/audio/transcriptions');
166
- expect(init.headers.Authorization).toBe('Bearer sk-my-key');
167
- // Verify FormData contains model field
168
- const body = init.body;
169
- expect(body.get('model')).toBe('whisper-1');
170
- // File should be present
171
- expect(body.get('file')).toBeTruthy();
172
- });
173
- it('sends audio as WAV file in FormData', async () => {
174
- const fetchFn = mockFetch('test');
175
- const provider = makeProvider({ fetchFn, silenceThresholdMs: 100 });
176
- await provider.start();
177
- provider.feedAudio(makeFrame([10, 20, 30, 40]));
178
- await vi.advanceTimersByTimeAsync(100);
179
- const [, init] = vi.mocked(fetchFn).mock.calls[0];
180
- const body = init.body;
181
- const file = body.get('file');
182
- expect(file).toBeInstanceOf(Blob);
183
- expect(file.type).toBe('audio/wav');
184
- // Verify it's a valid WAV (starts with RIFF header)
185
- const arrayBuf = await file.arrayBuffer();
186
- const header = Buffer.from(arrayBuf).subarray(0, 4).toString('ascii');
187
- expect(header).toBe('RIFF');
188
- });
189
- // -- Callback behavior --
190
- it('skips callback for empty transcription', async () => {
191
- const fetchFn = mockFetch('');
192
- const provider = makeProvider({ fetchFn, silenceThresholdMs: 100 });
193
- const results = [];
194
- provider.onTranscription((r) => results.push(r));
195
- await provider.start();
196
- provider.feedAudio(makeFrame());
197
- await vi.advanceTimersByTimeAsync(100);
198
- expect(fetchFn).toHaveBeenCalledTimes(1);
199
- expect(results).toHaveLength(0);
200
- });
201
- it('skips callback for whitespace-only transcription', async () => {
202
- const fetchFn = mockFetch(' ');
203
- const provider = makeProvider({ fetchFn, silenceThresholdMs: 100 });
204
- const results = [];
205
- provider.onTranscription((r) => results.push(r));
206
- await provider.start();
207
- provider.feedAudio(makeFrame());
208
- await vi.advanceTimersByTimeAsync(100);
209
- expect(results).toHaveLength(0);
210
- });
211
- it('fires callback without onTranscription registered (no crash)', async () => {
212
- const fetchFn = mockFetch('hello');
213
- const provider = makeProvider({ fetchFn, silenceThresholdMs: 100 });
214
- // deliberately not calling onTranscription
215
- await provider.start();
216
- provider.feedAudio(makeFrame());
217
- await vi.advanceTimersByTimeAsync(100);
218
- // Should not throw
219
- });
220
- // -- Error handling --
221
- it('logs error on non-OK API response', async () => {
222
- const fetchFn = mockFetch('', false, 401);
223
- const log = createLogger();
224
- const provider = makeProvider({ fetchFn, log, silenceThresholdMs: 100 });
225
- const results = [];
226
- provider.onTranscription((r) => results.push(r));
227
- await provider.start();
228
- provider.feedAudio(makeFrame());
229
- await vi.advanceTimersByTimeAsync(100);
230
- expect(results).toHaveLength(0);
231
- expect(log.error).toHaveBeenCalledWith(expect.objectContaining({ status: 401 }), 'OpenAI Whisper API error');
232
- });
233
- it('logs error on fetch rejection', async () => {
234
- const fetchFn = vi.fn().mockRejectedValue(new Error('network down'));
235
- const log = createLogger();
236
- const provider = makeProvider({ fetchFn, log, silenceThresholdMs: 100 });
237
- const results = [];
238
- provider.onTranscription((r) => results.push(r));
239
- await provider.start();
240
- provider.feedAudio(makeFrame());
241
- await vi.advanceTimersByTimeAsync(100);
242
- expect(results).toHaveLength(0);
243
- expect(log.error).toHaveBeenCalledWith(expect.objectContaining({ err: expect.any(Error) }), 'OpenAI Whisper transcription request failed');
244
- });
245
- });
246
- // ---------------------------------------------------------------------------
247
- // WAV header construction
248
- // ---------------------------------------------------------------------------
249
- describe('buildWav', () => {
250
- it('produces a valid 44-byte header + PCM data', () => {
251
- const pcm = Buffer.from([0x01, 0x02, 0x03, 0x04]);
252
- const wav = buildWav(pcm, 16000, 1);
253
- expect(wav.length).toBe(44 + 4);
254
- // RIFF header
255
- expect(wav.subarray(0, 4).toString('ascii')).toBe('RIFF');
256
- expect(wav.readUInt32LE(4)).toBe(36 + 4); // ChunkSize
257
- expect(wav.subarray(8, 12).toString('ascii')).toBe('WAVE');
258
- // fmt sub-chunk
259
- expect(wav.subarray(12, 16).toString('ascii')).toBe('fmt ');
260
- expect(wav.readUInt32LE(16)).toBe(16); // Subchunk1Size
261
- expect(wav.readUInt16LE(20)).toBe(1); // AudioFormat (PCM)
262
- expect(wav.readUInt16LE(22)).toBe(1); // NumChannels
263
- expect(wav.readUInt32LE(24)).toBe(16000); // SampleRate
264
- expect(wav.readUInt32LE(28)).toBe(32000); // ByteRate (16000 * 1 * 2)
265
- expect(wav.readUInt16LE(32)).toBe(2); // BlockAlign (1 * 2)
266
- expect(wav.readUInt16LE(34)).toBe(16); // BitsPerSample
267
- // data sub-chunk
268
- expect(wav.subarray(36, 40).toString('ascii')).toBe('data');
269
- expect(wav.readUInt32LE(40)).toBe(4); // data size
270
- // PCM data follows
271
- expect(wav.subarray(44)).toEqual(pcm);
272
- });
273
- it('handles stereo at 48kHz', () => {
274
- const pcm = Buffer.alloc(960); // some audio data
275
- const wav = buildWav(pcm, 48000, 2);
276
- expect(wav.readUInt16LE(22)).toBe(2); // NumChannels
277
- expect(wav.readUInt32LE(24)).toBe(48000); // SampleRate
278
- expect(wav.readUInt32LE(28)).toBe(192000); // ByteRate (48000 * 2 * 2)
279
- expect(wav.readUInt16LE(32)).toBe(4); // BlockAlign (2 * 2)
280
- });
281
- });
@@ -1,169 +0,0 @@
1
- import crypto from 'node:crypto';
2
- const CARTESIA_WS_URL = 'wss://api.cartesia.ai/tts/websocket';
3
- const DEFAULT_MODEL_ID = 'sonic-3';
4
- const DEFAULT_SAMPLE_RATE = 24000;
5
- export class CartesiaTtsProvider {
6
- apiKey;
7
- voiceId;
8
- modelId;
9
- sampleRate;
10
- log;
11
- WsCtor;
12
- constructor(opts) {
13
- if (typeof globalThis.WebSocket === 'undefined' && !opts.wsConstructor) {
14
- throw new Error('globalThis.WebSocket is not available. ' +
15
- 'Node 22+ includes WebSocket natively. ' +
16
- 'Upgrade to Node 22+ or pass a wsConstructor option.');
17
- }
18
- this.apiKey = opts.apiKey;
19
- this.voiceId = opts.voiceId ?? 'a0e99841-438c-4a64-b679-ae501e7d6091';
20
- this.modelId = opts.modelId ?? DEFAULT_MODEL_ID;
21
- this.sampleRate = opts.sampleRate ?? DEFAULT_SAMPLE_RATE;
22
- this.log = opts.log;
23
- this.WsCtor = opts.wsConstructor ?? globalThis.WebSocket;
24
- }
25
- async *synthesize(text) {
26
- if (!text.trim())
27
- return;
28
- const url = this.buildUrl();
29
- const ws = new this.WsCtor(url);
30
- let hasYielded = false;
31
- try {
32
- await this.waitForOpen(ws);
33
- this.log.info({ model: this.modelId, textLength: text.length }, 'Cartesia TTS WebSocket connected, sending request');
34
- ws.send(JSON.stringify({
35
- context_id: crypto.randomUUID().replace(/-/g, ''),
36
- model_id: this.modelId,
37
- transcript: text,
38
- voice: { mode: 'id', id: this.voiceId },
39
- output_format: {
40
- container: 'raw',
41
- encoding: 'pcm_s16le',
42
- sample_rate: this.sampleRate,
43
- },
44
- }));
45
- yield* this.receiveFrames(ws, () => {
46
- hasYielded = true;
47
- });
48
- }
49
- catch (err) {
50
- if (hasYielded) {
51
- throw new Error('Cartesia TTS stream disconnected mid-stream', { cause: err });
52
- }
53
- throw err;
54
- }
55
- finally {
56
- if (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING) {
57
- ws.close();
58
- }
59
- }
60
- }
61
- buildUrl() {
62
- const params = new URLSearchParams({
63
- api_key: this.apiKey,
64
- cartesia_version: '2024-06-10',
65
- });
66
- return `${CARTESIA_WS_URL}?${params.toString()}`;
67
- }
68
- waitForOpen(ws) {
69
- return new Promise((resolve, reject) => {
70
- ws.onopen = () => resolve();
71
- ws.onerror = (event) => {
72
- this.log.error({ error: event }, 'Cartesia TTS WebSocket error');
73
- };
74
- ws.onclose = (event) => {
75
- reject(new Error(`Cartesia TTS WebSocket closed before open: code=${event.code}`));
76
- };
77
- });
78
- }
79
- receiveFrames(ws, onYield) {
80
- const sampleRate = this.sampleRate;
81
- const log = this.log;
82
- // Buffer for frames received before the consumer pulls them
83
- const pending = [];
84
- let done = false;
85
- let error = null;
86
- let notify = null;
87
- function wake() {
88
- if (notify) {
89
- const fn = notify;
90
- notify = null;
91
- fn();
92
- }
93
- }
94
- ws.onmessage = (event) => {
95
- // Cartesia sends JSON messages with base64-encoded audio in msg.data
96
- try {
97
- const msg = JSON.parse(String(event.data));
98
- // Handle error responses from Cartesia
99
- // status_code 206 = partial content (normal streaming chunk) — only error on 4xx/5xx
100
- if (msg.error || (msg.status_code && msg.status_code >= 400)) {
101
- log.error({ cartesiaError: msg.error, statusCode: msg.status_code }, 'Cartesia TTS error response');
102
- error = new Error(`Cartesia TTS error: ${msg.error ?? `status ${msg.status_code}`}`);
103
- done = true;
104
- wake();
105
- return;
106
- }
107
- if (msg.data) {
108
- pending.push({
109
- buffer: Buffer.from(msg.data, 'base64'),
110
- sampleRate,
111
- channels: 1,
112
- });
113
- wake();
114
- }
115
- if (msg.done) {
116
- done = true;
117
- wake();
118
- }
119
- // Log unrecognized messages that have no data/done/error fields
120
- if (!msg.data && !msg.done && !msg.error && !(msg.status_code && msg.status_code < 400)) {
121
- log.warn({ msgType: msg.type, keys: Object.keys(msg).join(',') }, 'Cartesia TTS: unrecognized message');
122
- }
123
- }
124
- catch {
125
- // Fallback: raw binary frame (future-proofing)
126
- if (event.data instanceof ArrayBuffer) {
127
- pending.push({
128
- buffer: Buffer.from(event.data),
129
- sampleRate,
130
- channels: 1,
131
- });
132
- wake();
133
- }
134
- else {
135
- log.error('Unexpected Cartesia TTS message format');
136
- }
137
- }
138
- };
139
- ws.onclose = (event) => {
140
- if (!done) {
141
- error = new Error(`Cartesia TTS WebSocket closed unexpectedly: code=${event.code}`);
142
- }
143
- done = true;
144
- wake();
145
- };
146
- ws.onerror = (event) => {
147
- log.error({ error: event }, 'Cartesia TTS WebSocket error');
148
- };
149
- async function* generate() {
150
- while (true) {
151
- // Drain pending frames
152
- while (pending.length > 0) {
153
- onYield();
154
- yield pending.shift();
155
- }
156
- if (done) {
157
- if (error)
158
- throw error;
159
- return;
160
- }
161
- // Wait for new data
162
- await new Promise((resolve) => {
163
- notify = resolve;
164
- });
165
- }
166
- }
167
- return generate();
168
- }
169
- }
@@ -1,228 +0,0 @@
1
- import { describe, it, expect, vi, beforeEach } from 'vitest';
2
- import { CartesiaTtsProvider } from './tts-cartesia.js';
3
- class MockWebSocket {
4
- static CONNECTING = 0;
5
- static OPEN = 1;
6
- static CLOSING = 2;
7
- static CLOSED = 3;
8
- url;
9
- readyState = MockWebSocket.OPEN;
10
- onopen = null;
11
- onmessage = null;
12
- onerror = null;
13
- onclose = null;
14
- sent = [];
15
- constructor(url) {
16
- this.url = String(url);
17
- // Auto-open on next microtask so callers can attach handlers
18
- queueMicrotask(() => this.onopen?.({ type: 'open' }));
19
- }
20
- send(data) {
21
- this.sent.push(data);
22
- }
23
- close() {
24
- this.readyState = MockWebSocket.CLOSED;
25
- }
26
- // Test helpers — sends audio as JSON with base64 (matching real Cartesia API)
27
- _receiveAudio(data) {
28
- const b64 = Buffer.from(data).toString('base64');
29
- this.onmessage?.({ data: JSON.stringify({ type: 'chunk', data: b64 }) });
30
- }
31
- _receiveJson(obj) {
32
- this.onmessage?.({ data: JSON.stringify(obj) });
33
- }
34
- _triggerClose(code = 1006) {
35
- this.readyState = MockWebSocket.CLOSED;
36
- this.onclose?.({ code });
37
- }
38
- _triggerError() {
39
- this.onerror?.({ type: 'error' });
40
- }
41
- }
42
- // Make global WebSocket constants available for readyState checks
43
- globalThis.WebSocket = MockWebSocket;
44
- // ---------------------------------------------------------------------------
45
- // Helpers
46
- // ---------------------------------------------------------------------------
47
- function createLogger() {
48
- return { info: vi.fn(), warn: vi.fn(), error: vi.fn() };
49
- }
50
- let lastCreatedWs = null;
51
- function wsFactory(url) {
52
- const ws = new MockWebSocket(url);
53
- lastCreatedWs = ws;
54
- return ws;
55
- }
56
- const WsConstructor = wsFactory;
57
- function makeProvider(overrides = {}) {
58
- return new CartesiaTtsProvider({
59
- apiKey: overrides.apiKey ?? 'test-key',
60
- voiceId: overrides.voiceId,
61
- modelId: overrides.modelId,
62
- sampleRate: overrides.sampleRate,
63
- log: overrides.log ?? createLogger(),
64
- wsConstructor: WsConstructor,
65
- });
66
- }
67
- async function collectFrames(iter) {
68
- const frames = [];
69
- for await (const frame of iter) {
70
- frames.push(frame);
71
- }
72
- return frames;
73
- }
74
- // ---------------------------------------------------------------------------
75
- // Tests
76
- // ---------------------------------------------------------------------------
77
- beforeEach(() => {
78
- vi.clearAllMocks();
79
- lastCreatedWs = null;
80
- });
81
- describe('CartesiaTtsProvider', () => {
82
- it('constructs correct WebSocket URL with auth params', async () => {
83
- const provider = makeProvider({ apiKey: 'my-api-key' });
84
- const iter = provider.synthesize('hello');
85
- // Start consuming to trigger WebSocket creation
86
- const framePromise = collectFrames(iter);
87
- // Wait for microtask to open WS
88
- await new Promise((r) => setTimeout(r, 10));
89
- expect(lastCreatedWs).not.toBeNull();
90
- const url = new URL(lastCreatedWs.url);
91
- expect(url.protocol).toBe('wss:');
92
- expect(url.hostname).toBe('api.cartesia.ai');
93
- expect(url.pathname).toBe('/tts/websocket');
94
- expect(url.searchParams.get('api_key')).toBe('my-api-key');
95
- expect(url.searchParams.get('cartesia_version')).toBe('2024-06-10');
96
- // Verify the synthesis request JSON
97
- expect(lastCreatedWs.sent).toHaveLength(1);
98
- const req = JSON.parse(lastCreatedWs.sent[0]);
99
- expect(req.context_id).toMatch(/^[a-f0-9]{32}$/);
100
- expect(req.model_id).toBe('sonic-3');
101
- expect(req.transcript).toBe('hello');
102
- expect(req.output_format.container).toBe('raw');
103
- expect(req.output_format.encoding).toBe('pcm_s16le');
104
- // End the stream cleanly
105
- lastCreatedWs._receiveJson({ done: true });
106
- await framePromise;
107
- });
108
- it('streams multiple audio frames in correct order', async () => {
109
- const provider = makeProvider();
110
- const iter = provider.synthesize('hello world');
111
- const framePromise = collectFrames(iter);
112
- await new Promise((r) => setTimeout(r, 10));
113
- // Send 3 audio frames
114
- lastCreatedWs._receiveAudio([1, 2, 3]);
115
- lastCreatedWs._receiveAudio([4, 5, 6]);
116
- lastCreatedWs._receiveAudio([7, 8, 9]);
117
- lastCreatedWs._receiveJson({ done: true });
118
- const frames = await framePromise;
119
- expect(frames).toHaveLength(3);
120
- expect([...frames[0].buffer]).toEqual([1, 2, 3]);
121
- expect([...frames[1].buffer]).toEqual([4, 5, 6]);
122
- expect([...frames[2].buffer]).toEqual([7, 8, 9]);
123
- // Verify sample rate and channels on each frame
124
- for (const frame of frames) {
125
- expect(frame.sampleRate).toBe(24000);
126
- expect(frame.channels).toBe(1);
127
- }
128
- });
129
- it('connection failure before any frames throws', async () => {
130
- // Use a factory that triggers close instead of open (no auto-open)
131
- function failWsFactory(url) {
132
- const ws = {
133
- url: String(url),
134
- readyState: 0,
135
- onopen: null,
136
- onmessage: null,
137
- onerror: null,
138
- onclose: null,
139
- sent: [],
140
- send: vi.fn(),
141
- close: vi.fn(),
142
- };
143
- queueMicrotask(() => {
144
- ws.readyState = 3;
145
- ws.onclose?.({ code: 1006 });
146
- });
147
- lastCreatedWs = ws;
148
- return ws;
149
- }
150
- const provider = new CartesiaTtsProvider({
151
- apiKey: 'key',
152
- log: createLogger(),
153
- wsConstructor: failWsFactory,
154
- });
155
- await expect(collectFrames(provider.synthesize('test'))).rejects.toThrow('closed before open');
156
- });
157
- it('mid-stream disconnect throws without retrying', async () => {
158
- const provider = makeProvider();
159
- const iter = provider.synthesize('hello');
160
- const framePromise = collectFrames(iter);
161
- await new Promise((r) => setTimeout(r, 10));
162
- // Yield one frame, then disconnect
163
- lastCreatedWs._receiveAudio([1, 2, 3]);
164
- // Small delay to ensure the frame is consumed
165
- await new Promise((r) => setTimeout(r, 5));
166
- lastCreatedWs._triggerClose(1006);
167
- await expect(framePromise).rejects.toThrow('mid-stream');
168
- });
169
- it('empty text yields no frames', async () => {
170
- const provider = makeProvider();
171
- const frames = await collectFrames(provider.synthesize(''));
172
- expect(frames).toHaveLength(0);
173
- const frames2 = await collectFrames(provider.synthesize(' '));
174
- expect(frames2).toHaveLength(0);
175
- // No WebSocket should have been created
176
- expect(lastCreatedWs).toBeNull();
177
- });
178
- it('cleanup on early iterator break closes socket', async () => {
179
- const provider = makeProvider();
180
- const iter = provider.synthesize('hello')[Symbol.asyncIterator]();
181
- // Start pulling — triggers WS creation + waitForOpen
182
- const nextPromise = iter.next();
183
- // Let the microtask fire to open WS
184
- await new Promise((r) => setTimeout(r, 10));
185
- // Now send audio data so the first next() resolves
186
- lastCreatedWs._receiveAudio([1, 2, 3]);
187
- const first = await nextPromise;
188
- expect(first.done).toBe(false);
189
- expect([...first.value.buffer]).toEqual([1, 2, 3]);
190
- // Break early via return
191
- await iter.return(undefined);
192
- // WebSocket should be closed
193
- expect(lastCreatedWs.readyState).toBe(MockWebSocket.CLOSED);
194
- });
195
- it('constructor throws when globalThis.WebSocket unavailable and no wsConstructor', () => {
196
- const original = globalThis.WebSocket;
197
- try {
198
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
199
- globalThis.WebSocket = undefined;
200
- expect(() => new CartesiaTtsProvider({
201
- apiKey: 'key',
202
- log: createLogger(),
203
- })).toThrow('Node 22+');
204
- }
205
- finally {
206
- globalThis.WebSocket = original;
207
- }
208
- });
209
- it('uses custom voiceId, modelId, and sampleRate', async () => {
210
- const provider = makeProvider({
211
- voiceId: 'custom-voice',
212
- modelId: 'sonic-4',
213
- sampleRate: 48000,
214
- });
215
- const iter = provider.synthesize('test');
216
- const framePromise = collectFrames(iter);
217
- await new Promise((r) => setTimeout(r, 10));
218
- const req = JSON.parse(lastCreatedWs.sent[0]);
219
- expect(req.model_id).toBe('sonic-4');
220
- expect(req.voice.id).toBe('custom-voice');
221
- expect(req.output_format.sample_rate).toBe(48000);
222
- // Send a frame and complete
223
- lastCreatedWs._receiveAudio([10, 20]);
224
- lastCreatedWs._receiveJson({ done: true });
225
- const frames = await framePromise;
226
- expect(frames[0].sampleRate).toBe(48000);
227
- });
228
- });