discoclaw 1.2.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/.context/voice.md +30 -2
  2. package/.env.example +7 -3
  3. package/.env.example.full +13 -32
  4. package/README.md +1 -1
  5. package/dist/cli/dashboard.js +7 -1
  6. package/dist/cli/dashboard.test.js +0 -4
  7. package/dist/cli/init-wizard.js +4 -8
  8. package/dist/cli/init-wizard.test.js +4 -10
  9. package/dist/config.js +5 -38
  10. package/dist/config.test.js +8 -72
  11. package/dist/cron/executor.js +72 -1
  12. package/dist/dashboard/api/metrics.js +7 -0
  13. package/dist/dashboard/api/metrics.test.js +16 -0
  14. package/dist/dashboard/api/traces.js +14 -0
  15. package/dist/dashboard/api/traces.test.js +40 -0
  16. package/dist/dashboard/page.js +187 -8
  17. package/dist/dashboard/server.js +82 -19
  18. package/dist/dashboard/server.test.js +123 -10
  19. package/dist/discord/actions.js +112 -6
  20. package/dist/discord/actions.test.js +117 -1
  21. package/dist/discord/deferred-runner.js +306 -219
  22. package/dist/discord/help-command.js +1 -1
  23. package/dist/discord/message-coordinator.js +4 -36
  24. package/dist/discord/models-command.js +1 -1
  25. package/dist/discord/reaction-handler.js +83 -5
  26. package/dist/discord/reaction-handler.test.js +55 -0
  27. package/dist/discord/verify-push.js +31 -36
  28. package/dist/discord/verify-push.test.js +34 -6
  29. package/dist/discord/voice-command.js +1 -31
  30. package/dist/discord/voice-command.test.js +21 -259
  31. package/dist/discord/voice-status-command.js +3 -22
  32. package/dist/discord/voice-status-command.test.js +16 -124
  33. package/dist/discord-followup.test.js +133 -0
  34. package/dist/health/config-doctor.js +5 -27
  35. package/dist/health/config-doctor.test.js +1 -4
  36. package/dist/index.js +15 -28
  37. package/dist/observability/trace-store.js +56 -0
  38. package/dist/observability/trace-utils.js +31 -0
  39. package/dist/runtime/codex-cli.js +3 -2
  40. package/dist/runtime/codex-cli.test.js +33 -0
  41. package/dist/runtime/model-tiers.js +1 -1
  42. package/dist/runtime/model-tiers.test.js +9 -0
  43. package/dist/runtime/openai-tool-schemas.js +17 -0
  44. package/dist/runtime-overrides.js +2 -3
  45. package/dist/runtime-overrides.test.js +27 -193
  46. package/dist/tasks/store.js +10 -6
  47. package/dist/tasks/store.test.js +44 -0
  48. package/dist/tasks/task-action-executor.test.js +162 -50
  49. package/dist/tasks/task-action-mutations.js +22 -2
  50. package/dist/tasks/task-action-read-ops.js +7 -1
  51. package/dist/tasks/task-action-runner-types.js +19 -1
  52. package/dist/voice/audio-pipeline.js +183 -96
  53. package/dist/voice/audio-receiver.js +8 -0
  54. package/dist/voice/audio-receiver.test.js +16 -0
  55. package/dist/voice/conversation-buffer.js +16 -6
  56. package/dist/voice/providers/gemini-live-provider.js +481 -0
  57. package/dist/voice/providers/gemini-live-provider.test.js +834 -0
  58. package/dist/voice/providers/gemini-live-responder.js +267 -0
  59. package/dist/voice/providers/gemini-live-responder.test.js +615 -0
  60. package/dist/voice/providers/gemini-live-token-estimator.js +100 -0
  61. package/dist/voice/providers/gemini-live-token-estimator.test.js +160 -0
  62. package/dist/voice/providers/gemini-live-types.js +32 -0
  63. package/dist/voice/providers/gemini-tool-mapper.js +91 -0
  64. package/dist/voice/providers/gemini-tool-mapper.test.js +253 -0
  65. package/dist/voice/providers/index.js +3 -0
  66. package/dist/voice/voice-prompt-builder.js +26 -17
  67. package/dist/voice/voice-prompt-builder.test.js +16 -1
  68. package/docs/configuration.md +4 -9
  69. package/docs/official-docs.md +6 -9
  70. package/docs/runtime-switching.md +1 -1
  71. package/package.json +1 -1
  72. package/dist/voice/audio-pipeline.test.js +0 -619
  73. package/dist/voice/stt-deepgram.js +0 -154
  74. package/dist/voice/stt-deepgram.test.js +0 -275
  75. package/dist/voice/stt-factory.js +0 -42
  76. package/dist/voice/stt-factory.test.js +0 -45
  77. package/dist/voice/stt-openai.js +0 -156
  78. package/dist/voice/stt-openai.test.js +0 -281
  79. package/dist/voice/tts-cartesia.js +0 -169
  80. package/dist/voice/tts-cartesia.test.js +0 -228
  81. package/dist/voice/tts-deepgram.js +0 -84
  82. package/dist/voice/tts-deepgram.test.js +0 -220
  83. package/dist/voice/tts-factory.js +0 -52
  84. package/dist/voice/tts-factory.test.js +0 -53
  85. package/dist/voice/tts-openai.js +0 -70
  86. package/dist/voice/tts-openai.test.js +0 -138
  87. package/dist/voice/types.test.js +0 -84
@@ -1,84 +0,0 @@
1
- const DEEPGRAM_SPEECH_URL = 'https://api.deepgram.com/v1/speak';
2
- const DEFAULT_MODEL = 'aura-2-asteria-en';
3
- const DEFAULT_SAMPLE_RATE = 24000;
4
- export const DEEPGRAM_MAX_CHARS = 2000;
5
- /**
6
- * Deepgram Aura TTS adapter.
7
- *
8
- * POSTs to `/v1/speak` requesting `linear16` encoding with `container=none`
9
- * (raw PCM s16le). Streams the response body and yields `AudioFrame` chunks.
10
- */
11
- export class DeepgramTtsProvider {
12
- apiKey;
13
- model;
14
- sampleRate;
15
- speed;
16
- log;
17
- fetchFn;
18
- constructor(opts) {
19
- if (opts.speed !== undefined && (opts.speed < 0.5 || opts.speed > 1.5)) {
20
- throw new RangeError(`DeepgramTtsProvider: speed must be in range [0.5, 1.5], got ${opts.speed}`);
21
- }
22
- this.apiKey = opts.apiKey;
23
- this.model = opts.model ?? DEFAULT_MODEL;
24
- this.sampleRate = opts.sampleRate ?? DEFAULT_SAMPLE_RATE;
25
- this.speed = opts.speed;
26
- this.log = opts.log;
27
- this.fetchFn = opts.fetchFn ?? globalThis.fetch;
28
- }
29
- async *synthesize(text) {
30
- if (!text.trim())
31
- return;
32
- if (text.length > DEEPGRAM_MAX_CHARS) {
33
- const originalLength = text.length;
34
- const slice = text.slice(0, DEEPGRAM_MAX_CHARS);
35
- const sentenceEnd = Math.max(slice.lastIndexOf('. '), slice.lastIndexOf('! '), slice.lastIndexOf('? '), slice.lastIndexOf('.\n'), slice.lastIndexOf('!\n'), slice.lastIndexOf('?\n'));
36
- text = sentenceEnd > 0 ? slice.slice(0, sentenceEnd + 1) : (slice.lastIndexOf(' ') > 0 ? slice.slice(0, slice.lastIndexOf(' ')) : slice);
37
- this.log.warn({ originalLength, truncatedLength: text.length }, 'Deepgram TTS: text truncated to prevent HTTP 413');
38
- }
39
- const params = new URLSearchParams({
40
- model: this.model,
41
- encoding: 'linear16',
42
- sample_rate: String(this.sampleRate),
43
- container: 'none',
44
- });
45
- if (this.speed !== undefined) {
46
- params.set('speed', String(this.speed));
47
- }
48
- const url = `${DEEPGRAM_SPEECH_URL}?${params.toString()}`;
49
- this.log.info({ model: this.model, textLength: text.length }, 'Deepgram TTS: sending synthesis request');
50
- const response = await this.fetchFn(url, {
51
- method: 'POST',
52
- headers: {
53
- Authorization: `Token ${this.apiKey}`,
54
- 'Content-Type': 'application/json',
55
- },
56
- body: JSON.stringify({ text }),
57
- });
58
- if (!response.ok) {
59
- const body = await response.text();
60
- throw new Error(`Deepgram TTS API error: ${response.status} — ${body.slice(0, 200)}`);
61
- }
62
- if (!response.body) {
63
- throw new Error('Deepgram TTS: response has no body stream');
64
- }
65
- const reader = response.body.getReader();
66
- try {
67
- for (;;) {
68
- const { done, value } = await reader.read();
69
- if (done)
70
- break;
71
- if (value && value.byteLength > 0) {
72
- yield {
73
- buffer: Buffer.from(value.buffer, value.byteOffset, value.byteLength),
74
- sampleRate: this.sampleRate,
75
- channels: 1,
76
- };
77
- }
78
- }
79
- }
80
- finally {
81
- reader.releaseLock();
82
- }
83
- }
84
- }
@@ -1,220 +0,0 @@
1
- import { describe, it, expect, vi, beforeEach } from 'vitest';
2
- import { DeepgramTtsProvider, DEEPGRAM_MAX_CHARS } from './tts-deepgram.js';
3
- // ---------------------------------------------------------------------------
4
- // Helpers
5
- // ---------------------------------------------------------------------------
6
- function createLogger() {
7
- return { info: vi.fn(), warn: vi.fn(), error: vi.fn() };
8
- }
9
- /** Build a mock ReadableStream that yields the given byte arrays, then closes. */
10
- function mockStream(chunks) {
11
- let i = 0;
12
- return new ReadableStream({
13
- pull(controller) {
14
- if (i < chunks.length) {
15
- controller.enqueue(chunks[i]);
16
- i++;
17
- }
18
- else {
19
- controller.close();
20
- }
21
- },
22
- });
23
- }
24
- function mockFetch(chunks = [new Uint8Array([1, 2, 3, 4])], ok = true, status = 200) {
25
- return vi.fn().mockResolvedValue({
26
- ok,
27
- status,
28
- body: ok ? mockStream(chunks) : null,
29
- text: async () => 'API error body',
30
- });
31
- }
32
- function makeProvider(overrides = {}) {
33
- return new DeepgramTtsProvider({
34
- apiKey: overrides.apiKey ?? 'test-key',
35
- model: overrides.model,
36
- sampleRate: overrides.sampleRate,
37
- speed: overrides.speed,
38
- log: overrides.log ?? createLogger(),
39
- fetchFn: overrides.fetchFn ?? mockFetch(),
40
- });
41
- }
42
- async function collectFrames(iter) {
43
- const frames = [];
44
- for await (const frame of iter) {
45
- frames.push(frame);
46
- }
47
- return frames;
48
- }
49
- // ---------------------------------------------------------------------------
50
- // Tests
51
- // ---------------------------------------------------------------------------
52
- beforeEach(() => {
53
- vi.clearAllMocks();
54
- });
55
- describe('DeepgramTtsProvider', () => {
56
- it('sends correct API request with default model and linear16 encoding', async () => {
57
- const fetchFn = mockFetch([new Uint8Array([10, 20])]);
58
- const provider = makeProvider({ fetchFn, apiKey: 'dg-my-key' });
59
- await collectFrames(provider.synthesize('hello'));
60
- expect(fetchFn).toHaveBeenCalledTimes(1);
61
- const [url, init] = vi.mocked(fetchFn).mock.calls[0];
62
- expect(url).toContain('https://api.deepgram.com/v1/speak');
63
- expect(url).toContain('encoding=linear16');
64
- expect(url).toContain('sample_rate=24000');
65
- expect(url).toContain('container=none');
66
- expect(url).toContain('model=aura-2-asteria-en');
67
- expect(init.headers.Authorization).toBe('Token dg-my-key');
68
- expect(init.headers['Content-Type']).toBe('application/json');
69
- const body = JSON.parse(init.body);
70
- expect(body.text).toBe('hello');
71
- });
72
- it('uses custom model and sampleRate', async () => {
73
- const fetchFn = mockFetch([new Uint8Array([1])]);
74
- const provider = makeProvider({
75
- fetchFn,
76
- model: 'aura-2-luna-en',
77
- sampleRate: 48000,
78
- });
79
- const frames = await collectFrames(provider.synthesize('test'));
80
- const [url] = vi.mocked(fetchFn).mock.calls[0];
81
- expect(url).toContain('model=aura-2-luna-en');
82
- expect(url).toContain('sample_rate=48000');
83
- expect(frames[0].sampleRate).toBe(48000);
84
- });
85
- it('streams multiple audio frames with correct metadata', async () => {
86
- const chunks = [
87
- new Uint8Array([1, 2, 3]),
88
- new Uint8Array([4, 5, 6]),
89
- new Uint8Array([7, 8, 9]),
90
- ];
91
- const fetchFn = mockFetch(chunks);
92
- const provider = makeProvider({ fetchFn });
93
- const frames = await collectFrames(provider.synthesize('hello world'));
94
- expect(frames).toHaveLength(3);
95
- expect([...frames[0].buffer]).toEqual([1, 2, 3]);
96
- expect([...frames[1].buffer]).toEqual([4, 5, 6]);
97
- expect([...frames[2].buffer]).toEqual([7, 8, 9]);
98
- for (const frame of frames) {
99
- expect(frame.sampleRate).toBe(24000);
100
- expect(frame.channels).toBe(1);
101
- }
102
- });
103
- it('empty text yields no frames and does not call API', async () => {
104
- const fetchFn = mockFetch();
105
- const provider = makeProvider({ fetchFn });
106
- const frames = await collectFrames(provider.synthesize(''));
107
- expect(frames).toHaveLength(0);
108
- const frames2 = await collectFrames(provider.synthesize(' '));
109
- expect(frames2).toHaveLength(0);
110
- expect(fetchFn).not.toHaveBeenCalled();
111
- });
112
- it('throws on non-OK HTTP response', async () => {
113
- const fetchFn = mockFetch([], false, 429);
114
- const provider = makeProvider({ fetchFn });
115
- await expect(collectFrames(provider.synthesize('test'))).rejects.toThrow('Deepgram TTS API error: 429');
116
- });
117
- it('throws when response has no body stream', async () => {
118
- const fetchFn = vi.fn().mockResolvedValue({
119
- ok: true,
120
- status: 200,
121
- body: null,
122
- text: async () => '',
123
- });
124
- const provider = makeProvider({ fetchFn });
125
- await expect(collectFrames(provider.synthesize('test'))).rejects.toThrow('response has no body stream');
126
- });
127
- describe('text truncation', () => {
128
- it('passes through text under the limit unchanged', async () => {
129
- const fetchFn = mockFetch();
130
- const provider = makeProvider({ fetchFn });
131
- const shortText = 'a'.repeat(DEEPGRAM_MAX_CHARS - 1);
132
- await collectFrames(provider.synthesize(shortText));
133
- const [, init] = vi.mocked(fetchFn).mock.calls[0];
134
- expect(JSON.parse(init.body).text).toBe(shortText);
135
- });
136
- it('passes through text exactly at the limit unchanged', async () => {
137
- const fetchFn = mockFetch();
138
- const provider = makeProvider({ fetchFn });
139
- const exactText = 'a'.repeat(DEEPGRAM_MAX_CHARS);
140
- await collectFrames(provider.synthesize(exactText));
141
- const [, init] = vi.mocked(fetchFn).mock.calls[0];
142
- expect(JSON.parse(init.body).text).toBe(exactText);
143
- });
144
- it('truncates text over the limit to at most DEEPGRAM_MAX_CHARS chars', async () => {
145
- const fetchFn = mockFetch();
146
- const provider = makeProvider({ fetchFn });
147
- const longText = 'a'.repeat(DEEPGRAM_MAX_CHARS + 500);
148
- await collectFrames(provider.synthesize(longText));
149
- const [, init] = vi.mocked(fetchFn).mock.calls[0];
150
- const sentText = JSON.parse(init.body).text;
151
- expect(sentText.length).toBeLessThanOrEqual(DEEPGRAM_MAX_CHARS);
152
- });
153
- it('cuts at the last sentence boundary when truncating', async () => {
154
- const fetchFn = mockFetch();
155
- const log = createLogger();
156
- const provider = makeProvider({ fetchFn, log });
157
- // Build text with a sentence boundary well before the limit
158
- const prefix = 'Hello world. ';
159
- const filler = 'x'.repeat(DEEPGRAM_MAX_CHARS - prefix.length + 100);
160
- const longText = prefix + filler;
161
- await collectFrames(provider.synthesize(longText));
162
- const [, init] = vi.mocked(fetchFn).mock.calls[0];
163
- const sentText = JSON.parse(init.body).text;
164
- expect(sentText).toBe('Hello world.');
165
- expect(sentText.length).toBeLessThanOrEqual(DEEPGRAM_MAX_CHARS);
166
- });
167
- it('logs a warning with original and truncated lengths when truncating', async () => {
168
- const fetchFn = mockFetch();
169
- const log = createLogger();
170
- const provider = makeProvider({ fetchFn, log });
171
- const longText = 'a'.repeat(DEEPGRAM_MAX_CHARS + 100);
172
- await collectFrames(provider.synthesize(longText));
173
- expect(log.warn).toHaveBeenCalledTimes(1);
174
- const [meta, msg] = vi.mocked(log.warn).mock.calls[0];
175
- expect(meta.originalLength).toBe(longText.length);
176
- expect(meta.truncatedLength).toBeLessThanOrEqual(DEEPGRAM_MAX_CHARS);
177
- expect(msg).toContain('truncated');
178
- });
179
- it('does not log a warning for text within the limit', async () => {
180
- const fetchFn = mockFetch();
181
- const log = createLogger();
182
- const provider = makeProvider({ fetchFn, log });
183
- await collectFrames(provider.synthesize('short text'));
184
- expect(log.warn).not.toHaveBeenCalled();
185
- });
186
- });
187
- describe('speed parameter', () => {
188
- it('includes speed in the URL when set', async () => {
189
- const fetchFn = mockFetch([new Uint8Array([1])]);
190
- const provider = makeProvider({ fetchFn, speed: 1.2 });
191
- await collectFrames(provider.synthesize('hello'));
192
- const [url] = vi.mocked(fetchFn).mock.calls[0];
193
- expect(url).toContain('speed=1.2');
194
- });
195
- it('omits speed from the URL when not set', async () => {
196
- const fetchFn = mockFetch([new Uint8Array([1])]);
197
- const provider = makeProvider({ fetchFn });
198
- await collectFrames(provider.synthesize('hello'));
199
- const [url] = vi.mocked(fetchFn).mock.calls[0];
200
- expect(url).not.toContain('speed=');
201
- });
202
- it('throws RangeError when speed is below 0.5', () => {
203
- expect(() => makeProvider({ speed: 0.4 })).toThrow(RangeError);
204
- });
205
- it('throws RangeError when speed is above 1.5', () => {
206
- expect(() => makeProvider({ speed: 1.6 })).toThrow(RangeError);
207
- });
208
- });
209
- it('single large chunk yields one frame', async () => {
210
- const big = new Uint8Array(16384);
211
- big.fill(42);
212
- const fetchFn = mockFetch([big]);
213
- const provider = makeProvider({ fetchFn });
214
- const frames = await collectFrames(provider.synthesize('long text'));
215
- expect(frames).toHaveLength(1);
216
- expect(frames[0].buffer.length).toBe(16384);
217
- expect(frames[0].sampleRate).toBe(24000);
218
- expect(frames[0].channels).toBe(1);
219
- });
220
- });
@@ -1,52 +0,0 @@
1
- import { CartesiaTtsProvider } from './tts-cartesia.js';
2
- import { DeepgramTtsProvider } from './tts-deepgram.js';
3
- import { OpenaiTtsProvider } from './tts-openai.js';
4
- /**
5
- * Create a TTS provider based on the voice config.
6
- *
7
- * Maintainers: start with `docs/official-docs.md` before changing provider
8
- * wiring, model defaults, endpoint assumptions, or request parameters here.
9
- *
10
- * Currently supported: `cartesia` (Sonic-3 streaming via WebSocket, 24 kHz PCM),
11
- * `deepgram` (Aura streaming via REST, 24 kHz PCM),
12
- * `openai` (TTS API via REST, 24 kHz PCM).
13
- * Planned: `kokoro` (local Kokoro model, Phase 3b).
14
- *
15
- * Requires `DISCOCLAW_VOICE_ENABLED=1` and a provider-specific API key
16
- * (e.g. `CARTESIA_API_KEY`, `DEEPGRAM_API_KEY`, `OPENAI_API_KEY`). See docs/voice.md for setup.
17
- */
18
- export function createTtsProvider(config, log) {
19
- switch (config.ttsProvider) {
20
- case 'cartesia': {
21
- if (!config.cartesiaApiKey) {
22
- throw new Error('cartesiaApiKey is required when ttsProvider is "cartesia"');
23
- }
24
- return new CartesiaTtsProvider({
25
- apiKey: config.cartesiaApiKey,
26
- log,
27
- });
28
- }
29
- case 'deepgram': {
30
- if (!config.deepgramApiKey) {
31
- throw new Error('deepgramApiKey is required when ttsProvider is "deepgram"');
32
- }
33
- return new DeepgramTtsProvider({
34
- apiKey: config.deepgramApiKey,
35
- model: config.deepgramTtsVoice,
36
- speed: config.deepgramTtsSpeed,
37
- log,
38
- });
39
- }
40
- case 'openai': {
41
- if (!config.openaiApiKey) {
42
- throw new Error('openaiApiKey is required when ttsProvider is "openai"');
43
- }
44
- return new OpenaiTtsProvider({
45
- apiKey: config.openaiApiKey,
46
- log,
47
- });
48
- }
49
- case 'kokoro':
50
- throw new Error('Kokoro TTS adapter is not yet implemented (Phase 3b)');
51
- }
52
- }
@@ -1,53 +0,0 @@
1
- import { describe, it, expect, vi } from 'vitest';
2
- import { createTtsProvider } from './tts-factory.js';
3
- import { CartesiaTtsProvider } from './tts-cartesia.js';
4
- import { DeepgramTtsProvider } from './tts-deepgram.js';
5
- import { OpenaiTtsProvider } from './tts-openai.js';
6
- // Stub globalThis.WebSocket so CartesiaTtsProvider constructor doesn't throw
7
- class StubWebSocket {
8
- onopen = null;
9
- constructor() {
10
- queueMicrotask(() => this.onopen?.({ type: 'open' }));
11
- }
12
- send() { }
13
- close() { }
14
- }
15
- globalThis.WebSocket = StubWebSocket;
16
- function createLogger() {
17
- return { info: vi.fn(), warn: vi.fn(), error: vi.fn() };
18
- }
19
- function baseConfig(overrides = {}) {
20
- return {
21
- enabled: true,
22
- sttProvider: 'deepgram',
23
- ttsProvider: 'cartesia',
24
- cartesiaApiKey: 'test-key',
25
- ...overrides,
26
- };
27
- }
28
- describe('createTtsProvider', () => {
29
- it('returns a CartesiaTtsProvider for cartesia config', () => {
30
- const provider = createTtsProvider(baseConfig(), createLogger());
31
- expect(provider).toBeInstanceOf(CartesiaTtsProvider);
32
- });
33
- it('throws when cartesiaApiKey is missing for cartesia provider', () => {
34
- expect(() => createTtsProvider(baseConfig({ cartesiaApiKey: undefined }), createLogger())).toThrow('cartesiaApiKey is required');
35
- });
36
- it('returns a DeepgramTtsProvider for deepgram config', () => {
37
- const provider = createTtsProvider(baseConfig({ ttsProvider: 'deepgram', deepgramApiKey: 'dg-test' }), createLogger());
38
- expect(provider).toBeInstanceOf(DeepgramTtsProvider);
39
- });
40
- it('throws when deepgramApiKey is missing for deepgram provider', () => {
41
- expect(() => createTtsProvider(baseConfig({ ttsProvider: 'deepgram' }), createLogger())).toThrow('deepgramApiKey is required');
42
- });
43
- it('returns an OpenaiTtsProvider for openai config', () => {
44
- const provider = createTtsProvider(baseConfig({ ttsProvider: 'openai', openaiApiKey: 'sk-test' }), createLogger());
45
- expect(provider).toBeInstanceOf(OpenaiTtsProvider);
46
- });
47
- it('throws when openaiApiKey is missing for openai provider', () => {
48
- expect(() => createTtsProvider(baseConfig({ ttsProvider: 'openai' }), createLogger())).toThrow('openaiApiKey is required');
49
- });
50
- it('throws not-implemented for kokoro provider', () => {
51
- expect(() => createTtsProvider(baseConfig({ ttsProvider: 'kokoro' }), createLogger())).toThrow('not yet implemented');
52
- });
53
- });
@@ -1,70 +0,0 @@
1
- const OPENAI_SPEECH_URL = 'https://api.openai.com/v1/audio/speech';
2
- const DEFAULT_MODEL = 'tts-1';
3
- const DEFAULT_VOICE = 'alloy';
4
- const DEFAULT_SAMPLE_RATE = 24000;
5
- /**
6
- * OpenAI TTS adapter.
7
- *
8
- * POSTs to `/v1/audio/speech` requesting `pcm` format (raw 24 kHz 16-bit mono).
9
- * Streams the response body and yields `AudioFrame` chunks.
10
- * Keep the request body aligned with the official OpenAI audio speech docs.
11
- */
12
- export class OpenaiTtsProvider {
13
- apiKey;
14
- model;
15
- voice;
16
- sampleRate;
17
- log;
18
- fetchFn;
19
- constructor(opts) {
20
- this.apiKey = opts.apiKey;
21
- this.model = opts.model ?? DEFAULT_MODEL;
22
- this.voice = opts.voice ?? DEFAULT_VOICE;
23
- this.sampleRate = opts.sampleRate ?? DEFAULT_SAMPLE_RATE;
24
- this.log = opts.log;
25
- this.fetchFn = opts.fetchFn ?? globalThis.fetch;
26
- }
27
- async *synthesize(text) {
28
- if (!text.trim())
29
- return;
30
- this.log.info({ model: this.model, textLength: text.length }, 'OpenAI TTS: sending synthesis request');
31
- const response = await this.fetchFn(OPENAI_SPEECH_URL, {
32
- method: 'POST',
33
- headers: {
34
- Authorization: `Bearer ${this.apiKey}`,
35
- 'Content-Type': 'application/json',
36
- },
37
- body: JSON.stringify({
38
- model: this.model,
39
- input: text,
40
- voice: this.voice,
41
- response_format: 'pcm',
42
- }),
43
- });
44
- if (!response.ok) {
45
- const body = await response.text();
46
- throw new Error(`OpenAI TTS API error: ${response.status} — ${body.slice(0, 200)}`);
47
- }
48
- if (!response.body) {
49
- throw new Error('OpenAI TTS: response has no body stream');
50
- }
51
- const reader = response.body.getReader();
52
- try {
53
- for (;;) {
54
- const { done, value } = await reader.read();
55
- if (done)
56
- break;
57
- if (value && value.byteLength > 0) {
58
- yield {
59
- buffer: Buffer.from(value.buffer, value.byteOffset, value.byteLength),
60
- sampleRate: this.sampleRate,
61
- channels: 1,
62
- };
63
- }
64
- }
65
- }
66
- finally {
67
- reader.releaseLock();
68
- }
69
- }
70
- }
@@ -1,138 +0,0 @@
1
- import { describe, it, expect, vi, beforeEach } from 'vitest';
2
- import { OpenaiTtsProvider } from './tts-openai.js';
3
- // ---------------------------------------------------------------------------
4
- // Helpers
5
- // ---------------------------------------------------------------------------
6
- function createLogger() {
7
- return { info: vi.fn(), warn: vi.fn(), error: vi.fn() };
8
- }
9
- /** Build a mock ReadableStream that yields the given byte arrays, then closes. */
10
- function mockStream(chunks) {
11
- let i = 0;
12
- return new ReadableStream({
13
- pull(controller) {
14
- if (i < chunks.length) {
15
- controller.enqueue(chunks[i]);
16
- i++;
17
- }
18
- else {
19
- controller.close();
20
- }
21
- },
22
- });
23
- }
24
- function mockFetch(chunks = [new Uint8Array([1, 2, 3, 4])], ok = true, status = 200) {
25
- return vi.fn().mockResolvedValue({
26
- ok,
27
- status,
28
- body: ok ? mockStream(chunks) : null,
29
- text: async () => 'API error body',
30
- });
31
- }
32
- function makeProvider(overrides = {}) {
33
- return new OpenaiTtsProvider({
34
- apiKey: overrides.apiKey ?? 'test-key',
35
- model: overrides.model,
36
- voice: overrides.voice,
37
- sampleRate: overrides.sampleRate,
38
- log: overrides.log ?? createLogger(),
39
- fetchFn: overrides.fetchFn ?? mockFetch(),
40
- });
41
- }
42
- async function collectFrames(iter) {
43
- const frames = [];
44
- for await (const frame of iter) {
45
- frames.push(frame);
46
- }
47
- return frames;
48
- }
49
- // ---------------------------------------------------------------------------
50
- // Tests
51
- // ---------------------------------------------------------------------------
52
- beforeEach(() => {
53
- vi.clearAllMocks();
54
- });
55
- describe('OpenaiTtsProvider', () => {
56
- it('sends correct API request with default model, voice, and pcm format', async () => {
57
- const fetchFn = mockFetch([new Uint8Array([10, 20])]);
58
- const provider = makeProvider({ fetchFn, apiKey: 'sk-my-key' });
59
- await collectFrames(provider.synthesize('hello'));
60
- expect(fetchFn).toHaveBeenCalledTimes(1);
61
- const [url, init] = vi.mocked(fetchFn).mock.calls[0];
62
- expect(url).toBe('https://api.openai.com/v1/audio/speech');
63
- expect(init.headers.Authorization).toBe('Bearer sk-my-key');
64
- expect(init.headers['Content-Type']).toBe('application/json');
65
- const body = JSON.parse(init.body);
66
- expect(body.model).toBe('tts-1');
67
- expect(body.voice).toBe('alloy');
68
- expect(body.input).toBe('hello');
69
- expect(body.response_format).toBe('pcm');
70
- });
71
- it('uses custom model, voice, and sampleRate', async () => {
72
- const fetchFn = mockFetch([new Uint8Array([1])]);
73
- const provider = makeProvider({
74
- fetchFn,
75
- model: 'tts-1-hd',
76
- voice: 'nova',
77
- sampleRate: 48000,
78
- });
79
- const frames = await collectFrames(provider.synthesize('test'));
80
- const body = JSON.parse(vi.mocked(fetchFn).mock.calls[0][1].body);
81
- expect(body.model).toBe('tts-1-hd');
82
- expect(body.voice).toBe('nova');
83
- expect(frames[0].sampleRate).toBe(48000);
84
- });
85
- it('streams multiple audio frames with correct metadata', async () => {
86
- const chunks = [
87
- new Uint8Array([1, 2, 3]),
88
- new Uint8Array([4, 5, 6]),
89
- new Uint8Array([7, 8, 9]),
90
- ];
91
- const fetchFn = mockFetch(chunks);
92
- const provider = makeProvider({ fetchFn });
93
- const frames = await collectFrames(provider.synthesize('hello world'));
94
- expect(frames).toHaveLength(3);
95
- expect([...frames[0].buffer]).toEqual([1, 2, 3]);
96
- expect([...frames[1].buffer]).toEqual([4, 5, 6]);
97
- expect([...frames[2].buffer]).toEqual([7, 8, 9]);
98
- for (const frame of frames) {
99
- expect(frame.sampleRate).toBe(24000);
100
- expect(frame.channels).toBe(1);
101
- }
102
- });
103
- it('empty text yields no frames and does not call API', async () => {
104
- const fetchFn = mockFetch();
105
- const provider = makeProvider({ fetchFn });
106
- const frames = await collectFrames(provider.synthesize(''));
107
- expect(frames).toHaveLength(0);
108
- const frames2 = await collectFrames(provider.synthesize(' '));
109
- expect(frames2).toHaveLength(0);
110
- expect(fetchFn).not.toHaveBeenCalled();
111
- });
112
- it('throws on non-OK HTTP response', async () => {
113
- const fetchFn = mockFetch([], false, 429);
114
- const provider = makeProvider({ fetchFn });
115
- await expect(collectFrames(provider.synthesize('test'))).rejects.toThrow('OpenAI TTS API error: 429');
116
- });
117
- it('throws when response has no body stream', async () => {
118
- const fetchFn = vi.fn().mockResolvedValue({
119
- ok: true,
120
- status: 200,
121
- body: null,
122
- text: async () => '',
123
- });
124
- const provider = makeProvider({ fetchFn });
125
- await expect(collectFrames(provider.synthesize('test'))).rejects.toThrow('response has no body stream');
126
- });
127
- it('single large chunk yields one frame', async () => {
128
- const big = new Uint8Array(16384);
129
- big.fill(42);
130
- const fetchFn = mockFetch([big]);
131
- const provider = makeProvider({ fetchFn });
132
- const frames = await collectFrames(provider.synthesize('long text'));
133
- expect(frames).toHaveLength(1);
134
- expect(frames[0].buffer.length).toBe(16384);
135
- expect(frames[0].sampleRate).toBe(24000);
136
- expect(frames[0].channels).toBe(1);
137
- });
138
- });