@sudocode-ai/types 0.1.17 → 0.1.18-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sudocode-ai/types",
3
- "version": "0.1.17",
3
+ "version": "0.1.18-dev.0",
4
4
  "description": "TypeScript type definitions for sudocode",
5
5
  "types": "src/index.d.ts",
6
6
  "type": "module",
@@ -25,6 +25,10 @@
25
25
  "types": "./src/integrations.d.ts",
26
26
  "default": "./src/integrations.d.ts"
27
27
  },
28
+ "./voice": {
29
+ "types": "./src/voice.d.ts",
30
+ "default": "./src/voice.d.ts"
31
+ },
28
32
  "./schema": {
29
33
  "types": "./dist/schema.d.ts",
30
34
  "default": "./dist/schema.js"
@@ -65,6 +69,7 @@
65
69
  "src/artifacts.d.ts",
66
70
  "src/workflows.d.ts",
67
71
  "src/integrations.d.ts",
72
+ "src/voice.d.ts",
68
73
  "dist"
69
74
  ],
70
75
  "devDependencies": {
package/src/index.d.ts CHANGED
@@ -3,6 +3,7 @@
3
3
  */
4
4
 
5
5
  import type { IntegrationsConfig } from "./integrations.js";
6
+ import type { VoiceSettingsConfig } from "./voice.js";
6
7
 
7
8
  // =============================================================================
8
9
  // Integration Types (External Links)
@@ -297,6 +298,8 @@ export interface Config {
297
298
  integrations?: IntegrationsConfig;
298
299
  /** Editor configuration (optional) */
299
300
  editor?: EditorConfig;
301
+ /** Voice configuration (optional) */
302
+ voice?: VoiceSettingsConfig;
300
303
  }
301
304
 
302
305
  /**
@@ -450,3 +453,37 @@ export type {
450
453
  SearchOptions,
451
454
  SearchResult,
452
455
  } from "./integrations.js";
456
+
457
+ /**
458
+ * Voice types for STT/TTS functionality
459
+ * See voice.d.ts for detailed voice types
460
+ */
461
+ export type {
462
+ // Provider types
463
+ STTProvider,
464
+ TTSProvider,
465
+ // STT types
466
+ TranscriptionResult,
467
+ STTOptions,
468
+ // TTS types
469
+ TTSOptions,
470
+ SynthesizeRequest,
471
+ SynthesizeResponse,
472
+ // Voice input state types
473
+ VoiceInputState,
474
+ VoiceInputErrorCode,
475
+ VoiceInputError,
476
+ // API request/response types
477
+ TranscribeRequest,
478
+ TranscribeResponse,
479
+ // Configuration types
480
+ STTConfig,
481
+ TTSConfig,
482
+ VoiceConfig,
483
+ // Narration event types
484
+ NarrationCategory,
485
+ NarrationPriority,
486
+ VoiceNarrationEvent,
487
+ // User preferences
488
+ VoicePreferences,
489
+ } from "./voice.js";
package/src/voice.d.ts ADDED
@@ -0,0 +1,461 @@
1
+ /**
2
+ * Voice functionality types for sudocode
3
+ * Covers Speech-to-Text (STT) and Text-to-Speech (TTS) capabilities
4
+ */
5
+
6
+ // =============================================================================
7
+ // Provider Types
8
+ // =============================================================================
9
+
10
+ /**
11
+ * Available STT (Speech-to-Text) providers
12
+ */
13
+ export type STTProvider = "whisper-local" | "openai";
14
+
15
+ /**
16
+ * Available TTS (Text-to-Speech) providers
17
+ */
18
+ export type TTSProvider = "browser" | "kokoro" | "openai";
19
+
20
+ // =============================================================================
21
+ // Project Settings (config.json)
22
+ // =============================================================================
23
+
24
+ /**
25
+ * Voice settings stored in .sudocode/config.json
26
+ *
27
+ * These settings configure how voice features work for a project.
28
+ *
29
+ * @example
30
+ * ```json
31
+ * {
32
+ * "voice": {
33
+ * "enabled": true,
34
+ * "stt": {
35
+ * "provider": "whisper-local",
36
+ * "whisperUrl": "http://localhost:2022/v1",
37
+ * "whisperModel": "base"
38
+ * },
39
+ * "tts": {
40
+ * "provider": "browser"
41
+ * }
42
+ * }
43
+ * }
44
+ * ```
45
+ */
46
+ export interface VoiceSettingsConfig {
47
+ /** Whether voice features are enabled (default: true) */
48
+ enabled?: boolean;
49
+ /** Speech-to-text settings */
50
+ stt?: {
51
+ /** Preferred STT provider */
52
+ provider?: STTProvider;
53
+ /** URL for local Whisper server (default: http://localhost:2022/v1) */
54
+ whisperUrl?: string;
55
+ /** Whisper model to use (default: base) */
56
+ whisperModel?: string;
57
+ };
58
+ /** Text-to-speech settings */
59
+ tts?: {
60
+ /** Preferred TTS provider */
61
+ provider?: TTSProvider;
62
+ /** URL for local Kokoro server (default: http://localhost:8880/v1) */
63
+ kokoroUrl?: string;
64
+ /** Default voice for TTS (default: nova) */
65
+ defaultVoice?: string;
66
+ /** Kokoro execution mode: 'browser' for WASM, 'server' for streaming via sidecar */
67
+ kokoroMode?: "browser" | "server";
68
+ };
69
+ /** Narration settings - controls voice narration playback */
70
+ narration?: {
71
+ /** Whether voice narration is enabled (default: false) */
72
+ enabled?: boolean;
73
+ /** Preferred voice name for TTS (default: system default) */
74
+ voice?: string;
75
+ /** Speech rate from 0.5 to 2.0 (default: 1.0) */
76
+ speed?: number;
77
+ /** Volume from 0 to 1 (default: 1.0) */
78
+ volume?: number;
79
+ /** Whether to narrate tool use events like Read, Write, Bash (default: true) */
80
+ narrateToolUse?: boolean;
81
+ /** Whether to narrate tool results/completion (default: false) */
82
+ narrateToolResults?: boolean;
83
+ /** Whether to narrate assistant messages (default: true) */
84
+ narrateAssistantMessages?: boolean;
85
+ };
86
+ }
87
+
88
+ // =============================================================================
89
+ // STT Types
90
+ // =============================================================================
91
+
92
+ /**
93
+ * Result from a speech-to-text transcription
94
+ */
95
+ export interface TranscriptionResult {
96
+ /** The transcribed text */
97
+ text: string;
98
+ /** Confidence score from 0 to 1 (optional, provider-dependent) */
99
+ confidence?: number;
100
+ /** Duration of the audio in milliseconds */
101
+ duration_ms?: number;
102
+ }
103
+
104
+ /**
105
+ * Options for speech-to-text transcription
106
+ */
107
+ export interface STTOptions {
108
+ /** Language code (e.g., "en", "es", "fr") - defaults to "en" */
109
+ language?: string;
110
+ /** Preferred STT provider */
111
+ provider?: STTProvider;
112
+ }
113
+
114
+ // =============================================================================
115
+ // TTS Types
116
+ // =============================================================================
117
+
118
+ /**
119
+ * Options for text-to-speech synthesis
120
+ */
121
+ export interface TTSOptions {
122
+ /** Voice identifier (provider-specific) */
123
+ voice?: string;
124
+ /** Preferred TTS provider */
125
+ provider?: TTSProvider;
126
+ /** Speech rate multiplier (0.5 to 2.0) */
127
+ rate?: number;
128
+ /** Volume level (0 to 1) */
129
+ volume?: number;
130
+ }
131
+
132
+ /**
133
+ * Request for text-to-speech synthesis
134
+ */
135
+ export interface SynthesizeRequest {
136
+ /** Text to synthesize */
137
+ text: string;
138
+ /** Voice identifier */
139
+ voice?: string;
140
+ /** TTS provider to use */
141
+ provider?: TTSProvider;
142
+ }
143
+
144
+ /**
145
+ * Response from text-to-speech synthesis
146
+ * For browser provider: returns text for Web Speech API
147
+ * For kokoro/openai: audio is returned as a stream (audio/mpeg)
148
+ */
149
+ export interface SynthesizeResponse {
150
+ /** Text to speak (for browser provider) */
151
+ text?: string;
152
+ /** Audio content type (for kokoro/openai) */
153
+ contentType?: string;
154
+ }
155
+
156
+ // =============================================================================
157
+ // Voice Input States
158
+ // =============================================================================
159
+
160
+ /**
161
+ * State of the voice input UI
162
+ */
163
+ export type VoiceInputState = "idle" | "recording" | "transcribing" | "error";
164
+
165
+ /**
166
+ * Error codes for voice input failures
167
+ */
168
+ export type VoiceInputErrorCode =
169
+ | "permission_denied"
170
+ | "not_supported"
171
+ | "transcription_failed"
172
+ | "network_error";
173
+
174
+ /**
175
+ * Error object for voice input failures
176
+ */
177
+ export interface VoiceInputError {
178
+ /** Error code for programmatic handling */
179
+ code: VoiceInputErrorCode;
180
+ /** Human-readable error message */
181
+ message: string;
182
+ }
183
+
184
+ // =============================================================================
185
+ // API Request/Response Types
186
+ // =============================================================================
187
+
188
+ /**
189
+ * Request for POST /api/voice/transcribe
190
+ * Note: Actual request is multipart/form-data with audio blob
191
+ */
192
+ export interface TranscribeRequest {
193
+ /** Audio blob (audio/webm, audio/mp3, audio/wav) */
194
+ audio: Blob;
195
+ /** Language code (optional, defaults to "en") */
196
+ language?: string;
197
+ }
198
+
199
+ /**
200
+ * Response from POST /api/voice/transcribe
201
+ */
202
+ export interface TranscribeResponse {
203
+ /** Transcribed text */
204
+ text: string;
205
+ /** Confidence score from 0 to 1 */
206
+ confidence?: number;
207
+ /** Duration of the audio in milliseconds */
208
+ duration_ms?: number;
209
+ }
210
+
211
+ // =============================================================================
212
+ // Voice Configuration
213
+ // =============================================================================
214
+
215
+ /**
216
+ * STT configuration from GET /api/voice/config
217
+ */
218
+ export interface STTConfig {
219
+ /** Available STT providers */
220
+ providers: STTProvider[];
221
+ /** Default provider */
222
+ default: STTProvider;
223
+ /** Whether local Whisper is available */
224
+ whisperAvailable: boolean;
225
+ }
226
+
227
+ /**
228
+ * TTS configuration from GET /api/voice/config
229
+ */
230
+ export interface TTSConfig {
231
+ /** Available TTS providers */
232
+ providers: TTSProvider[];
233
+ /** Default provider */
234
+ default: TTSProvider;
235
+ /** Whether Kokoro is available */
236
+ kokoroAvailable: boolean;
237
+ /** Available voices per provider */
238
+ voices: Record<TTSProvider, string[]>;
239
+ }
240
+
241
+ /**
242
+ * Full voice configuration from GET /api/voice/config
243
+ *
244
+ * Combines runtime capabilities (provider availability) with
245
+ * user settings from config.json.
246
+ */
247
+ export interface VoiceConfig {
248
+ /** Whether voice features are enabled for this project */
249
+ enabled: boolean;
250
+ /** Speech-to-text configuration (runtime capabilities) */
251
+ stt: STTConfig;
252
+ /** Text-to-speech configuration (runtime capabilities) */
253
+ tts: TTSConfig;
254
+ /** User settings from config.json */
255
+ settings: VoiceSettingsConfig;
256
+ }
257
+
258
+ // =============================================================================
259
+ // Voice Narration Events (WebSocket)
260
+ // =============================================================================
261
+
262
+ /**
263
+ * Category of narration content
264
+ */
265
+ export type NarrationCategory = "status" | "progress" | "result" | "error";
266
+
267
+ /**
268
+ * Priority level for narration
269
+ */
270
+ export type NarrationPriority = "low" | "normal" | "high";
271
+
272
+ /**
273
+ * WebSocket event for voice narration
274
+ */
275
+ export interface VoiceNarrationEvent {
276
+ /** Event type identifier */
277
+ type: "voice_narration";
278
+ /** Associated execution ID */
279
+ executionId: string;
280
+ /** Text to be narrated */
281
+ text: string;
282
+ /** Category of the narration */
283
+ category: NarrationCategory;
284
+ /** Priority level for queue ordering */
285
+ priority: NarrationPriority;
286
+ }
287
+
288
+ // =============================================================================
289
+ // User Preferences
290
+ // =============================================================================
291
+
292
+ /**
293
+ * User voice preferences stored in localStorage
294
+ */
295
+ export interface VoicePreferences {
296
+ /** Whether voice narration is enabled */
297
+ narrationEnabled: boolean;
298
+ /** Preferred TTS provider */
299
+ ttsProvider: TTSProviderType;
300
+ /** Preferred voice for TTS */
301
+ ttsVoice: string;
302
+ /** Narration playback speed (0.5 to 2.0) */
303
+ narrationSpeed: number;
304
+ /** Narration volume (0 to 1) */
305
+ narrationVolume: number;
306
+ }
307
+
308
+ // =============================================================================
309
+ // TTS Provider Interface (Service-side)
310
+ // =============================================================================
311
+
312
+ /**
313
+ * TTS provider type identifier
314
+ * Used to distinguish between different provider implementations
315
+ */
316
+ export type TTSProviderType = TTSProvider;
317
+
318
+ /**
319
+ * Options passed to TTS providers for synthesis
320
+ */
321
+ export interface TTSProviderOptions {
322
+ /** Voice identifier (provider-specific) */
323
+ voice?: string;
324
+ /** Speech speed multiplier (0.5 to 2.0, default: 1.0) */
325
+ speed?: number;
326
+ /** Speech pitch multiplier (0.5 to 2.0, default: 1.0) */
327
+ pitch?: number;
328
+ }
329
+
330
+ /**
331
+ * Result from TTS synthesis
332
+ *
333
+ * Different providers return results in different forms:
334
+ * - Server-side TTS (Kokoro, OpenAI): Returns audio buffer
335
+ * - Browser TTS: Returns text for client-side Web Speech API synthesis
336
+ */
337
+ export interface TTSProviderResult {
338
+ /**
339
+ * Audio buffer for server-side TTS providers.
340
+ * Present when audio is synthesized server-side.
341
+ */
342
+ audio?: Buffer;
343
+
344
+ /**
345
+ * MIME type of the audio (e.g., "audio/mpeg", "audio/wav")
346
+ * Present when audio is returned
347
+ */
348
+ mimeType?: string;
349
+
350
+ /**
351
+ * Text to synthesize client-side.
352
+ * Present when using browser TTS (client does actual synthesis).
353
+ */
354
+ text?: string;
355
+
356
+ /**
357
+ * SSML markup for enhanced synthesis.
358
+ * Optional, used for providers that support SSML.
359
+ */
360
+ ssml?: string;
361
+ }
362
+
363
+ /**
364
+ * Information about a TTS voice
365
+ */
366
+ export interface TTSVoice {
367
+ /** Unique voice identifier (provider-specific) */
368
+ id: string;
369
+ /** Human-readable voice name */
370
+ name: string;
371
+ /** Language code (e.g., "en-US", "en-GB", "es-ES") */
372
+ language: string;
373
+ /** Provider that offers this voice */
374
+ provider: TTSProviderType;
375
+ }
376
+
377
+ // =============================================================================
378
+ // Streaming TTS WebSocket Messages
379
+ // =============================================================================
380
+
381
+ /**
382
+ * Client request to start TTS streaming
383
+ *
384
+ * Sent by the client to request text-to-speech synthesis.
385
+ * Server will respond with TTSAudioChunk messages followed by TTSStreamEnd.
386
+ */
387
+ export interface TTSStreamRequest {
388
+ /** Message type identifier */
389
+ type: "tts_request";
390
+ /** Unique request ID for correlating responses */
391
+ request_id: string;
392
+ /** Text to synthesize */
393
+ text: string;
394
+ /** Voice identifier (optional, uses default if not specified) */
395
+ voice?: string;
396
+ /** Speech speed multiplier (0.5 to 2.0, default: 1.0) */
397
+ speed?: number;
398
+ }
399
+
400
+ /**
401
+ * Server response containing an audio chunk
402
+ *
403
+ * Streamed from server to client during TTS synthesis.
404
+ * Audio is base64-encoded PCM (mono, 24kHz, float32).
405
+ */
406
+ export interface TTSAudioChunk {
407
+ /** Message type identifier */
408
+ type: "tts_audio";
409
+ /** Request ID this chunk belongs to */
410
+ request_id: string;
411
+ /** Base64-encoded PCM audio data (mono, 24kHz, float32) */
412
+ chunk: string;
413
+ /** Zero-based index of this chunk in the stream */
414
+ index: number;
415
+ /** Whether this is the final audio chunk */
416
+ is_final: boolean;
417
+ }
418
+
419
+ /**
420
+ * Server notification that TTS streaming has completed
421
+ *
422
+ * Sent after all audio chunks have been transmitted.
423
+ */
424
+ export interface TTSStreamEnd {
425
+ /** Message type identifier */
426
+ type: "tts_end";
427
+ /** Request ID this end message belongs to */
428
+ request_id: string;
429
+ /** Total number of audio chunks sent */
430
+ total_chunks: number;
431
+ /** Total duration of synthesis in milliseconds */
432
+ duration_ms: number;
433
+ }
434
+
435
+ /**
436
+ * Server notification of a TTS error
437
+ *
438
+ * Sent when TTS synthesis fails or encounters an error.
439
+ */
440
+ export interface TTSStreamError {
441
+ /** Message type identifier */
442
+ type: "tts_error";
443
+ /** Request ID this error belongs to */
444
+ request_id: string;
445
+ /** Human-readable error message */
446
+ error: string;
447
+ /** Whether the client can retry the request */
448
+ recoverable: boolean;
449
+ /** Whether the client should fall back to browser TTS */
450
+ fallback: boolean;
451
+ }
452
+
453
+ /**
454
+ * Union type for all TTS client messages
455
+ */
456
+ export type TTSClientMessage = TTSStreamRequest;
457
+
458
+ /**
459
+ * Union type for all TTS server messages
460
+ */
461
+ export type TTSServerMessage = TTSAudioChunk | TTSStreamEnd | TTSStreamError;