@framers/agentos 0.1.110 → 0.1.112
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api/agency.d.ts.map +1 -1
- package/dist/api/agency.js +38 -2
- package/dist/api/agency.js.map +1 -1
- package/dist/api/agent.js +1 -1
- package/dist/api/agent.js.map +1 -1
- package/dist/api/strategies/debate.d.ts.map +1 -1
- package/dist/api/strategies/debate.js.map +1 -1
- package/dist/api/strategies/graph.d.ts.map +1 -1
- package/dist/api/strategies/graph.js +1 -2
- package/dist/api/strategies/graph.js.map +1 -1
- package/dist/api/strategies/hierarchical.d.ts.map +1 -1
- package/dist/api/strategies/hierarchical.js +1 -2
- package/dist/api/strategies/hierarchical.js.map +1 -1
- package/dist/api/strategies/index.d.ts +1 -9
- package/dist/api/strategies/index.d.ts.map +1 -1
- package/dist/api/strategies/index.js +1 -11
- package/dist/api/strategies/index.js.map +1 -1
- package/dist/api/strategies/parallel.d.ts.map +1 -1
- package/dist/api/strategies/parallel.js +23 -4
- package/dist/api/strategies/parallel.js.map +1 -1
- package/dist/api/strategies/review-loop.d.ts.map +1 -1
- package/dist/api/strategies/review-loop.js.map +1 -1
- package/dist/api/strategies/sequential.d.ts.map +1 -1
- package/dist/api/strategies/sequential.js +1 -2
- package/dist/api/strategies/sequential.js.map +1 -1
- package/dist/api/strategies/shared.d.ts +8 -0
- package/dist/api/strategies/shared.d.ts.map +1 -1
- package/dist/api/strategies/shared.js +10 -1
- package/dist/api/strategies/shared.js.map +1 -1
- package/dist/api/types.d.ts +6 -0
- package/dist/api/types.d.ts.map +1 -1
- package/dist/api/types.js.map +1 -1
- package/dist/memory/AgentMemory.d.ts +2 -1
- package/dist/memory/AgentMemory.d.ts.map +1 -1
- package/dist/memory/AgentMemory.js +1 -1
- package/dist/memory/AgentMemory.js.map +1 -1
- package/dist/memory/CognitiveMemoryManager.d.ts.map +1 -1
- package/dist/memory/CognitiveMemoryManager.js +7 -2
- package/dist/memory/CognitiveMemoryManager.js.map +1 -1
- package/dist/memory/facade/Memory.d.ts.map +1 -1
- package/dist/memory/facade/Memory.js +6 -9
- package/dist/memory/facade/Memory.js.map +1 -1
- package/dist/memory/store/MemoryStore.d.ts +9 -0
- package/dist/memory/store/MemoryStore.d.ts.map +1 -1
- package/dist/memory/store/MemoryStore.js +66 -6
- package/dist/memory/store/MemoryStore.js.map +1 -1
- package/dist/memory/store/SqliteMemoryGraph.d.ts.map +1 -1
- package/dist/memory/store/SqliteMemoryGraph.js +27 -13
- package/dist/memory/store/SqliteMemoryGraph.js.map +1 -1
- package/dist/speech/FallbackProxy.d.ts +194 -41
- package/dist/speech/FallbackProxy.d.ts.map +1 -1
- package/dist/speech/FallbackProxy.js +155 -32
- package/dist/speech/FallbackProxy.js.map +1 -1
- package/dist/speech/SpeechProviderResolver.d.ts +278 -36
- package/dist/speech/SpeechProviderResolver.d.ts.map +1 -1
- package/dist/speech/SpeechProviderResolver.js +306 -40
- package/dist/speech/SpeechProviderResolver.js.map +1 -1
- package/dist/speech/providers/AssemblyAISTTProvider.d.ts +119 -19
- package/dist/speech/providers/AssemblyAISTTProvider.d.ts.map +1 -1
- package/dist/speech/providers/AssemblyAISTTProvider.js +153 -25
- package/dist/speech/providers/AssemblyAISTTProvider.js.map +1 -1
- package/dist/speech/providers/AzureSpeechSTTProvider.d.ts +121 -17
- package/dist/speech/providers/AzureSpeechSTTProvider.d.ts.map +1 -1
- package/dist/speech/providers/AzureSpeechSTTProvider.js +122 -14
- package/dist/speech/providers/AzureSpeechSTTProvider.js.map +1 -1
- package/dist/speech/providers/AzureSpeechTTSProvider.d.ts +130 -15
- package/dist/speech/providers/AzureSpeechTTSProvider.d.ts.map +1 -1
- package/dist/speech/providers/AzureSpeechTTSProvider.js +163 -18
- package/dist/speech/providers/AzureSpeechTTSProvider.js.map +1 -1
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.d.ts +159 -0
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.d.ts.map +1 -1
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.js +119 -0
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.js.map +1 -1
- package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts +102 -16
- package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts.map +1 -1
- package/dist/speech/providers/DeepgramBatchSTTProvider.js +108 -13
- package/dist/speech/providers/DeepgramBatchSTTProvider.js.map +1 -1
- package/dist/speech/providers/ElevenLabsTextToSpeechProvider.d.ts +149 -0
- package/dist/speech/providers/ElevenLabsTextToSpeechProvider.d.ts.map +1 -1
- package/dist/speech/providers/ElevenLabsTextToSpeechProvider.js +137 -2
- package/dist/speech/providers/ElevenLabsTextToSpeechProvider.js.map +1 -1
- package/dist/speech/providers/OpenAITextToSpeechProvider.d.ts +125 -0
- package/dist/speech/providers/OpenAITextToSpeechProvider.d.ts.map +1 -1
- package/dist/speech/providers/OpenAITextToSpeechProvider.js +128 -4
- package/dist/speech/providers/OpenAITextToSpeechProvider.js.map +1 -1
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.d.ts +110 -0
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.d.ts.map +1 -1
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.js +115 -0
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.js.map +1 -1
- package/dist/voice/CallManager.d.ts.map +1 -1
- package/dist/voice/CallManager.js +9 -1
- package/dist/voice/CallManager.js.map +1 -1
- package/dist/voice/MediaStreamParser.d.ts +115 -6
- package/dist/voice/MediaStreamParser.d.ts.map +1 -1
- package/dist/voice/MediaStreamParser.js +44 -0
- package/dist/voice/MediaStreamParser.js.map +1 -1
- package/dist/voice/TelephonyStreamTransport.d.ts +112 -20
- package/dist/voice/TelephonyStreamTransport.d.ts.map +1 -1
- package/dist/voice/TelephonyStreamTransport.js +136 -30
- package/dist/voice/TelephonyStreamTransport.js.map +1 -1
- package/dist/voice/parsers/PlivoMediaStreamParser.d.ts +64 -6
- package/dist/voice/parsers/PlivoMediaStreamParser.d.ts.map +1 -1
- package/dist/voice/parsers/PlivoMediaStreamParser.js +67 -6
- package/dist/voice/parsers/PlivoMediaStreamParser.js.map +1 -1
- package/dist/voice/parsers/TelnyxMediaStreamParser.d.ts +55 -8
- package/dist/voice/parsers/TelnyxMediaStreamParser.d.ts.map +1 -1
- package/dist/voice/parsers/TelnyxMediaStreamParser.js +60 -9
- package/dist/voice/parsers/TelnyxMediaStreamParser.js.map +1 -1
- package/dist/voice/parsers/TwilioMediaStreamParser.d.ts +73 -11
- package/dist/voice/parsers/TwilioMediaStreamParser.d.ts.map +1 -1
- package/dist/voice/parsers/TwilioMediaStreamParser.js +81 -12
- package/dist/voice/parsers/TwilioMediaStreamParser.js.map +1 -1
- package/dist/voice/providers/plivo.d.ts +108 -12
- package/dist/voice/providers/plivo.d.ts.map +1 -1
- package/dist/voice/providers/plivo.js +106 -9
- package/dist/voice/providers/plivo.js.map +1 -1
- package/dist/voice/providers/telnyx.d.ts +110 -20
- package/dist/voice/providers/telnyx.d.ts.map +1 -1
- package/dist/voice/providers/telnyx.js +111 -20
- package/dist/voice/providers/telnyx.js.map +1 -1
- package/dist/voice/providers/twilio.d.ts +91 -13
- package/dist/voice/providers/twilio.d.ts.map +1 -1
- package/dist/voice/providers/twilio.js +94 -14
- package/dist/voice/providers/twilio.js.map +1 -1
- package/dist/voice/twiml.d.ts +70 -12
- package/dist/voice/twiml.d.ts.map +1 -1
- package/dist/voice/twiml.js +70 -12
- package/dist/voice/twiml.js.map +1 -1
- package/dist/voice/types.d.ts +142 -15
- package/dist/voice/types.d.ts.map +1 -1
- package/dist/voice/types.js +34 -3
- package/dist/voice/types.js.map +1 -1
- package/package.json +1 -1
|
@@ -1,20 +1,145 @@
|
|
|
1
1
|
import type { SpeechSynthesisOptions, SpeechSynthesisResult, SpeechVoice, TextToSpeechProvider } from '../types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Configuration for the {@link OpenAITextToSpeechProvider}.
|
|
4
|
+
*
|
|
5
|
+
* @see {@link OpenAITextToSpeechProvider} for usage examples
|
|
6
|
+
* @see https://platform.openai.com/docs/api-reference/audio/createSpeech
|
|
7
|
+
*/
|
|
2
8
|
export interface OpenAITextToSpeechProviderConfig {
|
|
9
|
+
/**
|
|
10
|
+
* OpenAI API key used for authentication.
|
|
11
|
+
* Sent as `Authorization: Bearer <apiKey>`.
|
|
12
|
+
*/
|
|
3
13
|
apiKey: string;
|
|
14
|
+
/**
|
|
15
|
+
* Base URL for the OpenAI API. Override for proxies, Azure OpenAI, or
|
|
16
|
+
* compatible third-party endpoints.
|
|
17
|
+
* @default 'https://api.openai.com/v1'
|
|
18
|
+
*/
|
|
4
19
|
baseUrl?: string;
|
|
20
|
+
/**
|
|
21
|
+
* Default TTS model. `tts-1` is optimized for real-time, `tts-1-hd` for quality.
|
|
22
|
+
* @default 'tts-1'
|
|
23
|
+
*/
|
|
5
24
|
model?: string;
|
|
25
|
+
/**
|
|
26
|
+
* Default voice identifier. See {@link OPENAI_VOICES} for available options.
|
|
27
|
+
* @default 'nova'
|
|
28
|
+
*/
|
|
6
29
|
voice?: string;
|
|
30
|
+
/**
|
|
31
|
+
* Custom fetch implementation for dependency injection in tests.
|
|
32
|
+
* @default globalThis.fetch
|
|
33
|
+
*/
|
|
7
34
|
fetchImpl?: typeof fetch;
|
|
8
35
|
}
|
|
36
|
+
/**
|
|
37
|
+
* Text-to-speech provider that uses the OpenAI TTS API.
|
|
38
|
+
*
|
|
39
|
+
* ## API Contract
|
|
40
|
+
*
|
|
41
|
+
* - **Endpoint:** `POST {baseUrl}/audio/speech`
|
|
42
|
+
* - **Authentication:** `Authorization: Bearer <apiKey>`
|
|
43
|
+
* - **Content-Type:** `application/json`
|
|
44
|
+
* - **Request body:** `{ model, voice, input, response_format, speed }`
|
|
45
|
+
* - **Response:** Raw audio bytes in the requested format
|
|
46
|
+
*
|
|
47
|
+
* ## Models
|
|
48
|
+
*
|
|
49
|
+
* - `tts-1` — Optimized for real-time, lower latency, slightly lower quality
|
|
50
|
+
* - `tts-1-hd` — Higher quality at the cost of additional latency
|
|
51
|
+
*
|
|
52
|
+
* ## Voice Listing
|
|
53
|
+
*
|
|
54
|
+
* OpenAI's voice catalog is static (6 voices), so {@link listAvailableVoices}
|
|
55
|
+
* returns a hardcoded list from {@link OPENAI_VOICES} without making an API call.
|
|
56
|
+
*
|
|
57
|
+
* @see {@link OpenAITextToSpeechProviderConfig} for configuration options
|
|
58
|
+
* @see {@link OpenAIWhisperSpeechToTextProvider} for the corresponding STT provider
|
|
59
|
+
*
|
|
60
|
+
* @example
|
|
61
|
+
* ```ts
|
|
62
|
+
* const provider = new OpenAITextToSpeechProvider({
|
|
63
|
+
* apiKey: process.env.OPENAI_API_KEY!,
|
|
64
|
+
* model: 'tts-1',
|
|
65
|
+
* voice: 'nova',
|
|
66
|
+
* });
|
|
67
|
+
* const result = await provider.synthesize('Hello!', { speed: 1.1 });
|
|
68
|
+
* ```
|
|
69
|
+
*/
|
|
9
70
|
export declare class OpenAITextToSpeechProvider implements TextToSpeechProvider {
|
|
10
71
|
private readonly config;
|
|
72
|
+
/** Unique provider identifier used for registration and resolution. */
|
|
11
73
|
readonly id = "openai-tts";
|
|
74
|
+
/** Human-readable display name for UI and logging. */
|
|
12
75
|
readonly displayName = "OpenAI TTS";
|
|
76
|
+
/**
|
|
77
|
+
* Streaming is supported — the OpenAI API streams audio bytes as they
|
|
78
|
+
* are generated, enabling low-latency playback pipelines.
|
|
79
|
+
*/
|
|
13
80
|
readonly supportsStreaming = true;
|
|
81
|
+
/** Fetch implementation — injected for testability, defaults to global fetch. */
|
|
14
82
|
private readonly fetchImpl;
|
|
83
|
+
/**
|
|
84
|
+
* Creates a new OpenAITextToSpeechProvider.
|
|
85
|
+
*
|
|
86
|
+
* @param config - Provider configuration including API key and optional defaults.
|
|
87
|
+
*
|
|
88
|
+
* @example
|
|
89
|
+
* ```ts
|
|
90
|
+
* const provider = new OpenAITextToSpeechProvider({
|
|
91
|
+
* apiKey: 'sk-xxxx',
|
|
92
|
+
* voice: 'shimmer',
|
|
93
|
+
* });
|
|
94
|
+
* ```
|
|
95
|
+
*/
|
|
15
96
|
constructor(config: OpenAITextToSpeechProviderConfig);
|
|
97
|
+
/**
|
|
98
|
+
* Returns the human-readable provider name.
|
|
99
|
+
*
|
|
100
|
+
* @returns The display name string `'OpenAI TTS'`.
|
|
101
|
+
*
|
|
102
|
+
* @example
|
|
103
|
+
* ```ts
|
|
104
|
+
* provider.getProviderName(); // 'OpenAI TTS'
|
|
105
|
+
* ```
|
|
106
|
+
*/
|
|
16
107
|
getProviderName(): string;
|
|
108
|
+
/**
|
|
109
|
+
* Synthesizes speech from text using the OpenAI TTS API.
|
|
110
|
+
*
|
|
111
|
+
* @param text - The text to convert to audio. Maximum 4096 characters.
|
|
112
|
+
* @param options - Optional synthesis settings including voice, model,
|
|
113
|
+
* output format, and speed (0.25–4.0 range).
|
|
114
|
+
* @returns A promise resolving to the audio buffer and metadata.
|
|
115
|
+
* @throws {Error} When the OpenAI API returns a non-2xx status code.
|
|
116
|
+
* Common causes: invalid API key (401), rate limit (429), text too long (400).
|
|
117
|
+
*
|
|
118
|
+
* @example
|
|
119
|
+
* ```ts
|
|
120
|
+
* const result = await provider.synthesize('Hello world', {
|
|
121
|
+
* voice: 'alloy',
|
|
122
|
+
* speed: 1.2,
|
|
123
|
+
* outputFormat: 'opus',
|
|
124
|
+
* });
|
|
125
|
+
* ```
|
|
126
|
+
*/
|
|
17
127
|
synthesize(text: string, options?: SpeechSynthesisOptions): Promise<SpeechSynthesisResult>;
|
|
128
|
+
/**
|
|
129
|
+
* Returns the static list of available OpenAI TTS voices.
|
|
130
|
+
*
|
|
131
|
+
* Unlike other providers (ElevenLabs, Azure) that require an API call to
|
|
132
|
+
* list voices, OpenAI's voice catalog is fixed and hardcoded. This method
|
|
133
|
+
* returns a shallow copy to prevent external mutation.
|
|
134
|
+
*
|
|
135
|
+
* @returns A promise resolving to the 6 built-in OpenAI voice options.
|
|
136
|
+
*
|
|
137
|
+
* @example
|
|
138
|
+
* ```ts
|
|
139
|
+
* const voices = await provider.listAvailableVoices();
|
|
140
|
+
* const defaultVoice = voices.find(v => v.isDefault); // 'nova'
|
|
141
|
+
* ```
|
|
142
|
+
*/
|
|
18
143
|
listAvailableVoices(): Promise<SpeechVoice[]>;
|
|
19
144
|
}
|
|
20
145
|
//# sourceMappingURL=OpenAITextToSpeechProvider.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"OpenAITextToSpeechProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/OpenAITextToSpeechProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,sBAAsB,EACtB,qBAAqB,EACrB,WAAW,EACX,oBAAoB,EACrB,MAAM,aAAa,CAAC;AAErB,MAAM,WAAW,gCAAgC;IAC/C,MAAM,EAAE,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"OpenAITextToSpeechProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/OpenAITextToSpeechProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,sBAAsB,EACtB,qBAAqB,EACrB,WAAW,EACX,oBAAoB,EACrB,MAAM,aAAa,CAAC;AAErB;;;;;GAKG;AACH,MAAM,WAAW,gCAAgC;IAC/C;;;OAGG;IACH,MAAM,EAAE,MAAM,CAAC;IAEf;;;;OAIG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;;OAGG;IACH,SAAS,CAAC,EAAE,OAAO,KAAK,CAAC;CAC1B;AAqDD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiCG;AACH,qBAAa,0BAA2B,YAAW,oBAAoB;IA6BzD,OAAO,CAAC,QAAQ,CAAC,MAAM;IA5BnC,uEAAuE;IACvE,SAAgB,EAAE,gBAAgB;IAElC,sDAAsD;IACtD,SAAgB,WAAW,gBAAgB;IAE3C;;;OAGG;IACH,SAAgB,iBAAiB,QAAQ;IAEzC,iFAAiF;IACjF,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAe;IAEzC;;;;;;;;;;;;OAYG;gBAC0B,MAAM,EAAE,gCAAgC;IAIrE;;;;;;;;;OASG;IACH,eAAe,IAAI,MAAM;IAIzB;;;;;;;;;;;;;;;;;;OAkBG;IACG,UAAU,CACd,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,sBAA2B,GACnC,OAAO,CAAC,qBAAqB,CAAC;IA2CjC;;;;;;;;;;;;;;OAcG;IACG,mBAAmB,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;CAIpD"}
|
|
@@ -1,3 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Static catalog of built-in OpenAI TTS voices.
|
|
3
|
+
*
|
|
4
|
+
* These voices are available for both `tts-1` and `tts-1-hd` models.
|
|
5
|
+
* `'nova'` is marked as default because it provides a good balance of
|
|
6
|
+
* naturalness and clarity across languages.
|
|
7
|
+
*
|
|
8
|
+
* @see https://platform.openai.com/docs/guides/text-to-speech/voice-options
|
|
9
|
+
*/
|
|
1
10
|
const OPENAI_VOICES = [
|
|
2
11
|
{ id: 'alloy', name: 'Alloy', provider: 'openai-tts', lang: 'various', isDefault: false },
|
|
3
12
|
{ id: 'echo', name: 'Echo', provider: 'openai-tts', lang: 'various', isDefault: false },
|
|
@@ -6,6 +15,22 @@ const OPENAI_VOICES = [
|
|
|
6
15
|
{ id: 'nova', name: 'Nova', provider: 'openai-tts', lang: 'various', isDefault: true },
|
|
7
16
|
{ id: 'shimmer', name: 'Shimmer', provider: 'openai-tts', lang: 'various', isDefault: false },
|
|
8
17
|
];
|
|
18
|
+
/**
|
|
19
|
+
* Maps an OpenAI output format identifier to its corresponding MIME type.
|
|
20
|
+
*
|
|
21
|
+
* OpenAI TTS supports multiple output formats. The default is MP3, which
|
|
22
|
+
* provides good quality at reasonable file sizes. PCM returns raw 24kHz
|
|
23
|
+
* 16-bit little-endian audio (MIME type `audio/L16`).
|
|
24
|
+
*
|
|
25
|
+
* @param format - The OpenAI output format string (e.g. `'mp3'`, `'opus'`).
|
|
26
|
+
* @returns The corresponding MIME type string.
|
|
27
|
+
*
|
|
28
|
+
* @example
|
|
29
|
+
* ```ts
|
|
30
|
+
* mimeTypeForOutput('opus'); // 'audio/opus'
|
|
31
|
+
* mimeTypeForOutput(undefined); // 'audio/mpeg' (default)
|
|
32
|
+
* ```
|
|
33
|
+
*/
|
|
9
34
|
function mimeTypeForOutput(format) {
|
|
10
35
|
switch (format) {
|
|
11
36
|
case 'opus':
|
|
@@ -17,23 +42,106 @@ function mimeTypeForOutput(format) {
|
|
|
17
42
|
case 'wav':
|
|
18
43
|
return 'audio/wav';
|
|
19
44
|
case 'pcm':
|
|
20
|
-
return 'audio/L16';
|
|
45
|
+
return 'audio/L16'; // Raw 24kHz 16-bit little-endian mono
|
|
21
46
|
default:
|
|
22
|
-
return 'audio/mpeg';
|
|
47
|
+
return 'audio/mpeg'; // MP3 is the default format
|
|
23
48
|
}
|
|
24
49
|
}
|
|
50
|
+
/**
|
|
51
|
+
* Text-to-speech provider that uses the OpenAI TTS API.
|
|
52
|
+
*
|
|
53
|
+
* ## API Contract
|
|
54
|
+
*
|
|
55
|
+
* - **Endpoint:** `POST {baseUrl}/audio/speech`
|
|
56
|
+
* - **Authentication:** `Authorization: Bearer <apiKey>`
|
|
57
|
+
* - **Content-Type:** `application/json`
|
|
58
|
+
* - **Request body:** `{ model, voice, input, response_format, speed }`
|
|
59
|
+
* - **Response:** Raw audio bytes in the requested format
|
|
60
|
+
*
|
|
61
|
+
* ## Models
|
|
62
|
+
*
|
|
63
|
+
* - `tts-1` — Optimized for real-time, lower latency, slightly lower quality
|
|
64
|
+
* - `tts-1-hd` — Higher quality at the cost of additional latency
|
|
65
|
+
*
|
|
66
|
+
* ## Voice Listing
|
|
67
|
+
*
|
|
68
|
+
* OpenAI's voice catalog is static (6 voices), so {@link listAvailableVoices}
|
|
69
|
+
* returns a hardcoded list from {@link OPENAI_VOICES} without making an API call.
|
|
70
|
+
*
|
|
71
|
+
* @see {@link OpenAITextToSpeechProviderConfig} for configuration options
|
|
72
|
+
* @see {@link OpenAIWhisperSpeechToTextProvider} for the corresponding STT provider
|
|
73
|
+
*
|
|
74
|
+
* @example
|
|
75
|
+
* ```ts
|
|
76
|
+
* const provider = new OpenAITextToSpeechProvider({
|
|
77
|
+
* apiKey: process.env.OPENAI_API_KEY!,
|
|
78
|
+
* model: 'tts-1',
|
|
79
|
+
* voice: 'nova',
|
|
80
|
+
* });
|
|
81
|
+
* const result = await provider.synthesize('Hello!', { speed: 1.1 });
|
|
82
|
+
* ```
|
|
83
|
+
*/
|
|
25
84
|
export class OpenAITextToSpeechProvider {
|
|
85
|
+
/**
|
|
86
|
+
* Creates a new OpenAITextToSpeechProvider.
|
|
87
|
+
*
|
|
88
|
+
* @param config - Provider configuration including API key and optional defaults.
|
|
89
|
+
*
|
|
90
|
+
* @example
|
|
91
|
+
* ```ts
|
|
92
|
+
* const provider = new OpenAITextToSpeechProvider({
|
|
93
|
+
* apiKey: 'sk-xxxx',
|
|
94
|
+
* voice: 'shimmer',
|
|
95
|
+
* });
|
|
96
|
+
* ```
|
|
97
|
+
*/
|
|
26
98
|
constructor(config) {
|
|
27
99
|
this.config = config;
|
|
100
|
+
/** Unique provider identifier used for registration and resolution. */
|
|
28
101
|
this.id = 'openai-tts';
|
|
102
|
+
/** Human-readable display name for UI and logging. */
|
|
29
103
|
this.displayName = 'OpenAI TTS';
|
|
104
|
+
/**
|
|
105
|
+
* Streaming is supported — the OpenAI API streams audio bytes as they
|
|
106
|
+
* are generated, enabling low-latency playback pipelines.
|
|
107
|
+
*/
|
|
30
108
|
this.supportsStreaming = true;
|
|
31
109
|
this.fetchImpl = config.fetchImpl ?? fetch;
|
|
32
110
|
}
|
|
111
|
+
/**
|
|
112
|
+
* Returns the human-readable provider name.
|
|
113
|
+
*
|
|
114
|
+
* @returns The display name string `'OpenAI TTS'`.
|
|
115
|
+
*
|
|
116
|
+
* @example
|
|
117
|
+
* ```ts
|
|
118
|
+
* provider.getProviderName(); // 'OpenAI TTS'
|
|
119
|
+
* ```
|
|
120
|
+
*/
|
|
33
121
|
getProviderName() {
|
|
34
122
|
return this.displayName;
|
|
35
123
|
}
|
|
124
|
+
/**
|
|
125
|
+
* Synthesizes speech from text using the OpenAI TTS API.
|
|
126
|
+
*
|
|
127
|
+
* @param text - The text to convert to audio. Maximum 4096 characters.
|
|
128
|
+
* @param options - Optional synthesis settings including voice, model,
|
|
129
|
+
* output format, and speed (0.25–4.0 range).
|
|
130
|
+
* @returns A promise resolving to the audio buffer and metadata.
|
|
131
|
+
* @throws {Error} When the OpenAI API returns a non-2xx status code.
|
|
132
|
+
* Common causes: invalid API key (401), rate limit (429), text too long (400).
|
|
133
|
+
*
|
|
134
|
+
* @example
|
|
135
|
+
* ```ts
|
|
136
|
+
* const result = await provider.synthesize('Hello world', {
|
|
137
|
+
* voice: 'alloy',
|
|
138
|
+
* speed: 1.2,
|
|
139
|
+
* outputFormat: 'opus',
|
|
140
|
+
* });
|
|
141
|
+
* ```
|
|
142
|
+
*/
|
|
36
143
|
async synthesize(text, options = {}) {
|
|
144
|
+
// Resolve options with fallback chain: per-call options > config > defaults
|
|
37
145
|
const model = options.model ?? this.config.model ?? 'tts-1';
|
|
38
146
|
const voice = options.voice ?? this.config.voice ?? 'nova';
|
|
39
147
|
const outputFormat = options.outputFormat ?? 'mp3';
|
|
@@ -48,7 +156,7 @@ export class OpenAITextToSpeechProvider {
|
|
|
48
156
|
voice,
|
|
49
157
|
input: text,
|
|
50
158
|
response_format: outputFormat,
|
|
51
|
-
speed: options.speed,
|
|
159
|
+
speed: options.speed, // undefined is omitted by JSON.stringify
|
|
52
160
|
}),
|
|
53
161
|
});
|
|
54
162
|
if (!response.ok) {
|
|
@@ -59,7 +167,7 @@ export class OpenAITextToSpeechProvider {
|
|
|
59
167
|
return {
|
|
60
168
|
audioBuffer,
|
|
61
169
|
mimeType: mimeTypeForOutput(outputFormat),
|
|
62
|
-
cost: 0,
|
|
170
|
+
cost: 0, // Cost tracking is handled at a higher layer
|
|
63
171
|
voiceUsed: voice,
|
|
64
172
|
providerName: this.displayName,
|
|
65
173
|
usage: {
|
|
@@ -68,7 +176,23 @@ export class OpenAITextToSpeechProvider {
|
|
|
68
176
|
},
|
|
69
177
|
};
|
|
70
178
|
}
|
|
179
|
+
/**
|
|
180
|
+
* Returns the static list of available OpenAI TTS voices.
|
|
181
|
+
*
|
|
182
|
+
* Unlike other providers (ElevenLabs, Azure) that require an API call to
|
|
183
|
+
* list voices, OpenAI's voice catalog is fixed and hardcoded. This method
|
|
184
|
+
* returns a shallow copy to prevent external mutation.
|
|
185
|
+
*
|
|
186
|
+
* @returns A promise resolving to the 6 built-in OpenAI voice options.
|
|
187
|
+
*
|
|
188
|
+
* @example
|
|
189
|
+
* ```ts
|
|
190
|
+
* const voices = await provider.listAvailableVoices();
|
|
191
|
+
* const defaultVoice = voices.find(v => v.isDefault); // 'nova'
|
|
192
|
+
* ```
|
|
193
|
+
*/
|
|
71
194
|
async listAvailableVoices() {
|
|
195
|
+
// Return a shallow copy to prevent external mutation of the static catalog
|
|
72
196
|
return [...OPENAI_VOICES];
|
|
73
197
|
}
|
|
74
198
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"OpenAITextToSpeechProvider.js","sourceRoot":"","sources":["../../../src/speech/providers/OpenAITextToSpeechProvider.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"OpenAITextToSpeechProvider.js","sourceRoot":"","sources":["../../../src/speech/providers/OpenAITextToSpeechProvider.ts"],"names":[],"mappings":"AA8CA;;;;;;;;GAQG;AACH,MAAM,aAAa,GAA2B;IAC5C,EAAE,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;IACzF,EAAE,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;IACvF,EAAE,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;IACzF,EAAE,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;IACvF,EAAE,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,IAAI,EAAE;IACtF,EAAE,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,SAAS,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;CAC9F,CAAC;AAEF;;;;;;;;;;;;;;;GAeG;AACH,SAAS,iBAAiB,CAAC,MAA0B;IACnD,QAAQ,MAAM,EAAE,CAAC;QACf,KAAK,MAAM;YACT,OAAO,YAAY,CAAC;QACtB,KAAK,KAAK;YACR,OAAO,WAAW,CAAC;QACrB,KAAK,MAAM;YACT,OAAO,YAAY,CAAC;QACtB,KAAK,KAAK;YACR,OAAO,WAAW,CAAC;QACrB,KAAK,KAAK;YACR,OAAO,WAAW,CAAC,CAAC,sCAAsC;QAC5D;YACE,OAAO,YAAY,CAAC,CAAC,4BAA4B;IACrD,CAAC;AACH,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiCG;AACH,MAAM,OAAO,0BAA0B;IAgBrC;;;;;;;;;;;;OAYG;IACH,YAA6B,MAAwC;QAAxC,WAAM,GAAN,MAAM,CAAkC;QA5BrE,uEAAuE;QACvD,OAAE,GAAG,YAAY,CAAC;QAElC,sDAAsD;QACtC,gBAAW,GAAG,YAAY,CAAC;QAE3C;;;WAGG;QACa,sBAAiB,GAAG,IAAI,CAAC;QAmBvC,IAAI,CAAC,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,KAAK,CAAC;IAC7C,CAAC;IAED;;;;;;;;;OASG;IACH,eAAe;QACb,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED;;;;;;;;;;;;;;;;;;OAkBG;IACH,KAAK,CAAC,UAAU,CACd,IAAY,EACZ,UAAkC,EAAE;QAEpC,4EAA4E;QAC5E,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,OAAO,CAAC;QAC5D,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,MAAM,CAAC;QAC3D,MAAM,YAAY,GAAG,OAAO,CAAC,YAAY,IAAI,KAAK,CAAC;QAEnD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,SAAS,CACnC,GAAG,IAAI,CAAC,MAAM,CAAC,OAAO,IAAI,2BAA2B,eAAe,EACpE;YACE,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,aAAa,EAAE,UAAU,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE;gBAC7C,cAAc,EAAE,kBAAkB;aACnC;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;gBACnB,KAAK;gBACL,KAAK;gBACL,KAAK,EAAE,IAAI;gBACX,eAAe,EAAE,YAAY;gBAC7B,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,yCAAyC;aAChE,CAAC;SACH,CACF,CAAC;QAEF,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACtC,MAAM,IAAI,KAAK,CAAC,gCAAgC,QAAQ,CAAC,MAAM,MAAM,OAAO,EAAE,CAAC,CAAC;QAClF,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,QAAQ,CAAC,WAAW,EAAE,CAAC,CAAC;QAC9D,OAAO;YACL,WAAW;YACX,QAAQ,EAAE,iBAAiB,CAAC,YAAY,CAAC;YACzC,IAAI,EAAE,CAAC,EAAE,6CAA6C;YACtD,SAAS,EAAE,KAAK;YAChB,YAAY,EAAE,IAAI,CAAC,WAAW;YAC9B,KAAK,EAAE;gBACL,UAAU,EAAE,IAAI,CAAC,MAAM;gBACvB,SAAS,EAAE,KAAK;aACjB;SACF,CAAC;IACJ,CAAC;IAED;;;;;;;;;;;;;;OAcG;IACH,KAAK,CAAC,mBAAmB;QACvB,2EAA2E;QAC3E,OAAO,CAAC,GAAG,aAAa,CAAC,CAAC;IAC5B,CAAC;CACF"}
|
|
@@ -1,18 +1,128 @@
|
|
|
1
1
|
import type { SpeechAudioInput, SpeechToTextProvider, SpeechTranscriptionOptions, SpeechTranscriptionResult } from '../types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Configuration for the {@link OpenAIWhisperSpeechToTextProvider}.
|
|
4
|
+
*
|
|
5
|
+
* @see {@link OpenAIWhisperSpeechToTextProvider} for usage examples
|
|
6
|
+
* @see https://platform.openai.com/docs/api-reference/audio/createTranscription
|
|
7
|
+
*/
|
|
2
8
|
export interface OpenAIWhisperSpeechToTextProviderConfig {
|
|
9
|
+
/**
|
|
10
|
+
* OpenAI API key used for authentication.
|
|
11
|
+
* Sent as `Authorization: Bearer <apiKey>` in the request header.
|
|
12
|
+
*/
|
|
3
13
|
apiKey: string;
|
|
14
|
+
/**
|
|
15
|
+
* Base URL for the OpenAI API. Override for proxies, Azure OpenAI, or
|
|
16
|
+
* compatible third-party endpoints.
|
|
17
|
+
* @default 'https://api.openai.com/v1'
|
|
18
|
+
*/
|
|
4
19
|
baseUrl?: string;
|
|
20
|
+
/**
|
|
21
|
+
* Default Whisper model to use for transcription.
|
|
22
|
+
* @default 'whisper-1'
|
|
23
|
+
*/
|
|
5
24
|
model?: string;
|
|
25
|
+
/**
|
|
26
|
+
* Custom fetch implementation for dependency injection in tests.
|
|
27
|
+
* @default globalThis.fetch
|
|
28
|
+
*/
|
|
6
29
|
fetchImpl?: typeof fetch;
|
|
7
30
|
}
|
|
31
|
+
/**
|
|
32
|
+
* Speech-to-text provider that uses the OpenAI Whisper transcription API.
|
|
33
|
+
*
|
|
34
|
+
* ## API Contract
|
|
35
|
+
*
|
|
36
|
+
* - **Endpoint:** `POST {baseUrl}/audio/transcriptions`
|
|
37
|
+
* - **Authentication:** `Authorization: Bearer <apiKey>`
|
|
38
|
+
* - **Content-Type:** `multipart/form-data` (FormData with file blob)
|
|
39
|
+
* - **Response format:** Controlled by the `response_format` field; defaults
|
|
40
|
+
* to `verbose_json` which includes segments, language detection, and duration.
|
|
41
|
+
*
|
|
42
|
+
* ## Supported Response Formats
|
|
43
|
+
*
|
|
44
|
+
* - `verbose_json` — Full JSON with segments, duration, and language (default)
|
|
45
|
+
* - `json` — Minimal JSON with just the text
|
|
46
|
+
* - `text` — Plain text response (no JSON)
|
|
47
|
+
* - `srt` — SubRip subtitle format
|
|
48
|
+
* - `vtt` — WebVTT subtitle format
|
|
49
|
+
*
|
|
50
|
+
* When `text`, `srt`, or `vtt` format is used, the response is returned as
|
|
51
|
+
* plain text and segments are not available.
|
|
52
|
+
*
|
|
53
|
+
* @see {@link OpenAIWhisperSpeechToTextProviderConfig} for configuration options
|
|
54
|
+
* @see {@link normalizeSegments} for the segment normalization logic
|
|
55
|
+
*
|
|
56
|
+
* @example
|
|
57
|
+
* ```ts
|
|
58
|
+
* const provider = new OpenAIWhisperSpeechToTextProvider({
|
|
59
|
+
* apiKey: process.env.OPENAI_API_KEY!,
|
|
60
|
+
* model: 'whisper-1',
|
|
61
|
+
* });
|
|
62
|
+
* const result = await provider.transcribe(
|
|
63
|
+
* { data: audioBuffer, mimeType: 'audio/wav', fileName: 'recording.wav' },
|
|
64
|
+
* { language: 'en', responseFormat: 'verbose_json' },
|
|
65
|
+
* );
|
|
66
|
+
* ```
|
|
67
|
+
*/
|
|
8
68
|
export declare class OpenAIWhisperSpeechToTextProvider implements SpeechToTextProvider {
|
|
9
69
|
private readonly config;
|
|
70
|
+
/** Unique provider identifier used for registration and resolution. */
|
|
10
71
|
readonly id = "openai-whisper";
|
|
72
|
+
/** Human-readable display name for UI and logging. */
|
|
11
73
|
readonly displayName = "OpenAI Whisper";
|
|
74
|
+
/** Whisper API is batch-only; streaming requires a WebSocket adapter. */
|
|
12
75
|
readonly supportsStreaming = false;
|
|
76
|
+
/** Fetch implementation — injected for testability, defaults to global fetch. */
|
|
13
77
|
private readonly fetchImpl;
|
|
78
|
+
/**
|
|
79
|
+
* Creates a new OpenAIWhisperSpeechToTextProvider.
|
|
80
|
+
*
|
|
81
|
+
* @param config - Provider configuration including API key and optional defaults.
|
|
82
|
+
*
|
|
83
|
+
* @example
|
|
84
|
+
* ```ts
|
|
85
|
+
* const provider = new OpenAIWhisperSpeechToTextProvider({
|
|
86
|
+
* apiKey: 'sk-xxxx',
|
|
87
|
+
* baseUrl: 'https://api.openai.com/v1', // default
|
|
88
|
+
* model: 'whisper-1', // default
|
|
89
|
+
* });
|
|
90
|
+
* ```
|
|
91
|
+
*/
|
|
14
92
|
constructor(config: OpenAIWhisperSpeechToTextProviderConfig);
|
|
93
|
+
/**
|
|
94
|
+
* Returns the human-readable provider name.
|
|
95
|
+
*
|
|
96
|
+
* @returns The display name string `'OpenAI Whisper'`.
|
|
97
|
+
*
|
|
98
|
+
* @example
|
|
99
|
+
* ```ts
|
|
100
|
+
* provider.getProviderName(); // 'OpenAI Whisper'
|
|
101
|
+
* ```
|
|
102
|
+
*/
|
|
15
103
|
getProviderName(): string;
|
|
104
|
+
/**
|
|
105
|
+
* Transcribes an audio buffer using the OpenAI Whisper API.
|
|
106
|
+
*
|
|
107
|
+
* The audio is sent as a multipart form upload with the file, model, and
|
|
108
|
+
* optional parameters (language, prompt, temperature, response_format).
|
|
109
|
+
*
|
|
110
|
+
* @param audio - Raw audio data and metadata. The `data` buffer is wrapped
|
|
111
|
+
* in a Blob and sent as a form file field. If `fileName` is not provided,
|
|
112
|
+
* a default name is generated from the `format` field.
|
|
113
|
+
* @param options - Optional transcription settings including language hint,
|
|
114
|
+
* context prompt, temperature for sampling, and response format.
|
|
115
|
+
* @returns A promise resolving to the normalized transcription result.
|
|
116
|
+
* @throws {Error} When the OpenAI API returns a non-2xx status code.
|
|
117
|
+
*
|
|
118
|
+
* @example
|
|
119
|
+
* ```ts
|
|
120
|
+
* const result = await provider.transcribe(
|
|
121
|
+
* { data: mp3Buffer, mimeType: 'audio/mpeg', fileName: 'voice.mp3' },
|
|
122
|
+
* { language: 'fr', prompt: 'Discussion about AI' },
|
|
123
|
+
* );
|
|
124
|
+
* ```
|
|
125
|
+
*/
|
|
16
126
|
transcribe(audio: SpeechAudioInput, options?: SpeechTranscriptionOptions): Promise<SpeechTranscriptionResult>;
|
|
17
127
|
}
|
|
18
128
|
//# sourceMappingURL=OpenAIWhisperSpeechToTextProvider.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"OpenAIWhisperSpeechToTextProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/OpenAIWhisperSpeechToTextProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,gBAAgB,EAEhB,oBAAoB,EACpB,0BAA0B,EAC1B,yBAAyB,EAE1B,MAAM,aAAa,CAAC;AAErB,MAAM,WAAW,uCAAuC;IACtD,MAAM,EAAE,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"OpenAIWhisperSpeechToTextProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/OpenAIWhisperSpeechToTextProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,gBAAgB,EAEhB,oBAAoB,EACpB,0BAA0B,EAC1B,yBAAyB,EAE1B,MAAM,aAAa,CAAC;AAErB;;;;;GAKG;AACH,MAAM,WAAW,uCAAuC;IACtD;;;OAGG;IACH,MAAM,EAAE,MAAM,CAAC;IAEf;;;;OAIG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;;OAGG;IACH,SAAS,CAAC,EAAE,OAAO,KAAK,CAAC;CAC1B;AAqED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAoCG;AACH,qBAAa,iCAAkC,YAAW,oBAAoB;IA2BhE,OAAO,CAAC,QAAQ,CAAC,MAAM;IA1BnC,uEAAuE;IACvE,SAAgB,EAAE,oBAAoB;IAEtC,sDAAsD;IACtD,SAAgB,WAAW,oBAAoB;IAE/C,yEAAyE;IACzE,SAAgB,iBAAiB,SAAS;IAE1C,iFAAiF;IACjF,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAe;IAEzC;;;;;;;;;;;;;OAaG;gBAC0B,MAAM,EAAE,uCAAuC;IAI5E;;;;;;;;;OASG;IACH,eAAe,IAAI,MAAM;IAIzB;;;;;;;;;;;;;;;;;;;;;OAqBG;IACG,UAAU,CACd,KAAK,EAAE,gBAAgB,EACvB,OAAO,GAAE,0BAA+B,GACvC,OAAO,CAAC,yBAAyB,CAAC;CA2EtC"}
|
|
@@ -1,9 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Normalizes raw segment data from the OpenAI Whisper `verbose_json` response
|
|
3
|
+
* into strongly-typed {@link SpeechTranscriptionSegment} objects.
|
|
4
|
+
*
|
|
5
|
+
* This function performs defensive runtime type checking on every field because
|
|
6
|
+
* the Whisper API response shape is only partially documented and may include
|
|
7
|
+
* additional or differently-typed fields depending on the model version.
|
|
8
|
+
*
|
|
9
|
+
* The segment fields handled include standard ones (text, start, end, confidence)
|
|
10
|
+
* as well as Whisper-specific fields (id, seek, tokens, temperature, avg_logprob,
|
|
11
|
+
* compression_ratio, no_speech_prob) that are preserved for advanced consumers.
|
|
12
|
+
*
|
|
13
|
+
* @param input - The raw `segments` array from the Whisper JSON response.
|
|
14
|
+
* Expected to be an array of objects, but handles non-array gracefully.
|
|
15
|
+
* @returns An array of normalized segments, or `undefined` if the input
|
|
16
|
+
* is not a valid array.
|
|
17
|
+
*
|
|
18
|
+
* @see {@link SpeechTranscriptionSegment} for the output shape
|
|
19
|
+
*/
|
|
1
20
|
function normalizeSegments(input) {
|
|
2
21
|
if (!Array.isArray(input))
|
|
3
22
|
return undefined;
|
|
4
23
|
return input
|
|
5
24
|
.filter((segment) => typeof segment === 'object' && segment !== null)
|
|
6
25
|
.map((segment) => {
|
|
26
|
+
// Use Record<string, unknown> for safe property access on untyped API data
|
|
7
27
|
const item = segment;
|
|
8
28
|
return {
|
|
9
29
|
text: typeof item.text === 'string' ? item.text : '',
|
|
@@ -13,6 +33,7 @@ function normalizeSegments(input) {
|
|
|
13
33
|
speaker: typeof item.speaker === 'string' || typeof item.speaker === 'number'
|
|
14
34
|
? item.speaker
|
|
15
35
|
: undefined,
|
|
36
|
+
// Normalize nested word-level data with the same defensive approach
|
|
16
37
|
words: Array.isArray(item.words)
|
|
17
38
|
? item.words
|
|
18
39
|
.filter((word) => typeof word === 'object' && word !== null)
|
|
@@ -26,6 +47,7 @@ function normalizeSegments(input) {
|
|
|
26
47
|
};
|
|
27
48
|
})
|
|
28
49
|
: undefined,
|
|
50
|
+
// Whisper-specific metadata fields — preserved for advanced consumers
|
|
29
51
|
id: typeof item.id === 'number' ? item.id : undefined,
|
|
30
52
|
seek: typeof item.seek === 'number' ? item.seek : undefined,
|
|
31
53
|
tokens: Array.isArray(item.tokens)
|
|
@@ -38,25 +60,114 @@ function normalizeSegments(input) {
|
|
|
38
60
|
};
|
|
39
61
|
});
|
|
40
62
|
}
|
|
63
|
+
/**
|
|
64
|
+
* Speech-to-text provider that uses the OpenAI Whisper transcription API.
|
|
65
|
+
*
|
|
66
|
+
* ## API Contract
|
|
67
|
+
*
|
|
68
|
+
* - **Endpoint:** `POST {baseUrl}/audio/transcriptions`
|
|
69
|
+
* - **Authentication:** `Authorization: Bearer <apiKey>`
|
|
70
|
+
* - **Content-Type:** `multipart/form-data` (FormData with file blob)
|
|
71
|
+
* - **Response format:** Controlled by the `response_format` field; defaults
|
|
72
|
+
* to `verbose_json` which includes segments, language detection, and duration.
|
|
73
|
+
*
|
|
74
|
+
* ## Supported Response Formats
|
|
75
|
+
*
|
|
76
|
+
* - `verbose_json` — Full JSON with segments, duration, and language (default)
|
|
77
|
+
* - `json` — Minimal JSON with just the text
|
|
78
|
+
* - `text` — Plain text response (no JSON)
|
|
79
|
+
* - `srt` — SubRip subtitle format
|
|
80
|
+
* - `vtt` — WebVTT subtitle format
|
|
81
|
+
*
|
|
82
|
+
* When `text`, `srt`, or `vtt` format is used, the response is returned as
|
|
83
|
+
* plain text and segments are not available.
|
|
84
|
+
*
|
|
85
|
+
* @see {@link OpenAIWhisperSpeechToTextProviderConfig} for configuration options
|
|
86
|
+
* @see {@link normalizeSegments} for the segment normalization logic
|
|
87
|
+
*
|
|
88
|
+
* @example
|
|
89
|
+
* ```ts
|
|
90
|
+
* const provider = new OpenAIWhisperSpeechToTextProvider({
|
|
91
|
+
* apiKey: process.env.OPENAI_API_KEY!,
|
|
92
|
+
* model: 'whisper-1',
|
|
93
|
+
* });
|
|
94
|
+
* const result = await provider.transcribe(
|
|
95
|
+
* { data: audioBuffer, mimeType: 'audio/wav', fileName: 'recording.wav' },
|
|
96
|
+
* { language: 'en', responseFormat: 'verbose_json' },
|
|
97
|
+
* );
|
|
98
|
+
* ```
|
|
99
|
+
*/
|
|
41
100
|
export class OpenAIWhisperSpeechToTextProvider {
|
|
101
|
+
/**
|
|
102
|
+
* Creates a new OpenAIWhisperSpeechToTextProvider.
|
|
103
|
+
*
|
|
104
|
+
* @param config - Provider configuration including API key and optional defaults.
|
|
105
|
+
*
|
|
106
|
+
* @example
|
|
107
|
+
* ```ts
|
|
108
|
+
* const provider = new OpenAIWhisperSpeechToTextProvider({
|
|
109
|
+
* apiKey: 'sk-xxxx',
|
|
110
|
+
* baseUrl: 'https://api.openai.com/v1', // default
|
|
111
|
+
* model: 'whisper-1', // default
|
|
112
|
+
* });
|
|
113
|
+
* ```
|
|
114
|
+
*/
|
|
42
115
|
constructor(config) {
|
|
43
116
|
this.config = config;
|
|
117
|
+
/** Unique provider identifier used for registration and resolution. */
|
|
44
118
|
this.id = 'openai-whisper';
|
|
119
|
+
/** Human-readable display name for UI and logging. */
|
|
45
120
|
this.displayName = 'OpenAI Whisper';
|
|
121
|
+
/** Whisper API is batch-only; streaming requires a WebSocket adapter. */
|
|
46
122
|
this.supportsStreaming = false;
|
|
47
123
|
this.fetchImpl = config.fetchImpl ?? fetch;
|
|
48
124
|
}
|
|
125
|
+
/**
|
|
126
|
+
* Returns the human-readable provider name.
|
|
127
|
+
*
|
|
128
|
+
* @returns The display name string `'OpenAI Whisper'`.
|
|
129
|
+
*
|
|
130
|
+
* @example
|
|
131
|
+
* ```ts
|
|
132
|
+
* provider.getProviderName(); // 'OpenAI Whisper'
|
|
133
|
+
* ```
|
|
134
|
+
*/
|
|
49
135
|
getProviderName() {
|
|
50
136
|
return this.displayName;
|
|
51
137
|
}
|
|
138
|
+
/**
|
|
139
|
+
* Transcribes an audio buffer using the OpenAI Whisper API.
|
|
140
|
+
*
|
|
141
|
+
* The audio is sent as a multipart form upload with the file, model, and
|
|
142
|
+
* optional parameters (language, prompt, temperature, response_format).
|
|
143
|
+
*
|
|
144
|
+
* @param audio - Raw audio data and metadata. The `data` buffer is wrapped
|
|
145
|
+
* in a Blob and sent as a form file field. If `fileName` is not provided,
|
|
146
|
+
* a default name is generated from the `format` field.
|
|
147
|
+
* @param options - Optional transcription settings including language hint,
|
|
148
|
+
* context prompt, temperature for sampling, and response format.
|
|
149
|
+
* @returns A promise resolving to the normalized transcription result.
|
|
150
|
+
* @throws {Error} When the OpenAI API returns a non-2xx status code.
|
|
151
|
+
*
|
|
152
|
+
* @example
|
|
153
|
+
* ```ts
|
|
154
|
+
* const result = await provider.transcribe(
|
|
155
|
+
* { data: mp3Buffer, mimeType: 'audio/mpeg', fileName: 'voice.mp3' },
|
|
156
|
+
* { language: 'fr', prompt: 'Discussion about AI' },
|
|
157
|
+
* );
|
|
158
|
+
* ```
|
|
159
|
+
*/
|
|
52
160
|
async transcribe(audio, options = {}) {
|
|
53
161
|
const form = new FormData();
|
|
54
162
|
const responseFormat = (options.responseFormat ?? 'verbose_json');
|
|
55
163
|
const model = options.model ?? this.config.model ?? 'whisper-1';
|
|
164
|
+
// Generate a filename with the correct extension for Whisper's format detection
|
|
56
165
|
const fileName = audio.fileName ?? `speech.${audio.format ?? 'wav'}`;
|
|
166
|
+
// Build the multipart form payload — Whisper requires a file upload
|
|
57
167
|
form.append('file', new Blob([Uint8Array.from(audio.data)], { type: audio.mimeType ?? 'audio/wav' }), fileName);
|
|
58
168
|
form.append('model', model);
|
|
59
169
|
form.append('response_format', responseFormat);
|
|
170
|
+
// Optional fields — only include when explicitly set to avoid API warnings
|
|
60
171
|
if (options.language)
|
|
61
172
|
form.append('language', options.language);
|
|
62
173
|
if (options.prompt)
|
|
@@ -68,6 +179,7 @@ export class OpenAIWhisperSpeechToTextProvider {
|
|
|
68
179
|
method: 'POST',
|
|
69
180
|
headers: {
|
|
70
181
|
Authorization: `Bearer ${this.config.apiKey}`,
|
|
182
|
+
// Content-Type is NOT set — FormData sets it automatically with boundary
|
|
71
183
|
},
|
|
72
184
|
body: form,
|
|
73
185
|
});
|
|
@@ -75,6 +187,8 @@ export class OpenAIWhisperSpeechToTextProvider {
|
|
|
75
187
|
const message = await response.text();
|
|
76
188
|
throw new Error(`OpenAI Whisper transcription failed (${response.status}): ${message}`);
|
|
77
189
|
}
|
|
190
|
+
// Plain text responses (format=text, or server returning text/plain)
|
|
191
|
+
// don't have structured data — return minimal result with just the text.
|
|
78
192
|
if (responseFormat === 'text' || response.headers.get('content-type')?.includes('text/plain')) {
|
|
79
193
|
const text = await response.text();
|
|
80
194
|
return {
|
|
@@ -89,6 +203,7 @@ export class OpenAIWhisperSpeechToTextProvider {
|
|
|
89
203
|
},
|
|
90
204
|
};
|
|
91
205
|
}
|
|
206
|
+
// JSON responses (verbose_json or json) — parse and normalize
|
|
92
207
|
const payload = (await response.json());
|
|
93
208
|
const durationSeconds = typeof payload.duration === 'number' ? payload.duration : audio.durationSeconds;
|
|
94
209
|
return {
|