@framers/agentos 0.1.109 → 0.1.111

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/dist/api/agency.d.ts.map +1 -1
  2. package/dist/api/agency.js +38 -2
  3. package/dist/api/agency.js.map +1 -1
  4. package/dist/api/agent.js +1 -1
  5. package/dist/api/agent.js.map +1 -1
  6. package/dist/api/strategies/debate.d.ts.map +1 -1
  7. package/dist/api/strategies/debate.js.map +1 -1
  8. package/dist/api/strategies/graph.d.ts.map +1 -1
  9. package/dist/api/strategies/graph.js +1 -2
  10. package/dist/api/strategies/graph.js.map +1 -1
  11. package/dist/api/strategies/hierarchical.d.ts.map +1 -1
  12. package/dist/api/strategies/hierarchical.js +1 -2
  13. package/dist/api/strategies/hierarchical.js.map +1 -1
  14. package/dist/api/strategies/index.d.ts +1 -9
  15. package/dist/api/strategies/index.d.ts.map +1 -1
  16. package/dist/api/strategies/index.js +1 -11
  17. package/dist/api/strategies/index.js.map +1 -1
  18. package/dist/api/strategies/parallel.d.ts.map +1 -1
  19. package/dist/api/strategies/parallel.js +23 -4
  20. package/dist/api/strategies/parallel.js.map +1 -1
  21. package/dist/api/strategies/review-loop.d.ts.map +1 -1
  22. package/dist/api/strategies/review-loop.js.map +1 -1
  23. package/dist/api/strategies/sequential.d.ts.map +1 -1
  24. package/dist/api/strategies/sequential.js +1 -2
  25. package/dist/api/strategies/sequential.js.map +1 -1
  26. package/dist/api/strategies/shared.d.ts +8 -0
  27. package/dist/api/strategies/shared.d.ts.map +1 -1
  28. package/dist/api/strategies/shared.js +10 -1
  29. package/dist/api/strategies/shared.js.map +1 -1
  30. package/dist/api/types.d.ts +6 -0
  31. package/dist/api/types.d.ts.map +1 -1
  32. package/dist/api/types.js.map +1 -1
  33. package/dist/memory/AgentMemory.d.ts +2 -1
  34. package/dist/memory/AgentMemory.d.ts.map +1 -1
  35. package/dist/memory/AgentMemory.js +1 -1
  36. package/dist/memory/AgentMemory.js.map +1 -1
  37. package/dist/memory/CognitiveMemoryManager.d.ts.map +1 -1
  38. package/dist/memory/CognitiveMemoryManager.js +7 -2
  39. package/dist/memory/CognitiveMemoryManager.js.map +1 -1
  40. package/dist/memory/facade/Memory.d.ts.map +1 -1
  41. package/dist/memory/facade/Memory.js +6 -9
  42. package/dist/memory/facade/Memory.js.map +1 -1
  43. package/dist/memory/store/MemoryStore.d.ts +9 -0
  44. package/dist/memory/store/MemoryStore.d.ts.map +1 -1
  45. package/dist/memory/store/MemoryStore.js +66 -6
  46. package/dist/memory/store/MemoryStore.js.map +1 -1
  47. package/dist/memory/store/SqliteMemoryGraph.d.ts.map +1 -1
  48. package/dist/memory/store/SqliteMemoryGraph.js +27 -13
  49. package/dist/memory/store/SqliteMemoryGraph.js.map +1 -1
  50. package/dist/orchestration/runtime/GraphRuntime.d.ts.map +1 -1
  51. package/dist/orchestration/runtime/GraphRuntime.js +11 -4
  52. package/dist/orchestration/runtime/GraphRuntime.js.map +1 -1
  53. package/dist/orchestration/runtime/safeExpressionEvaluator.d.ts.map +1 -1
  54. package/dist/orchestration/runtime/safeExpressionEvaluator.js +35 -16
  55. package/dist/orchestration/runtime/safeExpressionEvaluator.js.map +1 -1
  56. package/dist/speech/FallbackProxy.d.ts +194 -41
  57. package/dist/speech/FallbackProxy.d.ts.map +1 -1
  58. package/dist/speech/FallbackProxy.js +155 -32
  59. package/dist/speech/FallbackProxy.js.map +1 -1
  60. package/dist/speech/SpeechProviderResolver.d.ts +278 -36
  61. package/dist/speech/SpeechProviderResolver.d.ts.map +1 -1
  62. package/dist/speech/SpeechProviderResolver.js +306 -40
  63. package/dist/speech/SpeechProviderResolver.js.map +1 -1
  64. package/dist/speech/providers/AssemblyAISTTProvider.d.ts +119 -19
  65. package/dist/speech/providers/AssemblyAISTTProvider.d.ts.map +1 -1
  66. package/dist/speech/providers/AssemblyAISTTProvider.js +153 -25
  67. package/dist/speech/providers/AssemblyAISTTProvider.js.map +1 -1
  68. package/dist/speech/providers/AzureSpeechSTTProvider.d.ts +121 -17
  69. package/dist/speech/providers/AzureSpeechSTTProvider.d.ts.map +1 -1
  70. package/dist/speech/providers/AzureSpeechSTTProvider.js +122 -14
  71. package/dist/speech/providers/AzureSpeechSTTProvider.js.map +1 -1
  72. package/dist/speech/providers/AzureSpeechTTSProvider.d.ts +130 -15
  73. package/dist/speech/providers/AzureSpeechTTSProvider.d.ts.map +1 -1
  74. package/dist/speech/providers/AzureSpeechTTSProvider.js +163 -18
  75. package/dist/speech/providers/AzureSpeechTTSProvider.js.map +1 -1
  76. package/dist/speech/providers/BuiltInAdaptiveVadProvider.d.ts +159 -0
  77. package/dist/speech/providers/BuiltInAdaptiveVadProvider.d.ts.map +1 -1
  78. package/dist/speech/providers/BuiltInAdaptiveVadProvider.js +119 -0
  79. package/dist/speech/providers/BuiltInAdaptiveVadProvider.js.map +1 -1
  80. package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts +102 -16
  81. package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts.map +1 -1
  82. package/dist/speech/providers/DeepgramBatchSTTProvider.js +108 -13
  83. package/dist/speech/providers/DeepgramBatchSTTProvider.js.map +1 -1
  84. package/dist/speech/providers/ElevenLabsTextToSpeechProvider.d.ts +149 -0
  85. package/dist/speech/providers/ElevenLabsTextToSpeechProvider.d.ts.map +1 -1
  86. package/dist/speech/providers/ElevenLabsTextToSpeechProvider.js +137 -2
  87. package/dist/speech/providers/ElevenLabsTextToSpeechProvider.js.map +1 -1
  88. package/dist/speech/providers/OpenAITextToSpeechProvider.d.ts +125 -0
  89. package/dist/speech/providers/OpenAITextToSpeechProvider.d.ts.map +1 -1
  90. package/dist/speech/providers/OpenAITextToSpeechProvider.js +128 -4
  91. package/dist/speech/providers/OpenAITextToSpeechProvider.js.map +1 -1
  92. package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.d.ts +110 -0
  93. package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.d.ts.map +1 -1
  94. package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.js +115 -0
  95. package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.js.map +1 -1
  96. package/package.json +1 -1
@@ -1,20 +1,145 @@
1
1
  import type { SpeechSynthesisOptions, SpeechSynthesisResult, SpeechVoice, TextToSpeechProvider } from '../types.js';
2
+ /**
3
+ * Configuration for the {@link OpenAITextToSpeechProvider}.
4
+ *
5
+ * @see {@link OpenAITextToSpeechProvider} for usage examples
6
+ * @see https://platform.openai.com/docs/api-reference/audio/createSpeech
7
+ */
2
8
  export interface OpenAITextToSpeechProviderConfig {
9
+ /**
10
+ * OpenAI API key used for authentication.
11
+ * Sent as `Authorization: Bearer <apiKey>`.
12
+ */
3
13
  apiKey: string;
14
+ /**
15
+ * Base URL for the OpenAI API. Override for proxies, Azure OpenAI, or
16
+ * compatible third-party endpoints.
17
+ * @default 'https://api.openai.com/v1'
18
+ */
4
19
  baseUrl?: string;
20
+ /**
21
+ * Default TTS model. `tts-1` is optimized for real-time, `tts-1-hd` for quality.
22
+ * @default 'tts-1'
23
+ */
5
24
  model?: string;
25
+ /**
26
+ * Default voice identifier. See {@link OPENAI_VOICES} for available options.
27
+ * @default 'nova'
28
+ */
6
29
  voice?: string;
30
+ /**
31
+ * Custom fetch implementation for dependency injection in tests.
32
+ * @default globalThis.fetch
33
+ */
7
34
  fetchImpl?: typeof fetch;
8
35
  }
36
+ /**
37
+ * Text-to-speech provider that uses the OpenAI TTS API.
38
+ *
39
+ * ## API Contract
40
+ *
41
+ * - **Endpoint:** `POST {baseUrl}/audio/speech`
42
+ * - **Authentication:** `Authorization: Bearer <apiKey>`
43
+ * - **Content-Type:** `application/json`
44
+ * - **Request body:** `{ model, voice, input, response_format, speed }`
45
+ * - **Response:** Raw audio bytes in the requested format
46
+ *
47
+ * ## Models
48
+ *
49
+ * - `tts-1` — Optimized for real-time, lower latency, slightly lower quality
50
+ * - `tts-1-hd` — Higher quality at the cost of additional latency
51
+ *
52
+ * ## Voice Listing
53
+ *
54
+ * OpenAI's voice catalog is static (6 voices), so {@link listAvailableVoices}
55
+ * returns a hardcoded list from {@link OPENAI_VOICES} without making an API call.
56
+ *
57
+ * @see {@link OpenAITextToSpeechProviderConfig} for configuration options
58
+ * @see {@link OpenAIWhisperSpeechToTextProvider} for the corresponding STT provider
59
+ *
60
+ * @example
61
+ * ```ts
62
+ * const provider = new OpenAITextToSpeechProvider({
63
+ * apiKey: process.env.OPENAI_API_KEY!,
64
+ * model: 'tts-1',
65
+ * voice: 'nova',
66
+ * });
67
+ * const result = await provider.synthesize('Hello!', { speed: 1.1 });
68
+ * ```
69
+ */
9
70
  export declare class OpenAITextToSpeechProvider implements TextToSpeechProvider {
10
71
  private readonly config;
72
+ /** Unique provider identifier used for registration and resolution. */
11
73
  readonly id = "openai-tts";
74
+ /** Human-readable display name for UI and logging. */
12
75
  readonly displayName = "OpenAI TTS";
76
+ /**
77
+ * Streaming is supported — the OpenAI API streams audio bytes as they
78
+ * are generated, enabling low-latency playback pipelines.
79
+ */
13
80
  readonly supportsStreaming = true;
81
+ /** Fetch implementation — injected for testability, defaults to global fetch. */
14
82
  private readonly fetchImpl;
83
+ /**
84
+ * Creates a new OpenAITextToSpeechProvider.
85
+ *
86
+ * @param config - Provider configuration including API key and optional defaults.
87
+ *
88
+ * @example
89
+ * ```ts
90
+ * const provider = new OpenAITextToSpeechProvider({
91
+ * apiKey: 'sk-xxxx',
92
+ * voice: 'shimmer',
93
+ * });
94
+ * ```
95
+ */
15
96
  constructor(config: OpenAITextToSpeechProviderConfig);
97
+ /**
98
+ * Returns the human-readable provider name.
99
+ *
100
+ * @returns The display name string `'OpenAI TTS'`.
101
+ *
102
+ * @example
103
+ * ```ts
104
+ * provider.getProviderName(); // 'OpenAI TTS'
105
+ * ```
106
+ */
16
107
  getProviderName(): string;
108
+ /**
109
+ * Synthesizes speech from text using the OpenAI TTS API.
110
+ *
111
+ * @param text - The text to convert to audio. Maximum 4096 characters.
112
+ * @param options - Optional synthesis settings including voice, model,
113
+ * output format, and speed (0.25–4.0 range).
114
+ * @returns A promise resolving to the audio buffer and metadata.
115
+ * @throws {Error} When the OpenAI API returns a non-2xx status code.
116
+ * Common causes: invalid API key (401), rate limit (429), text too long (400).
117
+ *
118
+ * @example
119
+ * ```ts
120
+ * const result = await provider.synthesize('Hello world', {
121
+ * voice: 'alloy',
122
+ * speed: 1.2,
123
+ * outputFormat: 'opus',
124
+ * });
125
+ * ```
126
+ */
17
127
  synthesize(text: string, options?: SpeechSynthesisOptions): Promise<SpeechSynthesisResult>;
128
+ /**
129
+ * Returns the static list of available OpenAI TTS voices.
130
+ *
131
+ * Unlike other providers (ElevenLabs, Azure) that require an API call to
132
+ * list voices, OpenAI's voice catalog is fixed and hardcoded. This method
133
+ * returns a shallow copy to prevent external mutation.
134
+ *
135
+ * @returns A promise resolving to the 6 built-in OpenAI voice options.
136
+ *
137
+ * @example
138
+ * ```ts
139
+ * const voices = await provider.listAvailableVoices();
140
+ * const defaultVoice = voices.find(v => v.isDefault); // 'nova'
141
+ * ```
142
+ */
18
143
  listAvailableVoices(): Promise<SpeechVoice[]>;
19
144
  }
20
145
  //# sourceMappingURL=OpenAITextToSpeechProvider.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"OpenAITextToSpeechProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/OpenAITextToSpeechProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,sBAAsB,EACtB,qBAAqB,EACrB,WAAW,EACX,oBAAoB,EACrB,MAAM,aAAa,CAAC;AAErB,MAAM,WAAW,gCAAgC;IAC/C,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,OAAO,KAAK,CAAC;CAC1B;AA4BD,qBAAa,0BAA2B,YAAW,oBAAoB;IAMzD,OAAO,CAAC,QAAQ,CAAC,MAAM;IALnC,SAAgB,EAAE,gBAAgB;IAClC,SAAgB,WAAW,gBAAgB;IAC3C,SAAgB,iBAAiB,QAAQ;IACzC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAe;gBAEZ,MAAM,EAAE,gCAAgC;IAIrE,eAAe,IAAI,MAAM;IAInB,UAAU,CACd,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,sBAA2B,GACnC,OAAO,CAAC,qBAAqB,CAAC;IAyC3B,mBAAmB,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;CAGpD"}
1
+ {"version":3,"file":"OpenAITextToSpeechProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/OpenAITextToSpeechProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,sBAAsB,EACtB,qBAAqB,EACrB,WAAW,EACX,oBAAoB,EACrB,MAAM,aAAa,CAAC;AAErB;;;;;GAKG;AACH,MAAM,WAAW,gCAAgC;IAC/C;;;OAGG;IACH,MAAM,EAAE,MAAM,CAAC;IAEf;;;;OAIG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;;OAGG;IACH,SAAS,CAAC,EAAE,OAAO,KAAK,CAAC;CAC1B;AAqDD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiCG;AACH,qBAAa,0BAA2B,YAAW,oBAAoB;IA6BzD,OAAO,CAAC,QAAQ,CAAC,MAAM;IA5BnC,uEAAuE;IACvE,SAAgB,EAAE,gBAAgB;IAElC,sDAAsD;IACtD,SAAgB,WAAW,gBAAgB;IAE3C;;;OAGG;IACH,SAAgB,iBAAiB,QAAQ;IAEzC,iFAAiF;IACjF,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAe;IAEzC;;;;;;;;;;;;OAYG;gBAC0B,MAAM,EAAE,gCAAgC;IAIrE;;;;;;;;;OASG;IACH,eAAe,IAAI,MAAM;IAIzB;;;;;;;;;;;;;;;;;;OAkBG;IACG,UAAU,CACd,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,sBAA2B,GACnC,OAAO,CAAC,qBAAqB,CAAC;IA2CjC;;;;;;;;;;;;;;OAcG;IACG,mBAAmB,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;CAIpD"}
@@ -1,3 +1,12 @@
1
+ /**
2
+ * Static catalog of built-in OpenAI TTS voices.
3
+ *
4
+ * These voices are available for both `tts-1` and `tts-1-hd` models.
5
+ * `'nova'` is marked as default because it provides a good balance of
6
+ * naturalness and clarity across languages.
7
+ *
8
+ * @see https://platform.openai.com/docs/guides/text-to-speech/voice-options
9
+ */
1
10
  const OPENAI_VOICES = [
2
11
  { id: 'alloy', name: 'Alloy', provider: 'openai-tts', lang: 'various', isDefault: false },
3
12
  { id: 'echo', name: 'Echo', provider: 'openai-tts', lang: 'various', isDefault: false },
@@ -6,6 +15,22 @@ const OPENAI_VOICES = [
6
15
  { id: 'nova', name: 'Nova', provider: 'openai-tts', lang: 'various', isDefault: true },
7
16
  { id: 'shimmer', name: 'Shimmer', provider: 'openai-tts', lang: 'various', isDefault: false },
8
17
  ];
18
+ /**
19
+ * Maps an OpenAI output format identifier to its corresponding MIME type.
20
+ *
21
+ * OpenAI TTS supports multiple output formats. The default is MP3, which
22
+ * provides good quality at reasonable file sizes. PCM returns raw 24kHz
23
+ * 16-bit little-endian audio (MIME type `audio/L16`).
24
+ *
25
+ * @param format - The OpenAI output format string (e.g. `'mp3'`, `'opus'`).
26
+ * @returns The corresponding MIME type string.
27
+ *
28
+ * @example
29
+ * ```ts
30
+ * mimeTypeForOutput('opus'); // 'audio/opus'
31
+ * mimeTypeForOutput(undefined); // 'audio/mpeg' (default)
32
+ * ```
33
+ */
9
34
  function mimeTypeForOutput(format) {
10
35
  switch (format) {
11
36
  case 'opus':
@@ -17,23 +42,106 @@ function mimeTypeForOutput(format) {
17
42
  case 'wav':
18
43
  return 'audio/wav';
19
44
  case 'pcm':
20
- return 'audio/L16';
45
+ return 'audio/L16'; // Raw 24kHz 16-bit little-endian mono
21
46
  default:
22
- return 'audio/mpeg';
47
+ return 'audio/mpeg'; // MP3 is the default format
23
48
  }
24
49
  }
50
+ /**
51
+ * Text-to-speech provider that uses the OpenAI TTS API.
52
+ *
53
+ * ## API Contract
54
+ *
55
+ * - **Endpoint:** `POST {baseUrl}/audio/speech`
56
+ * - **Authentication:** `Authorization: Bearer <apiKey>`
57
+ * - **Content-Type:** `application/json`
58
+ * - **Request body:** `{ model, voice, input, response_format, speed }`
59
+ * - **Response:** Raw audio bytes in the requested format
60
+ *
61
+ * ## Models
62
+ *
63
+ * - `tts-1` — Optimized for real-time, lower latency, slightly lower quality
64
+ * - `tts-1-hd` — Higher quality at the cost of additional latency
65
+ *
66
+ * ## Voice Listing
67
+ *
68
+ * OpenAI's voice catalog is static (6 voices), so {@link listAvailableVoices}
69
+ * returns a hardcoded list from {@link OPENAI_VOICES} without making an API call.
70
+ *
71
+ * @see {@link OpenAITextToSpeechProviderConfig} for configuration options
72
+ * @see {@link OpenAIWhisperSpeechToTextProvider} for the corresponding STT provider
73
+ *
74
+ * @example
75
+ * ```ts
76
+ * const provider = new OpenAITextToSpeechProvider({
77
+ * apiKey: process.env.OPENAI_API_KEY!,
78
+ * model: 'tts-1',
79
+ * voice: 'nova',
80
+ * });
81
+ * const result = await provider.synthesize('Hello!', { speed: 1.1 });
82
+ * ```
83
+ */
25
84
  export class OpenAITextToSpeechProvider {
85
+ /**
86
+ * Creates a new OpenAITextToSpeechProvider.
87
+ *
88
+ * @param config - Provider configuration including API key and optional defaults.
89
+ *
90
+ * @example
91
+ * ```ts
92
+ * const provider = new OpenAITextToSpeechProvider({
93
+ * apiKey: 'sk-xxxx',
94
+ * voice: 'shimmer',
95
+ * });
96
+ * ```
97
+ */
26
98
  constructor(config) {
27
99
  this.config = config;
100
+ /** Unique provider identifier used for registration and resolution. */
28
101
  this.id = 'openai-tts';
102
+ /** Human-readable display name for UI and logging. */
29
103
  this.displayName = 'OpenAI TTS';
104
+ /**
105
+ * Streaming is supported — the OpenAI API streams audio bytes as they
106
+ * are generated, enabling low-latency playback pipelines.
107
+ */
30
108
  this.supportsStreaming = true;
31
109
  this.fetchImpl = config.fetchImpl ?? fetch;
32
110
  }
111
+ /**
112
+ * Returns the human-readable provider name.
113
+ *
114
+ * @returns The display name string `'OpenAI TTS'`.
115
+ *
116
+ * @example
117
+ * ```ts
118
+ * provider.getProviderName(); // 'OpenAI TTS'
119
+ * ```
120
+ */
33
121
  getProviderName() {
34
122
  return this.displayName;
35
123
  }
124
+ /**
125
+ * Synthesizes speech from text using the OpenAI TTS API.
126
+ *
127
+ * @param text - The text to convert to audio. Maximum 4096 characters.
128
+ * @param options - Optional synthesis settings including voice, model,
129
+ * output format, and speed (0.25–4.0 range).
130
+ * @returns A promise resolving to the audio buffer and metadata.
131
+ * @throws {Error} When the OpenAI API returns a non-2xx status code.
132
+ * Common causes: invalid API key (401), rate limit (429), text too long (400).
133
+ *
134
+ * @example
135
+ * ```ts
136
+ * const result = await provider.synthesize('Hello world', {
137
+ * voice: 'alloy',
138
+ * speed: 1.2,
139
+ * outputFormat: 'opus',
140
+ * });
141
+ * ```
142
+ */
36
143
  async synthesize(text, options = {}) {
144
+ // Resolve options with fallback chain: per-call options > config > defaults
37
145
  const model = options.model ?? this.config.model ?? 'tts-1';
38
146
  const voice = options.voice ?? this.config.voice ?? 'nova';
39
147
  const outputFormat = options.outputFormat ?? 'mp3';
@@ -48,7 +156,7 @@ export class OpenAITextToSpeechProvider {
48
156
  voice,
49
157
  input: text,
50
158
  response_format: outputFormat,
51
- speed: options.speed,
159
+ speed: options.speed, // undefined is omitted by JSON.stringify
52
160
  }),
53
161
  });
54
162
  if (!response.ok) {
@@ -59,7 +167,7 @@ export class OpenAITextToSpeechProvider {
59
167
  return {
60
168
  audioBuffer,
61
169
  mimeType: mimeTypeForOutput(outputFormat),
62
- cost: 0,
170
+ cost: 0, // Cost tracking is handled at a higher layer
63
171
  voiceUsed: voice,
64
172
  providerName: this.displayName,
65
173
  usage: {
@@ -68,7 +176,23 @@ export class OpenAITextToSpeechProvider {
68
176
  },
69
177
  };
70
178
  }
179
+ /**
180
+ * Returns the static list of available OpenAI TTS voices.
181
+ *
182
+ * Unlike other providers (ElevenLabs, Azure) that require an API call to
183
+ * list voices, OpenAI's voice catalog is fixed and hardcoded. This method
184
+ * returns a shallow copy to prevent external mutation.
185
+ *
186
+ * @returns A promise resolving to the 6 built-in OpenAI voice options.
187
+ *
188
+ * @example
189
+ * ```ts
190
+ * const voices = await provider.listAvailableVoices();
191
+ * const defaultVoice = voices.find(v => v.isDefault); // 'nova'
192
+ * ```
193
+ */
71
194
  async listAvailableVoices() {
195
+ // Return a shallow copy to prevent external mutation of the static catalog
72
196
  return [...OPENAI_VOICES];
73
197
  }
74
198
  }
@@ -1 +1 @@
1
- {"version":3,"file":"OpenAITextToSpeechProvider.js","sourceRoot":"","sources":["../../../src/speech/providers/OpenAITextToSpeechProvider.ts"],"names":[],"mappings":"AAeA,MAAM,aAAa,GAA2B;IAC5C,EAAE,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;IACzF,EAAE,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;IACvF,EAAE,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;IACzF,EAAE,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;IACvF,EAAE,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,IAAI,EAAE;IACtF,EAAE,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,SAAS,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;CAC9F,CAAC;AAEF,SAAS,iBAAiB,CAAC,MAA0B;IACnD,QAAQ,MAAM,EAAE,CAAC;QACf,KAAK,MAAM;YACT,OAAO,YAAY,CAAC;QACtB,KAAK,KAAK;YACR,OAAO,WAAW,CAAC;QACrB,KAAK,MAAM;YACT,OAAO,YAAY,CAAC;QACtB,KAAK,KAAK;YACR,OAAO,WAAW,CAAC;QACrB,KAAK,KAAK;YACR,OAAO,WAAW,CAAC;QACrB;YACE,OAAO,YAAY,CAAC;IACxB,CAAC;AACH,CAAC;AAED,MAAM,OAAO,0BAA0B;IAMrC,YAA6B,MAAwC;QAAxC,WAAM,GAAN,MAAM,CAAkC;QALrD,OAAE,GAAG,YAAY,CAAC;QAClB,gBAAW,GAAG,YAAY,CAAC;QAC3B,sBAAiB,GAAG,IAAI,CAAC;QAIvC,IAAI,CAAC,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,KAAK,CAAC;IAC7C,CAAC;IAED,eAAe;QACb,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED,KAAK,CAAC,UAAU,CACd,IAAY,EACZ,UAAkC,EAAE;QAEpC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,OAAO,CAAC;QAC5D,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,MAAM,CAAC;QAC3D,MAAM,YAAY,GAAG,OAAO,CAAC,YAAY,IAAI,KAAK,CAAC;QACnD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,SAAS,CACnC,GAAG,IAAI,CAAC,MAAM,CAAC,OAAO,IAAI,2BAA2B,eAAe,EACpE;YACE,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,aAAa,EAAE,UAAU,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE;gBAC7C,cAAc,EAAE,kBAAkB;aACnC;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;gBACnB,KAAK;gBACL,KAAK;gBACL,KAAK,EAAE,IAAI;gBACX,eAAe,EAAE,YAAY;gBAC7B,KAAK,EAAE,OAAO,CAAC,KAAK;aACrB,CAAC;SACH,CACF,CAAC;QAEF,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACtC,MAAM,IAAI,KAAK,CAAC,gCAAgC,QAAQ,CAAC,MAAM,MAAM,OAAO,EAAE,CAAC,CAAC;QAClF,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,QAAQ,CAAC,WAAW,EAAE,CAAC,CAAC;QAC9D,OAAO;YACL,WAAW;YACX,QAAQ,EAAE,iBAAiB,CAAC,YAAY,CAAC;YACzC,IAAI,EAAE,CAAC;YACP,SAAS,EAAE,KAAK;YAChB,YAAY,EAAE,IAAI,CAAC,WAAW;YAC9B,KAAK,EAAE;gBACL,UAAU,EAAE,IAAI,CAAC,MAAM;gBACvB,SAAS,EAAE,KAAK;aACjB;SACF,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,mBAAmB;QACvB,OAAO,CAAC,GAAG,aAAa,CAAC,CAAC;IAC5B,CAAC;CACF"}
1
+ {"version":3,"file":"OpenAITextToSpeechProvider.js","sourceRoot":"","sources":["../../../src/speech/providers/OpenAITextToSpeechProvider.ts"],"names":[],"mappings":"AA8CA;;;;;;;;GAQG;AACH,MAAM,aAAa,GAA2B;IAC5C,EAAE,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;IACzF,EAAE,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;IACvF,EAAE,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;IACzF,EAAE,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;IACvF,EAAE,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,IAAI,EAAE;IACtF,EAAE,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,SAAS,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,KAAK,EAAE;CAC9F,CAAC;AAEF;;;;;;;;;;;;;;;GAeG;AACH,SAAS,iBAAiB,CAAC,MAA0B;IACnD,QAAQ,MAAM,EAAE,CAAC;QACf,KAAK,MAAM;YACT,OAAO,YAAY,CAAC;QACtB,KAAK,KAAK;YACR,OAAO,WAAW,CAAC;QACrB,KAAK,MAAM;YACT,OAAO,YAAY,CAAC;QACtB,KAAK,KAAK;YACR,OAAO,WAAW,CAAC;QACrB,KAAK,KAAK;YACR,OAAO,WAAW,CAAC,CAAC,sCAAsC;QAC5D;YACE,OAAO,YAAY,CAAC,CAAC,4BAA4B;IACrD,CAAC;AACH,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiCG;AACH,MAAM,OAAO,0BAA0B;IAgBrC;;;;;;;;;;;;OAYG;IACH,YAA6B,MAAwC;QAAxC,WAAM,GAAN,MAAM,CAAkC;QA5BrE,uEAAuE;QACvD,OAAE,GAAG,YAAY,CAAC;QAElC,sDAAsD;QACtC,gBAAW,GAAG,YAAY,CAAC;QAE3C;;;WAGG;QACa,sBAAiB,GAAG,IAAI,CAAC;QAmBvC,IAAI,CAAC,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,KAAK,CAAC;IAC7C,CAAC;IAED;;;;;;;;;OASG;IACH,eAAe;QACb,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED;;;;;;;;;;;;;;;;;;OAkBG;IACH,KAAK,CAAC,UAAU,CACd,IAAY,EACZ,UAAkC,EAAE;QAEpC,4EAA4E;QAC5E,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,OAAO,CAAC;QAC5D,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,MAAM,CAAC;QAC3D,MAAM,YAAY,GAAG,OAAO,CAAC,YAAY,IAAI,KAAK,CAAC;QAEnD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,SAAS,CACnC,GAAG,IAAI,CAAC,MAAM,CAAC,OAAO,IAAI,2BAA2B,eAAe,EACpE;YACE,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,aAAa,EAAE,UAAU,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE;gBAC7C,cAAc,EAAE,kBAAkB;aACnC;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;gBACnB,KAAK;gBACL,KAAK;gBACL,KAAK,EAAE,IAAI;gBACX,eAAe,EAAE,YAAY;gBAC7B,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,yCAAyC;aAChE,CAAC;SACH,CACF,CAAC;QAEF,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACtC,MAAM,IAAI,KAAK,CAAC,gCAAgC,QAAQ,CAAC,MAAM,MAAM,OAAO,EAAE,CAAC,CAAC;QAClF,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,QAAQ,CAAC,WAAW,EAAE,CAAC,CAAC;QAC9D,OAAO;YACL,WAAW;YACX,QAAQ,EAAE,iBAAiB,CAAC,YAAY,CAAC;YACzC,IAAI,EAAE,CAAC,EAAE,6CAA6C;YACtD,SAAS,EAAE,KAAK;YAChB,YAAY,EAAE,IAAI,CAAC,WAAW;YAC9B,KAAK,EAAE;gBACL,UAAU,EAAE,IAAI,CAAC,MAAM;gBACvB,SAAS,EAAE,KAAK;aACjB;SACF,CAAC;IACJ,CAAC;IAED;;;;;;;;;;;;;;OAcG;IACH,KAAK,CAAC,mBAAmB;QACvB,2EAA2E;QAC3E,OAAO,CAAC,GAAG,aAAa,CAAC,CAAC;IAC5B,CAAC;CACF"}
@@ -1,18 +1,128 @@
1
1
  import type { SpeechAudioInput, SpeechToTextProvider, SpeechTranscriptionOptions, SpeechTranscriptionResult } from '../types.js';
2
+ /**
3
+ * Configuration for the {@link OpenAIWhisperSpeechToTextProvider}.
4
+ *
5
+ * @see {@link OpenAIWhisperSpeechToTextProvider} for usage examples
6
+ * @see https://platform.openai.com/docs/api-reference/audio/createTranscription
7
+ */
2
8
  export interface OpenAIWhisperSpeechToTextProviderConfig {
9
+ /**
10
+ * OpenAI API key used for authentication.
11
+ * Sent as `Authorization: Bearer <apiKey>` in the request header.
12
+ */
3
13
  apiKey: string;
14
+ /**
15
+ * Base URL for the OpenAI API. Override for proxies, Azure OpenAI, or
16
+ * compatible third-party endpoints.
17
+ * @default 'https://api.openai.com/v1'
18
+ */
4
19
  baseUrl?: string;
20
+ /**
21
+ * Default Whisper model to use for transcription.
22
+ * @default 'whisper-1'
23
+ */
5
24
  model?: string;
25
+ /**
26
+ * Custom fetch implementation for dependency injection in tests.
27
+ * @default globalThis.fetch
28
+ */
6
29
  fetchImpl?: typeof fetch;
7
30
  }
31
+ /**
32
+ * Speech-to-text provider that uses the OpenAI Whisper transcription API.
33
+ *
34
+ * ## API Contract
35
+ *
36
+ * - **Endpoint:** `POST {baseUrl}/audio/transcriptions`
37
+ * - **Authentication:** `Authorization: Bearer <apiKey>`
38
+ * - **Content-Type:** `multipart/form-data` (FormData with file blob)
39
+ * - **Response format:** Controlled by the `response_format` field; defaults
40
+ * to `verbose_json` which includes segments, language detection, and duration.
41
+ *
42
+ * ## Supported Response Formats
43
+ *
44
+ * - `verbose_json` — Full JSON with segments, duration, and language (default)
45
+ * - `json` — Minimal JSON with just the text
46
+ * - `text` — Plain text response (no JSON)
47
+ * - `srt` — SubRip subtitle format
48
+ * - `vtt` — WebVTT subtitle format
49
+ *
50
+ * When `text`, `srt`, or `vtt` format is used, the response is returned as
51
+ * plain text and segments are not available.
52
+ *
53
+ * @see {@link OpenAIWhisperSpeechToTextProviderConfig} for configuration options
54
+ * @see {@link normalizeSegments} for the segment normalization logic
55
+ *
56
+ * @example
57
+ * ```ts
58
+ * const provider = new OpenAIWhisperSpeechToTextProvider({
59
+ * apiKey: process.env.OPENAI_API_KEY!,
60
+ * model: 'whisper-1',
61
+ * });
62
+ * const result = await provider.transcribe(
63
+ * { data: audioBuffer, mimeType: 'audio/wav', fileName: 'recording.wav' },
64
+ * { language: 'en', responseFormat: 'verbose_json' },
65
+ * );
66
+ * ```
67
+ */
8
68
  export declare class OpenAIWhisperSpeechToTextProvider implements SpeechToTextProvider {
9
69
  private readonly config;
70
+ /** Unique provider identifier used for registration and resolution. */
10
71
  readonly id = "openai-whisper";
72
+ /** Human-readable display name for UI and logging. */
11
73
  readonly displayName = "OpenAI Whisper";
74
+ /** Whisper API is batch-only; streaming requires a WebSocket adapter. */
12
75
  readonly supportsStreaming = false;
76
+ /** Fetch implementation — injected for testability, defaults to global fetch. */
13
77
  private readonly fetchImpl;
78
+ /**
79
+ * Creates a new OpenAIWhisperSpeechToTextProvider.
80
+ *
81
+ * @param config - Provider configuration including API key and optional defaults.
82
+ *
83
+ * @example
84
+ * ```ts
85
+ * const provider = new OpenAIWhisperSpeechToTextProvider({
86
+ * apiKey: 'sk-xxxx',
87
+ * baseUrl: 'https://api.openai.com/v1', // default
88
+ * model: 'whisper-1', // default
89
+ * });
90
+ * ```
91
+ */
14
92
  constructor(config: OpenAIWhisperSpeechToTextProviderConfig);
93
+ /**
94
+ * Returns the human-readable provider name.
95
+ *
96
+ * @returns The display name string `'OpenAI Whisper'`.
97
+ *
98
+ * @example
99
+ * ```ts
100
+ * provider.getProviderName(); // 'OpenAI Whisper'
101
+ * ```
102
+ */
15
103
  getProviderName(): string;
104
+ /**
105
+ * Transcribes an audio buffer using the OpenAI Whisper API.
106
+ *
107
+ * The audio is sent as a multipart form upload with the file, model, and
108
+ * optional parameters (language, prompt, temperature, response_format).
109
+ *
110
+ * @param audio - Raw audio data and metadata. The `data` buffer is wrapped
111
+ * in a Blob and sent as a form file field. If `fileName` is not provided,
112
+ * a default name is generated from the `format` field.
113
+ * @param options - Optional transcription settings including language hint,
114
+ * context prompt, temperature for sampling, and response format.
115
+ * @returns A promise resolving to the normalized transcription result.
116
+ * @throws {Error} When the OpenAI API returns a non-2xx status code.
117
+ *
118
+ * @example
119
+ * ```ts
120
+ * const result = await provider.transcribe(
121
+ * { data: mp3Buffer, mimeType: 'audio/mpeg', fileName: 'voice.mp3' },
122
+ * { language: 'fr', prompt: 'Discussion about AI' },
123
+ * );
124
+ * ```
125
+ */
16
126
  transcribe(audio: SpeechAudioInput, options?: SpeechTranscriptionOptions): Promise<SpeechTranscriptionResult>;
17
127
  }
18
128
  //# sourceMappingURL=OpenAIWhisperSpeechToTextProvider.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"OpenAIWhisperSpeechToTextProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/OpenAIWhisperSpeechToTextProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,gBAAgB,EAEhB,oBAAoB,EACpB,0BAA0B,EAC1B,yBAAyB,EAE1B,MAAM,aAAa,CAAC;AAErB,MAAM,WAAW,uCAAuC;IACtD,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,OAAO,KAAK,CAAC;CAC1B;AA8CD,qBAAa,iCAAkC,YAAW,oBAAoB;IAMhE,OAAO,CAAC,QAAQ,CAAC,MAAM;IALnC,SAAgB,EAAE,oBAAoB;IACtC,SAAgB,WAAW,oBAAoB;IAC/C,SAAgB,iBAAiB,SAAS;IAC1C,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAe;gBAEZ,MAAM,EAAE,uCAAuC;IAI5E,eAAe,IAAI,MAAM;IAInB,UAAU,CACd,KAAK,EAAE,gBAAgB,EACvB,OAAO,GAAE,0BAA+B,GACvC,OAAO,CAAC,yBAAyB,CAAC;CAoEtC"}
1
+ {"version":3,"file":"OpenAIWhisperSpeechToTextProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/OpenAIWhisperSpeechToTextProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,gBAAgB,EAEhB,oBAAoB,EACpB,0BAA0B,EAC1B,yBAAyB,EAE1B,MAAM,aAAa,CAAC;AAErB;;;;;GAKG;AACH,MAAM,WAAW,uCAAuC;IACtD;;;OAGG;IACH,MAAM,EAAE,MAAM,CAAC;IAEf;;;;OAIG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;;OAGG;IACH,SAAS,CAAC,EAAE,OAAO,KAAK,CAAC;CAC1B;AAqED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAoCG;AACH,qBAAa,iCAAkC,YAAW,oBAAoB;IA2BhE,OAAO,CAAC,QAAQ,CAAC,MAAM;IA1BnC,uEAAuE;IACvE,SAAgB,EAAE,oBAAoB;IAEtC,sDAAsD;IACtD,SAAgB,WAAW,oBAAoB;IAE/C,yEAAyE;IACzE,SAAgB,iBAAiB,SAAS;IAE1C,iFAAiF;IACjF,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAe;IAEzC;;;;;;;;;;;;;OAaG;gBAC0B,MAAM,EAAE,uCAAuC;IAI5E;;;;;;;;;OASG;IACH,eAAe,IAAI,MAAM;IAIzB;;;;;;;;;;;;;;;;;;;;;OAqBG;IACG,UAAU,CACd,KAAK,EAAE,gBAAgB,EACvB,OAAO,GAAE,0BAA+B,GACvC,OAAO,CAAC,yBAAyB,CAAC;CA2EtC"}
@@ -1,9 +1,29 @@
1
+ /**
2
+ * Normalizes raw segment data from the OpenAI Whisper `verbose_json` response
3
+ * into strongly-typed {@link SpeechTranscriptionSegment} objects.
4
+ *
5
+ * This function performs defensive runtime type checking on every field because
6
+ * the Whisper API response shape is only partially documented and may include
7
+ * additional or differently-typed fields depending on the model version.
8
+ *
9
+ * The segment fields handled include standard ones (text, start, end, confidence)
10
+ * as well as Whisper-specific fields (id, seek, tokens, temperature, avg_logprob,
11
+ * compression_ratio, no_speech_prob) that are preserved for advanced consumers.
12
+ *
13
+ * @param input - The raw `segments` array from the Whisper JSON response.
14
+ * Expected to be an array of objects, but handles non-array gracefully.
15
+ * @returns An array of normalized segments, or `undefined` if the input
16
+ * is not a valid array.
17
+ *
18
+ * @see {@link SpeechTranscriptionSegment} for the output shape
19
+ */
1
20
  function normalizeSegments(input) {
2
21
  if (!Array.isArray(input))
3
22
  return undefined;
4
23
  return input
5
24
  .filter((segment) => typeof segment === 'object' && segment !== null)
6
25
  .map((segment) => {
26
+ // Use Record<string, unknown> for safe property access on untyped API data
7
27
  const item = segment;
8
28
  return {
9
29
  text: typeof item.text === 'string' ? item.text : '',
@@ -13,6 +33,7 @@ function normalizeSegments(input) {
13
33
  speaker: typeof item.speaker === 'string' || typeof item.speaker === 'number'
14
34
  ? item.speaker
15
35
  : undefined,
36
+ // Normalize nested word-level data with the same defensive approach
16
37
  words: Array.isArray(item.words)
17
38
  ? item.words
18
39
  .filter((word) => typeof word === 'object' && word !== null)
@@ -26,6 +47,7 @@ function normalizeSegments(input) {
26
47
  };
27
48
  })
28
49
  : undefined,
50
+ // Whisper-specific metadata fields — preserved for advanced consumers
29
51
  id: typeof item.id === 'number' ? item.id : undefined,
30
52
  seek: typeof item.seek === 'number' ? item.seek : undefined,
31
53
  tokens: Array.isArray(item.tokens)
@@ -38,25 +60,114 @@ function normalizeSegments(input) {
38
60
  };
39
61
  });
40
62
  }
63
+ /**
64
+ * Speech-to-text provider that uses the OpenAI Whisper transcription API.
65
+ *
66
+ * ## API Contract
67
+ *
68
+ * - **Endpoint:** `POST {baseUrl}/audio/transcriptions`
69
+ * - **Authentication:** `Authorization: Bearer <apiKey>`
70
+ * - **Content-Type:** `multipart/form-data` (FormData with file blob)
71
+ * - **Response format:** Controlled by the `response_format` field; defaults
72
+ * to `verbose_json` which includes segments, language detection, and duration.
73
+ *
74
+ * ## Supported Response Formats
75
+ *
76
+ * - `verbose_json` — Full JSON with segments, duration, and language (default)
77
+ * - `json` — Minimal JSON with just the text
78
+ * - `text` — Plain text response (no JSON)
79
+ * - `srt` — SubRip subtitle format
80
+ * - `vtt` — WebVTT subtitle format
81
+ *
82
+ * When `text`, `srt`, or `vtt` format is used, the response is returned as
83
+ * plain text and segments are not available.
84
+ *
85
+ * @see {@link OpenAIWhisperSpeechToTextProviderConfig} for configuration options
86
+ * @see {@link normalizeSegments} for the segment normalization logic
87
+ *
88
+ * @example
89
+ * ```ts
90
+ * const provider = new OpenAIWhisperSpeechToTextProvider({
91
+ * apiKey: process.env.OPENAI_API_KEY!,
92
+ * model: 'whisper-1',
93
+ * });
94
+ * const result = await provider.transcribe(
95
+ * { data: audioBuffer, mimeType: 'audio/wav', fileName: 'recording.wav' },
96
+ * { language: 'en', responseFormat: 'verbose_json' },
97
+ * );
98
+ * ```
99
+ */
41
100
  export class OpenAIWhisperSpeechToTextProvider {
101
+ /**
102
+ * Creates a new OpenAIWhisperSpeechToTextProvider.
103
+ *
104
+ * @param config - Provider configuration including API key and optional defaults.
105
+ *
106
+ * @example
107
+ * ```ts
108
+ * const provider = new OpenAIWhisperSpeechToTextProvider({
109
+ * apiKey: 'sk-xxxx',
110
+ * baseUrl: 'https://api.openai.com/v1', // default
111
+ * model: 'whisper-1', // default
112
+ * });
113
+ * ```
114
+ */
42
115
  constructor(config) {
43
116
  this.config = config;
117
+ /** Unique provider identifier used for registration and resolution. */
44
118
  this.id = 'openai-whisper';
119
+ /** Human-readable display name for UI and logging. */
45
120
  this.displayName = 'OpenAI Whisper';
121
+ /** Whisper API is batch-only; streaming requires a WebSocket adapter. */
46
122
  this.supportsStreaming = false;
47
123
  this.fetchImpl = config.fetchImpl ?? fetch;
48
124
  }
125
+ /**
126
+ * Returns the human-readable provider name.
127
+ *
128
+ * @returns The display name string `'OpenAI Whisper'`.
129
+ *
130
+ * @example
131
+ * ```ts
132
+ * provider.getProviderName(); // 'OpenAI Whisper'
133
+ * ```
134
+ */
49
135
  getProviderName() {
50
136
  return this.displayName;
51
137
  }
138
+ /**
139
+ * Transcribes an audio buffer using the OpenAI Whisper API.
140
+ *
141
+ * The audio is sent as a multipart form upload with the file, model, and
142
+ * optional parameters (language, prompt, temperature, response_format).
143
+ *
144
+ * @param audio - Raw audio data and metadata. The `data` buffer is wrapped
145
+ * in a Blob and sent as a form file field. If `fileName` is not provided,
146
+ * a default name is generated from the `format` field.
147
+ * @param options - Optional transcription settings including language hint,
148
+ * context prompt, temperature for sampling, and response format.
149
+ * @returns A promise resolving to the normalized transcription result.
150
+ * @throws {Error} When the OpenAI API returns a non-2xx status code.
151
+ *
152
+ * @example
153
+ * ```ts
154
+ * const result = await provider.transcribe(
155
+ * { data: mp3Buffer, mimeType: 'audio/mpeg', fileName: 'voice.mp3' },
156
+ * { language: 'fr', prompt: 'Discussion about AI' },
157
+ * );
158
+ * ```
159
+ */
52
160
  async transcribe(audio, options = {}) {
53
161
  const form = new FormData();
54
162
  const responseFormat = (options.responseFormat ?? 'verbose_json');
55
163
  const model = options.model ?? this.config.model ?? 'whisper-1';
164
+ // Generate a filename with the correct extension for Whisper's format detection
56
165
  const fileName = audio.fileName ?? `speech.${audio.format ?? 'wav'}`;
166
+ // Build the multipart form payload — Whisper requires a file upload
57
167
  form.append('file', new Blob([Uint8Array.from(audio.data)], { type: audio.mimeType ?? 'audio/wav' }), fileName);
58
168
  form.append('model', model);
59
169
  form.append('response_format', responseFormat);
170
+ // Optional fields — only include when explicitly set to avoid API warnings
60
171
  if (options.language)
61
172
  form.append('language', options.language);
62
173
  if (options.prompt)
@@ -68,6 +179,7 @@ export class OpenAIWhisperSpeechToTextProvider {
68
179
  method: 'POST',
69
180
  headers: {
70
181
  Authorization: `Bearer ${this.config.apiKey}`,
182
+ // Content-Type is NOT set — FormData sets it automatically with boundary
71
183
  },
72
184
  body: form,
73
185
  });
@@ -75,6 +187,8 @@ export class OpenAIWhisperSpeechToTextProvider {
75
187
  const message = await response.text();
76
188
  throw new Error(`OpenAI Whisper transcription failed (${response.status}): ${message}`);
77
189
  }
190
+ // Plain text responses (format=text, or server returning text/plain)
191
+ // don't have structured data — return minimal result with just the text.
78
192
  if (responseFormat === 'text' || response.headers.get('content-type')?.includes('text/plain')) {
79
193
  const text = await response.text();
80
194
  return {
@@ -89,6 +203,7 @@ export class OpenAIWhisperSpeechToTextProvider {
89
203
  },
90
204
  };
91
205
  }
206
+ // JSON responses (verbose_json or json) — parse and normalize
92
207
  const payload = (await response.json());
93
208
  const durationSeconds = typeof payload.duration === 'number' ? payload.duration : audio.durationSeconds;
94
209
  return {