@framers/agentos 0.1.110 → 0.1.111
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api/agency.d.ts.map +1 -1
- package/dist/api/agency.js +38 -2
- package/dist/api/agency.js.map +1 -1
- package/dist/api/agent.js +1 -1
- package/dist/api/agent.js.map +1 -1
- package/dist/api/strategies/debate.d.ts.map +1 -1
- package/dist/api/strategies/debate.js.map +1 -1
- package/dist/api/strategies/graph.d.ts.map +1 -1
- package/dist/api/strategies/graph.js +1 -2
- package/dist/api/strategies/graph.js.map +1 -1
- package/dist/api/strategies/hierarchical.d.ts.map +1 -1
- package/dist/api/strategies/hierarchical.js +1 -2
- package/dist/api/strategies/hierarchical.js.map +1 -1
- package/dist/api/strategies/index.d.ts +1 -9
- package/dist/api/strategies/index.d.ts.map +1 -1
- package/dist/api/strategies/index.js +1 -11
- package/dist/api/strategies/index.js.map +1 -1
- package/dist/api/strategies/parallel.d.ts.map +1 -1
- package/dist/api/strategies/parallel.js +23 -4
- package/dist/api/strategies/parallel.js.map +1 -1
- package/dist/api/strategies/review-loop.d.ts.map +1 -1
- package/dist/api/strategies/review-loop.js.map +1 -1
- package/dist/api/strategies/sequential.d.ts.map +1 -1
- package/dist/api/strategies/sequential.js +1 -2
- package/dist/api/strategies/sequential.js.map +1 -1
- package/dist/api/strategies/shared.d.ts +8 -0
- package/dist/api/strategies/shared.d.ts.map +1 -1
- package/dist/api/strategies/shared.js +10 -1
- package/dist/api/strategies/shared.js.map +1 -1
- package/dist/api/types.d.ts +6 -0
- package/dist/api/types.d.ts.map +1 -1
- package/dist/api/types.js.map +1 -1
- package/dist/memory/AgentMemory.d.ts +2 -1
- package/dist/memory/AgentMemory.d.ts.map +1 -1
- package/dist/memory/AgentMemory.js +1 -1
- package/dist/memory/AgentMemory.js.map +1 -1
- package/dist/memory/CognitiveMemoryManager.d.ts.map +1 -1
- package/dist/memory/CognitiveMemoryManager.js +7 -2
- package/dist/memory/CognitiveMemoryManager.js.map +1 -1
- package/dist/memory/facade/Memory.d.ts.map +1 -1
- package/dist/memory/facade/Memory.js +6 -9
- package/dist/memory/facade/Memory.js.map +1 -1
- package/dist/memory/store/MemoryStore.d.ts +9 -0
- package/dist/memory/store/MemoryStore.d.ts.map +1 -1
- package/dist/memory/store/MemoryStore.js +66 -6
- package/dist/memory/store/MemoryStore.js.map +1 -1
- package/dist/memory/store/SqliteMemoryGraph.d.ts.map +1 -1
- package/dist/memory/store/SqliteMemoryGraph.js +27 -13
- package/dist/memory/store/SqliteMemoryGraph.js.map +1 -1
- package/dist/speech/FallbackProxy.d.ts +194 -41
- package/dist/speech/FallbackProxy.d.ts.map +1 -1
- package/dist/speech/FallbackProxy.js +155 -32
- package/dist/speech/FallbackProxy.js.map +1 -1
- package/dist/speech/SpeechProviderResolver.d.ts +278 -36
- package/dist/speech/SpeechProviderResolver.d.ts.map +1 -1
- package/dist/speech/SpeechProviderResolver.js +306 -40
- package/dist/speech/SpeechProviderResolver.js.map +1 -1
- package/dist/speech/providers/AssemblyAISTTProvider.d.ts +119 -19
- package/dist/speech/providers/AssemblyAISTTProvider.d.ts.map +1 -1
- package/dist/speech/providers/AssemblyAISTTProvider.js +153 -25
- package/dist/speech/providers/AssemblyAISTTProvider.js.map +1 -1
- package/dist/speech/providers/AzureSpeechSTTProvider.d.ts +121 -17
- package/dist/speech/providers/AzureSpeechSTTProvider.d.ts.map +1 -1
- package/dist/speech/providers/AzureSpeechSTTProvider.js +122 -14
- package/dist/speech/providers/AzureSpeechSTTProvider.js.map +1 -1
- package/dist/speech/providers/AzureSpeechTTSProvider.d.ts +130 -15
- package/dist/speech/providers/AzureSpeechTTSProvider.d.ts.map +1 -1
- package/dist/speech/providers/AzureSpeechTTSProvider.js +163 -18
- package/dist/speech/providers/AzureSpeechTTSProvider.js.map +1 -1
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.d.ts +159 -0
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.d.ts.map +1 -1
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.js +119 -0
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.js.map +1 -1
- package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts +102 -16
- package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts.map +1 -1
- package/dist/speech/providers/DeepgramBatchSTTProvider.js +108 -13
- package/dist/speech/providers/DeepgramBatchSTTProvider.js.map +1 -1
- package/dist/speech/providers/ElevenLabsTextToSpeechProvider.d.ts +149 -0
- package/dist/speech/providers/ElevenLabsTextToSpeechProvider.d.ts.map +1 -1
- package/dist/speech/providers/ElevenLabsTextToSpeechProvider.js +137 -2
- package/dist/speech/providers/ElevenLabsTextToSpeechProvider.js.map +1 -1
- package/dist/speech/providers/OpenAITextToSpeechProvider.d.ts +125 -0
- package/dist/speech/providers/OpenAITextToSpeechProvider.d.ts.map +1 -1
- package/dist/speech/providers/OpenAITextToSpeechProvider.js +128 -4
- package/dist/speech/providers/OpenAITextToSpeechProvider.js.map +1 -1
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.d.ts +110 -0
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.d.ts.map +1 -1
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.js +115 -0
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.js.map +1 -1
- package/package.json +1 -1
|
@@ -1,48 +1,148 @@
|
|
|
1
1
|
import type { SpeechAudioInput, SpeechToTextProvider, SpeechTranscriptionOptions, SpeechTranscriptionResult } from '../types.js';
|
|
2
|
-
/**
|
|
2
|
+
/**
|
|
3
|
+
* Configuration for the {@link AssemblyAISTTProvider}.
|
|
4
|
+
*
|
|
5
|
+
* @see {@link AssemblyAISTTProvider} for usage examples
|
|
6
|
+
*/
|
|
3
7
|
export interface AssemblyAISTTProviderConfig {
|
|
4
|
-
/**
|
|
8
|
+
/**
|
|
9
|
+
* AssemblyAI API key used for authentication.
|
|
10
|
+
* Sent as the `Authorization` header value (without a prefix like "Bearer").
|
|
11
|
+
* Obtain from https://www.assemblyai.com/dashboard/account
|
|
12
|
+
*/
|
|
5
13
|
apiKey: string;
|
|
6
14
|
/**
|
|
7
|
-
* Custom fetch implementation
|
|
8
|
-
*
|
|
15
|
+
* Custom fetch implementation for dependency injection in tests.
|
|
16
|
+
* When omitted, the global `fetch` is used.
|
|
17
|
+
* @default globalThis.fetch
|
|
9
18
|
*/
|
|
10
19
|
fetchImpl?: typeof fetch;
|
|
11
20
|
}
|
|
12
21
|
/**
|
|
13
22
|
* Speech-to-text provider that uses the AssemblyAI async transcription API.
|
|
14
23
|
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
24
|
+
* ## Three-Step Workflow
|
|
25
|
+
*
|
|
26
|
+
* AssemblyAI uses an asynchronous transcription pipeline that requires three
|
|
27
|
+
* sequential HTTP requests:
|
|
28
|
+
*
|
|
29
|
+
* 1. **Upload** — `POST /v2/upload` sends the raw audio bytes to AssemblyAI's
|
|
30
|
+
* CDN and returns an `upload_url`. This step is necessary because the
|
|
31
|
+
* transcript endpoint accepts URLs, not raw audio.
|
|
32
|
+
*
|
|
33
|
+
* 2. **Submit** — `POST /v2/transcript` creates a transcription job referencing
|
|
34
|
+
* the upload URL. Returns a transcript `id` used for polling. Optional
|
|
35
|
+
* features like `speaker_labels` are enabled in this request's JSON body.
|
|
36
|
+
*
|
|
37
|
+
* 3. **Poll** — `GET /v2/transcript/:id` is called every {@link POLL_INTERVAL_MS}
|
|
38
|
+
* (1 second) until the transcript `status` transitions to `'completed'` or
|
|
39
|
+
* `'error'`. The polling loop is bounded by {@link DEFAULT_TIMEOUT_MS}
|
|
40
|
+
* (120 seconds) to prevent indefinite waiting.
|
|
41
|
+
*
|
|
42
|
+
* ## AbortController Usage
|
|
43
|
+
*
|
|
44
|
+
* An optional `AbortSignal` can be passed via
|
|
45
|
+
* `options.providerSpecificOptions.signal` to cancel the transcription at any
|
|
46
|
+
* point. The signal is forwarded to all three fetch calls and also checked at
|
|
47
|
+
* the top of each polling iteration. When aborted, an error is thrown
|
|
48
|
+
* immediately without waiting for the current fetch to complete.
|
|
49
|
+
*
|
|
50
|
+
* ## Error Handling
|
|
51
|
+
*
|
|
52
|
+
* - Non-2xx responses at any step throw an `Error` with the HTTP status and body.
|
|
53
|
+
* - `status === 'error'` on the transcript throws with AssemblyAI's error message.
|
|
54
|
+
* - Timeout expiry throws with the transcript ID for manual inspection.
|
|
55
|
+
* - Aborted signals throw with a descriptive cancellation message.
|
|
56
|
+
*
|
|
57
|
+
* @see {@link AssemblyAISTTProviderConfig} for configuration options
|
|
58
|
+
* @see {@link AssemblyAITranscript} for the polling response shape
|
|
20
59
|
*
|
|
21
60
|
* @example
|
|
22
61
|
* ```ts
|
|
23
|
-
* const provider = new AssemblyAISTTProvider({
|
|
24
|
-
*
|
|
25
|
-
*
|
|
62
|
+
* const provider = new AssemblyAISTTProvider({
|
|
63
|
+
* apiKey: process.env.ASSEMBLYAI_API_KEY!,
|
|
64
|
+
* });
|
|
65
|
+
*
|
|
66
|
+
* // Basic transcription
|
|
67
|
+
* const result = await provider.transcribe({ data: audioBuffer });
|
|
68
|
+
*
|
|
69
|
+
* // With diarization and cancellation support
|
|
70
|
+
* const controller = new AbortController();
|
|
71
|
+
* const result = await provider.transcribe(
|
|
72
|
+
* { data: audioBuffer },
|
|
73
|
+
* {
|
|
74
|
+
* enableSpeakerDiarization: true,
|
|
75
|
+
* providerSpecificOptions: { signal: controller.signal },
|
|
76
|
+
* },
|
|
77
|
+
* );
|
|
26
78
|
* ```
|
|
27
79
|
*/
|
|
28
80
|
export declare class AssemblyAISTTProvider implements SpeechToTextProvider {
|
|
29
81
|
private readonly config;
|
|
82
|
+
/** Unique provider identifier used for registration and resolution. */
|
|
30
83
|
readonly id = "assemblyai";
|
|
84
|
+
/** Human-readable display name for UI and logging. */
|
|
31
85
|
readonly displayName = "AssemblyAI";
|
|
86
|
+
/**
|
|
87
|
+
* Streaming is not supported by this provider's async pipeline.
|
|
88
|
+
* AssemblyAI does offer a separate real-time streaming API via WebSocket,
|
|
89
|
+
* but that would be a different provider implementation.
|
|
90
|
+
*/
|
|
32
91
|
readonly supportsStreaming = false;
|
|
92
|
+
/** Fetch implementation — injected for testability, defaults to global fetch. */
|
|
33
93
|
private readonly fetchImpl;
|
|
94
|
+
/**
|
|
95
|
+
* Creates a new AssemblyAISTTProvider.
|
|
96
|
+
*
|
|
97
|
+
* @param config - Provider configuration including the API key.
|
|
98
|
+
*
|
|
99
|
+
* @example
|
|
100
|
+
* ```ts
|
|
101
|
+
* const provider = new AssemblyAISTTProvider({
|
|
102
|
+
* apiKey: 'your-assemblyai-api-key',
|
|
103
|
+
* });
|
|
104
|
+
* ```
|
|
105
|
+
*/
|
|
34
106
|
constructor(config: AssemblyAISTTProviderConfig);
|
|
35
|
-
/**
|
|
107
|
+
/**
|
|
108
|
+
* Returns the human-readable provider name.
|
|
109
|
+
*
|
|
110
|
+
* @returns The display name string `'AssemblyAI'`.
|
|
111
|
+
*
|
|
112
|
+
* @example
|
|
113
|
+
* ```ts
|
|
114
|
+
* provider.getProviderName(); // 'AssemblyAI'
|
|
115
|
+
* ```
|
|
116
|
+
*/
|
|
36
117
|
getProviderName(): string;
|
|
37
118
|
/**
|
|
38
|
-
* Transcribes an audio buffer via the AssemblyAI async pipeline
|
|
119
|
+
* Transcribes an audio buffer via the AssemblyAI three-step async pipeline:
|
|
120
|
+
* upload, submit, and poll.
|
|
39
121
|
*
|
|
40
|
-
* @param audio - Raw audio data and associated metadata.
|
|
122
|
+
* @param audio - Raw audio data and associated metadata. The `data` buffer
|
|
123
|
+
* is uploaded to AssemblyAI's CDN in step 1.
|
|
41
124
|
* @param options - Optional transcription settings. Pass
|
|
42
|
-
* `providerSpecificOptions.signal` (an {@link AbortSignal}) to cancel
|
|
43
|
-
*
|
|
44
|
-
* @
|
|
45
|
-
*
|
|
125
|
+
* `providerSpecificOptions.signal` (an {@link AbortSignal}) to cancel
|
|
126
|
+
* at any point in the pipeline.
|
|
127
|
+
* @returns A promise resolving to the normalized transcription result.
|
|
128
|
+
* @throws {Error} When the upload API returns a non-2xx status.
|
|
129
|
+
* @throws {Error} When the transcript submit API returns a non-2xx status.
|
|
130
|
+
* @throws {Error} When the polling API returns a non-2xx status.
|
|
131
|
+
* @throws {Error} When the transcript status becomes `'error'` (includes
|
|
132
|
+
* AssemblyAI's error message, e.g. "Audio file could not be decoded").
|
|
133
|
+
* @throws {Error} When the 120-second timeout is exceeded (includes the
|
|
134
|
+
* transcript ID for manual inspection via the AssemblyAI dashboard).
|
|
135
|
+
* @throws {Error} When the caller's AbortSignal is triggered.
|
|
136
|
+
*
|
|
137
|
+
* @example
|
|
138
|
+
* ```ts
|
|
139
|
+
* const result = await provider.transcribe(
|
|
140
|
+
* { data: wavBuffer, mimeType: 'audio/wav' },
|
|
141
|
+
* { enableSpeakerDiarization: true, language: 'en' },
|
|
142
|
+
* );
|
|
143
|
+
* console.log(result.text);
|
|
144
|
+
* console.log(result.segments?.map(s => `[${s.speaker}] ${s.text}`));
|
|
145
|
+
* ```
|
|
46
146
|
*/
|
|
47
147
|
transcribe(audio: SpeechAudioInput, options?: SpeechTranscriptionOptions): Promise<SpeechTranscriptionResult>;
|
|
48
148
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"AssemblyAISTTProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/AssemblyAISTTProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,gBAAgB,EAChB,oBAAoB,EACpB,0BAA0B,EAC1B,yBAAyB,EAE1B,MAAM,aAAa,CAAC;AAErB
|
|
1
|
+
{"version":3,"file":"AssemblyAISTTProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/AssemblyAISTTProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,gBAAgB,EAChB,oBAAoB,EACpB,0BAA0B,EAC1B,yBAAyB,EAE1B,MAAM,aAAa,CAAC;AAErB;;;;GAIG;AACH,MAAM,WAAW,2BAA2B;IAC1C;;;;OAIG;IACH,MAAM,EAAE,MAAM,CAAC;IAEf;;;;OAIG;IACH,SAAS,CAAC,EAAE,OAAO,KAAK,CAAC;CAC1B;AAmHD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA0DG;AACH,qBAAa,qBAAsB,YAAW,oBAAoB;IA6BpD,OAAO,CAAC,QAAQ,CAAC,MAAM;IA5BnC,uEAAuE;IACvE,SAAgB,EAAE,gBAAgB;IAElC,sDAAsD;IACtD,SAAgB,WAAW,gBAAgB;IAE3C;;;;OAIG;IACH,SAAgB,iBAAiB,SAAS;IAE1C,iFAAiF;IACjF,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAe;IAEzC;;;;;;;;;;;OAWG;gBAC0B,MAAM,EAAE,2BAA2B;IAIhE;;;;;;;;;OASG;IACH,eAAe,IAAI,MAAM;IAIzB;;;;;;;;;;;;;;;;;;;;;;;;;;;;OA4BG;IACG,UAAU,CACd,KAAK,EAAE,gBAAgB,EACvB,OAAO,GAAE,0BAA+B,GACvC,OAAO,CAAC,yBAAyB,CAAC;CAkHtC"}
|
|
@@ -1,20 +1,45 @@
|
|
|
1
|
+
/** Base URL for all AssemblyAI API v2 endpoints. */
|
|
1
2
|
const ASSEMBLYAI_BASE = 'https://api.assemblyai.com/v2';
|
|
2
|
-
/**
|
|
3
|
+
/**
|
|
4
|
+
* Maximum time (in milliseconds) to wait for a transcript to complete
|
|
5
|
+
* before throwing a timeout error.
|
|
6
|
+
*
|
|
7
|
+
* 120 seconds is generous — most transcripts complete within 10–30 seconds.
|
|
8
|
+
* The timeout exists to prevent indefinite polling in case of AssemblyAI
|
|
9
|
+
* service degradation or stuck transcripts.
|
|
10
|
+
*/
|
|
3
11
|
const DEFAULT_TIMEOUT_MS = 120000;
|
|
4
|
-
/**
|
|
12
|
+
/**
|
|
13
|
+
* Polling interval (in milliseconds) between transcript status checks.
|
|
14
|
+
*
|
|
15
|
+
* 1 second strikes a balance between responsiveness and API rate limiting.
|
|
16
|
+
* AssemblyAI does not document a rate limit for polling, but 1-second
|
|
17
|
+
* intervals are considered polite and are used in their official examples.
|
|
18
|
+
*/
|
|
5
19
|
const POLL_INTERVAL_MS = 1000;
|
|
6
20
|
/**
|
|
7
21
|
* Maps AssemblyAI word objects to {@link SpeechTranscriptionSegment} entries.
|
|
8
22
|
*
|
|
9
23
|
* Each word becomes its own segment so that per-word timing and speaker
|
|
10
|
-
* attribution are preserved in the
|
|
24
|
+
* attribution are preserved in the normalized result.
|
|
25
|
+
*
|
|
26
|
+
* **Important:** AssemblyAI returns word timings in milliseconds, so we
|
|
27
|
+
* divide by 1000 to convert to seconds for consistency with our normalized
|
|
28
|
+
* {@link SpeechTranscriptionSegment} interface (which uses seconds).
|
|
29
|
+
*
|
|
30
|
+
* @param words - Array of AssemblyAI word objects with millisecond timings.
|
|
31
|
+
* @returns An array of normalized transcription segments with second-based timings.
|
|
32
|
+
*
|
|
33
|
+
* @see {@link AssemblyAIWord} for the input shape
|
|
34
|
+
* @see {@link SpeechTranscriptionSegment} for the output shape
|
|
11
35
|
*/
|
|
12
36
|
function wordsToSegments(words) {
|
|
13
37
|
return words.map((w) => ({
|
|
14
38
|
text: w.text,
|
|
15
|
-
startTime: w.start / 1000, // AssemblyAI returns milliseconds
|
|
39
|
+
startTime: w.start / 1000, // AssemblyAI returns milliseconds -> convert to seconds
|
|
16
40
|
endTime: w.end / 1000,
|
|
17
41
|
confidence: w.confidence,
|
|
42
|
+
// Convert null speaker labels to undefined for type consistency
|
|
18
43
|
speaker: w.speaker ?? undefined,
|
|
19
44
|
words: [
|
|
20
45
|
{
|
|
@@ -29,45 +54,139 @@ function wordsToSegments(words) {
|
|
|
29
54
|
/**
|
|
30
55
|
* Speech-to-text provider that uses the AssemblyAI async transcription API.
|
|
31
56
|
*
|
|
32
|
-
*
|
|
33
|
-
*
|
|
34
|
-
*
|
|
35
|
-
*
|
|
36
|
-
*
|
|
57
|
+
* ## Three-Step Workflow
|
|
58
|
+
*
|
|
59
|
+
* AssemblyAI uses an asynchronous transcription pipeline that requires three
|
|
60
|
+
* sequential HTTP requests:
|
|
61
|
+
*
|
|
62
|
+
* 1. **Upload** — `POST /v2/upload` sends the raw audio bytes to AssemblyAI's
|
|
63
|
+
* CDN and returns an `upload_url`. This step is necessary because the
|
|
64
|
+
* transcript endpoint accepts URLs, not raw audio.
|
|
65
|
+
*
|
|
66
|
+
* 2. **Submit** — `POST /v2/transcript` creates a transcription job referencing
|
|
67
|
+
* the upload URL. Returns a transcript `id` used for polling. Optional
|
|
68
|
+
* features like `speaker_labels` are enabled in this request's JSON body.
|
|
69
|
+
*
|
|
70
|
+
* 3. **Poll** — `GET /v2/transcript/:id` is called every {@link POLL_INTERVAL_MS}
|
|
71
|
+
* (1 second) until the transcript `status` transitions to `'completed'` or
|
|
72
|
+
* `'error'`. The polling loop is bounded by {@link DEFAULT_TIMEOUT_MS}
|
|
73
|
+
* (120 seconds) to prevent indefinite waiting.
|
|
74
|
+
*
|
|
75
|
+
* ## AbortController Usage
|
|
76
|
+
*
|
|
77
|
+
* An optional `AbortSignal` can be passed via
|
|
78
|
+
* `options.providerSpecificOptions.signal` to cancel the transcription at any
|
|
79
|
+
* point. The signal is forwarded to all three fetch calls and also checked at
|
|
80
|
+
* the top of each polling iteration. When aborted, an error is thrown
|
|
81
|
+
* immediately without waiting for the current fetch to complete.
|
|
82
|
+
*
|
|
83
|
+
* ## Error Handling
|
|
84
|
+
*
|
|
85
|
+
* - Non-2xx responses at any step throw an `Error` with the HTTP status and body.
|
|
86
|
+
* - `status === 'error'` on the transcript throws with AssemblyAI's error message.
|
|
87
|
+
* - Timeout expiry throws with the transcript ID for manual inspection.
|
|
88
|
+
* - Aborted signals throw with a descriptive cancellation message.
|
|
89
|
+
*
|
|
90
|
+
* @see {@link AssemblyAISTTProviderConfig} for configuration options
|
|
91
|
+
* @see {@link AssemblyAITranscript} for the polling response shape
|
|
37
92
|
*
|
|
38
93
|
* @example
|
|
39
94
|
* ```ts
|
|
40
|
-
* const provider = new AssemblyAISTTProvider({
|
|
41
|
-
*
|
|
42
|
-
*
|
|
95
|
+
* const provider = new AssemblyAISTTProvider({
|
|
96
|
+
* apiKey: process.env.ASSEMBLYAI_API_KEY!,
|
|
97
|
+
* });
|
|
98
|
+
*
|
|
99
|
+
* // Basic transcription
|
|
100
|
+
* const result = await provider.transcribe({ data: audioBuffer });
|
|
101
|
+
*
|
|
102
|
+
* // With diarization and cancellation support
|
|
103
|
+
* const controller = new AbortController();
|
|
104
|
+
* const result = await provider.transcribe(
|
|
105
|
+
* { data: audioBuffer },
|
|
106
|
+
* {
|
|
107
|
+
* enableSpeakerDiarization: true,
|
|
108
|
+
* providerSpecificOptions: { signal: controller.signal },
|
|
109
|
+
* },
|
|
110
|
+
* );
|
|
43
111
|
* ```
|
|
44
112
|
*/
|
|
45
113
|
export class AssemblyAISTTProvider {
|
|
114
|
+
/**
|
|
115
|
+
* Creates a new AssemblyAISTTProvider.
|
|
116
|
+
*
|
|
117
|
+
* @param config - Provider configuration including the API key.
|
|
118
|
+
*
|
|
119
|
+
* @example
|
|
120
|
+
* ```ts
|
|
121
|
+
* const provider = new AssemblyAISTTProvider({
|
|
122
|
+
* apiKey: 'your-assemblyai-api-key',
|
|
123
|
+
* });
|
|
124
|
+
* ```
|
|
125
|
+
*/
|
|
46
126
|
constructor(config) {
|
|
47
127
|
this.config = config;
|
|
128
|
+
/** Unique provider identifier used for registration and resolution. */
|
|
48
129
|
this.id = 'assemblyai';
|
|
130
|
+
/** Human-readable display name for UI and logging. */
|
|
49
131
|
this.displayName = 'AssemblyAI';
|
|
132
|
+
/**
|
|
133
|
+
* Streaming is not supported by this provider's async pipeline.
|
|
134
|
+
* AssemblyAI does offer a separate real-time streaming API via WebSocket,
|
|
135
|
+
* but that would be a different provider implementation.
|
|
136
|
+
*/
|
|
50
137
|
this.supportsStreaming = false;
|
|
51
138
|
this.fetchImpl = config.fetchImpl ?? fetch;
|
|
52
139
|
}
|
|
53
|
-
/**
|
|
140
|
+
/**
|
|
141
|
+
* Returns the human-readable provider name.
|
|
142
|
+
*
|
|
143
|
+
* @returns The display name string `'AssemblyAI'`.
|
|
144
|
+
*
|
|
145
|
+
* @example
|
|
146
|
+
* ```ts
|
|
147
|
+
* provider.getProviderName(); // 'AssemblyAI'
|
|
148
|
+
* ```
|
|
149
|
+
*/
|
|
54
150
|
getProviderName() {
|
|
55
151
|
return this.displayName;
|
|
56
152
|
}
|
|
57
153
|
/**
|
|
58
|
-
* Transcribes an audio buffer via the AssemblyAI async pipeline
|
|
154
|
+
* Transcribes an audio buffer via the AssemblyAI three-step async pipeline:
|
|
155
|
+
* upload, submit, and poll.
|
|
59
156
|
*
|
|
60
|
-
* @param audio - Raw audio data and associated metadata.
|
|
157
|
+
* @param audio - Raw audio data and associated metadata. The `data` buffer
|
|
158
|
+
* is uploaded to AssemblyAI's CDN in step 1.
|
|
61
159
|
* @param options - Optional transcription settings. Pass
|
|
62
|
-
* `providerSpecificOptions.signal` (an {@link AbortSignal}) to cancel
|
|
63
|
-
*
|
|
64
|
-
* @
|
|
65
|
-
*
|
|
160
|
+
* `providerSpecificOptions.signal` (an {@link AbortSignal}) to cancel
|
|
161
|
+
* at any point in the pipeline.
|
|
162
|
+
* @returns A promise resolving to the normalized transcription result.
|
|
163
|
+
* @throws {Error} When the upload API returns a non-2xx status.
|
|
164
|
+
* @throws {Error} When the transcript submit API returns a non-2xx status.
|
|
165
|
+
* @throws {Error} When the polling API returns a non-2xx status.
|
|
166
|
+
* @throws {Error} When the transcript status becomes `'error'` (includes
|
|
167
|
+
* AssemblyAI's error message, e.g. "Audio file could not be decoded").
|
|
168
|
+
* @throws {Error} When the 120-second timeout is exceeded (includes the
|
|
169
|
+
* transcript ID for manual inspection via the AssemblyAI dashboard).
|
|
170
|
+
* @throws {Error} When the caller's AbortSignal is triggered.
|
|
171
|
+
*
|
|
172
|
+
* @example
|
|
173
|
+
* ```ts
|
|
174
|
+
* const result = await provider.transcribe(
|
|
175
|
+
* { data: wavBuffer, mimeType: 'audio/wav' },
|
|
176
|
+
* { enableSpeakerDiarization: true, language: 'en' },
|
|
177
|
+
* );
|
|
178
|
+
* console.log(result.text);
|
|
179
|
+
* console.log(result.segments?.map(s => `[${s.speaker}] ${s.text}`));
|
|
180
|
+
* ```
|
|
66
181
|
*/
|
|
67
182
|
async transcribe(audio, options = {}) {
|
|
183
|
+
// Extract the optional AbortSignal for cancellation support.
|
|
184
|
+
// Cast is safe because we document the expected type in the JSDoc.
|
|
68
185
|
const signal = options.providerSpecificOptions?.signal;
|
|
69
186
|
const timeoutMs = DEFAULT_TIMEOUT_MS;
|
|
70
|
-
// ── Step 1: Upload audio
|
|
187
|
+
// ── Step 1: Upload audio to AssemblyAI's CDN ──────────────────────────
|
|
188
|
+
// The upload endpoint returns an `upload_url` that the transcript
|
|
189
|
+
// endpoint can reference. This avoids sending raw bytes to /transcript.
|
|
71
190
|
const uploadResponse = await this.fetchImpl(`${ASSEMBLYAI_BASE}/upload`, {
|
|
72
191
|
method: 'POST',
|
|
73
192
|
headers: {
|
|
@@ -82,7 +201,9 @@ export class AssemblyAISTTProvider {
|
|
|
82
201
|
throw new Error(`AssemblyAI upload failed (${uploadResponse.status}): ${msg}`);
|
|
83
202
|
}
|
|
84
203
|
const { upload_url } = (await uploadResponse.json());
|
|
85
|
-
// ── Step 2: Submit transcript request
|
|
204
|
+
// ── Step 2: Submit transcript request ─────────────────────────────────
|
|
205
|
+
// Create a transcription job with the uploaded audio URL and any
|
|
206
|
+
// optional features like speaker diarization.
|
|
86
207
|
const submitPayload = {
|
|
87
208
|
audio_url: upload_url,
|
|
88
209
|
speaker_labels: options.enableSpeakerDiarization ?? false,
|
|
@@ -103,12 +224,16 @@ export class AssemblyAISTTProvider {
|
|
|
103
224
|
throw new Error(`AssemblyAI transcript submit failed (${submitResponse.status}): ${msg}`);
|
|
104
225
|
}
|
|
105
226
|
const { id } = (await submitResponse.json());
|
|
106
|
-
// ── Step 3: Poll until completed
|
|
227
|
+
// ── Step 3: Poll until completed or error ─────────────────────────────
|
|
228
|
+
// Check the transcript status every POLL_INTERVAL_MS until it reaches
|
|
229
|
+
// a terminal state or the timeout is exceeded.
|
|
107
230
|
const deadline = Date.now() + timeoutMs;
|
|
108
231
|
while (true) {
|
|
232
|
+
// Check for caller-initiated cancellation before each poll
|
|
109
233
|
if (signal?.aborted) {
|
|
110
234
|
throw new Error('AssemblyAI transcription aborted by caller signal');
|
|
111
235
|
}
|
|
236
|
+
// Check for timeout before each poll to avoid one extra unnecessary request
|
|
112
237
|
if (Date.now() >= deadline) {
|
|
113
238
|
throw new Error(`AssemblyAI transcription timed out after ${timeoutMs / 1000}s (transcript id: ${id})`);
|
|
114
239
|
}
|
|
@@ -121,9 +246,11 @@ export class AssemblyAISTTProvider {
|
|
|
121
246
|
throw new Error(`AssemblyAI poll failed (${pollResponse.status}): ${msg}`);
|
|
122
247
|
}
|
|
123
248
|
const transcript = (await pollResponse.json());
|
|
249
|
+
// Terminal state: transcription failed on AssemblyAI's side
|
|
124
250
|
if (transcript.status === 'error') {
|
|
125
251
|
throw new Error(`AssemblyAI transcription error: ${transcript.error ?? 'unknown error'}`);
|
|
126
252
|
}
|
|
253
|
+
// Terminal state: transcription succeeded — normalize and return
|
|
127
254
|
if (transcript.status === 'completed') {
|
|
128
255
|
const text = transcript.text ?? '';
|
|
129
256
|
const durationSeconds = transcript.audio_duration ?? audio.durationSeconds;
|
|
@@ -133,17 +260,18 @@ export class AssemblyAISTTProvider {
|
|
|
133
260
|
language: transcript.language_code ?? options.language,
|
|
134
261
|
durationSeconds,
|
|
135
262
|
confidence: transcript.confidence ?? undefined,
|
|
136
|
-
cost: 0,
|
|
263
|
+
cost: 0, // Cost tracking is handled at a higher layer
|
|
137
264
|
segments: words.length > 0 ? wordsToSegments(words) : undefined,
|
|
138
265
|
providerResponse: transcript,
|
|
139
|
-
isFinal: true,
|
|
266
|
+
isFinal: true, // Async API always returns final results
|
|
140
267
|
usage: {
|
|
141
268
|
durationMinutes: (durationSeconds ?? 0) / 60,
|
|
142
269
|
modelUsed: 'assemblyai',
|
|
143
270
|
},
|
|
144
271
|
};
|
|
145
272
|
}
|
|
146
|
-
//
|
|
273
|
+
// Non-terminal state ('queued' or 'processing') — wait before polling again.
|
|
274
|
+
// Using setTimeout instead of a busy loop to yield the event loop.
|
|
147
275
|
await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
|
|
148
276
|
}
|
|
149
277
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"AssemblyAISTTProvider.js","sourceRoot":"","sources":["../../../src/speech/providers/AssemblyAISTTProvider.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"AssemblyAISTTProvider.js","sourceRoot":"","sources":["../../../src/speech/providers/AssemblyAISTTProvider.ts"],"names":[],"mappings":"AAqFA,oDAAoD;AACpD,MAAM,eAAe,GAAG,+BAA+B,CAAC;AAExD;;;;;;;GAOG;AACH,MAAM,kBAAkB,GAAG,MAAO,CAAC;AAEnC;;;;;;GAMG;AACH,MAAM,gBAAgB,GAAG,IAAK,CAAC;AAE/B;;;;;;;;;;;;;;;GAeG;AACH,SAAS,eAAe,CAAC,KAAuB;IAC9C,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACvB,IAAI,EAAE,CAAC,CAAC,IAAI;QACZ,SAAS,EAAE,CAAC,CAAC,KAAK,GAAG,IAAI,EAAE,wDAAwD;QACnF,OAAO,EAAE,CAAC,CAAC,GAAG,GAAG,IAAI;QACrB,UAAU,EAAE,CAAC,CAAC,UAAU;QACxB,gEAAgE;QAChE,OAAO,EAAE,CAAC,CAAC,OAAO,IAAI,SAAS;QAC/B,KAAK,EAAE;YACL;gBACE,IAAI,EAAE,CAAC,CAAC,IAAI;gBACZ,KAAK,EAAE,CAAC,CAAC,KAAK,GAAG,IAAI;gBACrB,GAAG,EAAE,CAAC,CAAC,GAAG,GAAG,IAAI;gBACjB,UAAU,EAAE,CAAC,CAAC,UAAU;aACzB;SACF;KACF,CAAC,CAAC,CAAC;AACN,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA0DG;AACH,MAAM,OAAO,qBAAqB;IAiBhC;;;;;;;;;;;OAWG;IACH,YAA6B,MAAmC;QAAnC,WAAM,GAAN,MAAM,CAA6B;QA5BhE,uEAAuE;QACvD,OAAE,GAAG,YAAY,CAAC;QAElC,sDAAsD;QACtC,gBAAW,GAAG,YAAY,CAAC;QAE3C;;;;WAIG;QACa,sBAAiB,GAAG,KAAK,CAAC;QAkBxC,IAAI,CAAC,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,KAAK,CAAC;IAC7C,CAAC;IAED;;;;;;;;;OASG;IACH,eAAe;QACb,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;OA4BG;IACH,KAAK,CAAC,UAAU,CACd,KAAuB,EACvB,UAAsC,EAAE;QAExC,6DAA6D;QAC7D,mEAAmE;QACnE,MAAM,MAAM,GAAG,OAAO,CAAC,uBAAuB,EAAE,MAAiC,CAAC;QAClF,MAAM,SAAS,GAAG,kBAAkB,CAAC;QAErC,yEAAyE;QACzE,kEAAkE;QAClE,wEAAwE;QACxE,MAAM,cAAc,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,GAAG,eAAe,SAAS,EAAE;YACvE,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,aAAa,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM;gBACjC,cAAc,EAAE,KAAK,CAAC,QAAQ,IAAI,WAAW;aAC9C;YACD,IAAI,EAAE,KAAK,CAAC,IAA2B;YACvC,MAAM;SACP,CAAC,CAAC;QAEH,IAAI,CAAC,cAAc,CAAC,EAAE,EAAE,CAAC;YACvB,MAAM,GAAG,GAAG,MAAM,cAAc,CAAC,IAAI,EAAE,CAAC;YACxC,MAAM,IAAI,KAAK,CAAC,6BAA6B,cAAc,CAAC,MAAM,MAAM,GAAG,EAAE,CAAC,CAAC;QACjF,CAAC;QAED,MAAM,EAAE,UAAU,EAAE,GAAG,CAAC,MAAM,cAAc,CAAC,IAAI,EAAE,CAA2B,CAAC;QAE/E,yEAAyE;QACzE,iEAAiE;QACjE,8CAA8C;QAC9C,MAAM,aAAa,GAA4B;YAC7C,SAAS,EAAE,UAAU;YACrB,cAAc,EAAE,OAAO,CAAC,wBAAwB,IAAI,KAAK;SAC1D,CAAC;QACF,IAAI,OAAO,CAAC,QAAQ;YAAE,aAAa,CAAC,aAAa,GAAG,OAAO,CAAC,QAAQ,CAAC;QAErE,MAAM,cAAc,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,GAAG,eAAe,aAAa,EAAE;YAC3E,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,aAAa,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM;gBACjC,cAAc,EAAE,kBAAkB;aACnC;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,aAAa,CAAC;YACnC,MAAM;SACP,CAAC,CAAC;QAEH,IAAI,CAAC,cAAc,CAAC,EAAE,EAAE,CAAC;YACvB,MAAM,GAAG,GAAG,MAAM,cAAc,CAAC,IAAI,EAAE,CAAC;YACxC,MAAM,IAAI,KAAK,CAAC,wCAAwC,cAAc,CAAC,MAAM,MAAM,GAAG,EAAE,CAAC,CAAC;QAC5F,CAAC;QAED,MAAM,EAAE,EAAE,EAAE,GAAG,CAAC,MAAM,cAAc,CAAC,IAAI,EAAE,CAAmB,CAAC;QAE/D,yEAAyE;QACzE,sEAAsE;QACtE,+CAA+C;QAC/C,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;QAExC,OAAO,IAAI,EAAE,CAAC;YACZ,2DAA2D;YAC3D,IAAI,MAAM,EAAE,OAAO,EAAE,CAAC;gBACpB,MAAM,IAAI,KAAK,CAAC,mDAAmD,CAAC,CAAC;YACvE,CAAC;YAED,4EAA4E;YAC5E,IAAI,IAAI,CAAC,GAAG,EAAE,IAAI,QAAQ,EAAE,CAAC;gBAC3B,MAAM,IAAI,KAAK,CACb,4CAA4C,SAAS,GAAG,IAAI,qBAAqB,EAAE,GAAG,CACvF,CAAC;YACJ,CAAC;YAED,MAAM,YAAY,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,GAAG,eAAe,eAAe,EAAE,EAAE,EAAE;gBAC/E,OAAO,EAAE,EAAE,aAAa,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE;gBAC9C,MAAM;aACP,CAAC,CAAC;YAEH,IAAI,CAAC,YAAY,CAAC,EAAE,EAAE,CAAC;gBACrB,MAAM,GAAG,GAAG,MAAM,YAAY,CAAC,IAAI,EAAE,CAAC;gBACtC,MAAM,IAAI,KAAK,CAAC,2BAA2B,YAAY,CAAC,MAAM,MAAM,GAAG,EAAE,CAAC,CAAC;YAC7E,CAAC;YAED,MAAM,UAAU,GAAG,CAAC,MAAM,YAAY,CAAC,IAAI,EAAE,CAAyB,CAAC;YAEvE,4DAA4D;YAC5D,IAAI,UAAU,CAAC,MAAM,KAAK,OAAO,EAAE,CAAC;gBAClC,MAAM,IAAI,KAAK,CAAC,mCAAmC,UAAU,CAAC,KAAK,IAAI,eAAe,EAAE,CAAC,CAAC;YAC5F,CAAC;YAED,iEAAiE;YACjE,IAAI,UAAU,CAAC,MAAM,KAAK,WAAW,EAAE,CAAC;gBACtC,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,IAAI,EAAE,CAAC;gBACnC,MAAM,eAAe,GAAG,UAAU,CAAC,cAAc,IAAI,KAAK,CAAC,eAAe,CAAC;gBAC3E,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,IAAI,EAAE,CAAC;gBAErC,OAAO;oBACL,IAAI;oBACJ,QAAQ,EAAE,UAAU,CAAC,aAAa,IAAI,OAAO,CAAC,QAAQ;oBACtD,eAAe;oBACf,UAAU,EAAE,UAAU,CAAC,UAAU,IAAI,SAAS;oBAC9C,IAAI,EAAE,CAAC,EAAE,6CAA6C;oBACtD,QAAQ,EAAE,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS;oBAC/D,gBAAgB,EAAE,UAAU;oBAC5B,OAAO,EAAE,IAAI,EAAE,yCAAyC;oBACxD,KAAK,EAAE;wBACL,eAAe,EAAE,CAAC,eAAe,IAAI,CAAC,CAAC,GAAG,EAAE;wBAC5C,SAAS,EAAE,YAAY;qBACxB;iBACF,CAAC;YACJ,CAAC;YAED,6EAA6E;YAC7E,mEAAmE;YACnE,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,gBAAgB,CAAC,CAAC,CAAC;QAC9E,CAAC;IACH,CAAC;CACF"}
|
|
@@ -1,47 +1,151 @@
|
|
|
1
1
|
import type { SpeechAudioInput, SpeechToTextProvider, SpeechTranscriptionOptions, SpeechTranscriptionResult } from '../types.js';
|
|
2
|
-
/**
|
|
2
|
+
/**
|
|
3
|
+
* Configuration for the {@link AzureSpeechSTTProvider}.
|
|
4
|
+
*
|
|
5
|
+
* @see {@link AzureSpeechSTTProvider} for usage examples
|
|
6
|
+
* @see https://learn.microsoft.com/azure/ai-services/speech-service/rest-speech-to-text
|
|
7
|
+
*/
|
|
3
8
|
export interface AzureSpeechSTTProviderConfig {
|
|
4
|
-
/**
|
|
9
|
+
/**
|
|
10
|
+
* Azure Cognitive Services subscription key.
|
|
11
|
+
* Sent as the `Ocp-Apim-Subscription-Key` header — this is Azure's
|
|
12
|
+
* standard authentication mechanism for Cognitive Services REST APIs.
|
|
13
|
+
* Obtain from the Azure portal under your Speech resource's "Keys and Endpoint".
|
|
14
|
+
*/
|
|
5
15
|
key: string;
|
|
6
|
-
/**
|
|
16
|
+
/**
|
|
17
|
+
* Azure region where the Speech resource is deployed, e.g. `'eastus'`,
|
|
18
|
+
* `'westeurope'`, `'southeastasia'`.
|
|
19
|
+
*
|
|
20
|
+
* The region determines the REST endpoint hostname:
|
|
21
|
+
* `https://{region}.stt.speech.microsoft.com`
|
|
22
|
+
*
|
|
23
|
+
* @see https://learn.microsoft.com/azure/ai-services/speech-service/regions
|
|
24
|
+
*/
|
|
7
25
|
region: string;
|
|
8
26
|
/**
|
|
9
|
-
* Custom fetch implementation
|
|
10
|
-
*
|
|
27
|
+
* Custom fetch implementation for dependency injection in tests.
|
|
28
|
+
* @default globalThis.fetch
|
|
11
29
|
*/
|
|
12
30
|
fetchImpl?: typeof fetch;
|
|
13
31
|
}
|
|
14
32
|
/**
|
|
15
33
|
* Speech-to-text provider that uses the Azure Cognitive Services Speech REST API.
|
|
16
34
|
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
35
|
+
* ## Azure REST Endpoint Format
|
|
36
|
+
*
|
|
37
|
+
* The endpoint URL follows this pattern:
|
|
38
|
+
* ```
|
|
39
|
+
* https://{region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?language={lang}
|
|
40
|
+
* ```
|
|
41
|
+
*
|
|
42
|
+
* - `{region}` — The Azure region from config (e.g. `eastus`, `westeurope`).
|
|
43
|
+
* - `{lang}` — BCP-47 language code from options or `'en-US'` default.
|
|
44
|
+
* - The `/conversation/` path segment selects the conversation recognition mode
|
|
45
|
+
* (as opposed to `/interactive/` or `/dictation/`).
|
|
46
|
+
*
|
|
47
|
+
* ## Authentication: `Ocp-Apim-Subscription-Key`
|
|
48
|
+
*
|
|
49
|
+
* Azure Cognitive Services uses the `Ocp-Apim-Subscription-Key` HTTP header
|
|
50
|
+
* for authentication, which differs from the typical `Authorization: Bearer`
|
|
51
|
+
* pattern. The subscription key is sent as a plain-text header value — no
|
|
52
|
+
* "Bearer" or "Token" prefix.
|
|
53
|
+
*
|
|
54
|
+
* An alternative is to use a short-lived token from the token endpoint, but
|
|
55
|
+
* this provider uses the simpler key-based approach for reliability.
|
|
56
|
+
*
|
|
57
|
+
* ## NoMatch Handling
|
|
58
|
+
*
|
|
59
|
+
* When Azure's recognizer detects audio but cannot identify any speech, it
|
|
60
|
+
* returns `RecognitionStatus: 'NoMatch'` instead of raising an HTTP error.
|
|
61
|
+
* This provider maps `NoMatch` to an empty-text result (`text: ''`) with
|
|
62
|
+
* `isFinal: true`, matching the Azure Speech SDK's behaviour. This prevents
|
|
63
|
+
* the fallback proxy from unnecessarily trying another provider when the
|
|
64
|
+
* audio genuinely contains no speech.
|
|
65
|
+
*
|
|
66
|
+
* ## Limitations
|
|
67
|
+
*
|
|
68
|
+
* - Audio must be PCM WAV format. The `Content-Type` is hardcoded to
|
|
69
|
+
* `audio/wav` regardless of the `audio.mimeType` value.
|
|
70
|
+
* - Streaming is not supported — use the Azure Speech SDK for real-time STT.
|
|
71
|
+
* - Speaker diarization is not available via the REST API.
|
|
72
|
+
*
|
|
73
|
+
* @see {@link AzureSpeechSTTProviderConfig} for configuration options
|
|
74
|
+
* @see {@link AzureSpeechTTSProvider} for the corresponding TTS provider
|
|
21
75
|
*
|
|
22
76
|
* @example
|
|
23
77
|
* ```ts
|
|
24
|
-
* const provider = new AzureSpeechSTTProvider({
|
|
25
|
-
*
|
|
26
|
-
*
|
|
78
|
+
* const provider = new AzureSpeechSTTProvider({
|
|
79
|
+
* key: process.env.AZURE_SPEECH_KEY!,
|
|
80
|
+
* region: 'eastus',
|
|
81
|
+
* });
|
|
82
|
+
* const result = await provider.transcribe(
|
|
83
|
+
* { data: wavBuffer, mimeType: 'audio/wav' },
|
|
84
|
+
* { language: 'de-DE' },
|
|
85
|
+
* );
|
|
86
|
+
* console.log(result.text); // '' if no speech detected
|
|
27
87
|
* ```
|
|
28
88
|
*/
|
|
29
89
|
export declare class AzureSpeechSTTProvider implements SpeechToTextProvider {
|
|
30
90
|
private readonly config;
|
|
91
|
+
/** Unique provider identifier used for registration and resolution. */
|
|
31
92
|
readonly id = "azure-speech-stt";
|
|
93
|
+
/** Human-readable display name for UI and logging. */
|
|
32
94
|
readonly displayName = "Azure Speech (STT)";
|
|
95
|
+
/** This provider uses synchronous HTTP requests, not WebSocket streaming. */
|
|
33
96
|
readonly supportsStreaming = false;
|
|
97
|
+
/** Fetch implementation — injected for testability, defaults to global fetch. */
|
|
34
98
|
private readonly fetchImpl;
|
|
99
|
+
/**
|
|
100
|
+
* Creates a new AzureSpeechSTTProvider.
|
|
101
|
+
*
|
|
102
|
+
* @param config - Provider configuration including the subscription key and region.
|
|
103
|
+
*
|
|
104
|
+
* @example
|
|
105
|
+
* ```ts
|
|
106
|
+
* const provider = new AzureSpeechSTTProvider({
|
|
107
|
+
* key: 'your-azure-subscription-key',
|
|
108
|
+
* region: 'eastus',
|
|
109
|
+
* });
|
|
110
|
+
* ```
|
|
111
|
+
*/
|
|
35
112
|
constructor(config: AzureSpeechSTTProviderConfig);
|
|
36
|
-
/**
|
|
113
|
+
/**
|
|
114
|
+
* Returns the human-readable provider name.
|
|
115
|
+
*
|
|
116
|
+
* @returns The display name string `'Azure Speech (STT)'`.
|
|
117
|
+
*
|
|
118
|
+
* @example
|
|
119
|
+
* ```ts
|
|
120
|
+
* provider.getProviderName(); // 'Azure Speech (STT)'
|
|
121
|
+
* ```
|
|
122
|
+
*/
|
|
37
123
|
getProviderName(): string;
|
|
38
124
|
/**
|
|
39
125
|
* Transcribes an audio buffer using the Azure Speech recognition REST endpoint.
|
|
40
126
|
*
|
|
41
|
-
*
|
|
42
|
-
*
|
|
43
|
-
*
|
|
44
|
-
* @
|
|
127
|
+
* Sends the raw audio as PCM WAV and returns a normalized result. Azure's
|
|
128
|
+
* `NoMatch` status is treated as an empty transcript (not an error).
|
|
129
|
+
*
|
|
130
|
+
* @param audio - Raw audio data. Azure expects PCM WAV format; the
|
|
131
|
+
* Content-Type header is always set to `'audio/wav'` regardless of
|
|
132
|
+
* `audio.mimeType`.
|
|
133
|
+
* @param options - Optional transcription settings. Only `language` is
|
|
134
|
+
* supported by the Azure REST endpoint.
|
|
135
|
+
* @returns A promise resolving to the normalized transcription result.
|
|
136
|
+
* @throws {Error} When the Azure API returns a non-2xx HTTP status code.
|
|
137
|
+
* The error message includes the status and response body text.
|
|
138
|
+
*
|
|
139
|
+
* @example
|
|
140
|
+
* ```ts
|
|
141
|
+
* const result = await provider.transcribe(
|
|
142
|
+
* { data: wavBuffer, durationSeconds: 5 },
|
|
143
|
+
* { language: 'fr-FR' },
|
|
144
|
+
* );
|
|
145
|
+
* if (result.text === '') {
|
|
146
|
+
* console.log('No speech detected in the audio');
|
|
147
|
+
* }
|
|
148
|
+
* ```
|
|
45
149
|
*/
|
|
46
150
|
transcribe(audio: SpeechAudioInput, options?: SpeechTranscriptionOptions): Promise<SpeechTranscriptionResult>;
|
|
47
151
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"AzureSpeechSTTProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/AzureSpeechSTTProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,gBAAgB,EAChB,oBAAoB,EACpB,0BAA0B,EAC1B,yBAAyB,EAC1B,MAAM,aAAa,CAAC;AAErB
|
|
1
|
+
{"version":3,"file":"AzureSpeechSTTProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/AzureSpeechSTTProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,gBAAgB,EAChB,oBAAoB,EACpB,0BAA0B,EAC1B,yBAAyB,EAC1B,MAAM,aAAa,CAAC;AAErB;;;;;GAKG;AACH,MAAM,WAAW,4BAA4B;IAC3C;;;;;OAKG;IACH,GAAG,EAAE,MAAM,CAAC;IAEZ;;;;;;;;OAQG;IACH,MAAM,EAAE,MAAM,CAAC;IAEf;;;OAGG;IACH,SAAS,CAAC,EAAE,OAAO,KAAK,CAAC;CAC1B;AA4DD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwDG;AACH,qBAAa,sBAAuB,YAAW,oBAAoB;IA0BrD,OAAO,CAAC,QAAQ,CAAC,MAAM;IAzBnC,uEAAuE;IACvE,SAAgB,EAAE,sBAAsB;IAExC,sDAAsD;IACtD,SAAgB,WAAW,wBAAwB;IAEnD,6EAA6E;IAC7E,SAAgB,iBAAiB,SAAS;IAE1C,iFAAiF;IACjF,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAe;IAEzC;;;;;;;;;;;;OAYG;gBAC0B,MAAM,EAAE,4BAA4B;IAIjE;;;;;;;;;OASG;IACH,eAAe,IAAI,MAAM;IAIzB;;;;;;;;;;;;;;;;;;;;;;;;;OAyBG;IACG,UAAU,CACd,KAAK,EAAE,gBAAgB,EACvB,OAAO,GAAE,0BAA+B,GACvC,OAAO,CAAC,yBAAyB,CAAC;CAmEtC"}
|