assemblyai 4.33.3 → 4.34.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -0
- package/dist/assemblyai.streaming.umd.js +1291 -3
- package/dist/assemblyai.streaming.umd.min.js +1 -1
- package/dist/assemblyai.umd.js +802 -7
- package/dist/assemblyai.umd.min.js +1 -1
- package/dist/browser.mjs +775 -5
- package/dist/bun.mjs +775 -5
- package/dist/deno.mjs +775 -5
- package/dist/exports/streaming.d.ts +7 -0
- package/dist/index.cjs +802 -7
- package/dist/index.mjs +794 -8
- package/dist/node.cjs +783 -4
- package/dist/node.mjs +775 -5
- package/dist/services/index.d.ts +2 -2
- package/dist/services/streaming/browser/dual-channel-capture.d.ts +66 -0
- package/dist/services/streaming/browser/worklets/pcm16-encoder.d.ts +19 -0
- package/dist/services/streaming/energy-vad.d.ts +35 -0
- package/dist/services/streaming/index.d.ts +4 -0
- package/dist/services/streaming/label-mapper.d.ts +44 -0
- package/dist/services/streaming/resampler.d.ts +22 -0
- package/dist/services/streaming/service.d.ts +71 -2
- package/dist/streaming.browser.mjs +1247 -4
- package/dist/streaming.cjs +1287 -3
- package/dist/streaming.mjs +1276 -4
- package/dist/types/streaming/dual-channel.d.ts +48 -0
- package/dist/types/streaming/index.d.ts +140 -4
- package/dist/workerd.mjs +775 -5
- package/package.json +1 -1
- package/src/exports/streaming.ts +7 -0
- package/src/services/index.ts +20 -1
- package/src/services/streaming/browser/dual-channel-capture.ts +177 -0
- package/src/services/streaming/browser/worklets/pcm16-encoder.ts +70 -0
- package/src/services/streaming/energy-vad.ts +75 -0
- package/src/services/streaming/index.ts +4 -0
- package/src/services/streaming/label-mapper.ts +128 -0
- package/src/services/streaming/resampler.ts +69 -0
- package/src/services/streaming/service.ts +405 -3
- package/src/types/streaming/dual-channel.ts +57 -0
- package/src/types/streaming/index.ts +144 -1
|
@@ -1,4 +1,65 @@
|
|
|
1
1
|
import { AudioEncoding } from "..";
|
|
2
|
+
import type { Channel, VadDetector, VadFrame } from "./dual-channel";
|
|
3
|
+
|
|
4
|
+
export * from "./dual-channel";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Per-channel attribution tuning for dual-channel mode. All fields optional;
|
|
8
|
+
* ignored when `StreamingTranscriberParams.channels` is not set.
|
|
9
|
+
*/
|
|
10
|
+
export type ChannelAttributionParams = {
|
|
11
|
+
/** Energy ratio above which a channel is declared dominant for a word. Default 4. */
|
|
12
|
+
dominanceRatio?: number;
|
|
13
|
+
/** Rolling VAD timeline window in ms. Default 30_000. */
|
|
14
|
+
timelineWindowMs?: number;
|
|
15
|
+
/**
|
|
16
|
+
* Factory for the per-channel VAD detector. Called once per declared channel
|
|
17
|
+
* at transcriber construction time. The channel name is passed so factories
|
|
18
|
+
* that wrap higher-level VAD libraries (which manage their own audio source)
|
|
19
|
+
* can map each `VadDetector` instance to its corresponding channel.
|
|
20
|
+
*/
|
|
21
|
+
createVad?: (channelName: string) => VadDetector;
|
|
22
|
+
/** Mix flush interval in ms — how often per-channel buffers are summed and sent. Default 50. */
|
|
23
|
+
flushIntervalMs?: number;
|
|
24
|
+
/**
|
|
25
|
+
* Strategy used to fill words whose per-word VAD attribution resolved to
|
|
26
|
+
* `"unknown"`. Confident per-word VAD decisions (`"mic"` / `"system"`) are
|
|
27
|
+
* never modified by any strategy.
|
|
28
|
+
*
|
|
29
|
+
* - `"window"` (default): look at the dominant non-`"unknown"` channel
|
|
30
|
+
* among ±`resolutionWindowWords` neighboring words in the same turn.
|
|
31
|
+
* Ignores `speaker_label`, so it works even when AAI re-uses a label for
|
|
32
|
+
* two physically distinct voices.
|
|
33
|
+
* - `"speaker-history"`: accumulate per-`speaker_label` per-channel active
|
|
34
|
+
* VAD energy across the session, then fill `"unknown"` words with the
|
|
35
|
+
* speaker's dominant channel when it clears
|
|
36
|
+
* `speakerHistoryMinRmsEvidence` and beats runner-up by
|
|
37
|
+
* `speakerHistoryDominanceRatio`. Robust for stable speaker labels but
|
|
38
|
+
* does nothing when a speaker has split evidence.
|
|
39
|
+
* - `"none"`: disable resolution; `"unknown"` words remain `"unknown"` in
|
|
40
|
+
* the output.
|
|
41
|
+
*/
|
|
42
|
+
resolveUnknownChannelsMethod?: "none" | "window" | "speaker-history";
|
|
43
|
+
/**
|
|
44
|
+
* Half-window (in words) on each side of an `"unknown"` word for the
|
|
45
|
+
* `"window"` method. Default 2 — so the full window is up to 5 words
|
|
46
|
+
* (2 before + the unknown + 2 after).
|
|
47
|
+
*/
|
|
48
|
+
resolutionWindowWords?: number;
|
|
49
|
+
/**
|
|
50
|
+
* Minimum cumulative active-RMS evidence (sum across all the speaker's
|
|
51
|
+
* frames to date) before a speaker can be resolved via the
|
|
52
|
+
* `"speaker-history"` method. Default 0.5 — roughly a few seconds of
|
|
53
|
+
* sustained speech.
|
|
54
|
+
*/
|
|
55
|
+
speakerHistoryMinRmsEvidence?: number;
|
|
56
|
+
/**
|
|
57
|
+
* For the `"speaker-history"` method, the top channel's evidence must
|
|
58
|
+
* exceed the runner-up's by at least this factor for the speaker to be
|
|
59
|
+
* considered pinned to that channel. Default 3.
|
|
60
|
+
*/
|
|
61
|
+
speakerHistoryDominanceRatio?: number;
|
|
62
|
+
};
|
|
2
63
|
|
|
3
64
|
export type LLMGatewayMessage = {
|
|
4
65
|
role: string;
|
|
@@ -30,7 +91,8 @@ export type StreamingTranscriberParams = {
|
|
|
30
91
|
keyterms?: string[];
|
|
31
92
|
keytermsPrompt?: string[];
|
|
32
93
|
prompt?: string;
|
|
33
|
-
|
|
94
|
+
agentContext?: string;
|
|
95
|
+
speechModel?: StreamingSpeechModel;
|
|
34
96
|
languageDetection?: boolean;
|
|
35
97
|
domain?: StreamingDomain;
|
|
36
98
|
inactivityTimeout?: number;
|
|
@@ -46,10 +108,38 @@ export type StreamingTranscriberParams = {
|
|
|
46
108
|
redactPii?: boolean;
|
|
47
109
|
redactPiiPolicies?: StreamingPiiPolicy[];
|
|
48
110
|
redactPiiSub?: StreamingPiiSubstitution;
|
|
111
|
+
mode?: StreamingMode;
|
|
49
112
|
llmGateway?: LLMGatewayConfig;
|
|
50
113
|
webhookUrl?: string;
|
|
51
114
|
webhookAuthHeaderName?: string;
|
|
52
115
|
webhookAuthHeaderValue?: string;
|
|
116
|
+
/**
|
|
117
|
+
* Enable dual-channel (or N-channel) mode. Presence of `channels` switches the
|
|
118
|
+
* transcriber into channel-tagged mode: `sendAudio(audio, { channel })` is required,
|
|
119
|
+
* per-channel VAD runs on the raw PCM, the streams are mixed to mono before being
|
|
120
|
+
* sent to the server, and emitted `TurnEvent`s are enriched with `channel` and
|
|
121
|
+
* per-word `channel` attribution.
|
|
122
|
+
*
|
|
123
|
+
* Must contain exactly 2 entries with unique names. The names are echoed back in
|
|
124
|
+
* `TurnEvent.channel` / `words[i].channel`.
|
|
125
|
+
*
|
|
126
|
+
* **Acoustic-leak caveat.** Per-word channel attribution uses energy-based
|
|
127
|
+
* VAD on each channel. If your capture setup lets one channel's audio bleed
|
|
128
|
+
* into another at similar amplitude — typically system audio playing
|
|
129
|
+
* through speakers and being picked up by an open mic — attribution can
|
|
130
|
+
* misfire (mic-tagged words that were actually system). Transcription
|
|
131
|
+
* quality is unaffected; only the `channel` field is. To preserve
|
|
132
|
+
* attribution in speaker-leak setups, apply echo cancellation at capture
|
|
133
|
+
* before feeding audio to the SDK. In browsers, that's
|
|
134
|
+
* `getUserMedia({ audio: { echoCancellation: true } })`. On macOS native,
|
|
135
|
+
* `AVAudioEngine.setVoiceProcessingEnabled(true)` on the input node. If
|
|
136
|
+
* platform-level AEC isn't available, swap in a DNN VAD (e.g. Silero) via
|
|
137
|
+
* `channelAttribution.createVad`. See the dual-channel sample app's
|
|
138
|
+
* README for worked examples.
|
|
139
|
+
*/
|
|
140
|
+
channels?: Array<{ name: string }>;
|
|
141
|
+
/** Tuning for dual-channel attribution. Ignored when `channels` is unset. */
|
|
142
|
+
channelAttribution?: ChannelAttributionParams;
|
|
53
143
|
};
|
|
54
144
|
|
|
55
145
|
export type StreamingEvents =
|
|
@@ -58,7 +148,9 @@ export type StreamingEvents =
|
|
|
58
148
|
| "turn"
|
|
59
149
|
| "speechStarted"
|
|
60
150
|
| "llmGatewayResponse"
|
|
151
|
+
| "speakerRevision"
|
|
61
152
|
| "warning"
|
|
153
|
+
| "vad"
|
|
62
154
|
| "error";
|
|
63
155
|
|
|
64
156
|
export type StreamingListeners = {
|
|
@@ -67,7 +159,9 @@ export type StreamingListeners = {
|
|
|
67
159
|
turn?: (event: TurnEvent) => void;
|
|
68
160
|
speechStarted?: (event: SpeechStartedEvent) => void;
|
|
69
161
|
llmGatewayResponse?: (event: LLMGatewayResponseEvent) => void;
|
|
162
|
+
speakerRevision?: (event: SpeakerRevisionEvent) => void;
|
|
70
163
|
warning?: (event: WarningEvent) => void;
|
|
164
|
+
vad?: (event: VadFrame) => void;
|
|
71
165
|
error?: (error: Error) => void;
|
|
72
166
|
};
|
|
73
167
|
|
|
@@ -75,11 +169,14 @@ export type StreamingSpeechModel =
|
|
|
75
169
|
| "universal-streaming-english"
|
|
76
170
|
| "universal-streaming-multilingual"
|
|
77
171
|
| "u3-rt-pro"
|
|
172
|
+
| "u3-rt-pro-beta-1"
|
|
78
173
|
| "whisper-rt"
|
|
79
174
|
| "u3-pro";
|
|
80
175
|
|
|
81
176
|
export type StreamingDomain = "medical-v1";
|
|
82
177
|
|
|
178
|
+
export type StreamingMode = "max_accuracy" | "min_latency" | "balanced";
|
|
179
|
+
|
|
83
180
|
export type VoiceFocusModel = "near-field" | "far-field";
|
|
84
181
|
|
|
85
182
|
export type StreamingPiiSubstitution = "hash" | "entity_name";
|
|
@@ -186,6 +283,12 @@ export type TurnEvent = {
|
|
|
186
283
|
language_code?: string;
|
|
187
284
|
language_confidence?: number;
|
|
188
285
|
speaker_label?: string;
|
|
286
|
+
/**
|
|
287
|
+
* Duration-weighted majority channel across `words[i].channel`. Populated only
|
|
288
|
+
* when the transcriber is configured with `channels`. Independent from
|
|
289
|
+
* `speaker_label`.
|
|
290
|
+
*/
|
|
291
|
+
channel?: Channel;
|
|
189
292
|
};
|
|
190
293
|
|
|
191
294
|
export type StreamingWord = {
|
|
@@ -195,6 +298,20 @@ export type StreamingWord = {
|
|
|
195
298
|
text: string;
|
|
196
299
|
word_is_final: boolean;
|
|
197
300
|
speaker?: string;
|
|
301
|
+
/**
|
|
302
|
+
* Physical input channel attributed by client-side VAD during this word's
|
|
303
|
+
* time window. Populated only when the transcriber is configured with
|
|
304
|
+
* `channels`. Independent from `speaker`.
|
|
305
|
+
*/
|
|
306
|
+
channel?: Channel;
|
|
307
|
+
/**
|
|
308
|
+
* True if `channel` was filled in by `channelAttribution.resolveUnknownChannelsMethod`
|
|
309
|
+
* rather than by the per-word VAD. Only set on words whose per-word VAD
|
|
310
|
+
* attribution was `"unknown"` and whose resolution method produced a
|
|
311
|
+
* confident channel. Useful for debugging or rendering an indicator that a
|
|
312
|
+
* word's channel came from context, not direct VAD evidence.
|
|
313
|
+
*/
|
|
314
|
+
channelResolved?: boolean;
|
|
198
315
|
};
|
|
199
316
|
|
|
200
317
|
export type TerminationEvent = {
|
|
@@ -220,6 +337,7 @@ export type StreamingUpdateConfiguration = {
|
|
|
220
337
|
format_turns?: boolean;
|
|
221
338
|
keyterms_prompt?: string[];
|
|
222
339
|
prompt?: string;
|
|
340
|
+
agent_context?: string;
|
|
223
341
|
filter_profanity?: boolean;
|
|
224
342
|
interruption_delay?: number;
|
|
225
343
|
turn_left_pad_ms?: number;
|
|
@@ -248,12 +366,37 @@ export type LLMGatewayResponseEvent = {
|
|
|
248
366
|
data: unknown;
|
|
249
367
|
};
|
|
250
368
|
|
|
369
|
+
/**
|
|
370
|
+
* A single earlier Turn whose speaker labels were revised by reclustering.
|
|
371
|
+
* Match by `turn_order` against the original Turn; replace its per-word
|
|
372
|
+
* `speaker` assignments (and the turn-level `speaker_label`) with these. Text
|
|
373
|
+
* and word timestamps are unchanged from the original Turn.
|
|
374
|
+
*/
|
|
375
|
+
export type SpeakerRevisionItem = {
|
|
376
|
+
turn_order: number;
|
|
377
|
+
speaker_label?: string;
|
|
378
|
+
words: StreamingWord[];
|
|
379
|
+
};
|
|
380
|
+
|
|
381
|
+
/**
|
|
382
|
+
* Server-side correction to previously-emitted Turns' speaker labels.
|
|
383
|
+
* Diarization-only (emitted only when `speakerLabels` is enabled). Sent once
|
|
384
|
+
* per offline-recluster resolve; `revisions` carries one entry per earlier
|
|
385
|
+
* Turn whose label actually changed (unchanged turns are omitted). Apply each
|
|
386
|
+
* entry by matching its `turn_order`.
|
|
387
|
+
*/
|
|
388
|
+
export type SpeakerRevisionEvent = {
|
|
389
|
+
type: "SpeakerRevision";
|
|
390
|
+
revisions: SpeakerRevisionItem[];
|
|
391
|
+
};
|
|
392
|
+
|
|
251
393
|
export type StreamingEventMessage =
|
|
252
394
|
| BeginEvent
|
|
253
395
|
| TurnEvent
|
|
254
396
|
| SpeechStartedEvent
|
|
255
397
|
| TerminationEvent
|
|
256
398
|
| LLMGatewayResponseEvent
|
|
399
|
+
| SpeakerRevisionEvent
|
|
257
400
|
| ErrorEvent
|
|
258
401
|
| WarningEvent;
|
|
259
402
|
|