assemblyai 4.33.3 → 4.34.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +22 -0
  2. package/dist/assemblyai.streaming.umd.js +1291 -3
  3. package/dist/assemblyai.streaming.umd.min.js +1 -1
  4. package/dist/assemblyai.umd.js +802 -7
  5. package/dist/assemblyai.umd.min.js +1 -1
  6. package/dist/browser.mjs +775 -5
  7. package/dist/bun.mjs +775 -5
  8. package/dist/deno.mjs +775 -5
  9. package/dist/exports/streaming.d.ts +7 -0
  10. package/dist/index.cjs +802 -7
  11. package/dist/index.mjs +794 -8
  12. package/dist/node.cjs +783 -4
  13. package/dist/node.mjs +775 -5
  14. package/dist/services/index.d.ts +2 -2
  15. package/dist/services/streaming/browser/dual-channel-capture.d.ts +66 -0
  16. package/dist/services/streaming/browser/worklets/pcm16-encoder.d.ts +19 -0
  17. package/dist/services/streaming/energy-vad.d.ts +35 -0
  18. package/dist/services/streaming/index.d.ts +4 -0
  19. package/dist/services/streaming/label-mapper.d.ts +44 -0
  20. package/dist/services/streaming/resampler.d.ts +22 -0
  21. package/dist/services/streaming/service.d.ts +71 -2
  22. package/dist/streaming.browser.mjs +1247 -4
  23. package/dist/streaming.cjs +1287 -3
  24. package/dist/streaming.mjs +1276 -4
  25. package/dist/types/streaming/dual-channel.d.ts +48 -0
  26. package/dist/types/streaming/index.d.ts +140 -4
  27. package/dist/workerd.mjs +775 -5
  28. package/package.json +1 -1
  29. package/src/exports/streaming.ts +7 -0
  30. package/src/services/index.ts +20 -1
  31. package/src/services/streaming/browser/dual-channel-capture.ts +177 -0
  32. package/src/services/streaming/browser/worklets/pcm16-encoder.ts +70 -0
  33. package/src/services/streaming/energy-vad.ts +75 -0
  34. package/src/services/streaming/index.ts +4 -0
  35. package/src/services/streaming/label-mapper.ts +128 -0
  36. package/src/services/streaming/resampler.ts +69 -0
  37. package/src/services/streaming/service.ts +405 -3
  38. package/src/types/streaming/dual-channel.ts +57 -0
  39. package/src/types/streaming/index.ts +144 -1
@@ -1,4 +1,65 @@
1
1
  import { AudioEncoding } from "..";
2
+ import type { Channel, VadDetector, VadFrame } from "./dual-channel";
3
+
4
+ export * from "./dual-channel";
5
+
6
+ /**
7
+ * Per-channel attribution tuning for dual-channel mode. All fields optional;
8
+ * ignored when `StreamingTranscriberParams.channels` is not set.
9
+ */
10
+ export type ChannelAttributionParams = {
11
+ /** Energy ratio above which a channel is declared dominant for a word. Default 4. */
12
+ dominanceRatio?: number;
13
+ /** Rolling VAD timeline window in ms. Default 30_000. */
14
+ timelineWindowMs?: number;
15
+ /**
16
+ * Factory for the per-channel VAD detector. Called once per declared channel
17
+ * at transcriber construction time. The channel name is passed so factories
18
+ * that wrap higher-level VAD libraries (which manage their own audio source)
19
+ * can map each `VadDetector` instance to its corresponding channel.
20
+ */
21
+ createVad?: (channelName: string) => VadDetector;
22
+ /** Mix flush interval in ms — how often per-channel buffers are summed and sent. Default 50. */
23
+ flushIntervalMs?: number;
24
+ /**
25
+ * Strategy used to fill words whose per-word VAD attribution resolved to
26
+ * `"unknown"`. Confident per-word VAD decisions (`"mic"` / `"system"`) are
27
+ * never modified by any strategy.
28
+ *
29
+ * - `"window"` (default): look at the dominant non-`"unknown"` channel
30
+ * among ±`resolutionWindowWords` neighboring words in the same turn.
31
+ * Ignores `speaker_label`, so it works even when AAI re-uses a label for
32
+ * two physically distinct voices.
33
+ * - `"speaker-history"`: accumulate per-`speaker_label` per-channel active
34
+ * VAD energy across the session, then fill `"unknown"` words with the
35
+ * speaker's dominant channel when it clears
36
+ * `speakerHistoryMinRmsEvidence` and beats runner-up by
37
+ * `speakerHistoryDominanceRatio`. Robust for stable speaker labels but
38
+ * does nothing when a speaker has split evidence.
39
+ * - `"none"`: disable resolution; `"unknown"` words remain `"unknown"` in
40
+ * the output.
41
+ */
42
+ resolveUnknownChannelsMethod?: "none" | "window" | "speaker-history";
43
+ /**
44
+ * Half-window (in words) on each side of an `"unknown"` word for the
45
+ * `"window"` method. Default 2 — so the full window is up to 5 words
46
+ * (2 before + the unknown + 2 after).
47
+ */
48
+ resolutionWindowWords?: number;
49
+ /**
50
+ * Minimum cumulative active-RMS evidence (sum across all the speaker's
51
+ * frames to date) before a speaker can be resolved via the
52
+ * `"speaker-history"` method. Default 0.5 — roughly a few seconds of
53
+ * sustained speech.
54
+ */
55
+ speakerHistoryMinRmsEvidence?: number;
56
+ /**
57
+ * For the `"speaker-history"` method, the top channel's evidence must
58
+ * exceed the runner-up's by at least this factor for the speaker to be
59
+ * considered pinned to that channel. Default 3.
60
+ */
61
+ speakerHistoryDominanceRatio?: number;
62
+ };
2
63
 
3
64
  export type LLMGatewayMessage = {
4
65
  role: string;
@@ -30,7 +91,8 @@ export type StreamingTranscriberParams = {
30
91
  keyterms?: string[];
31
92
  keytermsPrompt?: string[];
32
93
  prompt?: string;
33
- speechModel: StreamingSpeechModel;
94
+ agentContext?: string;
95
+ speechModel?: StreamingSpeechModel;
34
96
  languageDetection?: boolean;
35
97
  domain?: StreamingDomain;
36
98
  inactivityTimeout?: number;
@@ -46,10 +108,38 @@ export type StreamingTranscriberParams = {
46
108
  redactPii?: boolean;
47
109
  redactPiiPolicies?: StreamingPiiPolicy[];
48
110
  redactPiiSub?: StreamingPiiSubstitution;
111
+ mode?: StreamingMode;
49
112
  llmGateway?: LLMGatewayConfig;
50
113
  webhookUrl?: string;
51
114
  webhookAuthHeaderName?: string;
52
115
  webhookAuthHeaderValue?: string;
116
+ /**
117
+ * Enable dual-channel (or N-channel) mode. Presence of `channels` switches the
118
+ * transcriber into channel-tagged mode: `sendAudio(audio, { channel })` is required,
119
+ * per-channel VAD runs on the raw PCM, the streams are mixed to mono before being
120
+ * sent to the server, and emitted `TurnEvent`s are enriched with `channel` and
121
+ * per-word `channel` attribution.
122
+ *
123
+ * Must contain exactly 2 entries with unique names. The names are echoed back in
124
+ * `TurnEvent.channel` / `words[i].channel`.
125
+ *
126
+ * **Acoustic-leak caveat.** Per-word channel attribution uses energy-based
127
+ * VAD on each channel. If your capture setup lets one channel's audio bleed
128
+ * into another at similar amplitude — typically system audio playing
129
+ * through speakers and being picked up by an open mic — attribution can
130
+ * misfire (mic-tagged words that were actually system). Transcription
131
+ * quality is unaffected; only the `channel` field is. To preserve
132
+ * attribution in speaker-leak setups, apply echo cancellation at capture
133
+ * before feeding audio to the SDK. In browsers, that's
134
+ * `getUserMedia({ audio: { echoCancellation: true } })`. On macOS native,
135
+ * `AVAudioEngine.setVoiceProcessingEnabled(true)` on the input node. If
136
+ * platform-level AEC isn't available, swap in a DNN VAD (e.g. Silero) via
137
+ * `channelAttribution.createVad`. See the dual-channel sample app's
138
+ * README for worked examples.
139
+ */
140
+ channels?: Array<{ name: string }>;
141
+ /** Tuning for dual-channel attribution. Ignored when `channels` is unset. */
142
+ channelAttribution?: ChannelAttributionParams;
53
143
  };
54
144
 
55
145
  export type StreamingEvents =
@@ -58,7 +148,9 @@ export type StreamingEvents =
58
148
  | "turn"
59
149
  | "speechStarted"
60
150
  | "llmGatewayResponse"
151
+ | "speakerRevision"
61
152
  | "warning"
153
+ | "vad"
62
154
  | "error";
63
155
 
64
156
  export type StreamingListeners = {
@@ -67,7 +159,9 @@ export type StreamingListeners = {
67
159
  turn?: (event: TurnEvent) => void;
68
160
  speechStarted?: (event: SpeechStartedEvent) => void;
69
161
  llmGatewayResponse?: (event: LLMGatewayResponseEvent) => void;
162
+ speakerRevision?: (event: SpeakerRevisionEvent) => void;
70
163
  warning?: (event: WarningEvent) => void;
164
+ vad?: (event: VadFrame) => void;
71
165
  error?: (error: Error) => void;
72
166
  };
73
167
 
@@ -75,11 +169,14 @@ export type StreamingSpeechModel =
75
169
  | "universal-streaming-english"
76
170
  | "universal-streaming-multilingual"
77
171
  | "u3-rt-pro"
172
+ | "u3-rt-pro-beta-1"
78
173
  | "whisper-rt"
79
174
  | "u3-pro";
80
175
 
81
176
  export type StreamingDomain = "medical-v1";
82
177
 
178
+ export type StreamingMode = "max_accuracy" | "min_latency" | "balanced";
179
+
83
180
  export type VoiceFocusModel = "near-field" | "far-field";
84
181
 
85
182
  export type StreamingPiiSubstitution = "hash" | "entity_name";
@@ -186,6 +283,12 @@ export type TurnEvent = {
186
283
  language_code?: string;
187
284
  language_confidence?: number;
188
285
  speaker_label?: string;
286
+ /**
287
+ * Duration-weighted majority channel across `words[i].channel`. Populated only
288
+ * when the transcriber is configured with `channels`. Independent from
289
+ * `speaker_label`.
290
+ */
291
+ channel?: Channel;
189
292
  };
190
293
 
191
294
  export type StreamingWord = {
@@ -195,6 +298,20 @@ export type StreamingWord = {
195
298
  text: string;
196
299
  word_is_final: boolean;
197
300
  speaker?: string;
301
+ /**
302
+ * Physical input channel attributed by client-side VAD during this word's
303
+ * time window. Populated only when the transcriber is configured with
304
+ * `channels`. Independent from `speaker`.
305
+ */
306
+ channel?: Channel;
307
+ /**
308
+ * True if `channel` was filled in by `channelAttribution.resolveUnknownChannelsMethod`
309
+ * rather than by the per-word VAD. Only set on words whose per-word VAD
310
+ * attribution was `"unknown"` and whose resolution method produced a
311
+ * confident channel. Useful for debugging or rendering an indicator that a
312
+ * word's channel came from context, not direct VAD evidence.
313
+ */
314
+ channelResolved?: boolean;
198
315
  };
199
316
 
200
317
  export type TerminationEvent = {
@@ -220,6 +337,7 @@ export type StreamingUpdateConfiguration = {
220
337
  format_turns?: boolean;
221
338
  keyterms_prompt?: string[];
222
339
  prompt?: string;
340
+ agent_context?: string;
223
341
  filter_profanity?: boolean;
224
342
  interruption_delay?: number;
225
343
  turn_left_pad_ms?: number;
@@ -248,12 +366,37 @@ export type LLMGatewayResponseEvent = {
248
366
  data: unknown;
249
367
  };
250
368
 
369
+ /**
370
+ * A single earlier Turn whose speaker labels were revised by reclustering.
371
+ * Match by `turn_order` against the original Turn; replace its per-word
372
+ * `speaker` assignments (and the turn-level `speaker_label`) with these. Text
373
+ * and word timestamps are unchanged from the original Turn.
374
+ */
375
+ export type SpeakerRevisionItem = {
376
+ turn_order: number;
377
+ speaker_label?: string;
378
+ words: StreamingWord[];
379
+ };
380
+
381
+ /**
382
+ * Server-side correction to previously-emitted Turns' speaker labels.
383
+ * Diarization-only (emitted only when `speakerLabels` is enabled). Sent once
384
+ * per offline-recluster resolve; `revisions` carries one entry per earlier
385
+ * Turn whose label actually changed (unchanged turns are omitted). Apply each
386
+ * entry by matching its `turn_order`.
387
+ */
388
+ export type SpeakerRevisionEvent = {
389
+ type: "SpeakerRevision";
390
+ revisions: SpeakerRevisionItem[];
391
+ };
392
+
251
393
  export type StreamingEventMessage =
252
394
  | BeginEvent
253
395
  | TurnEvent
254
396
  | SpeechStartedEvent
255
397
  | TerminationEvent
256
398
  | LLMGatewayResponseEvent
399
+ | SpeakerRevisionEvent
257
400
  | ErrorEvent
258
401
  | WarningEvent;
259
402