assemblyai 4.33.3 → 4.34.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +22 -0
  2. package/dist/assemblyai.streaming.umd.js +1291 -3
  3. package/dist/assemblyai.streaming.umd.min.js +1 -1
  4. package/dist/assemblyai.umd.js +802 -7
  5. package/dist/assemblyai.umd.min.js +1 -1
  6. package/dist/browser.mjs +775 -5
  7. package/dist/bun.mjs +775 -5
  8. package/dist/deno.mjs +775 -5
  9. package/dist/exports/streaming.d.ts +7 -0
  10. package/dist/index.cjs +802 -7
  11. package/dist/index.mjs +794 -8
  12. package/dist/node.cjs +783 -4
  13. package/dist/node.mjs +775 -5
  14. package/dist/services/index.d.ts +2 -2
  15. package/dist/services/streaming/browser/dual-channel-capture.d.ts +66 -0
  16. package/dist/services/streaming/browser/worklets/pcm16-encoder.d.ts +19 -0
  17. package/dist/services/streaming/energy-vad.d.ts +35 -0
  18. package/dist/services/streaming/index.d.ts +4 -0
  19. package/dist/services/streaming/label-mapper.d.ts +44 -0
  20. package/dist/services/streaming/resampler.d.ts +22 -0
  21. package/dist/services/streaming/service.d.ts +71 -2
  22. package/dist/streaming.browser.mjs +1247 -4
  23. package/dist/streaming.cjs +1287 -3
  24. package/dist/streaming.mjs +1276 -4
  25. package/dist/types/streaming/dual-channel.d.ts +48 -0
  26. package/dist/types/streaming/index.d.ts +140 -4
  27. package/dist/workerd.mjs +775 -5
  28. package/package.json +1 -1
  29. package/src/exports/streaming.ts +7 -0
  30. package/src/services/index.ts +20 -1
  31. package/src/services/streaming/browser/dual-channel-capture.ts +177 -0
  32. package/src/services/streaming/browser/worklets/pcm16-encoder.ts +70 -0
  33. package/src/services/streaming/energy-vad.ts +75 -0
  34. package/src/services/streaming/index.ts +4 -0
  35. package/src/services/streaming/label-mapper.ts +128 -0
  36. package/src/services/streaming/resampler.ts +69 -0
  37. package/src/services/streaming/service.ts +405 -3
  38. package/src/types/streaming/dual-channel.ts +57 -0
  39. package/src/types/streaming/index.ts +144 -1
@@ -0,0 +1,48 @@
1
+ /**
2
+ * Physical input channel that a word/turn was attributed to.
3
+ * - A channel name declared in `StreamingTranscriberParams.channels` (e.g. `"mic"`, `"system"`).
4
+ * - `"unknown"`: no channel was clearly dominant during the word's time window (silent
5
+ * or all channels evenly active under our threshold).
6
+ *
7
+ * This is independent of AssemblyAI's diarization `speaker_label` / `words[i].speaker`,
8
+ * which identifies voices by acoustic characteristics. A given speaker_label can map
9
+ * to any physical channel; the two dimensions can disagree.
10
+ */
11
+ export type Channel = string | "unknown";
12
+ /**
13
+ * Per-channel, per-frame VAD observation emitted by `StreamingTranscriber` when running
14
+ * in dual-channel mode. `ts` is stream-relative milliseconds, derived from the
15
+ * per-channel sample counter — the same reference frame as `StreamingWord.start` /
16
+ * `.end`, so per-word lookups need no conversion.
17
+ */
18
+ export type VadFrame = {
19
+ ts: number;
20
+ channel: string;
21
+ active: boolean;
22
+ rms: number;
23
+ };
24
+ export type VadDetectorResult = {
25
+ active: boolean;
26
+ energy: number;
27
+ };
28
+ /**
29
+ * Pluggable per-channel voice-activity detector. The default `EnergyVad` is energy-based
30
+ * with an adaptive noise-floor threshold; callers can drop in a DNN-backed detector
31
+ * (e.g. Silero via `@ricky0123/vad-web`) for noisier environments.
32
+ *
33
+ * A separate `VadDetector` instance is held per channel; do not assume cross-channel
34
+ * state. Frames are fixed-size at the transcriber's target sample rate.
35
+ */
36
+ export interface VadDetector {
37
+ process(frame: Float32Array): VadDetectorResult;
38
+ reset(): void;
39
+ }
40
+ /**
41
+ * Thrown when `DualChannelCapture` is constructed in a non-browser environment
42
+ * (no `globalThis.AudioContext`). The helper is intentionally surfaced from the
43
+ * main entrypoint so the import path is uniform across runtimes; the runtime
44
+ * guard moves to construction time.
45
+ */
46
+ export declare class BrowserOnlyError extends Error {
47
+ constructor(message?: string);
48
+ }
@@ -1,4 +1,63 @@
1
1
  import { AudioEncoding } from "..";
2
+ import type { Channel, VadDetector, VadFrame } from "./dual-channel";
3
+ export * from "./dual-channel";
4
+ /**
5
+ * Per-channel attribution tuning for dual-channel mode. All fields optional;
6
+ * ignored when `StreamingTranscriberParams.channels` is not set.
7
+ */
8
+ export type ChannelAttributionParams = {
9
+ /** Energy ratio above which a channel is declared dominant for a word. Default 4. */
10
+ dominanceRatio?: number;
11
+ /** Rolling VAD timeline window in ms. Default 30_000. */
12
+ timelineWindowMs?: number;
13
+ /**
14
+ * Factory for the per-channel VAD detector. Called once per declared channel
15
+ * at transcriber construction time. The channel name is passed so factories
16
+ * that wrap higher-level VAD libraries (which manage their own audio source)
17
+ * can map each `VadDetector` instance to its corresponding channel.
18
+ */
19
+ createVad?: (channelName: string) => VadDetector;
20
+ /** Mix flush interval in ms — how often per-channel buffers are summed and sent. Default 50. */
21
+ flushIntervalMs?: number;
22
+ /**
23
+ * Strategy used to fill words whose per-word VAD attribution resolved to
24
+ * `"unknown"`. Confident per-word VAD decisions (`"mic"` / `"system"`) are
25
+ * never modified by any strategy.
26
+ *
27
+ * - `"window"` (default): look at the dominant non-`"unknown"` channel
28
+ * among ±`resolutionWindowWords` neighboring words in the same turn.
29
+ * Ignores `speaker_label`, so it works even when AAI re-uses a label for
30
+ * two physically distinct voices.
31
+ * - `"speaker-history"`: accumulate per-`speaker_label` per-channel active
32
+ * VAD energy across the session, then fill `"unknown"` words with the
33
+ * speaker's dominant channel when it clears
34
+ * `speakerHistoryMinRmsEvidence` and beats runner-up by
35
+ * `speakerHistoryDominanceRatio`. Robust for stable speaker labels but
36
+ * does nothing when a speaker has split evidence.
37
+ * - `"none"`: disable resolution; `"unknown"` words remain `"unknown"` in
38
+ * the output.
39
+ */
40
+ resolveUnknownChannelsMethod?: "none" | "window" | "speaker-history";
41
+ /**
42
+ * Half-window (in words) on each side of an `"unknown"` word for the
43
+ * `"window"` method. Default 2 — so the full window is up to 5 words
44
+ * (2 before + the unknown + 2 after).
45
+ */
46
+ resolutionWindowWords?: number;
47
+ /**
48
+ * Minimum cumulative active-RMS evidence (sum across all the speaker's
49
+ * frames to date) before a speaker can be resolved via the
50
+ * `"speaker-history"` method. Default 0.5 — roughly a few seconds of
51
+ * sustained speech.
52
+ */
53
+ speakerHistoryMinRmsEvidence?: number;
54
+ /**
55
+ * For the `"speaker-history"` method, the top channel's evidence must
56
+ * exceed the runner-up's by at least this factor for the speaker to be
57
+ * considered pinned to that channel. Default 3.
58
+ */
59
+ speakerHistoryDominanceRatio?: number;
60
+ };
2
61
  export type LLMGatewayMessage = {
3
62
  role: string;
4
63
  content: string;
@@ -27,7 +86,8 @@ export type StreamingTranscriberParams = {
27
86
  keyterms?: string[];
28
87
  keytermsPrompt?: string[];
29
88
  prompt?: string;
30
- speechModel: StreamingSpeechModel;
89
+ agentContext?: string;
90
+ speechModel?: StreamingSpeechModel;
31
91
  languageDetection?: boolean;
32
92
  domain?: StreamingDomain;
33
93
  inactivityTimeout?: number;
@@ -43,23 +103,56 @@ export type StreamingTranscriberParams = {
43
103
  redactPii?: boolean;
44
104
  redactPiiPolicies?: StreamingPiiPolicy[];
45
105
  redactPiiSub?: StreamingPiiSubstitution;
106
+ mode?: StreamingMode;
46
107
  llmGateway?: LLMGatewayConfig;
47
108
  webhookUrl?: string;
48
109
  webhookAuthHeaderName?: string;
49
110
  webhookAuthHeaderValue?: string;
111
+ /**
112
+ * Enable dual-channel (or N-channel) mode. Presence of `channels` switches the
113
+ * transcriber into channel-tagged mode: `sendAudio(audio, { channel })` is required,
114
+ * per-channel VAD runs on the raw PCM, the streams are mixed to mono before being
115
+ * sent to the server, and emitted `TurnEvent`s are enriched with `channel` and
116
+ * per-word `channel` attribution.
117
+ *
118
+ * Must contain exactly 2 entries with unique names. The names are echoed back in
119
+ * `TurnEvent.channel` / `words[i].channel`.
120
+ *
121
+ * **Acoustic-leak caveat.** Per-word channel attribution uses energy-based
122
+ * VAD on each channel. If your capture setup lets one channel's audio bleed
123
+ * into another at similar amplitude — typically system audio playing
124
+ * through speakers and being picked up by an open mic — attribution can
125
+ * misfire (mic-tagged words that were actually system). Transcription
126
+ * quality is unaffected; only the `channel` field is. To preserve
127
+ * attribution in speaker-leak setups, apply echo cancellation at capture
128
+ * before feeding audio to the SDK. In browsers, that's
129
+ * `getUserMedia({ audio: { echoCancellation: true } })`. On macOS native,
130
+ * `AVAudioEngine.setVoiceProcessingEnabled(true)` on the input node. If
131
+ * platform-level AEC isn't available, swap in a DNN VAD (e.g. Silero) via
132
+ * `channelAttribution.createVad`. See the dual-channel sample app's
133
+ * README for worked examples.
134
+ */
135
+ channels?: Array<{
136
+ name: string;
137
+ }>;
138
+ /** Tuning for dual-channel attribution. Ignored when `channels` is unset. */
139
+ channelAttribution?: ChannelAttributionParams;
50
140
  };
51
- export type StreamingEvents = "open" | "close" | "turn" | "speechStarted" | "llmGatewayResponse" | "warning" | "error";
141
+ export type StreamingEvents = "open" | "close" | "turn" | "speechStarted" | "llmGatewayResponse" | "speakerRevision" | "warning" | "vad" | "error";
52
142
  export type StreamingListeners = {
53
143
  open?: (event: BeginEvent) => void;
54
144
  close?: (code: number, reason: string) => void;
55
145
  turn?: (event: TurnEvent) => void;
56
146
  speechStarted?: (event: SpeechStartedEvent) => void;
57
147
  llmGatewayResponse?: (event: LLMGatewayResponseEvent) => void;
148
+ speakerRevision?: (event: SpeakerRevisionEvent) => void;
58
149
  warning?: (event: WarningEvent) => void;
150
+ vad?: (event: VadFrame) => void;
59
151
  error?: (error: Error) => void;
60
152
  };
61
- export type StreamingSpeechModel = "universal-streaming-english" | "universal-streaming-multilingual" | "u3-rt-pro" | "whisper-rt" | "u3-pro";
153
+ export type StreamingSpeechModel = "universal-streaming-english" | "universal-streaming-multilingual" | "u3-rt-pro" | "u3-rt-pro-beta-1" | "whisper-rt" | "u3-pro";
62
154
  export type StreamingDomain = "medical-v1";
155
+ export type StreamingMode = "max_accuracy" | "min_latency" | "balanced";
63
156
  export type VoiceFocusModel = "near-field" | "far-field";
64
157
  export type StreamingPiiSubstitution = "hash" | "entity_name";
65
158
  export type StreamingPiiPolicy = "account_number" | "banking_information" | "blood_type" | "corporate_action" | "credit_card_cvv" | "credit_card_expiration" | "credit_card_number" | "date" | "date_interval" | "date_of_birth" | "day" | "drivers_license" | "drug" | "duration" | "effect" | "email_address" | "event" | "filename" | "financial_metric" | "gender" | "gender_sexuality" | "healthcare_number" | "injury" | "ip_address" | "language" | "location" | "location_address" | "location_address_street" | "location_city" | "location_coordinate" | "location_country" | "location_state" | "location_zip" | "marital_status" | "medical_code" | "medical_condition" | "medical_process" | "money_amount" | "month" | "nationality" | "number_sequence" | "occupation" | "organization" | "organization_id" | "organization_medical_facility" | "passport_number" | "password" | "person_age" | "person_name" | "phone_number" | "physical_attribute" | "political_affiliation" | "product" | "project" | "religion" | "sexuality" | "statistics" | "time" | "trend" | "url" | "us_social_security_number" | "username" | "vehicle_id" | "year" | "zodiac_sign";
@@ -91,6 +184,12 @@ export type TurnEvent = {
91
184
  language_code?: string;
92
185
  language_confidence?: number;
93
186
  speaker_label?: string;
187
+ /**
188
+ * Duration-weighted majority channel across `words[i].channel`. Populated only
189
+ * when the transcriber is configured with `channels`. Independent from
190
+ * `speaker_label`.
191
+ */
192
+ channel?: Channel;
94
193
  };
95
194
  export type StreamingWord = {
96
195
  start: number;
@@ -99,6 +198,20 @@ export type StreamingWord = {
99
198
  text: string;
100
199
  word_is_final: boolean;
101
200
  speaker?: string;
201
+ /**
202
+ * Physical input channel attributed by client-side VAD during this word's
203
+ * time window. Populated only when the transcriber is configured with
204
+ * `channels`. Independent from `speaker`.
205
+ */
206
+ channel?: Channel;
207
+ /**
208
+ * True if `channel` was filled in by `channelAttribution.resolveUnknownChannelsMethod`
209
+ * rather than by the per-word VAD. Only set on words whose per-word VAD
210
+ * attribution was `"unknown"` and whose resolution method produced a
211
+ * confident channel. Useful for debugging or rendering an indicator that a
212
+ * word's channel came from context, not direct VAD evidence.
213
+ */
214
+ channelResolved?: boolean;
102
215
  };
103
216
  export type TerminationEvent = {
104
217
  type: "Termination";
@@ -121,6 +234,7 @@ export type StreamingUpdateConfiguration = {
121
234
  format_turns?: boolean;
122
235
  keyterms_prompt?: string[];
123
236
  prompt?: string;
237
+ agent_context?: string;
124
238
  filter_profanity?: boolean;
125
239
  interruption_delay?: number;
126
240
  turn_left_pad_ms?: number;
@@ -144,5 +258,27 @@ export type LLMGatewayResponseEvent = {
144
258
  transcript: string;
145
259
  data: unknown;
146
260
  };
147
- export type StreamingEventMessage = BeginEvent | TurnEvent | SpeechStartedEvent | TerminationEvent | LLMGatewayResponseEvent | ErrorEvent | WarningEvent;
261
+ /**
262
+ * A single earlier Turn whose speaker labels were revised by reclustering.
263
+ * Match by `turn_order` against the original Turn; replace its per-word
264
+ * `speaker` assignments (and the turn-level `speaker_label`) with these. Text
265
+ * and word timestamps are unchanged from the original Turn.
266
+ */
267
+ export type SpeakerRevisionItem = {
268
+ turn_order: number;
269
+ speaker_label?: string;
270
+ words: StreamingWord[];
271
+ };
272
+ /**
273
+ * Server-side correction to previously-emitted Turns' speaker labels.
274
+ * Diarization-only (emitted only when `speakerLabels` is enabled). Sent once
275
+ * per offline-recluster resolve; `revisions` carries one entry per earlier
276
+ * Turn whose label actually changed (unchanged turns are omitted). Apply each
277
+ * entry by matching its `turn_order`.
278
+ */
279
+ export type SpeakerRevisionEvent = {
280
+ type: "SpeakerRevision";
281
+ revisions: SpeakerRevisionItem[];
282
+ };
283
+ export type StreamingEventMessage = BeginEvent | TurnEvent | SpeechStartedEvent | TerminationEvent | LLMGatewayResponseEvent | SpeakerRevisionEvent | ErrorEvent | WarningEvent;
148
284
  export type StreamingOperationMessage = StreamingUpdateConfiguration | StreamingForceEndpoint | StreamingTerminateSession;