kugelaudio 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## [kugelaudio-v0.6.0](https://github.com/Kugelaudio/KugelAudio/compare/js-sdk-v0.5.0...js-sdk-v0.6.0) (2026-06-01)
2
+
3
+ ### Features
4
+
5
+ * streaming barge-in (cancelCurrent) across server + JS/Python/Java SDKs ([#1210](https://github.com/Kugelaudio/KugelAudio/issues/1210)) ([341e54f](https://github.com/Kugelaudio/KugelAudio/commit/341e54f169b4dd9242272b249fca30f005bfc3b8))
6
+
1
7
  ## [kugelaudio-v0.5.0](https://github.com/Kugelaudio/KugelAudio/compare/js-sdk-v0.4.0...js-sdk-v0.5.0) (2026-05-21)
2
8
 
3
9
  ### Features
package/dist/index.d.mts CHANGED
@@ -383,6 +383,13 @@ interface StreamingSessionCallbacks {
383
383
  onGenerationStarted?: (chunkId: number, text: string) => void;
384
384
  /** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
385
385
  onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
386
+ /**
387
+ * Called when the server acknowledges a barge-in
388
+ * ({@link StreamingSession.cancelCurrent}). After this fires, no further
389
+ * audio chunks from the cancelled turn will arrive and the session is
390
+ * ready for the next `send()`.
391
+ */
392
+ onInterrupted?: () => void;
386
393
  /** Called on any error. */
387
394
  onError?: (error: Error) => void;
388
395
  }
@@ -938,8 +945,14 @@ declare class MultiContextSession {
938
945
  flush(contextId: string): void;
939
946
  /**
940
947
  * Close a specific context.
948
+ *
949
+ * @param contextId - The context to close.
950
+ * @param immediate - When `true`, **barge-in**: the server cancels the
951
+ * context's in-flight generation immediately and discards any buffered or
952
+ * queued text instead of draining it. Use this when the end user speaks
953
+ * over the agent. When `false` (default), queued sentences finish first.
941
954
  */
942
- closeContext(contextId: string): void;
955
+ closeContext(contextId: string, immediate?: boolean): void;
943
956
  /**
944
957
  * Send keep-alive to reset a context's inactivity timeout.
945
958
  */
@@ -1016,6 +1029,34 @@ declare class StreamingSession {
1016
1029
  * handle chunking via `chunkLengthSchedule` / `autoMode` instead.
1017
1030
  */
1018
1031
  send(text: string, flush?: boolean): void;
1032
+ /**
1033
+ * Interrupt (barge-in) the current generation without closing the socket.
1034
+ *
1035
+ * Use this when the end user starts speaking over the agent: it tells the
1036
+ * server to **stop generating audio for the current turn immediately** and
1037
+ * drop any text that was buffered or queued but not yet spoken. Unlike
1038
+ * {@link endSession}, no remaining text is flushed — the turn is abandoned.
1039
+ *
1040
+ * The WebSocket stays open and a fresh session is ready, so you can call
1041
+ * {@link send} for the next user turn right away (config is re-sent
1042
+ * automatically on that first `send`).
1043
+ *
1044
+ * The returned promise resolves once the server acknowledges with an
1045
+ * `interrupted` frame (which also fires
1046
+ * {@link StreamingSessionCallbacks.onInterrupted}), or after a 5 s **quiet**
1047
+ * timeout — i.e. 5 s elapse without any server message arriving. The timer
1048
+ * resets on every incoming frame, so a few in-flight audio chunks still
1049
+ * draining at the moment of cancellation do not trip it prematurely.
1050
+ *
1051
+ * @example
1052
+ * ```typescript
1053
+ * // VAD detected the user speaking over the agent:
1054
+ * await session.cancelCurrent();
1055
+ * // Socket is still open — start the next turn immediately:
1056
+ * session.send(nextLlmToken);
1057
+ * ```
1058
+ */
1059
+ cancelCurrent(): Promise<void>;
1019
1060
  /**
1020
1061
  * End the current session but keep the WebSocket connection open.
1021
1062
  *
package/dist/index.d.ts CHANGED
@@ -383,6 +383,13 @@ interface StreamingSessionCallbacks {
383
383
  onGenerationStarted?: (chunkId: number, text: string) => void;
384
384
  /** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
385
385
  onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
386
+ /**
387
+ * Called when the server acknowledges a barge-in
388
+ * ({@link StreamingSession.cancelCurrent}). After this fires, no further
389
+ * audio chunks from the cancelled turn will arrive and the session is
390
+ * ready for the next `send()`.
391
+ */
392
+ onInterrupted?: () => void;
386
393
  /** Called on any error. */
387
394
  onError?: (error: Error) => void;
388
395
  }
@@ -938,8 +945,14 @@ declare class MultiContextSession {
938
945
  flush(contextId: string): void;
939
946
  /**
940
947
  * Close a specific context.
948
+ *
949
+ * @param contextId - The context to close.
950
+ * @param immediate - When `true`, **barge-in**: the server cancels the
951
+ * context's in-flight generation immediately and discards any buffered or
952
+ * queued text instead of draining it. Use this when the end user speaks
953
+ * over the agent. When `false` (default), queued sentences finish first.
941
954
  */
942
- closeContext(contextId: string): void;
955
+ closeContext(contextId: string, immediate?: boolean): void;
943
956
  /**
944
957
  * Send keep-alive to reset a context's inactivity timeout.
945
958
  */
@@ -1016,6 +1029,34 @@ declare class StreamingSession {
1016
1029
  * handle chunking via `chunkLengthSchedule` / `autoMode` instead.
1017
1030
  */
1018
1031
  send(text: string, flush?: boolean): void;
1032
+ /**
1033
+ * Interrupt (barge-in) the current generation without closing the socket.
1034
+ *
1035
+ * Use this when the end user starts speaking over the agent: it tells the
1036
+ * server to **stop generating audio for the current turn immediately** and
1037
+ * drop any text that was buffered or queued but not yet spoken. Unlike
1038
+ * {@link endSession}, no remaining text is flushed — the turn is abandoned.
1039
+ *
1040
+ * The WebSocket stays open and a fresh session is ready, so you can call
1041
+ * {@link send} for the next user turn right away (config is re-sent
1042
+ * automatically on that first `send`).
1043
+ *
1044
+ * The returned promise resolves once the server acknowledges with an
1045
+ * `interrupted` frame (which also fires
1046
+ * {@link StreamingSessionCallbacks.onInterrupted}), or after a 5 s **quiet**
1047
+ * timeout — i.e. 5 s elapse without any server message arriving. The timer
1048
+ * resets on every incoming frame, so a few in-flight audio chunks still
1049
+ * draining at the moment of cancellation do not trip it prematurely.
1050
+ *
1051
+ * @example
1052
+ * ```typescript
1053
+ * // VAD detected the user speaking over the agent:
1054
+ * await session.cancelCurrent();
1055
+ * // Socket is still open — start the next turn immediately:
1056
+ * session.send(nextLlmToken);
1057
+ * ```
1058
+ */
1059
+ cancelCurrent(): Promise<void>;
1019
1060
  /**
1020
1061
  * End the current session but keep the WebSocket connection open.
1021
1062
  *
package/dist/index.js CHANGED
@@ -1438,13 +1438,21 @@ var MultiContextSession = class {
1438
1438
  }
1439
1439
  /**
1440
1440
  * Close a specific context.
1441
+ *
1442
+ * @param contextId - The context to close.
1443
+ * @param immediate - When `true`, **barge-in**: the server cancels the
1444
+ * context's in-flight generation immediately and discards any buffered or
1445
+ * queued text instead of draining it. Use this when the end user speaks
1446
+ * over the agent. When `false` (default), queued sentences finish first.
1441
1447
  */
1442
- closeContext(contextId) {
1448
+ closeContext(contextId, immediate = false) {
1443
1449
  if (!this.ws || this.ws.readyState !== WS_OPEN) return;
1444
- this.ws.send(JSON.stringify({
1450
+ const msg = {
1445
1451
  close_context: true,
1446
1452
  context_id: contextId
1447
- }));
1453
+ };
1454
+ if (immediate) msg.immediate = true;
1455
+ this.ws.send(JSON.stringify(msg));
1448
1456
  }
1449
1457
  /**
1450
1458
  * Send keep-alive to reset a context's inactivity timeout.
@@ -1549,6 +1557,9 @@ var StreamingSession = class {
1549
1557
  if (data.generation_started) {
1550
1558
  this.callbacks.onGenerationStarted?.(data.chunk_id ?? 0, data.text ?? "");
1551
1559
  }
1560
+ if (data.interrupted) {
1561
+ this.callbacks.onInterrupted?.();
1562
+ }
1552
1563
  if (data.session_closed) {
1553
1564
  this.callbacks.onSessionClosed?.(
1554
1565
  data.total_audio_seconds ?? 0,
@@ -1629,6 +1640,73 @@ var StreamingSession = class {
1629
1640
  }
1630
1641
  this.ws.send(JSON.stringify(msg));
1631
1642
  }
1643
+ /**
1644
+ * Interrupt (barge-in) the current generation without closing the socket.
1645
+ *
1646
+ * Use this when the end user starts speaking over the agent: it tells the
1647
+ * server to **stop generating audio for the current turn immediately** and
1648
+ * drop any text that was buffered or queued but not yet spoken. Unlike
1649
+ * {@link endSession}, no remaining text is flushed — the turn is abandoned.
1650
+ *
1651
+ * The WebSocket stays open and a fresh session is ready, so you can call
1652
+ * {@link send} for the next user turn right away (config is re-sent
1653
+ * automatically on that first `send`).
1654
+ *
1655
+ * The returned promise resolves once the server acknowledges with an
1656
+ * `interrupted` frame (which also fires
1657
+ * {@link StreamingSessionCallbacks.onInterrupted}), or after a 5 s **quiet**
1658
+ * timeout — i.e. 5 s elapse without any server message arriving. The timer
1659
+ * resets on every incoming frame, so a few in-flight audio chunks still
1660
+ * draining at the moment of cancellation do not trip it prematurely.
1661
+ *
1662
+ * @example
1663
+ * ```typescript
1664
+ * // VAD detected the user speaking over the agent:
1665
+ * await session.cancelCurrent();
1666
+ * // Socket is still open — start the next turn immediately:
1667
+ * session.send(nextLlmToken);
1668
+ * ```
1669
+ */
1670
+ cancelCurrent() {
1671
+ if (!this.ws || this.ws.readyState !== WS_OPEN) return Promise.resolve();
1672
+ const ws = this.ws;
1673
+ const QUIET_TIMEOUT_MS = 5e3;
1674
+ return new Promise((resolve) => {
1675
+ let settled = false;
1676
+ let timer;
1677
+ const prevMessage = ws.onmessage;
1678
+ const prevClose = ws.onclose;
1679
+ const done = () => {
1680
+ if (settled) return;
1681
+ settled = true;
1682
+ clearTimeout(timer);
1683
+ ws.onmessage = prevMessage;
1684
+ ws.onclose = prevClose;
1685
+ this.configSent = false;
1686
+ resolve();
1687
+ };
1688
+ const armQuietTimer = () => {
1689
+ clearTimeout(timer);
1690
+ timer = setTimeout(done, QUIET_TIMEOUT_MS);
1691
+ };
1692
+ armQuietTimer();
1693
+ ws.onmessage = (event) => {
1694
+ armQuietTimer();
1695
+ if (prevMessage) prevMessage.call(ws, event);
1696
+ try {
1697
+ const raw = typeof event.data === "string" ? event.data : event.data instanceof Buffer ? event.data.toString() : String(event.data);
1698
+ if (JSON.parse(raw).interrupted) done();
1699
+ } catch {
1700
+ }
1701
+ };
1702
+ ws.onclose = (event) => {
1703
+ this.ws = null;
1704
+ if (prevClose) prevClose.call(ws, event);
1705
+ done();
1706
+ };
1707
+ ws.send(JSON.stringify({ cancel: true }));
1708
+ });
1709
+ }
1632
1710
  /**
1633
1711
  * End the current session but keep the WebSocket connection open.
1634
1712
  *
package/dist/index.mjs CHANGED
@@ -1400,13 +1400,21 @@ var MultiContextSession = class {
1400
1400
  }
1401
1401
  /**
1402
1402
  * Close a specific context.
1403
+ *
1404
+ * @param contextId - The context to close.
1405
+ * @param immediate - When `true`, **barge-in**: the server cancels the
1406
+ * context's in-flight generation immediately and discards any buffered or
1407
+ * queued text instead of draining it. Use this when the end user speaks
1408
+ * over the agent. When `false` (default), queued sentences finish first.
1403
1409
  */
1404
- closeContext(contextId) {
1410
+ closeContext(contextId, immediate = false) {
1405
1411
  if (!this.ws || this.ws.readyState !== WS_OPEN) return;
1406
- this.ws.send(JSON.stringify({
1412
+ const msg = {
1407
1413
  close_context: true,
1408
1414
  context_id: contextId
1409
- }));
1415
+ };
1416
+ if (immediate) msg.immediate = true;
1417
+ this.ws.send(JSON.stringify(msg));
1410
1418
  }
1411
1419
  /**
1412
1420
  * Send keep-alive to reset a context's inactivity timeout.
@@ -1511,6 +1519,9 @@ var StreamingSession = class {
1511
1519
  if (data.generation_started) {
1512
1520
  this.callbacks.onGenerationStarted?.(data.chunk_id ?? 0, data.text ?? "");
1513
1521
  }
1522
+ if (data.interrupted) {
1523
+ this.callbacks.onInterrupted?.();
1524
+ }
1514
1525
  if (data.session_closed) {
1515
1526
  this.callbacks.onSessionClosed?.(
1516
1527
  data.total_audio_seconds ?? 0,
@@ -1591,6 +1602,73 @@ var StreamingSession = class {
1591
1602
  }
1592
1603
  this.ws.send(JSON.stringify(msg));
1593
1604
  }
1605
+ /**
1606
+ * Interrupt (barge-in) the current generation without closing the socket.
1607
+ *
1608
+ * Use this when the end user starts speaking over the agent: it tells the
1609
+ * server to **stop generating audio for the current turn immediately** and
1610
+ * drop any text that was buffered or queued but not yet spoken. Unlike
1611
+ * {@link endSession}, no remaining text is flushed — the turn is abandoned.
1612
+ *
1613
+ * The WebSocket stays open and a fresh session is ready, so you can call
1614
+ * {@link send} for the next user turn right away (config is re-sent
1615
+ * automatically on that first `send`).
1616
+ *
1617
+ * The returned promise resolves once the server acknowledges with an
1618
+ * `interrupted` frame (which also fires
1619
+ * {@link StreamingSessionCallbacks.onInterrupted}), or after a 5 s **quiet**
1620
+ * timeout — i.e. 5 s elapse without any server message arriving. The timer
1621
+ * resets on every incoming frame, so a few in-flight audio chunks still
1622
+ * draining at the moment of cancellation do not trip it prematurely.
1623
+ *
1624
+ * @example
1625
+ * ```typescript
1626
+ * // VAD detected the user speaking over the agent:
1627
+ * await session.cancelCurrent();
1628
+ * // Socket is still open — start the next turn immediately:
1629
+ * session.send(nextLlmToken);
1630
+ * ```
1631
+ */
1632
+ cancelCurrent() {
1633
+ if (!this.ws || this.ws.readyState !== WS_OPEN) return Promise.resolve();
1634
+ const ws = this.ws;
1635
+ const QUIET_TIMEOUT_MS = 5e3;
1636
+ return new Promise((resolve) => {
1637
+ let settled = false;
1638
+ let timer;
1639
+ const prevMessage = ws.onmessage;
1640
+ const prevClose = ws.onclose;
1641
+ const done = () => {
1642
+ if (settled) return;
1643
+ settled = true;
1644
+ clearTimeout(timer);
1645
+ ws.onmessage = prevMessage;
1646
+ ws.onclose = prevClose;
1647
+ this.configSent = false;
1648
+ resolve();
1649
+ };
1650
+ const armQuietTimer = () => {
1651
+ clearTimeout(timer);
1652
+ timer = setTimeout(done, QUIET_TIMEOUT_MS);
1653
+ };
1654
+ armQuietTimer();
1655
+ ws.onmessage = (event) => {
1656
+ armQuietTimer();
1657
+ if (prevMessage) prevMessage.call(ws, event);
1658
+ try {
1659
+ const raw = typeof event.data === "string" ? event.data : event.data instanceof Buffer ? event.data.toString() : String(event.data);
1660
+ if (JSON.parse(raw).interrupted) done();
1661
+ } catch {
1662
+ }
1663
+ };
1664
+ ws.onclose = (event) => {
1665
+ this.ws = null;
1666
+ if (prevClose) prevClose.call(ws, event);
1667
+ done();
1668
+ };
1669
+ ws.send(JSON.stringify({ cancel: true }));
1670
+ });
1671
+ }
1594
1672
  /**
1595
1673
  * End the current session but keep the WebSocket connection open.
1596
1674
  *
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "kugelaudio",
3
- "version": "0.5.0",
3
+ "version": "0.6.0",
4
4
  "description": "Official JavaScript/TypeScript SDK for KugelAudio TTS API",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",
@@ -333,6 +333,10 @@ function makeGenerationStartedMsg(chunkId: number, text: string): string {
333
333
  });
334
334
  }
335
335
 
336
+ function makeInterruptedMsg(): string {
337
+ return JSON.stringify({ interrupted: true });
338
+ }
339
+
336
340
  describe('StreamingSession', () => {
337
341
  let client: KugelAudio;
338
342
 
@@ -545,4 +549,115 @@ describe('StreamingSession', () => {
545
549
  expect(session.isConnected).toBe(true);
546
550
  expect(() => session.send('Hello.', true)).not.toThrow();
547
551
  });
552
+
553
+ // -------------------------------------------------------------------------
554
+ // cancelCurrent() — barge-in (KUG-1050)
555
+ // -------------------------------------------------------------------------
556
+
557
+ it('cancelCurrent() sends {cancel:true}, fires onInterrupted, keeps socket open', async () => {
558
+ const interruptedCalls: number[] = [];
559
+
560
+ const session = client.tts.streamingSession(
561
+ { voiceId: 1 },
562
+ { onInterrupted: () => interruptedCalls.push(1) },
563
+ );
564
+
565
+ session.connect();
566
+ await new Promise<void>((r) => setTimeout(r, 10));
567
+
568
+ session.send('A very long sentence the user is about to talk over.');
569
+ mockWs.onmessage?.({ data: makeAudioMsg(0, 100) });
570
+
571
+ const cancelPromise = session.cancelCurrent();
572
+
573
+ // The barge-in frame was sent to the server.
574
+ const lastSent = JSON.parse(mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string);
575
+ expect(lastSent.cancel).toBe(true);
576
+
577
+ // Server acks the barge-in.
578
+ mockWs.onmessage?.({ data: makeInterruptedMsg() });
579
+ await cancelPromise;
580
+
581
+ // onInterrupted fired and the socket stayed open for the next turn.
582
+ expect(interruptedCalls).toHaveLength(1);
583
+ expect(session.isConnected).toBe(true);
584
+ expect(mockWs.close).not.toHaveBeenCalled();
585
+ });
586
+
587
+ it('cancelCurrent() re-sends config on the next send (fresh server session)', async () => {
588
+ const session = client.tts.streamingSession({ voiceId: 42 }, {});
589
+
590
+ session.connect();
591
+ await new Promise<void>((r) => setTimeout(r, 10));
592
+
593
+ // First send carries config (voice_id).
594
+ session.send('Hello.');
595
+ expect(JSON.parse(mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string).voice_id).toBe(42);
596
+
597
+ const cancelPromise = session.cancelCurrent();
598
+ mockWs.onmessage?.({ data: makeInterruptedMsg() });
599
+ await cancelPromise;
600
+
601
+ // The server started a fresh session, so the next send must re-send config.
602
+ session.send('Next turn.');
603
+ expect(JSON.parse(mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string).voice_id).toBe(42);
604
+ });
605
+
606
+ it('cancelCurrent() resolves on quiet timeout if server never acks', async () => {
607
+ const session = client.tts.streamingSession({ voiceId: 1 }, {});
608
+
609
+ session.connect();
610
+ await new Promise<void>((r) => setTimeout(r, 10));
611
+ session.send('Hello.');
612
+
613
+ vi.useFakeTimers();
614
+ const cancelPromise = session.cancelCurrent();
615
+
616
+ // No interrupted ack — the 5 s quiet timeout resolves it.
617
+ await vi.advanceTimersByTimeAsync(6_000);
618
+ await cancelPromise;
619
+
620
+ vi.useRealTimers();
621
+ // Socket was never closed; still reusable.
622
+ expect(session.isConnected).toBe(true);
623
+ });
624
+ });
625
+
626
+ // ---------------------------------------------------------------------------
627
+ // MultiContextSession barge-in — closeContext immediate (KUG-1050)
628
+ // ---------------------------------------------------------------------------
629
+
630
+ describe('MultiContextSession closeContext', () => {
631
+ let client: KugelAudio;
632
+
633
+ beforeEach(() => {
634
+ client = new KugelAudio({ apiKey: 'test-key-xxx' });
635
+ });
636
+
637
+ it('closeContext(id, true) sends the immediate barge-in flag', async () => {
638
+ const session = client.tts.createMultiContextSession({ defaultVoiceId: 1 });
639
+ await session.connect({});
640
+
641
+ session.closeContext('ctx1', true);
642
+
643
+ const sent = JSON.parse(
644
+ mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string
645
+ );
646
+ expect(sent.close_context).toBe(true);
647
+ expect(sent.context_id).toBe('ctx1');
648
+ expect(sent.immediate).toBe(true);
649
+ });
650
+
651
+ it('closeContext(id) omits immediate (graceful drain)', async () => {
652
+ const session = client.tts.createMultiContextSession({ defaultVoiceId: 1 });
653
+ await session.connect({});
654
+
655
+ session.closeContext('ctx1');
656
+
657
+ const sent = JSON.parse(
658
+ mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string
659
+ );
660
+ expect(sent.close_context).toBe(true);
661
+ expect(sent.immediate).toBeUndefined();
662
+ });
548
663
  });
package/src/client.ts CHANGED
@@ -1195,14 +1195,22 @@ class MultiContextSession {
1195
1195
 
1196
1196
  /**
1197
1197
  * Close a specific context.
1198
+ *
1199
+ * @param contextId - The context to close.
1200
+ * @param immediate - When `true`, **barge-in**: the server cancels the
1201
+ * context's in-flight generation immediately and discards any buffered or
1202
+ * queued text instead of draining it. Use this when the end user speaks
1203
+ * over the agent. When `false` (default), queued sentences finish first.
1198
1204
  */
1199
- closeContext(contextId: string): void {
1205
+ closeContext(contextId: string, immediate = false): void {
1200
1206
  if (!this.ws || this.ws.readyState !== WS_OPEN) return;
1201
1207
 
1202
- this.ws.send(JSON.stringify({
1208
+ const msg: Record<string, unknown> = {
1203
1209
  close_context: true,
1204
1210
  context_id: contextId,
1205
- }));
1211
+ };
1212
+ if (immediate) msg.immediate = true;
1213
+ this.ws.send(JSON.stringify(msg));
1206
1214
  }
1207
1215
 
1208
1216
  /**
@@ -1362,6 +1370,10 @@ class StreamingSession {
1362
1370
  this.callbacks.onGenerationStarted?.(data.chunk_id ?? 0, data.text ?? '');
1363
1371
  }
1364
1372
 
1373
+ if (data.interrupted) {
1374
+ this.callbacks.onInterrupted?.();
1375
+ }
1376
+
1365
1377
  if (data.session_closed) {
1366
1378
  this.callbacks.onSessionClosed?.(
1367
1379
  data.total_audio_seconds ?? 0,
@@ -1461,6 +1473,97 @@ class StreamingSession {
1461
1473
  this.ws.send(JSON.stringify(msg));
1462
1474
  }
1463
1475
 
1476
+ /**
1477
+ * Interrupt (barge-in) the current generation without closing the socket.
1478
+ *
1479
+ * Use this when the end user starts speaking over the agent: it tells the
1480
+ * server to **stop generating audio for the current turn immediately** and
1481
+ * drop any text that was buffered or queued but not yet spoken. Unlike
1482
+ * {@link endSession}, no remaining text is flushed — the turn is abandoned.
1483
+ *
1484
+ * The WebSocket stays open and a fresh session is ready, so you can call
1485
+ * {@link send} for the next user turn right away (config is re-sent
1486
+ * automatically on that first `send`).
1487
+ *
1488
+ * The returned promise resolves once the server acknowledges with an
1489
+ * `interrupted` frame (which also fires
1490
+ * {@link StreamingSessionCallbacks.onInterrupted}), or after a 5 s **quiet**
1491
+ * timeout — i.e. 5 s elapse without any server message arriving. The timer
1492
+ * resets on every incoming frame, so a few in-flight audio chunks still
1493
+ * draining at the moment of cancellation do not trip it prematurely.
1494
+ *
1495
+ * @example
1496
+ * ```typescript
1497
+ * // VAD detected the user speaking over the agent:
1498
+ * await session.cancelCurrent();
1499
+ * // Socket is still open — start the next turn immediately:
1500
+ * session.send(nextLlmToken);
1501
+ * ```
1502
+ */
1503
+ cancelCurrent(): Promise<void> {
1504
+ if (!this.ws || this.ws.readyState !== WS_OPEN) return Promise.resolve();
1505
+
1506
+ const ws = this.ws;
1507
+ // Quiet timeout: resets on every incoming server message. Trips only
1508
+ // when the server has been silent for this long. A short window is fine
1509
+ // here because the server cancels in-flight generation promptly; we only
1510
+ // need to outlast a handful of already-emitted audio frames in transit.
1511
+ const QUIET_TIMEOUT_MS = 5_000;
1512
+
1513
+ return new Promise<void>((resolve) => {
1514
+ let settled = false;
1515
+ let timer: ReturnType<typeof setTimeout>;
1516
+
1517
+ const prevMessage = ws.onmessage;
1518
+ const prevClose = ws.onclose;
1519
+
1520
+ const done = () => {
1521
+ if (settled) return;
1522
+ settled = true;
1523
+ clearTimeout(timer);
1524
+ // Restore the original handlers so subsequent calls don't stack
1525
+ // wrappers and the typed-error onclose installed by connect() stays
1526
+ // in effect for the next turn.
1527
+ ws.onmessage = prevMessage;
1528
+ ws.onclose = prevClose;
1529
+ // The server starts a fresh session after a cancel, so the next
1530
+ // send() must re-send config.
1531
+ this.configSent = false;
1532
+ resolve();
1533
+ };
1534
+
1535
+ const armQuietTimer = () => {
1536
+ clearTimeout(timer);
1537
+ timer = setTimeout(done, QUIET_TIMEOUT_MS);
1538
+ };
1539
+
1540
+ armQuietTimer();
1541
+
1542
+ ws.onmessage = (event: MessageEvent) => {
1543
+ // Reset the quiet timer on EVERY incoming frame — late audio chunks
1544
+ // from the cancelled turn count as liveness, not just the ack.
1545
+ armQuietTimer();
1546
+ if (prevMessage) prevMessage.call(ws, event);
1547
+ try {
1548
+ const raw = typeof event.data === 'string'
1549
+ ? event.data
1550
+ : event.data instanceof Buffer
1551
+ ? event.data.toString()
1552
+ : String(event.data);
1553
+ if (JSON.parse(raw).interrupted) done();
1554
+ } catch { /* ignore parse errors */ }
1555
+ };
1556
+
1557
+ ws.onclose = (event: CloseEvent) => {
1558
+ this.ws = null;
1559
+ if (prevClose) prevClose.call(ws, event);
1560
+ done();
1561
+ };
1562
+
1563
+ ws.send(JSON.stringify({ cancel: true }));
1564
+ });
1565
+ }
1566
+
1464
1567
  /**
1465
1568
  * End the current session but keep the WebSocket connection open.
1466
1569
  *
package/src/types.ts CHANGED
@@ -408,6 +408,13 @@ export interface StreamingSessionCallbacks {
408
408
  onGenerationStarted?: (chunkId: number, text: string) => void;
409
409
  /** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
410
410
  onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
411
+ /**
412
+ * Called when the server acknowledges a barge-in
413
+ * ({@link StreamingSession.cancelCurrent}). After this fires, no further
414
+ * audio chunks from the cancelled turn will arrive and the session is
415
+ * ready for the next `send()`.
416
+ */
417
+ onInterrupted?: () => void;
411
418
  /** Called on any error. */
412
419
  onError?: (error: Error) => void;
413
420
  }