kugelaudio 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/index.d.mts +42 -1
- package/dist/index.d.ts +42 -1
- package/dist/index.js +81 -3
- package/dist/index.mjs +81 -3
- package/package.json +1 -1
- package/src/client.test.ts +115 -0
- package/src/client.ts +106 -3
- package/src/types.ts +7 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
## [kugelaudio-v0.6.0](https://github.com/Kugelaudio/KugelAudio/compare/js-sdk-v0.5.0...js-sdk-v0.6.0) (2026-06-01)
|
|
2
|
+
|
|
3
|
+
### Features
|
|
4
|
+
|
|
5
|
+
* streaming barge-in (cancelCurrent) across server + JS/Python/Java SDKs ([#1210](https://github.com/Kugelaudio/KugelAudio/issues/1210)) ([341e54f](https://github.com/Kugelaudio/KugelAudio/commit/341e54f169b4dd9242272b249fca30f005bfc3b8))
|
|
6
|
+
|
|
1
7
|
## [kugelaudio-v0.5.0](https://github.com/Kugelaudio/KugelAudio/compare/js-sdk-v0.4.0...js-sdk-v0.5.0) (2026-05-21)
|
|
2
8
|
|
|
3
9
|
### Features
|
package/dist/index.d.mts
CHANGED
|
@@ -383,6 +383,13 @@ interface StreamingSessionCallbacks {
|
|
|
383
383
|
onGenerationStarted?: (chunkId: number, text: string) => void;
|
|
384
384
|
/** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
|
|
385
385
|
onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
|
|
386
|
+
/**
|
|
387
|
+
* Called when the server acknowledges a barge-in
|
|
388
|
+
* ({@link StreamingSession.cancelCurrent}). After this fires, no further
|
|
389
|
+
* audio chunks from the cancelled turn will arrive and the session is
|
|
390
|
+
* ready for the next `send()`.
|
|
391
|
+
*/
|
|
392
|
+
onInterrupted?: () => void;
|
|
386
393
|
/** Called on any error. */
|
|
387
394
|
onError?: (error: Error) => void;
|
|
388
395
|
}
|
|
@@ -938,8 +945,14 @@ declare class MultiContextSession {
|
|
|
938
945
|
flush(contextId: string): void;
|
|
939
946
|
/**
|
|
940
947
|
* Close a specific context.
|
|
948
|
+
*
|
|
949
|
+
* @param contextId - The context to close.
|
|
950
|
+
* @param immediate - When `true`, **barge-in**: the server cancels the
|
|
951
|
+
* context's in-flight generation immediately and discards any buffered or
|
|
952
|
+
* queued text instead of draining it. Use this when the end user speaks
|
|
953
|
+
* over the agent. When `false` (default), queued sentences finish first.
|
|
941
954
|
*/
|
|
942
|
-
closeContext(contextId: string): void;
|
|
955
|
+
closeContext(contextId: string, immediate?: boolean): void;
|
|
943
956
|
/**
|
|
944
957
|
* Send keep-alive to reset a context's inactivity timeout.
|
|
945
958
|
*/
|
|
@@ -1016,6 +1029,34 @@ declare class StreamingSession {
|
|
|
1016
1029
|
* handle chunking via `chunkLengthSchedule` / `autoMode` instead.
|
|
1017
1030
|
*/
|
|
1018
1031
|
send(text: string, flush?: boolean): void;
|
|
1032
|
+
/**
|
|
1033
|
+
* Interrupt (barge-in) the current generation without closing the socket.
|
|
1034
|
+
*
|
|
1035
|
+
* Use this when the end user starts speaking over the agent: it tells the
|
|
1036
|
+
* server to **stop generating audio for the current turn immediately** and
|
|
1037
|
+
* drop any text that was buffered or queued but not yet spoken. Unlike
|
|
1038
|
+
* {@link endSession}, no remaining text is flushed — the turn is abandoned.
|
|
1039
|
+
*
|
|
1040
|
+
* The WebSocket stays open and a fresh session is ready, so you can call
|
|
1041
|
+
* {@link send} for the next user turn right away (config is re-sent
|
|
1042
|
+
* automatically on that first `send`).
|
|
1043
|
+
*
|
|
1044
|
+
* The returned promise resolves once the server acknowledges with an
|
|
1045
|
+
* `interrupted` frame (which also fires
|
|
1046
|
+
* {@link StreamingSessionCallbacks.onInterrupted}), or after a 5 s **quiet**
|
|
1047
|
+
* timeout — i.e. 5 s elapse without any server message arriving. The timer
|
|
1048
|
+
* resets on every incoming frame, so a few in-flight audio chunks still
|
|
1049
|
+
* draining at the moment of cancellation do not trip it prematurely.
|
|
1050
|
+
*
|
|
1051
|
+
* @example
|
|
1052
|
+
* ```typescript
|
|
1053
|
+
* // VAD detected the user speaking over the agent:
|
|
1054
|
+
* await session.cancelCurrent();
|
|
1055
|
+
* // Socket is still open — start the next turn immediately:
|
|
1056
|
+
* session.send(nextLlmToken);
|
|
1057
|
+
* ```
|
|
1058
|
+
*/
|
|
1059
|
+
cancelCurrent(): Promise<void>;
|
|
1019
1060
|
/**
|
|
1020
1061
|
* End the current session but keep the WebSocket connection open.
|
|
1021
1062
|
*
|
package/dist/index.d.ts
CHANGED
|
@@ -383,6 +383,13 @@ interface StreamingSessionCallbacks {
|
|
|
383
383
|
onGenerationStarted?: (chunkId: number, text: string) => void;
|
|
384
384
|
/** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
|
|
385
385
|
onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
|
|
386
|
+
/**
|
|
387
|
+
* Called when the server acknowledges a barge-in
|
|
388
|
+
* ({@link StreamingSession.cancelCurrent}). After this fires, no further
|
|
389
|
+
* audio chunks from the cancelled turn will arrive and the session is
|
|
390
|
+
* ready for the next `send()`.
|
|
391
|
+
*/
|
|
392
|
+
onInterrupted?: () => void;
|
|
386
393
|
/** Called on any error. */
|
|
387
394
|
onError?: (error: Error) => void;
|
|
388
395
|
}
|
|
@@ -938,8 +945,14 @@ declare class MultiContextSession {
|
|
|
938
945
|
flush(contextId: string): void;
|
|
939
946
|
/**
|
|
940
947
|
* Close a specific context.
|
|
948
|
+
*
|
|
949
|
+
* @param contextId - The context to close.
|
|
950
|
+
* @param immediate - When `true`, **barge-in**: the server cancels the
|
|
951
|
+
* context's in-flight generation immediately and discards any buffered or
|
|
952
|
+
* queued text instead of draining it. Use this when the end user speaks
|
|
953
|
+
* over the agent. When `false` (default), queued sentences finish first.
|
|
941
954
|
*/
|
|
942
|
-
closeContext(contextId: string): void;
|
|
955
|
+
closeContext(contextId: string, immediate?: boolean): void;
|
|
943
956
|
/**
|
|
944
957
|
* Send keep-alive to reset a context's inactivity timeout.
|
|
945
958
|
*/
|
|
@@ -1016,6 +1029,34 @@ declare class StreamingSession {
|
|
|
1016
1029
|
* handle chunking via `chunkLengthSchedule` / `autoMode` instead.
|
|
1017
1030
|
*/
|
|
1018
1031
|
send(text: string, flush?: boolean): void;
|
|
1032
|
+
/**
|
|
1033
|
+
* Interrupt (barge-in) the current generation without closing the socket.
|
|
1034
|
+
*
|
|
1035
|
+
* Use this when the end user starts speaking over the agent: it tells the
|
|
1036
|
+
* server to **stop generating audio for the current turn immediately** and
|
|
1037
|
+
* drop any text that was buffered or queued but not yet spoken. Unlike
|
|
1038
|
+
* {@link endSession}, no remaining text is flushed — the turn is abandoned.
|
|
1039
|
+
*
|
|
1040
|
+
* The WebSocket stays open and a fresh session is ready, so you can call
|
|
1041
|
+
* {@link send} for the next user turn right away (config is re-sent
|
|
1042
|
+
* automatically on that first `send`).
|
|
1043
|
+
*
|
|
1044
|
+
* The returned promise resolves once the server acknowledges with an
|
|
1045
|
+
* `interrupted` frame (which also fires
|
|
1046
|
+
* {@link StreamingSessionCallbacks.onInterrupted}), or after a 5 s **quiet**
|
|
1047
|
+
* timeout — i.e. 5 s elapse without any server message arriving. The timer
|
|
1048
|
+
* resets on every incoming frame, so a few in-flight audio chunks still
|
|
1049
|
+
* draining at the moment of cancellation do not trip it prematurely.
|
|
1050
|
+
*
|
|
1051
|
+
* @example
|
|
1052
|
+
* ```typescript
|
|
1053
|
+
* // VAD detected the user speaking over the agent:
|
|
1054
|
+
* await session.cancelCurrent();
|
|
1055
|
+
* // Socket is still open — start the next turn immediately:
|
|
1056
|
+
* session.send(nextLlmToken);
|
|
1057
|
+
* ```
|
|
1058
|
+
*/
|
|
1059
|
+
cancelCurrent(): Promise<void>;
|
|
1019
1060
|
/**
|
|
1020
1061
|
* End the current session but keep the WebSocket connection open.
|
|
1021
1062
|
*
|
package/dist/index.js
CHANGED
|
@@ -1438,13 +1438,21 @@ var MultiContextSession = class {
|
|
|
1438
1438
|
}
|
|
1439
1439
|
/**
|
|
1440
1440
|
* Close a specific context.
|
|
1441
|
+
*
|
|
1442
|
+
* @param contextId - The context to close.
|
|
1443
|
+
* @param immediate - When `true`, **barge-in**: the server cancels the
|
|
1444
|
+
* context's in-flight generation immediately and discards any buffered or
|
|
1445
|
+
* queued text instead of draining it. Use this when the end user speaks
|
|
1446
|
+
* over the agent. When `false` (default), queued sentences finish first.
|
|
1441
1447
|
*/
|
|
1442
|
-
closeContext(contextId) {
|
|
1448
|
+
closeContext(contextId, immediate = false) {
|
|
1443
1449
|
if (!this.ws || this.ws.readyState !== WS_OPEN) return;
|
|
1444
|
-
|
|
1450
|
+
const msg = {
|
|
1445
1451
|
close_context: true,
|
|
1446
1452
|
context_id: contextId
|
|
1447
|
-
}
|
|
1453
|
+
};
|
|
1454
|
+
if (immediate) msg.immediate = true;
|
|
1455
|
+
this.ws.send(JSON.stringify(msg));
|
|
1448
1456
|
}
|
|
1449
1457
|
/**
|
|
1450
1458
|
* Send keep-alive to reset a context's inactivity timeout.
|
|
@@ -1549,6 +1557,9 @@ var StreamingSession = class {
|
|
|
1549
1557
|
if (data.generation_started) {
|
|
1550
1558
|
this.callbacks.onGenerationStarted?.(data.chunk_id ?? 0, data.text ?? "");
|
|
1551
1559
|
}
|
|
1560
|
+
if (data.interrupted) {
|
|
1561
|
+
this.callbacks.onInterrupted?.();
|
|
1562
|
+
}
|
|
1552
1563
|
if (data.session_closed) {
|
|
1553
1564
|
this.callbacks.onSessionClosed?.(
|
|
1554
1565
|
data.total_audio_seconds ?? 0,
|
|
@@ -1629,6 +1640,73 @@ var StreamingSession = class {
|
|
|
1629
1640
|
}
|
|
1630
1641
|
this.ws.send(JSON.stringify(msg));
|
|
1631
1642
|
}
|
|
1643
|
+
/**
|
|
1644
|
+
* Interrupt (barge-in) the current generation without closing the socket.
|
|
1645
|
+
*
|
|
1646
|
+
* Use this when the end user starts speaking over the agent: it tells the
|
|
1647
|
+
* server to **stop generating audio for the current turn immediately** and
|
|
1648
|
+
* drop any text that was buffered or queued but not yet spoken. Unlike
|
|
1649
|
+
* {@link endSession}, no remaining text is flushed — the turn is abandoned.
|
|
1650
|
+
*
|
|
1651
|
+
* The WebSocket stays open and a fresh session is ready, so you can call
|
|
1652
|
+
* {@link send} for the next user turn right away (config is re-sent
|
|
1653
|
+
* automatically on that first `send`).
|
|
1654
|
+
*
|
|
1655
|
+
* The returned promise resolves once the server acknowledges with an
|
|
1656
|
+
* `interrupted` frame (which also fires
|
|
1657
|
+
* {@link StreamingSessionCallbacks.onInterrupted}), or after a 5 s **quiet**
|
|
1658
|
+
* timeout — i.e. 5 s elapse without any server message arriving. The timer
|
|
1659
|
+
* resets on every incoming frame, so a few in-flight audio chunks still
|
|
1660
|
+
* draining at the moment of cancellation do not trip it prematurely.
|
|
1661
|
+
*
|
|
1662
|
+
* @example
|
|
1663
|
+
* ```typescript
|
|
1664
|
+
* // VAD detected the user speaking over the agent:
|
|
1665
|
+
* await session.cancelCurrent();
|
|
1666
|
+
* // Socket is still open — start the next turn immediately:
|
|
1667
|
+
* session.send(nextLlmToken);
|
|
1668
|
+
* ```
|
|
1669
|
+
*/
|
|
1670
|
+
cancelCurrent() {
|
|
1671
|
+
if (!this.ws || this.ws.readyState !== WS_OPEN) return Promise.resolve();
|
|
1672
|
+
const ws = this.ws;
|
|
1673
|
+
const QUIET_TIMEOUT_MS = 5e3;
|
|
1674
|
+
return new Promise((resolve) => {
|
|
1675
|
+
let settled = false;
|
|
1676
|
+
let timer;
|
|
1677
|
+
const prevMessage = ws.onmessage;
|
|
1678
|
+
const prevClose = ws.onclose;
|
|
1679
|
+
const done = () => {
|
|
1680
|
+
if (settled) return;
|
|
1681
|
+
settled = true;
|
|
1682
|
+
clearTimeout(timer);
|
|
1683
|
+
ws.onmessage = prevMessage;
|
|
1684
|
+
ws.onclose = prevClose;
|
|
1685
|
+
this.configSent = false;
|
|
1686
|
+
resolve();
|
|
1687
|
+
};
|
|
1688
|
+
const armQuietTimer = () => {
|
|
1689
|
+
clearTimeout(timer);
|
|
1690
|
+
timer = setTimeout(done, QUIET_TIMEOUT_MS);
|
|
1691
|
+
};
|
|
1692
|
+
armQuietTimer();
|
|
1693
|
+
ws.onmessage = (event) => {
|
|
1694
|
+
armQuietTimer();
|
|
1695
|
+
if (prevMessage) prevMessage.call(ws, event);
|
|
1696
|
+
try {
|
|
1697
|
+
const raw = typeof event.data === "string" ? event.data : event.data instanceof Buffer ? event.data.toString() : String(event.data);
|
|
1698
|
+
if (JSON.parse(raw).interrupted) done();
|
|
1699
|
+
} catch {
|
|
1700
|
+
}
|
|
1701
|
+
};
|
|
1702
|
+
ws.onclose = (event) => {
|
|
1703
|
+
this.ws = null;
|
|
1704
|
+
if (prevClose) prevClose.call(ws, event);
|
|
1705
|
+
done();
|
|
1706
|
+
};
|
|
1707
|
+
ws.send(JSON.stringify({ cancel: true }));
|
|
1708
|
+
});
|
|
1709
|
+
}
|
|
1632
1710
|
/**
|
|
1633
1711
|
* End the current session but keep the WebSocket connection open.
|
|
1634
1712
|
*
|
package/dist/index.mjs
CHANGED
|
@@ -1400,13 +1400,21 @@ var MultiContextSession = class {
|
|
|
1400
1400
|
}
|
|
1401
1401
|
/**
|
|
1402
1402
|
* Close a specific context.
|
|
1403
|
+
*
|
|
1404
|
+
* @param contextId - The context to close.
|
|
1405
|
+
* @param immediate - When `true`, **barge-in**: the server cancels the
|
|
1406
|
+
* context's in-flight generation immediately and discards any buffered or
|
|
1407
|
+
* queued text instead of draining it. Use this when the end user speaks
|
|
1408
|
+
* over the agent. When `false` (default), queued sentences finish first.
|
|
1403
1409
|
*/
|
|
1404
|
-
closeContext(contextId) {
|
|
1410
|
+
closeContext(contextId, immediate = false) {
|
|
1405
1411
|
if (!this.ws || this.ws.readyState !== WS_OPEN) return;
|
|
1406
|
-
|
|
1412
|
+
const msg = {
|
|
1407
1413
|
close_context: true,
|
|
1408
1414
|
context_id: contextId
|
|
1409
|
-
}
|
|
1415
|
+
};
|
|
1416
|
+
if (immediate) msg.immediate = true;
|
|
1417
|
+
this.ws.send(JSON.stringify(msg));
|
|
1410
1418
|
}
|
|
1411
1419
|
/**
|
|
1412
1420
|
* Send keep-alive to reset a context's inactivity timeout.
|
|
@@ -1511,6 +1519,9 @@ var StreamingSession = class {
|
|
|
1511
1519
|
if (data.generation_started) {
|
|
1512
1520
|
this.callbacks.onGenerationStarted?.(data.chunk_id ?? 0, data.text ?? "");
|
|
1513
1521
|
}
|
|
1522
|
+
if (data.interrupted) {
|
|
1523
|
+
this.callbacks.onInterrupted?.();
|
|
1524
|
+
}
|
|
1514
1525
|
if (data.session_closed) {
|
|
1515
1526
|
this.callbacks.onSessionClosed?.(
|
|
1516
1527
|
data.total_audio_seconds ?? 0,
|
|
@@ -1591,6 +1602,73 @@ var StreamingSession = class {
|
|
|
1591
1602
|
}
|
|
1592
1603
|
this.ws.send(JSON.stringify(msg));
|
|
1593
1604
|
}
|
|
1605
|
+
/**
|
|
1606
|
+
* Interrupt (barge-in) the current generation without closing the socket.
|
|
1607
|
+
*
|
|
1608
|
+
* Use this when the end user starts speaking over the agent: it tells the
|
|
1609
|
+
* server to **stop generating audio for the current turn immediately** and
|
|
1610
|
+
* drop any text that was buffered or queued but not yet spoken. Unlike
|
|
1611
|
+
* {@link endSession}, no remaining text is flushed — the turn is abandoned.
|
|
1612
|
+
*
|
|
1613
|
+
* The WebSocket stays open and a fresh session is ready, so you can call
|
|
1614
|
+
* {@link send} for the next user turn right away (config is re-sent
|
|
1615
|
+
* automatically on that first `send`).
|
|
1616
|
+
*
|
|
1617
|
+
* The returned promise resolves once the server acknowledges with an
|
|
1618
|
+
* `interrupted` frame (which also fires
|
|
1619
|
+
* {@link StreamingSessionCallbacks.onInterrupted}), or after a 5 s **quiet**
|
|
1620
|
+
* timeout — i.e. 5 s elapse without any server message arriving. The timer
|
|
1621
|
+
* resets on every incoming frame, so a few in-flight audio chunks still
|
|
1622
|
+
* draining at the moment of cancellation do not trip it prematurely.
|
|
1623
|
+
*
|
|
1624
|
+
* @example
|
|
1625
|
+
* ```typescript
|
|
1626
|
+
* // VAD detected the user speaking over the agent:
|
|
1627
|
+
* await session.cancelCurrent();
|
|
1628
|
+
* // Socket is still open — start the next turn immediately:
|
|
1629
|
+
* session.send(nextLlmToken);
|
|
1630
|
+
* ```
|
|
1631
|
+
*/
|
|
1632
|
+
cancelCurrent() {
|
|
1633
|
+
if (!this.ws || this.ws.readyState !== WS_OPEN) return Promise.resolve();
|
|
1634
|
+
const ws = this.ws;
|
|
1635
|
+
const QUIET_TIMEOUT_MS = 5e3;
|
|
1636
|
+
return new Promise((resolve) => {
|
|
1637
|
+
let settled = false;
|
|
1638
|
+
let timer;
|
|
1639
|
+
const prevMessage = ws.onmessage;
|
|
1640
|
+
const prevClose = ws.onclose;
|
|
1641
|
+
const done = () => {
|
|
1642
|
+
if (settled) return;
|
|
1643
|
+
settled = true;
|
|
1644
|
+
clearTimeout(timer);
|
|
1645
|
+
ws.onmessage = prevMessage;
|
|
1646
|
+
ws.onclose = prevClose;
|
|
1647
|
+
this.configSent = false;
|
|
1648
|
+
resolve();
|
|
1649
|
+
};
|
|
1650
|
+
const armQuietTimer = () => {
|
|
1651
|
+
clearTimeout(timer);
|
|
1652
|
+
timer = setTimeout(done, QUIET_TIMEOUT_MS);
|
|
1653
|
+
};
|
|
1654
|
+
armQuietTimer();
|
|
1655
|
+
ws.onmessage = (event) => {
|
|
1656
|
+
armQuietTimer();
|
|
1657
|
+
if (prevMessage) prevMessage.call(ws, event);
|
|
1658
|
+
try {
|
|
1659
|
+
const raw = typeof event.data === "string" ? event.data : event.data instanceof Buffer ? event.data.toString() : String(event.data);
|
|
1660
|
+
if (JSON.parse(raw).interrupted) done();
|
|
1661
|
+
} catch {
|
|
1662
|
+
}
|
|
1663
|
+
};
|
|
1664
|
+
ws.onclose = (event) => {
|
|
1665
|
+
this.ws = null;
|
|
1666
|
+
if (prevClose) prevClose.call(ws, event);
|
|
1667
|
+
done();
|
|
1668
|
+
};
|
|
1669
|
+
ws.send(JSON.stringify({ cancel: true }));
|
|
1670
|
+
});
|
|
1671
|
+
}
|
|
1594
1672
|
/**
|
|
1595
1673
|
* End the current session but keep the WebSocket connection open.
|
|
1596
1674
|
*
|
package/package.json
CHANGED
package/src/client.test.ts
CHANGED
|
@@ -333,6 +333,10 @@ function makeGenerationStartedMsg(chunkId: number, text: string): string {
|
|
|
333
333
|
});
|
|
334
334
|
}
|
|
335
335
|
|
|
336
|
+
function makeInterruptedMsg(): string {
|
|
337
|
+
return JSON.stringify({ interrupted: true });
|
|
338
|
+
}
|
|
339
|
+
|
|
336
340
|
describe('StreamingSession', () => {
|
|
337
341
|
let client: KugelAudio;
|
|
338
342
|
|
|
@@ -545,4 +549,115 @@ describe('StreamingSession', () => {
|
|
|
545
549
|
expect(session.isConnected).toBe(true);
|
|
546
550
|
expect(() => session.send('Hello.', true)).not.toThrow();
|
|
547
551
|
});
|
|
552
|
+
|
|
553
|
+
// -------------------------------------------------------------------------
|
|
554
|
+
// cancelCurrent() — barge-in (KUG-1050)
|
|
555
|
+
// -------------------------------------------------------------------------
|
|
556
|
+
|
|
557
|
+
it('cancelCurrent() sends {cancel:true}, fires onInterrupted, keeps socket open', async () => {
|
|
558
|
+
const interruptedCalls: number[] = [];
|
|
559
|
+
|
|
560
|
+
const session = client.tts.streamingSession(
|
|
561
|
+
{ voiceId: 1 },
|
|
562
|
+
{ onInterrupted: () => interruptedCalls.push(1) },
|
|
563
|
+
);
|
|
564
|
+
|
|
565
|
+
session.connect();
|
|
566
|
+
await new Promise<void>((r) => setTimeout(r, 10));
|
|
567
|
+
|
|
568
|
+
session.send('A very long sentence the user is about to talk over.');
|
|
569
|
+
mockWs.onmessage?.({ data: makeAudioMsg(0, 100) });
|
|
570
|
+
|
|
571
|
+
const cancelPromise = session.cancelCurrent();
|
|
572
|
+
|
|
573
|
+
// The barge-in frame was sent to the server.
|
|
574
|
+
const lastSent = JSON.parse(mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string);
|
|
575
|
+
expect(lastSent.cancel).toBe(true);
|
|
576
|
+
|
|
577
|
+
// Server acks the barge-in.
|
|
578
|
+
mockWs.onmessage?.({ data: makeInterruptedMsg() });
|
|
579
|
+
await cancelPromise;
|
|
580
|
+
|
|
581
|
+
// onInterrupted fired and the socket stayed open for the next turn.
|
|
582
|
+
expect(interruptedCalls).toHaveLength(1);
|
|
583
|
+
expect(session.isConnected).toBe(true);
|
|
584
|
+
expect(mockWs.close).not.toHaveBeenCalled();
|
|
585
|
+
});
|
|
586
|
+
|
|
587
|
+
it('cancelCurrent() re-sends config on the next send (fresh server session)', async () => {
|
|
588
|
+
const session = client.tts.streamingSession({ voiceId: 42 }, {});
|
|
589
|
+
|
|
590
|
+
session.connect();
|
|
591
|
+
await new Promise<void>((r) => setTimeout(r, 10));
|
|
592
|
+
|
|
593
|
+
// First send carries config (voice_id).
|
|
594
|
+
session.send('Hello.');
|
|
595
|
+
expect(JSON.parse(mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string).voice_id).toBe(42);
|
|
596
|
+
|
|
597
|
+
const cancelPromise = session.cancelCurrent();
|
|
598
|
+
mockWs.onmessage?.({ data: makeInterruptedMsg() });
|
|
599
|
+
await cancelPromise;
|
|
600
|
+
|
|
601
|
+
// The server started a fresh session, so the next send must re-send config.
|
|
602
|
+
session.send('Next turn.');
|
|
603
|
+
expect(JSON.parse(mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string).voice_id).toBe(42);
|
|
604
|
+
});
|
|
605
|
+
|
|
606
|
+
it('cancelCurrent() resolves on quiet timeout if server never acks', async () => {
|
|
607
|
+
const session = client.tts.streamingSession({ voiceId: 1 }, {});
|
|
608
|
+
|
|
609
|
+
session.connect();
|
|
610
|
+
await new Promise<void>((r) => setTimeout(r, 10));
|
|
611
|
+
session.send('Hello.');
|
|
612
|
+
|
|
613
|
+
vi.useFakeTimers();
|
|
614
|
+
const cancelPromise = session.cancelCurrent();
|
|
615
|
+
|
|
616
|
+
// No interrupted ack — the 5 s quiet timeout resolves it.
|
|
617
|
+
await vi.advanceTimersByTimeAsync(6_000);
|
|
618
|
+
await cancelPromise;
|
|
619
|
+
|
|
620
|
+
vi.useRealTimers();
|
|
621
|
+
// Socket was never closed; still reusable.
|
|
622
|
+
expect(session.isConnected).toBe(true);
|
|
623
|
+
});
|
|
624
|
+
});
|
|
625
|
+
|
|
626
|
+
// ---------------------------------------------------------------------------
|
|
627
|
+
// MultiContextSession barge-in — closeContext immediate (KUG-1050)
|
|
628
|
+
// ---------------------------------------------------------------------------
|
|
629
|
+
|
|
630
|
+
describe('MultiContextSession closeContext', () => {
|
|
631
|
+
let client: KugelAudio;
|
|
632
|
+
|
|
633
|
+
beforeEach(() => {
|
|
634
|
+
client = new KugelAudio({ apiKey: 'test-key-xxx' });
|
|
635
|
+
});
|
|
636
|
+
|
|
637
|
+
it('closeContext(id, true) sends the immediate barge-in flag', async () => {
|
|
638
|
+
const session = client.tts.createMultiContextSession({ defaultVoiceId: 1 });
|
|
639
|
+
await session.connect({});
|
|
640
|
+
|
|
641
|
+
session.closeContext('ctx1', true);
|
|
642
|
+
|
|
643
|
+
const sent = JSON.parse(
|
|
644
|
+
mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string
|
|
645
|
+
);
|
|
646
|
+
expect(sent.close_context).toBe(true);
|
|
647
|
+
expect(sent.context_id).toBe('ctx1');
|
|
648
|
+
expect(sent.immediate).toBe(true);
|
|
649
|
+
});
|
|
650
|
+
|
|
651
|
+
it('closeContext(id) omits immediate (graceful drain)', async () => {
|
|
652
|
+
const session = client.tts.createMultiContextSession({ defaultVoiceId: 1 });
|
|
653
|
+
await session.connect({});
|
|
654
|
+
|
|
655
|
+
session.closeContext('ctx1');
|
|
656
|
+
|
|
657
|
+
const sent = JSON.parse(
|
|
658
|
+
mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string
|
|
659
|
+
);
|
|
660
|
+
expect(sent.close_context).toBe(true);
|
|
661
|
+
expect(sent.immediate).toBeUndefined();
|
|
662
|
+
});
|
|
548
663
|
});
|
package/src/client.ts
CHANGED
|
@@ -1195,14 +1195,22 @@ class MultiContextSession {
|
|
|
1195
1195
|
|
|
1196
1196
|
/**
|
|
1197
1197
|
* Close a specific context.
|
|
1198
|
+
*
|
|
1199
|
+
* @param contextId - The context to close.
|
|
1200
|
+
* @param immediate - When `true`, **barge-in**: the server cancels the
|
|
1201
|
+
* context's in-flight generation immediately and discards any buffered or
|
|
1202
|
+
* queued text instead of draining it. Use this when the end user speaks
|
|
1203
|
+
* over the agent. When `false` (default), queued sentences finish first.
|
|
1198
1204
|
*/
|
|
1199
|
-
closeContext(contextId: string): void {
|
|
1205
|
+
closeContext(contextId: string, immediate = false): void {
|
|
1200
1206
|
if (!this.ws || this.ws.readyState !== WS_OPEN) return;
|
|
1201
1207
|
|
|
1202
|
-
|
|
1208
|
+
const msg: Record<string, unknown> = {
|
|
1203
1209
|
close_context: true,
|
|
1204
1210
|
context_id: contextId,
|
|
1205
|
-
}
|
|
1211
|
+
};
|
|
1212
|
+
if (immediate) msg.immediate = true;
|
|
1213
|
+
this.ws.send(JSON.stringify(msg));
|
|
1206
1214
|
}
|
|
1207
1215
|
|
|
1208
1216
|
/**
|
|
@@ -1362,6 +1370,10 @@ class StreamingSession {
|
|
|
1362
1370
|
this.callbacks.onGenerationStarted?.(data.chunk_id ?? 0, data.text ?? '');
|
|
1363
1371
|
}
|
|
1364
1372
|
|
|
1373
|
+
if (data.interrupted) {
|
|
1374
|
+
this.callbacks.onInterrupted?.();
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1365
1377
|
if (data.session_closed) {
|
|
1366
1378
|
this.callbacks.onSessionClosed?.(
|
|
1367
1379
|
data.total_audio_seconds ?? 0,
|
|
@@ -1461,6 +1473,97 @@ class StreamingSession {
|
|
|
1461
1473
|
this.ws.send(JSON.stringify(msg));
|
|
1462
1474
|
}
|
|
1463
1475
|
|
|
1476
|
+
/**
|
|
1477
|
+
* Interrupt (barge-in) the current generation without closing the socket.
|
|
1478
|
+
*
|
|
1479
|
+
* Use this when the end user starts speaking over the agent: it tells the
|
|
1480
|
+
* server to **stop generating audio for the current turn immediately** and
|
|
1481
|
+
* drop any text that was buffered or queued but not yet spoken. Unlike
|
|
1482
|
+
* {@link endSession}, no remaining text is flushed — the turn is abandoned.
|
|
1483
|
+
*
|
|
1484
|
+
* The WebSocket stays open and a fresh session is ready, so you can call
|
|
1485
|
+
* {@link send} for the next user turn right away (config is re-sent
|
|
1486
|
+
* automatically on that first `send`).
|
|
1487
|
+
*
|
|
1488
|
+
* The returned promise resolves once the server acknowledges with an
|
|
1489
|
+
* `interrupted` frame (which also fires
|
|
1490
|
+
* {@link StreamingSessionCallbacks.onInterrupted}), or after a 5 s **quiet**
|
|
1491
|
+
* timeout — i.e. 5 s elapse without any server message arriving. The timer
|
|
1492
|
+
* resets on every incoming frame, so a few in-flight audio chunks still
|
|
1493
|
+
* draining at the moment of cancellation do not trip it prematurely.
|
|
1494
|
+
*
|
|
1495
|
+
* @example
|
|
1496
|
+
* ```typescript
|
|
1497
|
+
* // VAD detected the user speaking over the agent:
|
|
1498
|
+
* await session.cancelCurrent();
|
|
1499
|
+
* // Socket is still open — start the next turn immediately:
|
|
1500
|
+
* session.send(nextLlmToken);
|
|
1501
|
+
* ```
|
|
1502
|
+
*/
|
|
1503
|
+
cancelCurrent(): Promise<void> {
|
|
1504
|
+
if (!this.ws || this.ws.readyState !== WS_OPEN) return Promise.resolve();
|
|
1505
|
+
|
|
1506
|
+
const ws = this.ws;
|
|
1507
|
+
// Quiet timeout: resets on every incoming server message. Trips only
|
|
1508
|
+
// when the server has been silent for this long. A short window is fine
|
|
1509
|
+
// here because the server cancels in-flight generation promptly; we only
|
|
1510
|
+
// need to outlast a handful of already-emitted audio frames in transit.
|
|
1511
|
+
const QUIET_TIMEOUT_MS = 5_000;
|
|
1512
|
+
|
|
1513
|
+
return new Promise<void>((resolve) => {
|
|
1514
|
+
let settled = false;
|
|
1515
|
+
let timer: ReturnType<typeof setTimeout>;
|
|
1516
|
+
|
|
1517
|
+
const prevMessage = ws.onmessage;
|
|
1518
|
+
const prevClose = ws.onclose;
|
|
1519
|
+
|
|
1520
|
+
const done = () => {
|
|
1521
|
+
if (settled) return;
|
|
1522
|
+
settled = true;
|
|
1523
|
+
clearTimeout(timer);
|
|
1524
|
+
// Restore the original handlers so subsequent calls don't stack
|
|
1525
|
+
// wrappers and the typed-error onclose installed by connect() stays
|
|
1526
|
+
// in effect for the next turn.
|
|
1527
|
+
ws.onmessage = prevMessage;
|
|
1528
|
+
ws.onclose = prevClose;
|
|
1529
|
+
// The server starts a fresh session after a cancel, so the next
|
|
1530
|
+
// send() must re-send config.
|
|
1531
|
+
this.configSent = false;
|
|
1532
|
+
resolve();
|
|
1533
|
+
};
|
|
1534
|
+
|
|
1535
|
+
const armQuietTimer = () => {
|
|
1536
|
+
clearTimeout(timer);
|
|
1537
|
+
timer = setTimeout(done, QUIET_TIMEOUT_MS);
|
|
1538
|
+
};
|
|
1539
|
+
|
|
1540
|
+
armQuietTimer();
|
|
1541
|
+
|
|
1542
|
+
ws.onmessage = (event: MessageEvent) => {
|
|
1543
|
+
// Reset the quiet timer on EVERY incoming frame — late audio chunks
|
|
1544
|
+
// from the cancelled turn count as liveness, not just the ack.
|
|
1545
|
+
armQuietTimer();
|
|
1546
|
+
if (prevMessage) prevMessage.call(ws, event);
|
|
1547
|
+
try {
|
|
1548
|
+
const raw = typeof event.data === 'string'
|
|
1549
|
+
? event.data
|
|
1550
|
+
: event.data instanceof Buffer
|
|
1551
|
+
? event.data.toString()
|
|
1552
|
+
: String(event.data);
|
|
1553
|
+
if (JSON.parse(raw).interrupted) done();
|
|
1554
|
+
} catch { /* ignore parse errors */ }
|
|
1555
|
+
};
|
|
1556
|
+
|
|
1557
|
+
ws.onclose = (event: CloseEvent) => {
|
|
1558
|
+
this.ws = null;
|
|
1559
|
+
if (prevClose) prevClose.call(ws, event);
|
|
1560
|
+
done();
|
|
1561
|
+
};
|
|
1562
|
+
|
|
1563
|
+
ws.send(JSON.stringify({ cancel: true }));
|
|
1564
|
+
});
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1464
1567
|
/**
|
|
1465
1568
|
* End the current session but keep the WebSocket connection open.
|
|
1466
1569
|
*
|
package/src/types.ts
CHANGED
|
@@ -408,6 +408,13 @@ export interface StreamingSessionCallbacks {
|
|
|
408
408
|
onGenerationStarted?: (chunkId: number, text: string) => void;
|
|
409
409
|
/** Called when word-level timestamps arrive (requires `wordTimestamps: true`). */
|
|
410
410
|
onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
|
|
411
|
+
/**
|
|
412
|
+
* Called when the server acknowledges a barge-in
|
|
413
|
+
* ({@link StreamingSession.cancelCurrent}). After this fires, no further
|
|
414
|
+
* audio chunks from the cancelled turn will arrive and the session is
|
|
415
|
+
* ready for the next `send()`.
|
|
416
|
+
*/
|
|
417
|
+
onInterrupted?: () => void;
|
|
411
418
|
/** Called on any error. */
|
|
412
419
|
onError?: (error: Error) => void;
|
|
413
420
|
}
|