@livekit/agents 1.0.16 → 1.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/inference/llm.cjs +35 -13
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +10 -5
- package/dist/inference/llm.d.ts +10 -5
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +35 -13
- package/dist/inference/llm.js.map +1 -1
- package/dist/llm/chat_context.d.cts +1 -1
- package/dist/llm/chat_context.d.ts +1 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +1 -1
- package/dist/llm/llm.d.ts +1 -1
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/provider_format/google.cjs.map +1 -1
- package/dist/llm/provider_format/google.d.cts +1 -1
- package/dist/llm/provider_format/google.d.ts +1 -1
- package/dist/llm/provider_format/google.d.ts.map +1 -1
- package/dist/llm/provider_format/google.js.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +4 -0
- package/dist/llm/realtime.d.ts +4 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/llm/utils.cjs +2 -2
- package/dist/llm/utils.cjs.map +1 -1
- package/dist/llm/utils.d.cts +1 -1
- package/dist/llm/utils.d.ts +1 -1
- package/dist/llm/utils.d.ts.map +1 -1
- package/dist/llm/utils.js +2 -2
- package/dist/llm/utils.js.map +1 -1
- package/dist/llm/zod-utils.cjs +6 -3
- package/dist/llm/zod-utils.cjs.map +1 -1
- package/dist/llm/zod-utils.d.cts +1 -1
- package/dist/llm/zod-utils.d.ts +1 -1
- package/dist/llm/zod-utils.d.ts.map +1 -1
- package/dist/llm/zod-utils.js +6 -3
- package/dist/llm/zod-utils.js.map +1 -1
- package/dist/llm/zod-utils.test.cjs +83 -0
- package/dist/llm/zod-utils.test.cjs.map +1 -1
- package/dist/llm/zod-utils.test.js +83 -0
- package/dist/llm/zod-utils.test.js.map +1 -1
- package/dist/stt/stt.cjs +0 -1
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +0 -1
- package/dist/stt/stt.js.map +1 -1
- package/dist/tts/tts.cjs +2 -4
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +3 -5
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +7 -0
- package/dist/utils.d.ts +7 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent_activity.cjs +69 -20
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +69 -20
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +40 -1
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +5 -0
- package/dist/voice/agent_session.d.ts +5 -0
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +40 -1
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/interruption_detection.test.cjs +114 -0
- package/dist/voice/interruption_detection.test.cjs.map +1 -0
- package/dist/voice/interruption_detection.test.js +113 -0
- package/dist/voice/interruption_detection.test.js.map +1 -0
- package/dist/voice/room_io/room_io.cjs +3 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +1 -0
- package/dist/voice/room_io/room_io.d.ts +1 -0
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +3 -0
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/package.json +3 -3
- package/src/inference/llm.ts +53 -21
- package/src/llm/__snapshots__/zod-utils.test.ts.snap +218 -0
- package/src/llm/llm.ts +1 -1
- package/src/llm/provider_format/google.ts +4 -4
- package/src/llm/realtime.ts +8 -1
- package/src/llm/utils.ts +7 -2
- package/src/llm/zod-utils.test.ts +101 -0
- package/src/llm/zod-utils.ts +12 -3
- package/src/stt/stt.ts +2 -1
- package/src/tts/tts.ts +7 -5
- package/src/utils.ts +17 -0
- package/src/voice/agent_activity.ts +96 -24
- package/src/voice/agent_session.ts +54 -0
- package/src/voice/interruption_detection.test.ts +151 -0
- package/src/voice/room_io/room_io.ts +4 -0
|
@@ -235,6 +235,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
235
235
|
} catch (error) {
|
|
236
236
|
this.logger.error(error, 'failed to update the tools');
|
|
237
237
|
}
|
|
238
|
+
|
|
239
|
+
if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
|
|
240
|
+
this.logger.error(
|
|
241
|
+
'audio output is enabled but RealtimeModel has no audio modality ' +
|
|
242
|
+
'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
|
|
243
|
+
'or set a TTS model.',
|
|
244
|
+
);
|
|
245
|
+
}
|
|
238
246
|
} else if (this.llm instanceof LLM) {
|
|
239
247
|
try {
|
|
240
248
|
updateInstructions({
|
|
@@ -625,11 +633,21 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
625
633
|
return;
|
|
626
634
|
}
|
|
627
635
|
|
|
636
|
+
// Refactored interruption word count check:
|
|
637
|
+
// - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
|
|
638
|
+
// - Apply check to all STT results: empty string, undefined, or any length
|
|
639
|
+
// - This ensures consistent behavior across all interruption scenarios
|
|
628
640
|
if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
|
|
629
641
|
const text = this.audioRecognition.currentTranscript;
|
|
630
|
-
|
|
631
642
|
// TODO(shubhra): better word splitting for multi-language
|
|
632
|
-
|
|
643
|
+
|
|
644
|
+
// Normalize text: convert undefined/null to empty string for consistent word counting
|
|
645
|
+
const normalizedText = text ?? '';
|
|
646
|
+
const wordCount = splitWords(normalizedText, true).length;
|
|
647
|
+
|
|
648
|
+
// Only allow interruption if word count meets or exceeds minInterruptionWords
|
|
649
|
+
// This applies to all cases: empty strings, partial speech, and full speech
|
|
650
|
+
if (wordCount < this.agentSession.options.minInterruptionWords) {
|
|
633
651
|
return;
|
|
634
652
|
}
|
|
635
653
|
}
|
|
@@ -767,19 +785,30 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
767
785
|
return true;
|
|
768
786
|
}
|
|
769
787
|
|
|
788
|
+
// Refactored interruption word count check for consistency with onVADInferenceDone:
|
|
789
|
+
// - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
|
|
790
|
+
// - Use consistent word splitting logic with splitWords (matching onVADInferenceDone pattern)
|
|
770
791
|
if (
|
|
771
792
|
this.stt &&
|
|
772
793
|
this.turnDetection !== 'manual' &&
|
|
773
794
|
this._currentSpeech &&
|
|
774
795
|
this._currentSpeech.allowInterruptions &&
|
|
775
796
|
!this._currentSpeech.interrupted &&
|
|
776
|
-
this.agentSession.options.minInterruptionWords > 0
|
|
777
|
-
info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
|
|
797
|
+
this.agentSession.options.minInterruptionWords > 0
|
|
778
798
|
) {
|
|
779
|
-
|
|
780
|
-
this.
|
|
781
|
-
|
|
782
|
-
|
|
799
|
+
const wordCount = splitWords(info.newTranscript, true).length;
|
|
800
|
+
if (wordCount < this.agentSession.options.minInterruptionWords) {
|
|
801
|
+
// avoid interruption if the new_transcript contains fewer words than minInterruptionWords
|
|
802
|
+
this.cancelPreemptiveGeneration();
|
|
803
|
+
this.logger.info(
|
|
804
|
+
{
|
|
805
|
+
wordCount,
|
|
806
|
+
minInterruptionWords: this.agentSession.options.minInterruptionWords,
|
|
807
|
+
},
|
|
808
|
+
'skipping user input, word count below minimum interruption threshold',
|
|
809
|
+
);
|
|
810
|
+
return false;
|
|
811
|
+
}
|
|
783
812
|
}
|
|
784
813
|
|
|
785
814
|
const oldTask = this._userTurnCompletedTask;
|
|
@@ -1612,7 +1641,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1612
1641
|
|
|
1613
1642
|
const readMessages = async (
|
|
1614
1643
|
abortController: AbortController,
|
|
1615
|
-
outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
|
|
1644
|
+
outputs: Array<[string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]>,
|
|
1616
1645
|
) => {
|
|
1617
1646
|
replyAbortController.signal.addEventListener('abort', () => abortController.abort(), {
|
|
1618
1647
|
once: true,
|
|
@@ -1627,7 +1656,25 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1627
1656
|
);
|
|
1628
1657
|
break;
|
|
1629
1658
|
}
|
|
1630
|
-
|
|
1659
|
+
|
|
1660
|
+
const msgModalities = msg.modalities ? await msg.modalities : undefined;
|
|
1661
|
+
let ttsTextInput: ReadableStream<string> | null = null;
|
|
1662
|
+
let trTextInput: ReadableStream<string>;
|
|
1663
|
+
|
|
1664
|
+
if (msgModalities && !msgModalities.includes('audio') && this.tts) {
|
|
1665
|
+
if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
|
|
1666
|
+
this.logger.warn(
|
|
1667
|
+
'text response received from realtime API, falling back to use a TTS model.',
|
|
1668
|
+
);
|
|
1669
|
+
}
|
|
1670
|
+
const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
|
|
1671
|
+
ttsTextInput = _ttsTextInput;
|
|
1672
|
+
trTextInput = _trTextInput;
|
|
1673
|
+
} else {
|
|
1674
|
+
trTextInput = msg.textStream;
|
|
1675
|
+
}
|
|
1676
|
+
|
|
1677
|
+
const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
|
|
1631
1678
|
let textOut: _TextOut | null = null;
|
|
1632
1679
|
if (trNodeResult) {
|
|
1633
1680
|
const [textForwardTask, _textOut] = performTextForwarding(
|
|
@@ -1638,30 +1685,51 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1638
1685
|
forwardTasks.push(textForwardTask);
|
|
1639
1686
|
textOut = _textOut;
|
|
1640
1687
|
}
|
|
1688
|
+
|
|
1641
1689
|
let audioOut: _AudioOut | null = null;
|
|
1642
1690
|
if (audioOutput) {
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1691
|
+
let realtimeAudioResult: ReadableStream<AudioFrame> | null = null;
|
|
1692
|
+
|
|
1693
|
+
if (ttsTextInput) {
|
|
1694
|
+
const [ttsTask, ttsStream] = performTTSInference(
|
|
1695
|
+
(...args) => this.agent.ttsNode(...args),
|
|
1696
|
+
ttsTextInput,
|
|
1697
|
+
modelSettings,
|
|
1698
|
+
abortController,
|
|
1699
|
+
);
|
|
1700
|
+
tasks.push(ttsTask);
|
|
1701
|
+
realtimeAudioResult = ttsStream;
|
|
1702
|
+
} else if (msgModalities && msgModalities.includes('audio')) {
|
|
1703
|
+
realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
|
|
1704
|
+
msg.audioStream,
|
|
1705
|
+
modelSettings,
|
|
1706
|
+
);
|
|
1707
|
+
} else if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
|
|
1708
|
+
this.logger.error(
|
|
1709
|
+
'Text message received from Realtime API with audio modality. ' +
|
|
1710
|
+
'This usually happens when text chat context is synced to the API. ' +
|
|
1711
|
+
'Try to add a TTS model as fallback or use text modality with TTS instead.',
|
|
1712
|
+
);
|
|
1713
|
+
} else {
|
|
1714
|
+
this.logger.warn(
|
|
1715
|
+
'audio output is enabled but neither tts nor realtime audio is available',
|
|
1716
|
+
);
|
|
1717
|
+
}
|
|
1718
|
+
|
|
1719
|
+
if (realtimeAudioResult) {
|
|
1648
1720
|
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
1649
|
-
|
|
1721
|
+
realtimeAudioResult,
|
|
1650
1722
|
audioOutput,
|
|
1651
1723
|
abortController,
|
|
1652
1724
|
);
|
|
1653
1725
|
forwardTasks.push(forwardTask);
|
|
1654
1726
|
audioOut = _audioOut;
|
|
1655
1727
|
audioOut.firstFrameFut.await.finally(onFirstFrame);
|
|
1656
|
-
} else {
|
|
1657
|
-
this.logger.warn(
|
|
1658
|
-
'audio output is enabled but neither tts nor realtime audio is available',
|
|
1659
|
-
);
|
|
1660
1728
|
}
|
|
1661
1729
|
} else if (textOut) {
|
|
1662
1730
|
textOut.firstTextFut.await.finally(onFirstFrame);
|
|
1663
1731
|
}
|
|
1664
|
-
outputs.push([msg.messageId, textOut, audioOut]);
|
|
1732
|
+
outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
|
|
1665
1733
|
}
|
|
1666
1734
|
await waitFor(forwardTasks);
|
|
1667
1735
|
} catch (error) {
|
|
@@ -1671,7 +1739,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1671
1739
|
}
|
|
1672
1740
|
};
|
|
1673
1741
|
|
|
1674
|
-
const messageOutputs: Array<
|
|
1742
|
+
const messageOutputs: Array<
|
|
1743
|
+
[string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]
|
|
1744
|
+
> = [];
|
|
1675
1745
|
const tasks = [
|
|
1676
1746
|
Task.from(
|
|
1677
1747
|
(controller) => readMessages(controller, messageOutputs),
|
|
@@ -1750,7 +1820,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1750
1820
|
|
|
1751
1821
|
if (messageOutputs.length > 0) {
|
|
1752
1822
|
// there should be only one message
|
|
1753
|
-
const [msgId, textOut, audioOut] = messageOutputs[0]!;
|
|
1823
|
+
const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0]!;
|
|
1754
1824
|
let forwardedText = textOut?.text || '';
|
|
1755
1825
|
|
|
1756
1826
|
if (audioOutput) {
|
|
@@ -1775,6 +1845,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1775
1845
|
this.realtimeSession.truncate({
|
|
1776
1846
|
messageId: msgId,
|
|
1777
1847
|
audioEndMs: Math.floor(playbackPosition),
|
|
1848
|
+
modalities: msgModalities,
|
|
1849
|
+
audioTranscript: forwardedText,
|
|
1778
1850
|
});
|
|
1779
1851
|
}
|
|
1780
1852
|
|
|
@@ -1805,7 +1877,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1805
1877
|
|
|
1806
1878
|
if (messageOutputs.length > 0) {
|
|
1807
1879
|
// there should be only one message
|
|
1808
|
-
const [msgId, textOut, _] = messageOutputs[0]!;
|
|
1880
|
+
const [msgId, textOut, _, __] = messageOutputs[0]!;
|
|
1809
1881
|
const message = ChatMessage.create({
|
|
1810
1882
|
role: 'assistant',
|
|
1811
1883
|
content: textOut?.text || '',
|
|
@@ -58,6 +58,7 @@ export interface VoiceOptions {
|
|
|
58
58
|
maxEndpointingDelay: number;
|
|
59
59
|
maxToolSteps: number;
|
|
60
60
|
preemptiveGeneration: boolean;
|
|
61
|
+
userAwayTimeout?: number | null;
|
|
61
62
|
}
|
|
62
63
|
|
|
63
64
|
const defaultVoiceOptions: VoiceOptions = {
|
|
@@ -69,6 +70,7 @@ const defaultVoiceOptions: VoiceOptions = {
|
|
|
69
70
|
maxEndpointingDelay: 6000,
|
|
70
71
|
maxToolSteps: 3,
|
|
71
72
|
preemptiveGeneration: false,
|
|
73
|
+
userAwayTimeout: 15.0,
|
|
72
74
|
} as const;
|
|
73
75
|
|
|
74
76
|
export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
|
|
@@ -123,6 +125,7 @@ export class AgentSession<
|
|
|
123
125
|
private _output: AgentOutput;
|
|
124
126
|
|
|
125
127
|
private closingTask: Promise<void> | null = null;
|
|
128
|
+
private userAwayTimer: NodeJS.Timeout | null = null;
|
|
126
129
|
|
|
127
130
|
constructor(opts: AgentSessionOptions<UserData>) {
|
|
128
131
|
super();
|
|
@@ -167,6 +170,8 @@ export class AgentSession<
|
|
|
167
170
|
// This is the "global" chat context, it holds the entire conversation history
|
|
168
171
|
this._chatCtx = ChatContext.empty();
|
|
169
172
|
this.options = { ...defaultVoiceOptions, ...voiceOptions };
|
|
173
|
+
|
|
174
|
+
this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed.bind(this));
|
|
170
175
|
}
|
|
171
176
|
|
|
172
177
|
get input(): AgentInput {
|
|
@@ -416,6 +421,14 @@ export class AgentSession<
|
|
|
416
421
|
|
|
417
422
|
const oldState = this._agentState;
|
|
418
423
|
this._agentState = state;
|
|
424
|
+
|
|
425
|
+
// Handle user away timer based on state changes
|
|
426
|
+
if (state === 'listening' && this.userState === 'listening') {
|
|
427
|
+
this._setUserAwayTimer();
|
|
428
|
+
} else {
|
|
429
|
+
this._cancelUserAwayTimer();
|
|
430
|
+
}
|
|
431
|
+
|
|
419
432
|
this.emit(
|
|
420
433
|
AgentSessionEventTypes.AgentStateChanged,
|
|
421
434
|
createAgentStateChangedEvent(oldState, state),
|
|
@@ -430,6 +443,14 @@ export class AgentSession<
|
|
|
430
443
|
|
|
431
444
|
const oldState = this.userState;
|
|
432
445
|
this.userState = state;
|
|
446
|
+
|
|
447
|
+
// Handle user away timer based on state changes
|
|
448
|
+
if (state === 'listening' && this._agentState === 'listening') {
|
|
449
|
+
this._setUserAwayTimer();
|
|
450
|
+
} else {
|
|
451
|
+
this._cancelUserAwayTimer();
|
|
452
|
+
}
|
|
453
|
+
|
|
433
454
|
this.emit(
|
|
434
455
|
AgentSessionEventTypes.UserStateChanged,
|
|
435
456
|
createUserStateChangedEvent(oldState, state),
|
|
@@ -451,6 +472,37 @@ export class AgentSession<
|
|
|
451
472
|
|
|
452
473
|
private onTextOutputChanged(): void {}
|
|
453
474
|
|
|
475
|
+
private _setUserAwayTimer(): void {
|
|
476
|
+
this._cancelUserAwayTimer();
|
|
477
|
+
|
|
478
|
+
if (this.options.userAwayTimeout === null || this.options.userAwayTimeout === undefined) {
|
|
479
|
+
return;
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
if (this.roomIO && !this.roomIO.isParticipantAvailable) {
|
|
483
|
+
return;
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
this.userAwayTimer = setTimeout(() => {
|
|
487
|
+
this.logger.debug('User away timeout triggered');
|
|
488
|
+
this._updateUserState('away');
|
|
489
|
+
}, this.options.userAwayTimeout * 1000);
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
private _cancelUserAwayTimer(): void {
|
|
493
|
+
if (this.userAwayTimer !== null) {
|
|
494
|
+
clearTimeout(this.userAwayTimer);
|
|
495
|
+
this.userAwayTimer = null;
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
private _onUserInputTranscribed(ev: UserInputTranscribedEvent): void {
|
|
500
|
+
if (this.userState === 'away' && ev.isFinal) {
|
|
501
|
+
this.logger.debug('User returned from away state due to speech input');
|
|
502
|
+
this._updateUserState('listening');
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
|
|
454
506
|
private async closeImpl(
|
|
455
507
|
reason: CloseReason,
|
|
456
508
|
error: RealtimeModelError | LLMError | TTSError | STTError | null = null,
|
|
@@ -460,6 +512,8 @@ export class AgentSession<
|
|
|
460
512
|
return;
|
|
461
513
|
}
|
|
462
514
|
|
|
515
|
+
this._cancelUserAwayTimer();
|
|
516
|
+
|
|
463
517
|
if (this.activity) {
|
|
464
518
|
if (!drain) {
|
|
465
519
|
try {
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Unit tests for interruption detection logic in AgentActivity.
|
|
7
|
+
*
|
|
8
|
+
* Tests the refactored minInterruptionWords check which ensures:
|
|
9
|
+
* - Consistent word count filtering across all speech scenarios
|
|
10
|
+
* - Proper handling of empty strings, undefined, and short speech
|
|
11
|
+
* - Interruptions allowed only when word count meets or exceeds minInterruptionWords threshold
|
|
12
|
+
*/
|
|
13
|
+
import { describe, expect, it } from 'vitest';
|
|
14
|
+
import { splitWords } from '../tokenize/basic/word.js';
|
|
15
|
+
|
|
16
|
+
describe('Interruption Detection - Word Counting', () => {
|
|
17
|
+
describe('Word Splitting Behavior', () => {
|
|
18
|
+
it('should count empty string as 0 words', () => {
|
|
19
|
+
const text = '';
|
|
20
|
+
const wordCount = splitWords(text, true).length;
|
|
21
|
+
expect(wordCount).toBe(0);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
it('should count single word correctly', () => {
|
|
25
|
+
const text = 'hello';
|
|
26
|
+
const wordCount = splitWords(text, true).length;
|
|
27
|
+
expect(wordCount).toBe(1);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it('should count two words correctly', () => {
|
|
31
|
+
const text = 'hello world';
|
|
32
|
+
const wordCount = splitWords(text, true).length;
|
|
33
|
+
expect(wordCount).toBe(2);
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
it('should count multiple words correctly', () => {
|
|
37
|
+
const text = 'hello this is a full sentence';
|
|
38
|
+
const wordCount = splitWords(text, true).length;
|
|
39
|
+
expect(wordCount).toBe(6);
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
it('should handle punctuation correctly', () => {
|
|
43
|
+
const text = 'hello, world!';
|
|
44
|
+
const wordCount = splitWords(text, true).length;
|
|
45
|
+
expect(wordCount).toBe(2);
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
it('should handle multiple spaces between words', () => {
|
|
49
|
+
const text = 'hello world';
|
|
50
|
+
const wordCount = splitWords(text, true).length;
|
|
51
|
+
expect(wordCount).toBe(2);
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
it('should count whitespace-only string as 0 words', () => {
|
|
55
|
+
const text = ' ';
|
|
56
|
+
const wordCount = splitWords(text, true).length;
|
|
57
|
+
expect(wordCount).toBe(0);
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it('should handle leading and trailing whitespace', () => {
|
|
61
|
+
const text = ' hello world ';
|
|
62
|
+
const wordCount = splitWords(text, true).length;
|
|
63
|
+
expect(wordCount).toBe(2);
|
|
64
|
+
});
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
describe('Integration: Full Interruption Check Logic', () => {
|
|
68
|
+
it('should block interruption for empty transcript with threshold 2', () => {
|
|
69
|
+
const text = '';
|
|
70
|
+
const minInterruptionWords = 2;
|
|
71
|
+
|
|
72
|
+
const normalizedText = text ?? '';
|
|
73
|
+
const wordCount = splitWords(normalizedText, true).length;
|
|
74
|
+
const shouldBlock = wordCount < minInterruptionWords;
|
|
75
|
+
|
|
76
|
+
expect(normalizedText).toBe('');
|
|
77
|
+
expect(wordCount).toBe(0);
|
|
78
|
+
expect(shouldBlock).toBe(true);
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it('should block interruption for undefined transcript with threshold 2', () => {
|
|
82
|
+
const text: string | undefined = undefined;
|
|
83
|
+
const minInterruptionWords = 2;
|
|
84
|
+
|
|
85
|
+
const normalizedText = text ?? '';
|
|
86
|
+
const wordCount = splitWords(normalizedText, true).length;
|
|
87
|
+
const shouldBlock = wordCount < minInterruptionWords;
|
|
88
|
+
|
|
89
|
+
expect(normalizedText).toBe('');
|
|
90
|
+
expect(wordCount).toBe(0);
|
|
91
|
+
expect(shouldBlock).toBe(true);
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it('should block interruption for single word with threshold 2', () => {
|
|
95
|
+
const text = 'hello';
|
|
96
|
+
const minInterruptionWords = 2;
|
|
97
|
+
|
|
98
|
+
const normalizedText = text ?? '';
|
|
99
|
+
const wordCount = splitWords(normalizedText, true).length;
|
|
100
|
+
const shouldBlock = wordCount < minInterruptionWords;
|
|
101
|
+
|
|
102
|
+
expect(normalizedText).toBe('hello');
|
|
103
|
+
expect(wordCount).toBe(1);
|
|
104
|
+
expect(shouldBlock).toBe(true);
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
it('should allow interruption when word count exactly meets threshold', () => {
|
|
108
|
+
const text = 'hello world';
|
|
109
|
+
const minInterruptionWords = 2;
|
|
110
|
+
|
|
111
|
+
const normalizedText = text ?? '';
|
|
112
|
+
const wordCount = splitWords(normalizedText, true).length;
|
|
113
|
+
const shouldBlock = wordCount < minInterruptionWords;
|
|
114
|
+
|
|
115
|
+
expect(normalizedText).toBe('hello world');
|
|
116
|
+
expect(wordCount).toBe(2);
|
|
117
|
+
expect(shouldBlock).toBe(false);
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
it('should allow interruption when word count exceeds threshold', () => {
|
|
121
|
+
const text = 'hello this is a full sentence';
|
|
122
|
+
const minInterruptionWords = 2;
|
|
123
|
+
|
|
124
|
+
const normalizedText = text ?? '';
|
|
125
|
+
const wordCount = splitWords(normalizedText, true).length;
|
|
126
|
+
const shouldBlock = wordCount < minInterruptionWords;
|
|
127
|
+
|
|
128
|
+
expect(normalizedText).toBe('hello this is a full sentence');
|
|
129
|
+
expect(wordCount).toBe(6);
|
|
130
|
+
expect(shouldBlock).toBe(false);
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
it('should apply consistent word counting logic in both methods', () => {
|
|
134
|
+
const transcripts = ['', 'hello', 'hello world', 'this is a longer sentence'];
|
|
135
|
+
const threshold = 2;
|
|
136
|
+
|
|
137
|
+
transcripts.forEach((transcript) => {
|
|
138
|
+
const text1 = transcript;
|
|
139
|
+
const normalizedText1 = text1 ?? '';
|
|
140
|
+
const wordCount1 = splitWords(normalizedText1, true).length;
|
|
141
|
+
const shouldBlock1 = wordCount1 < threshold;
|
|
142
|
+
|
|
143
|
+
const wordCount2 = splitWords(transcript, true).length;
|
|
144
|
+
const shouldBlock2 = wordCount2 < threshold;
|
|
145
|
+
|
|
146
|
+
expect(wordCount1).toBe(wordCount2);
|
|
147
|
+
expect(shouldBlock1).toBe(shouldBlock2);
|
|
148
|
+
});
|
|
149
|
+
});
|
|
150
|
+
});
|
|
151
|
+
});
|
|
@@ -369,6 +369,10 @@ export class RoomIO {
|
|
|
369
369
|
return this.transcriptionSynchronizer.textOutput;
|
|
370
370
|
}
|
|
371
371
|
|
|
372
|
+
get isParticipantAvailable(): boolean {
|
|
373
|
+
return this.participantAvailableFuture.done;
|
|
374
|
+
}
|
|
375
|
+
|
|
372
376
|
/** Switch to a different participant */
|
|
373
377
|
setParticipant(participantIdentity: string | null) {
|
|
374
378
|
this.logger.debug({ participantIdentity }, 'setting participant');
|