@livekit/agents 1.0.50 → 1.0.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/dist/index.cjs +12 -10
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +13 -13
  4. package/dist/index.d.ts +13 -13
  5. package/dist/index.d.ts.map +1 -1
  6. package/dist/index.js +11 -10
  7. package/dist/index.js.map +1 -1
  8. package/dist/inference/api_protos.d.cts +67 -67
  9. package/dist/inference/api_protos.d.ts +67 -67
  10. package/dist/inference/llm.cjs +10 -8
  11. package/dist/inference/llm.cjs.map +1 -1
  12. package/dist/inference/llm.d.cts +1 -1
  13. package/dist/inference/llm.d.ts +1 -1
  14. package/dist/inference/llm.d.ts.map +1 -1
  15. package/dist/inference/llm.js +3 -7
  16. package/dist/inference/llm.js.map +1 -1
  17. package/dist/inference/stt.cjs +20 -12
  18. package/dist/inference/stt.cjs.map +1 -1
  19. package/dist/inference/stt.d.cts +3 -2
  20. package/dist/inference/stt.d.ts +3 -2
  21. package/dist/inference/stt.d.ts.map +1 -1
  22. package/dist/inference/stt.js +20 -12
  23. package/dist/inference/stt.js.map +1 -1
  24. package/dist/inference/stt.test.cjs +14 -0
  25. package/dist/inference/stt.test.cjs.map +1 -1
  26. package/dist/inference/stt.test.js +14 -0
  27. package/dist/inference/stt.test.js.map +1 -1
  28. package/dist/inference/tts.cjs +13 -4
  29. package/dist/inference/tts.cjs.map +1 -1
  30. package/dist/inference/tts.d.cts +2 -1
  31. package/dist/inference/tts.d.ts +2 -1
  32. package/dist/inference/tts.d.ts.map +1 -1
  33. package/dist/inference/tts.js +13 -4
  34. package/dist/inference/tts.js.map +1 -1
  35. package/dist/inference/tts.test.cjs +10 -0
  36. package/dist/inference/tts.test.cjs.map +1 -1
  37. package/dist/inference/tts.test.js +10 -0
  38. package/dist/inference/tts.test.js.map +1 -1
  39. package/dist/inference/utils.cjs +5 -5
  40. package/dist/inference/utils.cjs.map +1 -1
  41. package/dist/inference/utils.js +1 -1
  42. package/dist/inference/utils.js.map +1 -1
  43. package/dist/language.cjs +394 -0
  44. package/dist/language.cjs.map +1 -0
  45. package/dist/language.d.cts +15 -0
  46. package/dist/language.d.ts +15 -0
  47. package/dist/language.d.ts.map +1 -0
  48. package/dist/language.js +363 -0
  49. package/dist/language.js.map +1 -0
  50. package/dist/language.test.cjs +43 -0
  51. package/dist/language.test.cjs.map +1 -0
  52. package/dist/language.test.js +49 -0
  53. package/dist/language.test.js.map +1 -0
  54. package/dist/stream/deferred_stream.cjs +6 -2
  55. package/dist/stream/deferred_stream.cjs.map +1 -1
  56. package/dist/stream/deferred_stream.d.ts.map +1 -1
  57. package/dist/stream/deferred_stream.js +6 -2
  58. package/dist/stream/deferred_stream.js.map +1 -1
  59. package/dist/stt/stt.cjs.map +1 -1
  60. package/dist/stt/stt.d.cts +2 -1
  61. package/dist/stt/stt.d.ts +2 -1
  62. package/dist/stt/stt.d.ts.map +1 -1
  63. package/dist/stt/stt.js.map +1 -1
  64. package/dist/version.cjs +1 -1
  65. package/dist/version.js +1 -1
  66. package/dist/voice/agent_activity.cjs +1 -1
  67. package/dist/voice/agent_activity.cjs.map +1 -1
  68. package/dist/voice/agent_activity.js +1 -1
  69. package/dist/voice/agent_activity.js.map +1 -1
  70. package/dist/voice/agent_activity.test.cjs +135 -0
  71. package/dist/voice/agent_activity.test.cjs.map +1 -0
  72. package/dist/voice/agent_activity.test.js +134 -0
  73. package/dist/voice/agent_activity.test.js.map +1 -0
  74. package/dist/voice/audio_recognition.cjs.map +1 -1
  75. package/dist/voice/audio_recognition.d.cts +3 -2
  76. package/dist/voice/audio_recognition.d.ts +3 -2
  77. package/dist/voice/audio_recognition.d.ts.map +1 -1
  78. package/dist/voice/audio_recognition.js.map +1 -1
  79. package/dist/voice/events.cjs.map +1 -1
  80. package/dist/voice/events.d.cts +3 -2
  81. package/dist/voice/events.d.ts +3 -2
  82. package/dist/voice/events.d.ts.map +1 -1
  83. package/dist/voice/events.js.map +1 -1
  84. package/package.json +1 -1
  85. package/src/index.ts +13 -15
  86. package/src/inference/llm.ts +3 -8
  87. package/src/inference/stt.test.ts +17 -0
  88. package/src/inference/stt.ts +22 -14
  89. package/src/inference/tts.test.ts +12 -0
  90. package/src/inference/tts.ts +14 -5
  91. package/src/inference/utils.ts +1 -1
  92. package/src/language.test.ts +62 -0
  93. package/src/language.ts +380 -0
  94. package/src/stream/deferred_stream.ts +5 -1
  95. package/src/stt/stt.ts +2 -1
  96. package/src/voice/agent_activity.test.ts +194 -0
  97. package/src/voice/agent_activity.ts +1 -1
  98. package/src/voice/audio_recognition.ts +4 -3
  99. package/src/voice/events.ts +3 -2
@@ -0,0 +1,194 @@
1
+ // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+
5
+ /**
6
+ * Regression tests for mainTask speech handle processing.
7
+ *
8
+ * When a speech handle is interrupted after _authorizeGeneration() but before the
9
+ * reply task calls _markGenerationDone(), mainTask hangs on _waitForGeneration()
10
+ * indefinitely. All subsequent speech handles queue behind it and the agent becomes
11
+ * unresponsive.
12
+ *
13
+ * Fix: race _waitForGeneration() against the interrupt future via waitIfNotInterrupted().
14
+ *
15
+ * Related: #1124, #1089, #836
16
+ */
17
+ import { Heap } from 'heap-js';
18
+ import { describe, expect, it, vi } from 'vitest';
19
+ import { Future } from '../utils.js';
20
+ import { AgentActivity } from './agent_activity.js';
21
+ import { SpeechHandle } from './speech_handle.js';
22
+
23
+ // Break circular dependency: agent_activity.ts → agent.js → beta/workflows/task_group.ts
24
+ vi.mock('./agent.js', () => {
25
+ class Agent {}
26
+ class AgentTask extends Agent {}
27
+ class StopResponse {}
28
+ return {
29
+ Agent,
30
+ AgentTask,
31
+ StopResponse,
32
+ _getActivityTaskInfo: () => null,
33
+ _setActivityTaskInfo: () => {},
34
+ functionCallStorage: {
35
+ getStore: () => undefined,
36
+ enterWith: () => {},
37
+ run: (_: unknown, fn: () => unknown) => fn(),
38
+ },
39
+ speechHandleStorage: {
40
+ getStore: () => undefined,
41
+ enterWith: () => {},
42
+ },
43
+ };
44
+ });
45
+
46
+ vi.mock('../version.js', () => ({ version: '0.0.0-test' }));
47
+
48
+ async function raceTimeout(promise: Promise<unknown>, ms: number): Promise<'resolved' | 'timeout'> {
49
+ let timer: ReturnType<typeof setTimeout>;
50
+ const timeout = new Promise<'timeout'>((resolve) => {
51
+ timer = setTimeout(() => resolve('timeout'), ms);
52
+ });
53
+ return Promise.race([promise.then(() => 'resolved' as const), timeout]).finally(() =>
54
+ clearTimeout(timer),
55
+ );
56
+ }
57
+
58
+ /**
59
+ * Build a minimal stand-in with just enough state for mainTask to run.
60
+ *
61
+ * mainTask accesses: q_updated, speechQueue, _currentSpeech, _schedulingPaused,
62
+ * getDrainPendingSpeechTasks(), and logger. We provide stubs for all of these,
63
+ * then bind the real AgentActivity.prototype.mainTask to this object.
64
+ */
65
+ function buildMainTaskRunner() {
66
+ const q_updated = new Future<void>();
67
+ type HeapItem = [number, number, SpeechHandle];
68
+ const speechQueue = new Heap<HeapItem>((a: HeapItem, b: HeapItem) => b[0] - a[0] || a[1] - b[1]);
69
+
70
+ const fakeActivity = {
71
+ q_updated,
72
+ speechQueue,
73
+ _currentSpeech: undefined as SpeechHandle | undefined,
74
+ _schedulingPaused: false,
75
+ getDrainPendingSpeechTasks: () => [],
76
+ logger: {
77
+ info: () => {},
78
+ debug: () => {},
79
+ warn: () => {},
80
+ error: () => {},
81
+ },
82
+ };
83
+
84
+ const mainTask = (AgentActivity.prototype as Record<string, unknown>).mainTask as (
85
+ signal: AbortSignal,
86
+ ) => Promise<void>;
87
+
88
+ return {
89
+ fakeActivity,
90
+ mainTask: mainTask.bind(fakeActivity),
91
+ speechQueue,
92
+ q_updated,
93
+ };
94
+ }
95
+
96
+ describe('AgentActivity - mainTask', () => {
97
+ it('should recover when speech handle is interrupted after authorization', async () => {
98
+ const { fakeActivity, mainTask, speechQueue, q_updated } = buildMainTaskRunner();
99
+
100
+ const handle = SpeechHandle.create({ allowInterruptions: true });
101
+
102
+ speechQueue.push([SpeechHandle.SPEECH_PRIORITY_NORMAL, 1, handle]);
103
+ handle._markScheduled();
104
+ q_updated.resolve();
105
+
106
+ const ac = new AbortController();
107
+ const mainTaskPromise = mainTask(ac.signal);
108
+
109
+ // Give mainTask time to pop the handle and call _authorizeGeneration
110
+ await new Promise((r) => setTimeout(r, 50));
111
+
112
+ // Interrupt while waiting for generation
113
+ handle.interrupt();
114
+
115
+ // Let mainTask react to the interrupt, then signal exit
116
+ await new Promise((r) => setTimeout(r, 50));
117
+ fakeActivity._schedulingPaused = true;
118
+ fakeActivity.q_updated = new Future();
119
+ fakeActivity.q_updated.resolve();
120
+ ac.abort();
121
+
122
+ const result = await raceTimeout(mainTaskPromise, 2000);
123
+ expect(result).toBe('resolved');
124
+ });
125
+
126
+ it('should process next queued handle after an interrupted one', async () => {
127
+ const { fakeActivity, mainTask, speechQueue, q_updated } = buildMainTaskRunner();
128
+
129
+ const handleA = SpeechHandle.create({ allowInterruptions: true });
130
+ const handleB = SpeechHandle.create({ allowInterruptions: true });
131
+
132
+ speechQueue.push([SpeechHandle.SPEECH_PRIORITY_NORMAL, 1, handleA]);
133
+ handleA._markScheduled();
134
+ speechQueue.push([SpeechHandle.SPEECH_PRIORITY_NORMAL, 2, handleB]);
135
+ handleB._markScheduled();
136
+ q_updated.resolve();
137
+
138
+ const ac = new AbortController();
139
+ const mainTaskPromise = mainTask(ac.signal);
140
+
141
+ // Wait for mainTask to pick up handle A
142
+ await new Promise((r) => setTimeout(r, 50));
143
+
144
+ // Interrupt handle A
145
+ handleA.interrupt();
146
+
147
+ // Wait for mainTask to move to handle B and authorize it
148
+ await new Promise((r) => setTimeout(r, 50));
149
+
150
+ // Resolve handle B's generation (simulating normal reply task completion).
151
+ // If mainTask is stuck on handle A (bug), handle B was never authorized and this
152
+ // throws — we catch it and let the timeout assert the real failure.
153
+ try {
154
+ handleB._markGenerationDone();
155
+ } catch {
156
+ // Expected when fix is absent: handle B has no active generation
157
+ }
158
+
159
+ // Let mainTask finish
160
+ await new Promise((r) => setTimeout(r, 50));
161
+ fakeActivity._schedulingPaused = true;
162
+ fakeActivity.q_updated = new Future();
163
+ fakeActivity.q_updated.resolve();
164
+ ac.abort();
165
+
166
+ const result = await raceTimeout(mainTaskPromise, 2000);
167
+ expect(result).toBe('resolved');
168
+ });
169
+
170
+ it('should skip handles that were interrupted before being popped', async () => {
171
+ const { fakeActivity, mainTask, speechQueue, q_updated } = buildMainTaskRunner();
172
+
173
+ const handle = SpeechHandle.create({ allowInterruptions: true });
174
+
175
+ // Interrupt before mainTask ever sees it
176
+ handle.interrupt();
177
+
178
+ speechQueue.push([SpeechHandle.SPEECH_PRIORITY_NORMAL, 1, handle]);
179
+ handle._markScheduled();
180
+ q_updated.resolve();
181
+
182
+ const ac = new AbortController();
183
+ const mainTaskPromise = mainTask(ac.signal);
184
+
185
+ await new Promise((r) => setTimeout(r, 50));
186
+ fakeActivity._schedulingPaused = true;
187
+ fakeActivity.q_updated = new Future();
188
+ fakeActivity.q_updated.resolve();
189
+ ac.abort();
190
+
191
+ const result = await raceTimeout(mainTaskPromise, 2000);
192
+ expect(result).toBe('resolved');
193
+ });
194
+ });
@@ -1052,7 +1052,7 @@ export class AgentActivity implements RecognitionHooks {
1052
1052
 
1053
1053
  this._currentSpeech = speechHandle;
1054
1054
  speechHandle._authorizeGeneration();
1055
- await speechHandle._waitForGeneration();
1055
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForGeneration()]);
1056
1056
  this._currentSpeech = undefined;
1057
1057
  }
1058
1058
 
@@ -12,6 +12,7 @@ import {
12
12
  } from '@opentelemetry/api';
13
13
  import type { WritableStreamDefaultWriter } from 'node:stream/web';
14
14
  import { ReadableStream } from 'node:stream/web';
15
+ import type { LanguageCode } from '../language.js';
15
16
  import { type ChatContext } from '../llm/chat_context.js';
16
17
  import { log } from '../log.js';
17
18
  import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/deferred_stream.js';
@@ -58,8 +59,8 @@ export interface RecognitionHooks {
58
59
  }
59
60
 
60
61
  export interface _TurnDetector {
61
- unlikelyThreshold: (language?: string) => Promise<number | undefined>;
62
- supportsLanguage: (language?: string) => Promise<boolean>;
62
+ unlikelyThreshold: (language?: LanguageCode) => Promise<number | undefined>;
63
+ supportsLanguage: (language?: LanguageCode) => Promise<boolean>;
63
64
  predictEndOfTurn(chatCtx: ChatContext): Promise<number>;
64
65
  }
65
66
 
@@ -106,7 +107,7 @@ export class AudioRecognition {
106
107
  private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
107
108
  private minEndpointingDelay: number;
108
109
  private maxEndpointingDelay: number;
109
- private lastLanguage?: string;
110
+ private lastLanguage?: LanguageCode;
110
111
  private rootSpanContext?: Context;
111
112
  private sttModel?: string;
112
113
  private sttProvider?: string;
@@ -1,6 +1,7 @@
1
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
+ import type { LanguageCode } from '../language.js';
4
5
  import type {
5
6
  ChatMessage,
6
7
  FunctionCall,
@@ -87,7 +88,7 @@ export type UserInputTranscribedEvent = {
87
88
  /** Not supported yet. Always null by default. */
88
89
  speakerId: string | null;
89
90
  createdAt: number;
90
- language: string | null;
91
+ language: LanguageCode | null;
91
92
  };
92
93
 
93
94
  export const createUserInputTranscribedEvent = ({
@@ -100,7 +101,7 @@ export const createUserInputTranscribedEvent = ({
100
101
  transcript: string;
101
102
  isFinal: boolean;
102
103
  speakerId?: string | null;
103
- language?: string | null;
104
+ language?: LanguageCode | null;
104
105
  createdAt?: number;
105
106
  }): UserInputTranscribedEvent => ({
106
107
  type: 'user_input_transcribed',