@livekit/agents 1.0.45 → 1.0.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +14 -20
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +14 -20
- package/dist/cli.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +14 -5
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +14 -5
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/llm/chat_context.cjs +19 -0
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +4 -0
- package/dist/llm/chat_context.d.ts +4 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +19 -0
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +82 -2
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +80 -1
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +326 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +327 -1
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +4 -3
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +4 -3
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/log.cjs +5 -2
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +5 -2
- package/dist/log.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +15 -6
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +15 -6
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/stream/index.cjs +3 -0
- package/dist/stream/index.cjs.map +1 -1
- package/dist/stream/index.d.cts +1 -0
- package/dist/stream/index.d.ts +1 -0
- package/dist/stream/index.d.ts.map +1 -1
- package/dist/stream/index.js +2 -0
- package/dist/stream/index.js.map +1 -1
- package/dist/stream/multi_input_stream.cjs +139 -0
- package/dist/stream/multi_input_stream.cjs.map +1 -0
- package/dist/stream/multi_input_stream.d.cts +55 -0
- package/dist/stream/multi_input_stream.d.ts +55 -0
- package/dist/stream/multi_input_stream.d.ts.map +1 -0
- package/dist/stream/multi_input_stream.js +115 -0
- package/dist/stream/multi_input_stream.js.map +1 -0
- package/dist/stream/multi_input_stream.test.cjs +340 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -0
- package/dist/stream/multi_input_stream.test.js +339 -0
- package/dist/stream/multi_input_stream.test.js.map +1 -0
- package/dist/telemetry/trace_types.cjs +42 -0
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +14 -0
- package/dist/telemetry/trace_types.d.ts +14 -0
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +28 -0
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/utils.cjs +44 -2
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +8 -0
- package/dist/utils.d.ts +8 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +44 -2
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +71 -0
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +71 -0
- package/dist/utils.test.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +144 -12
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +29 -4
- package/dist/voice/agent.d.ts +29 -4
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +140 -11
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +120 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +122 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +402 -292
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +35 -7
- package/dist/voice/agent_activity.d.ts +35 -7
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +402 -287
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +156 -44
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +22 -9
- package/dist/voice/agent_session.d.ts +22 -9
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +156 -44
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +89 -36
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +22 -1
- package/dist/voice/audio_recognition.d.ts +22 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +93 -36
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/audio_recognition_span.test.cjs +233 -0
- package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
- package/dist/voice/audio_recognition_span.test.js +232 -0
- package/dist/voice/audio_recognition_span.test.js.map +1 -0
- package/dist/voice/generation.cjs +39 -19
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +44 -20
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +2 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +2 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/io.cjs +6 -3
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +3 -2
- package/dist/voice/io.d.ts +3 -2
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +6 -3
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +3 -1
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +3 -1
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +17 -17
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +2 -2
- package/dist/voice/room_io/_input.d.ts +2 -2
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +7 -6
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +9 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +3 -1
- package/dist/voice/room_io/room_io.d.ts +3 -1
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +9 -0
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +7 -1
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +2 -0
- package/dist/voice/speech_handle.d.ts +2 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +8 -2
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/run_result.cjs +66 -15
- package/dist/voice/testing/run_result.cjs.map +1 -1
- package/dist/voice/testing/run_result.d.cts +14 -3
- package/dist/voice/testing/run_result.d.ts +14 -3
- package/dist/voice/testing/run_result.d.ts.map +1 -1
- package/dist/voice/testing/run_result.js +66 -15
- package/dist/voice/testing/run_result.js.map +1 -1
- package/dist/voice/utils.cjs +47 -0
- package/dist/voice/utils.cjs.map +1 -0
- package/dist/voice/utils.d.cts +4 -0
- package/dist/voice/utils.d.ts +4 -0
- package/dist/voice/utils.d.ts.map +1 -0
- package/dist/voice/utils.js +23 -0
- package/dist/voice/utils.js.map +1 -0
- package/package.json +1 -1
- package/src/cli.ts +20 -33
- package/src/ipc/job_proc_lazy_main.ts +16 -5
- package/src/llm/chat_context.ts +35 -0
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +385 -1
- package/src/llm/provider_format/openai.ts +103 -0
- package/src/llm/provider_format/utils.ts +6 -4
- package/src/llm/realtime.ts +1 -0
- package/src/log.ts +5 -2
- package/src/stream/deferred_stream.ts +17 -6
- package/src/stream/index.ts +1 -0
- package/src/stream/multi_input_stream.test.ts +540 -0
- package/src/stream/multi_input_stream.ts +172 -0
- package/src/telemetry/trace_types.ts +18 -0
- package/src/utils.test.ts +87 -0
- package/src/utils.ts +52 -2
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +140 -2
- package/src/voice/agent.ts +189 -10
- package/src/voice/agent_activity.ts +449 -286
- package/src/voice/agent_session.ts +195 -51
- package/src/voice/audio_recognition.ts +118 -38
- package/src/voice/audio_recognition_span.test.ts +261 -0
- package/src/voice/generation.ts +52 -23
- package/src/voice/index.ts +1 -1
- package/src/voice/io.ts +7 -4
- package/src/voice/recorder_io/recorder_io.ts +2 -1
- package/src/voice/room_io/_input.ts +11 -7
- package/src/voice/room_io/room_io.ts +12 -0
- package/src/voice/speech_handle.ts +9 -2
- package/src/voice/testing/run_result.ts +81 -23
- package/src/voice/utils.ts +29 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { ParticipantKind } from '@livekit/rtc-node';
|
|
5
|
+
import { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base';
|
|
6
|
+
import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';
|
|
7
|
+
import { describe, expect, it, vi } from 'vitest';
|
|
8
|
+
import { initializeLogger } from '../log.js';
|
|
9
|
+
import { type SpeechEvent, SpeechEventType } from '../stt/stt.js';
|
|
10
|
+
import { setTracerProvider } from '../telemetry/index.js';
|
|
11
|
+
import { VAD, type VADEvent, VADEventType, type VADStream } from '../vad.js';
|
|
12
|
+
import { AudioRecognition, type _TurnDetector } from './audio_recognition.js';
|
|
13
|
+
|
|
14
|
+
function setupInMemoryTracing() {
|
|
15
|
+
const exporter = new InMemorySpanExporter();
|
|
16
|
+
const provider = new NodeTracerProvider();
|
|
17
|
+
provider.addSpanProcessor(new SimpleSpanProcessor(exporter));
|
|
18
|
+
provider.register();
|
|
19
|
+
setTracerProvider(provider);
|
|
20
|
+
return { exporter };
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
function spanByName(spans: any[], name: string) {
|
|
24
|
+
return spans.find((s) => s.name === name);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
class FakeVADStream extends (Object as unknown as { new (): VADStream }) {
|
|
28
|
+
// We intentionally avoid extending the real VADStream (it is not exported as a value in JS output
|
|
29
|
+
// in some bundling contexts). Instead we emulate the async iterator shape used by AudioRecognition.
|
|
30
|
+
private events: VADEvent[];
|
|
31
|
+
private idx = 0;
|
|
32
|
+
constructor(events: VADEvent[]) {
|
|
33
|
+
super();
|
|
34
|
+
this.events = events;
|
|
35
|
+
}
|
|
36
|
+
updateInputStream() {}
|
|
37
|
+
detachInputStream() {}
|
|
38
|
+
close() {}
|
|
39
|
+
[Symbol.asyncIterator]() {
|
|
40
|
+
return this;
|
|
41
|
+
}
|
|
42
|
+
async next(): Promise<IteratorResult<VADEvent>> {
|
|
43
|
+
if (this.idx >= this.events.length) {
|
|
44
|
+
return { done: true, value: undefined };
|
|
45
|
+
}
|
|
46
|
+
const value = this.events[this.idx++]!;
|
|
47
|
+
return { done: false, value };
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
class FakeVAD extends VAD {
|
|
52
|
+
label = 'fake-vad';
|
|
53
|
+
private events: VADEvent[];
|
|
54
|
+
constructor(events: VADEvent[]) {
|
|
55
|
+
super({ updateInterval: 1 });
|
|
56
|
+
this.events = events;
|
|
57
|
+
}
|
|
58
|
+
stream(): any {
|
|
59
|
+
return new FakeVADStream(this.events);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const alwaysTrueTurnDetector: _TurnDetector = {
|
|
64
|
+
supportsLanguage: async () => true,
|
|
65
|
+
unlikelyThreshold: async () => undefined,
|
|
66
|
+
predictEndOfTurn: async () => 1.0,
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
describe('AudioRecognition user_turn span parity', () => {
|
|
70
|
+
initializeLogger({ pretty: false, level: 'silent' });
|
|
71
|
+
|
|
72
|
+
it('creates user_turn and parents eou_detection under it (stt mode)', async () => {
|
|
73
|
+
const { exporter } = setupInMemoryTracing();
|
|
74
|
+
|
|
75
|
+
const hooks = {
|
|
76
|
+
onStartOfSpeech: vi.fn(),
|
|
77
|
+
onVADInferenceDone: vi.fn(),
|
|
78
|
+
onEndOfSpeech: vi.fn(),
|
|
79
|
+
onInterimTranscript: vi.fn(),
|
|
80
|
+
onFinalTranscript: vi.fn(),
|
|
81
|
+
onPreemptiveGeneration: vi.fn(),
|
|
82
|
+
retrieveChatCtx: () =>
|
|
83
|
+
({
|
|
84
|
+
copy() {
|
|
85
|
+
return this;
|
|
86
|
+
},
|
|
87
|
+
addMessage() {},
|
|
88
|
+
toJSON() {
|
|
89
|
+
return { items: [] };
|
|
90
|
+
},
|
|
91
|
+
}) as any,
|
|
92
|
+
onEndOfTurn: vi.fn(async () => true),
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
const sttEvents: SpeechEvent[] = [
|
|
96
|
+
{ type: SpeechEventType.START_OF_SPEECH },
|
|
97
|
+
{
|
|
98
|
+
type: SpeechEventType.FINAL_TRANSCRIPT,
|
|
99
|
+
alternatives: [
|
|
100
|
+
{
|
|
101
|
+
language: 'en',
|
|
102
|
+
text: 'hello',
|
|
103
|
+
startTime: 0,
|
|
104
|
+
endTime: 0,
|
|
105
|
+
confidence: 0.9,
|
|
106
|
+
},
|
|
107
|
+
],
|
|
108
|
+
},
|
|
109
|
+
{ type: SpeechEventType.END_OF_SPEECH },
|
|
110
|
+
];
|
|
111
|
+
|
|
112
|
+
const sttNode = async () =>
|
|
113
|
+
new ReadableStream<SpeechEvent>({
|
|
114
|
+
start(controller) {
|
|
115
|
+
for (const ev of sttEvents) controller.enqueue(ev);
|
|
116
|
+
controller.close();
|
|
117
|
+
},
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
const ar = new AudioRecognition({
|
|
121
|
+
recognitionHooks: hooks as any,
|
|
122
|
+
stt: sttNode as any,
|
|
123
|
+
vad: undefined,
|
|
124
|
+
turnDetector: alwaysTrueTurnDetector,
|
|
125
|
+
turnDetectionMode: 'stt',
|
|
126
|
+
minEndpointingDelay: 0,
|
|
127
|
+
maxEndpointingDelay: 0,
|
|
128
|
+
sttModel: 'deepgram-nova2',
|
|
129
|
+
sttProvider: 'deepgram',
|
|
130
|
+
getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }),
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
await ar.start();
|
|
134
|
+
// allow background task to drain
|
|
135
|
+
await new Promise((r) => setTimeout(r, 20));
|
|
136
|
+
await ar.close();
|
|
137
|
+
|
|
138
|
+
const spans = exporter.getFinishedSpans();
|
|
139
|
+
const userTurn = spanByName(spans, 'user_turn');
|
|
140
|
+
const eou = spanByName(spans, 'eou_detection');
|
|
141
|
+
expect(userTurn, 'user_turn span missing').toBeTruthy();
|
|
142
|
+
expect(eou, 'eou_detection span missing').toBeTruthy();
|
|
143
|
+
|
|
144
|
+
expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);
|
|
145
|
+
|
|
146
|
+
// creation-time attributes
|
|
147
|
+
expect(userTurn.attributes['lk.participant_id']).toBe('p1');
|
|
148
|
+
expect(userTurn.attributes['lk.participant_identity']).toBe('bob');
|
|
149
|
+
expect(userTurn.attributes['lk.participant_kind']).toBe('AGENT');
|
|
150
|
+
expect(userTurn.attributes['gen_ai.request.model']).toBe('deepgram-nova2');
|
|
151
|
+
expect(userTurn.attributes['gen_ai.provider.name']).toBe('deepgram');
|
|
152
|
+
|
|
153
|
+
// end-of-turn attributes
|
|
154
|
+
expect(userTurn.attributes['lk.user_transcript']).toContain('hello');
|
|
155
|
+
expect(userTurn.attributes['lk.transcript_confidence']).toBeGreaterThan(0);
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
it('creates user_turn from VAD startTime (vad mode) and keeps same parenting', async () => {
|
|
159
|
+
const { exporter } = setupInMemoryTracing();
|
|
160
|
+
|
|
161
|
+
const hooks = {
|
|
162
|
+
onStartOfSpeech: vi.fn(),
|
|
163
|
+
onVADInferenceDone: vi.fn(),
|
|
164
|
+
onEndOfSpeech: vi.fn(),
|
|
165
|
+
onInterimTranscript: vi.fn(),
|
|
166
|
+
onFinalTranscript: vi.fn(),
|
|
167
|
+
onPreemptiveGeneration: vi.fn(),
|
|
168
|
+
retrieveChatCtx: () =>
|
|
169
|
+
({
|
|
170
|
+
copy() {
|
|
171
|
+
return this;
|
|
172
|
+
},
|
|
173
|
+
addMessage() {},
|
|
174
|
+
toJSON() {
|
|
175
|
+
return { items: [] };
|
|
176
|
+
},
|
|
177
|
+
}) as any,
|
|
178
|
+
onEndOfTurn: vi.fn(async () => true),
|
|
179
|
+
};
|
|
180
|
+
|
|
181
|
+
const now = Date.now();
|
|
182
|
+
const vadEvents: VADEvent[] = [
|
|
183
|
+
{
|
|
184
|
+
type: VADEventType.START_OF_SPEECH,
|
|
185
|
+
samplesIndex: 0,
|
|
186
|
+
timestamp: now,
|
|
187
|
+
speechDuration: 100,
|
|
188
|
+
silenceDuration: 0,
|
|
189
|
+
frames: [],
|
|
190
|
+
probability: 0,
|
|
191
|
+
inferenceDuration: 0,
|
|
192
|
+
speaking: true,
|
|
193
|
+
rawAccumulatedSilence: 0,
|
|
194
|
+
rawAccumulatedSpeech: 0,
|
|
195
|
+
},
|
|
196
|
+
{
|
|
197
|
+
type: VADEventType.END_OF_SPEECH,
|
|
198
|
+
samplesIndex: 0,
|
|
199
|
+
timestamp: now + 200,
|
|
200
|
+
speechDuration: 100,
|
|
201
|
+
silenceDuration: 100,
|
|
202
|
+
frames: [],
|
|
203
|
+
probability: 0,
|
|
204
|
+
inferenceDuration: 0,
|
|
205
|
+
speaking: false,
|
|
206
|
+
rawAccumulatedSilence: 0,
|
|
207
|
+
rawAccumulatedSpeech: 0,
|
|
208
|
+
},
|
|
209
|
+
];
|
|
210
|
+
|
|
211
|
+
const sttEvents: SpeechEvent[] = [
|
|
212
|
+
{
|
|
213
|
+
type: SpeechEventType.FINAL_TRANSCRIPT,
|
|
214
|
+
alternatives: [
|
|
215
|
+
{
|
|
216
|
+
language: 'en',
|
|
217
|
+
text: 'test',
|
|
218
|
+
startTime: 0,
|
|
219
|
+
endTime: 0,
|
|
220
|
+
confidence: 0.8,
|
|
221
|
+
},
|
|
222
|
+
],
|
|
223
|
+
},
|
|
224
|
+
];
|
|
225
|
+
|
|
226
|
+
const sttNode = async () =>
|
|
227
|
+
new ReadableStream<SpeechEvent>({
|
|
228
|
+
start(controller) {
|
|
229
|
+
for (const ev of sttEvents) controller.enqueue(ev);
|
|
230
|
+
controller.close();
|
|
231
|
+
},
|
|
232
|
+
});
|
|
233
|
+
|
|
234
|
+
const ar = new AudioRecognition({
|
|
235
|
+
recognitionHooks: hooks as any,
|
|
236
|
+
stt: sttNode as any,
|
|
237
|
+
vad: new FakeVAD(vadEvents) as any,
|
|
238
|
+
turnDetector: alwaysTrueTurnDetector,
|
|
239
|
+
turnDetectionMode: 'vad',
|
|
240
|
+
minEndpointingDelay: 0,
|
|
241
|
+
maxEndpointingDelay: 0,
|
|
242
|
+
sttModel: 'stt-model',
|
|
243
|
+
sttProvider: 'stt-provider',
|
|
244
|
+
getLinkedParticipant: () => ({ sid: 'p2', identity: 'alice', kind: ParticipantKind.AGENT }),
|
|
245
|
+
});
|
|
246
|
+
|
|
247
|
+
await ar.start();
|
|
248
|
+
await new Promise((r) => setTimeout(r, 20));
|
|
249
|
+
await ar.close();
|
|
250
|
+
|
|
251
|
+
const spans = exporter.getFinishedSpans();
|
|
252
|
+
const userTurn = spanByName(spans, 'user_turn');
|
|
253
|
+
const eou = spanByName(spans, 'eou_detection');
|
|
254
|
+
expect(userTurn).toBeTruthy();
|
|
255
|
+
expect(eou).toBeTruthy();
|
|
256
|
+
expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);
|
|
257
|
+
|
|
258
|
+
expect(hooks.onStartOfSpeech).toHaveBeenCalled();
|
|
259
|
+
expect(hooks.onEndOfSpeech).toHaveBeenCalled();
|
|
260
|
+
});
|
|
261
|
+
});
|
package/src/voice/generation.ts
CHANGED
|
@@ -26,7 +26,13 @@ import { IdentityTransform } from '../stream/identity_transform.js';
|
|
|
26
26
|
import { traceTypes, tracer } from '../telemetry/index.js';
|
|
27
27
|
import { USERDATA_TIMED_TRANSCRIPT } from '../types.js';
|
|
28
28
|
import { Future, Task, shortuuid, toError, waitForAbort } from '../utils.js';
|
|
29
|
-
import {
|
|
29
|
+
import {
|
|
30
|
+
type Agent,
|
|
31
|
+
type ModelSettings,
|
|
32
|
+
_setActivityTaskInfo,
|
|
33
|
+
functionCallStorage,
|
|
34
|
+
isStopResponse,
|
|
35
|
+
} from './agent.js';
|
|
30
36
|
import type { AgentSession } from './agent_session.js';
|
|
31
37
|
import {
|
|
32
38
|
AudioOutput,
|
|
@@ -719,7 +725,7 @@ export interface _AudioOut {
|
|
|
719
725
|
|
|
720
726
|
async function forwardAudio(
|
|
721
727
|
ttsStream: ReadableStream<AudioFrame>,
|
|
722
|
-
|
|
728
|
+
audioOutput: AudioOutput,
|
|
723
729
|
out: _AudioOut,
|
|
724
730
|
signal?: AbortSignal,
|
|
725
731
|
): Promise<void> {
|
|
@@ -733,8 +739,8 @@ async function forwardAudio(
|
|
|
733
739
|
};
|
|
734
740
|
|
|
735
741
|
try {
|
|
736
|
-
|
|
737
|
-
|
|
742
|
+
audioOutput.on(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
|
|
743
|
+
audioOutput.resume();
|
|
738
744
|
|
|
739
745
|
while (true) {
|
|
740
746
|
if (signal?.aborted) {
|
|
@@ -748,36 +754,36 @@ async function forwardAudio(
|
|
|
748
754
|
|
|
749
755
|
if (
|
|
750
756
|
!out.firstFrameFut.done &&
|
|
751
|
-
|
|
752
|
-
|
|
757
|
+
audioOutput.sampleRate &&
|
|
758
|
+
audioOutput.sampleRate !== frame.sampleRate &&
|
|
753
759
|
!resampler
|
|
754
760
|
) {
|
|
755
|
-
resampler = new AudioResampler(frame.sampleRate,
|
|
761
|
+
resampler = new AudioResampler(frame.sampleRate, audioOutput.sampleRate, 1);
|
|
756
762
|
}
|
|
757
763
|
|
|
758
764
|
if (resampler) {
|
|
759
765
|
for (const f of resampler.push(frame)) {
|
|
760
|
-
await
|
|
766
|
+
await audioOutput.captureFrame(f);
|
|
761
767
|
}
|
|
762
768
|
} else {
|
|
763
|
-
await
|
|
769
|
+
await audioOutput.captureFrame(frame);
|
|
764
770
|
}
|
|
765
771
|
}
|
|
766
772
|
|
|
767
773
|
if (resampler) {
|
|
768
774
|
for (const f of resampler.flush()) {
|
|
769
|
-
await
|
|
775
|
+
await audioOutput.captureFrame(f);
|
|
770
776
|
}
|
|
771
777
|
}
|
|
772
778
|
} finally {
|
|
773
|
-
|
|
779
|
+
audioOutput.off(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
|
|
774
780
|
|
|
775
781
|
if (!out.firstFrameFut.done) {
|
|
776
782
|
out.firstFrameFut.reject(new Error('audio forwarding cancelled before playback started'));
|
|
777
783
|
}
|
|
778
784
|
|
|
779
785
|
reader?.releaseLock();
|
|
780
|
-
|
|
786
|
+
audioOutput.flush();
|
|
781
787
|
}
|
|
782
788
|
}
|
|
783
789
|
|
|
@@ -836,7 +842,7 @@ export function performToolExecutions({
|
|
|
836
842
|
const signal = controller.signal;
|
|
837
843
|
const reader = toolCallStream.getReader();
|
|
838
844
|
|
|
839
|
-
const tasks:
|
|
845
|
+
const tasks: Task<void>[] = [];
|
|
840
846
|
while (!signal.aborted) {
|
|
841
847
|
const { done, value: toolCall } = await reader.read();
|
|
842
848
|
if (signal.aborted) break;
|
|
@@ -929,14 +935,6 @@ export function performToolExecutions({
|
|
|
929
935
|
'Executing LLM tool call',
|
|
930
936
|
);
|
|
931
937
|
|
|
932
|
-
const toolExecution = asyncLocalStorage.run({ functionCall: toolCall }, async () => {
|
|
933
|
-
return await tool.execute(parsedArgs, {
|
|
934
|
-
ctx: new RunContext(session, speechHandle, toolCall),
|
|
935
|
-
toolCallId: toolCall.callId,
|
|
936
|
-
abortSignal: signal,
|
|
937
|
-
});
|
|
938
|
-
});
|
|
939
|
-
|
|
940
938
|
const _tracableToolExecutionImpl = async (toolExecTask: Promise<unknown>, span: Span) => {
|
|
941
939
|
span.setAttribute(traceTypes.ATTR_FUNCTION_TOOL_NAME, toolCall.name);
|
|
942
940
|
span.setAttribute(traceTypes.ATTR_FUNCTION_TOOL_ARGS, toolCall.args);
|
|
@@ -993,11 +991,42 @@ export function performToolExecutions({
|
|
|
993
991
|
name: 'function_tool',
|
|
994
992
|
});
|
|
995
993
|
|
|
994
|
+
const toolTask = Task.from(
|
|
995
|
+
async () => {
|
|
996
|
+
// Ensure this task is marked inline before user tool code executes.
|
|
997
|
+
const currentTask = Task.current();
|
|
998
|
+
if (currentTask) {
|
|
999
|
+
_setActivityTaskInfo(currentTask, {
|
|
1000
|
+
speechHandle,
|
|
1001
|
+
functionCall: toolCall,
|
|
1002
|
+
inlineTask: true,
|
|
1003
|
+
});
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
const toolExecution = functionCallStorage.run({ functionCall: toolCall }, async () => {
|
|
1007
|
+
return await tool.execute(parsedArgs, {
|
|
1008
|
+
ctx: new RunContext(session, speechHandle, toolCall),
|
|
1009
|
+
toolCallId: toolCall.callId,
|
|
1010
|
+
abortSignal: signal,
|
|
1011
|
+
});
|
|
1012
|
+
});
|
|
1013
|
+
|
|
1014
|
+
await tracableToolExecution(toolExecution);
|
|
1015
|
+
},
|
|
1016
|
+
controller,
|
|
1017
|
+
`performToolExecution:${toolCall.name}`,
|
|
1018
|
+
);
|
|
1019
|
+
|
|
1020
|
+
_setActivityTaskInfo(toolTask, {
|
|
1021
|
+
speechHandle,
|
|
1022
|
+
functionCall: toolCall,
|
|
1023
|
+
inlineTask: true,
|
|
1024
|
+
});
|
|
996
1025
|
// wait, not cancelling all tool calling tasks
|
|
997
|
-
tasks.push(
|
|
1026
|
+
tasks.push(toolTask);
|
|
998
1027
|
}
|
|
999
1028
|
|
|
1000
|
-
await Promise.allSettled(tasks);
|
|
1029
|
+
await Promise.allSettled(tasks.map((task) => task.result));
|
|
1001
1030
|
if (toolOutput.output.length > 0) {
|
|
1002
1031
|
logger.debug(
|
|
1003
1032
|
{
|
package/src/voice/index.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
export { Agent, StopResponse, type AgentOptions, type ModelSettings } from './agent.js';
|
|
4
|
+
export { Agent, AgentTask, StopResponse, type AgentOptions, type ModelSettings } from './agent.js';
|
|
5
5
|
export { AgentSession, type AgentSessionOptions, type VoiceOptions } from './agent_session.js';
|
|
6
6
|
export * from './avatar/index.js';
|
|
7
7
|
export * from './background_audio.js';
|
package/src/voice/io.ts
CHANGED
|
@@ -8,7 +8,7 @@ import type { ChatContext } from '../llm/chat_context.js';
|
|
|
8
8
|
import type { ChatChunk } from '../llm/llm.js';
|
|
9
9
|
import type { ToolContext } from '../llm/tool_context.js';
|
|
10
10
|
import { log } from '../log.js';
|
|
11
|
-
import {
|
|
11
|
+
import { MultiInputStream } from '../stream/multi_input_stream.js';
|
|
12
12
|
import type { SpeechEvent } from '../stt/stt.js';
|
|
13
13
|
import { Future } from '../utils.js';
|
|
14
14
|
import type { ModelSettings } from './agent.js';
|
|
@@ -84,11 +84,14 @@ export interface AudioOutputCapabilities {
|
|
|
84
84
|
}
|
|
85
85
|
|
|
86
86
|
export abstract class AudioInput {
|
|
87
|
-
protected
|
|
88
|
-
new DeferredReadableStream<AudioFrame>();
|
|
87
|
+
protected multiStream: MultiInputStream<AudioFrame> = new MultiInputStream<AudioFrame>();
|
|
89
88
|
|
|
90
89
|
get stream(): ReadableStream<AudioFrame> {
|
|
91
|
-
return this.
|
|
90
|
+
return this.multiStream.stream;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
async close(): Promise<void> {
|
|
94
|
+
await this.multiStream.close();
|
|
92
95
|
}
|
|
93
96
|
|
|
94
97
|
onAttached(): void {}
|
|
@@ -105,6 +105,7 @@ export class RecorderIO {
|
|
|
105
105
|
await this.outChan.close();
|
|
106
106
|
await this.closeFuture.await;
|
|
107
107
|
await cancelAndWait([this.forwardTask!, this.encodeTask!]);
|
|
108
|
+
await this.inRecord?.close();
|
|
108
109
|
|
|
109
110
|
this.started = false;
|
|
110
111
|
} finally {
|
|
@@ -378,7 +379,7 @@ class RecorderAudioInput extends AudioInput {
|
|
|
378
379
|
this.source = source;
|
|
379
380
|
|
|
380
381
|
// Set up the intercepting stream
|
|
381
|
-
this.
|
|
382
|
+
this.multiStream.addInputStream(this.createInterceptingStream());
|
|
382
383
|
}
|
|
383
384
|
|
|
384
385
|
/**
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
import { type AudioFrame, FrameProcessor } from '@livekit/rtc-node';
|
|
5
4
|
import {
|
|
5
|
+
type AudioFrame,
|
|
6
6
|
AudioStream,
|
|
7
|
+
FrameProcessor,
|
|
7
8
|
type NoiseCancellationOptions,
|
|
8
9
|
RemoteParticipant,
|
|
9
10
|
type RemoteTrack,
|
|
@@ -25,7 +26,9 @@ export class ParticipantAudioInputStream extends AudioInput {
|
|
|
25
26
|
private frameProcessor?: FrameProcessor<AudioFrame>;
|
|
26
27
|
private publication: RemoteTrackPublication | null = null;
|
|
27
28
|
private participantIdentity: string | null = null;
|
|
29
|
+
private currentInputId: string | null = null;
|
|
28
30
|
private logger = log();
|
|
31
|
+
|
|
29
32
|
constructor({
|
|
30
33
|
room,
|
|
31
34
|
sampleRate,
|
|
@@ -121,8 +124,9 @@ export class ParticipantAudioInputStream extends AudioInput {
|
|
|
121
124
|
};
|
|
122
125
|
|
|
123
126
|
private closeStream() {
|
|
124
|
-
if (this.
|
|
125
|
-
this.
|
|
127
|
+
if (this.currentInputId) {
|
|
128
|
+
void this.multiStream.removeInputStream(this.currentInputId);
|
|
129
|
+
this.currentInputId = null;
|
|
126
130
|
}
|
|
127
131
|
|
|
128
132
|
this.publication = null;
|
|
@@ -143,7 +147,7 @@ export class ParticipantAudioInputStream extends AudioInput {
|
|
|
143
147
|
}
|
|
144
148
|
this.closeStream();
|
|
145
149
|
this.publication = publication;
|
|
146
|
-
this.
|
|
150
|
+
this.currentInputId = this.multiStream.addInputStream(
|
|
147
151
|
resampleStream({
|
|
148
152
|
stream: this.createStream(track),
|
|
149
153
|
outputRate: this.sampleRate,
|
|
@@ -179,14 +183,14 @@ export class ParticipantAudioInputStream extends AudioInput {
|
|
|
179
183
|
}) as unknown as ReadableStream<AudioFrame>;
|
|
180
184
|
}
|
|
181
185
|
|
|
182
|
-
async close() {
|
|
186
|
+
override async close() {
|
|
183
187
|
this.room.off(RoomEvent.TrackSubscribed, this.onTrackSubscribed);
|
|
184
188
|
this.room.off(RoomEvent.TrackUnpublished, this.onTrackUnpublished);
|
|
185
189
|
this.room.off(RoomEvent.TokenRefreshed, this.onTokenRefreshed);
|
|
186
190
|
this.closeStream();
|
|
191
|
+
await super.close();
|
|
192
|
+
|
|
187
193
|
this.frameProcessor?.close();
|
|
188
194
|
this.frameProcessor = undefined;
|
|
189
|
-
// Ignore errors - stream may be locked by RecorderIO or already cancelled
|
|
190
|
-
await this.deferredStream.stream.cancel().catch(() => {});
|
|
191
195
|
}
|
|
192
196
|
}
|
|
@@ -376,6 +376,18 @@ export class RoomIO {
|
|
|
376
376
|
return this.participantAvailableFuture.done;
|
|
377
377
|
}
|
|
378
378
|
|
|
379
|
+
get linkedParticipant(): RemoteParticipant | undefined {
|
|
380
|
+
if (!this.isParticipantAvailable) {
|
|
381
|
+
return undefined;
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
return this.participantAvailableFuture.result;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
get localParticipant(): Participant | undefined {
|
|
388
|
+
return this.room.localParticipant ?? undefined;
|
|
389
|
+
}
|
|
390
|
+
|
|
379
391
|
/** Switch to a different participant */
|
|
380
392
|
setParticipant(participantIdentity: string | null) {
|
|
381
393
|
this.logger.debug({ participantIdentity }, 'setting participant');
|
|
@@ -5,7 +5,7 @@ import type { Context } from '@opentelemetry/api';
|
|
|
5
5
|
import type { ChatItem } from '../llm/index.js';
|
|
6
6
|
import type { Task } from '../utils.js';
|
|
7
7
|
import { Event, Future, shortuuid } from '../utils.js';
|
|
8
|
-
import {
|
|
8
|
+
import { functionCallStorage } from './agent.js';
|
|
9
9
|
|
|
10
10
|
/** Symbol used to identify SpeechHandle instances */
|
|
11
11
|
const SPEECH_HANDLE_SYMBOL = Symbol.for('livekit.agents.SpeechHandle');
|
|
@@ -46,6 +46,9 @@ export class SpeechHandle {
|
|
|
46
46
|
/** @internal - OpenTelemetry context for the agent turn span */
|
|
47
47
|
_agentTurnContext?: Context;
|
|
48
48
|
|
|
49
|
+
/** @internal - used by AgentTask/RunResult final output plumbing */
|
|
50
|
+
_maybeRunFinalOutput?: unknown;
|
|
51
|
+
|
|
49
52
|
private itemAddedCallbacks: Set<(item: ChatItem) => void> = new Set();
|
|
50
53
|
private doneCallbacks: Set<(sh: SpeechHandle) => void> = new Set();
|
|
51
54
|
|
|
@@ -148,7 +151,7 @@ export class SpeechHandle {
|
|
|
148
151
|
* has entirely played out, including any tool calls and response follow-ups.
|
|
149
152
|
*/
|
|
150
153
|
async waitForPlayout(): Promise<void> {
|
|
151
|
-
const store =
|
|
154
|
+
const store = functionCallStorage.getStore();
|
|
152
155
|
if (store && store?.functionCall) {
|
|
153
156
|
throw new Error(
|
|
154
157
|
`Cannot call 'SpeechHandle.waitForPlayout()' from inside the function tool '${store.functionCall.name}'. ` +
|
|
@@ -167,6 +170,10 @@ export class SpeechHandle {
|
|
|
167
170
|
}
|
|
168
171
|
|
|
169
172
|
addDoneCallback(callback: (sh: SpeechHandle) => void) {
|
|
173
|
+
if (this.done()) {
|
|
174
|
+
queueMicrotask(() => callback(this));
|
|
175
|
+
return;
|
|
176
|
+
}
|
|
170
177
|
this.doneCallbacks.add(callback);
|
|
171
178
|
}
|
|
172
179
|
|