@livekit/agents 1.0.21 → 1.0.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/inference/api_protos.cjs +2 -2
- package/dist/inference/api_protos.cjs.map +1 -1
- package/dist/inference/api_protos.d.cts +16 -16
- package/dist/inference/api_protos.d.ts +16 -16
- package/dist/inference/api_protos.js +2 -2
- package/dist/inference/api_protos.js.map +1 -1
- package/dist/inference/stt.cjs +42 -30
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +42 -30
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs +2 -3
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +2 -3
- package/dist/inference/tts.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +35 -1
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +13 -1
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/job.cjs +52 -6
- package/dist/job.cjs.map +1 -1
- package/dist/job.d.cts +2 -0
- package/dist/job.d.ts +2 -0
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +52 -6
- package/dist/job.js.map +1 -1
- package/dist/llm/llm.cjs +38 -3
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +1 -0
- package/dist/llm/llm.d.ts +1 -0
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +38 -3
- package/dist/llm/llm.js.map +1 -1
- package/dist/log.cjs +34 -10
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.cts +7 -0
- package/dist/log.d.ts +7 -0
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +34 -11
- package/dist/log.js.map +1 -1
- package/dist/stt/stt.cjs +18 -5
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +18 -5
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/index.cjs +23 -2
- package/dist/telemetry/index.cjs.map +1 -1
- package/dist/telemetry/index.d.cts +4 -1
- package/dist/telemetry/index.d.ts +4 -1
- package/dist/telemetry/index.d.ts.map +1 -1
- package/dist/telemetry/index.js +27 -2
- package/dist/telemetry/index.js.map +1 -1
- package/dist/telemetry/logging.cjs +65 -0
- package/dist/telemetry/logging.cjs.map +1 -0
- package/dist/telemetry/logging.d.cts +21 -0
- package/dist/telemetry/logging.d.ts +21 -0
- package/dist/telemetry/logging.d.ts.map +1 -0
- package/dist/telemetry/logging.js +40 -0
- package/dist/telemetry/logging.js.map +1 -0
- package/dist/telemetry/otel_http_exporter.cjs +144 -0
- package/dist/telemetry/otel_http_exporter.cjs.map +1 -0
- package/dist/telemetry/otel_http_exporter.d.cts +62 -0
- package/dist/telemetry/otel_http_exporter.d.ts +62 -0
- package/dist/telemetry/otel_http_exporter.d.ts.map +1 -0
- package/dist/telemetry/otel_http_exporter.js +120 -0
- package/dist/telemetry/otel_http_exporter.js.map +1 -0
- package/dist/telemetry/pino_otel_transport.cjs +217 -0
- package/dist/telemetry/pino_otel_transport.cjs.map +1 -0
- package/dist/telemetry/pino_otel_transport.d.cts +58 -0
- package/dist/telemetry/pino_otel_transport.d.ts +58 -0
- package/dist/telemetry/pino_otel_transport.d.ts.map +1 -0
- package/dist/telemetry/pino_otel_transport.js +189 -0
- package/dist/telemetry/pino_otel_transport.js.map +1 -0
- package/dist/telemetry/traces.cjs +225 -16
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.cts +17 -0
- package/dist/telemetry/traces.d.ts +17 -0
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +211 -14
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/tts/tts.cjs +68 -20
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +2 -0
- package/dist/tts/tts.d.ts +2 -0
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +68 -20
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +6 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +1 -0
- package/dist/utils.d.ts +1 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +5 -0
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent_activity.cjs +93 -7
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +3 -0
- package/dist/voice/agent_activity.d.ts +3 -0
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +93 -7
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +122 -27
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +15 -0
- package/dist/voice/agent_session.d.ts +15 -0
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +122 -27
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +69 -22
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +5 -0
- package/dist/voice/audio_recognition.d.ts +5 -0
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +69 -22
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/generation.cjs +43 -3
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +43 -3
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/report.cjs +3 -2
- package/dist/voice/report.cjs.map +1 -1
- package/dist/voice/report.d.cts +7 -1
- package/dist/voice/report.d.ts +7 -1
- package/dist/voice/report.d.ts.map +1 -1
- package/dist/voice/report.js +3 -2
- package/dist/voice/report.js.map +1 -1
- package/package.json +8 -2
- package/src/inference/api_protos.ts +2 -2
- package/src/inference/stt.ts +48 -33
- package/src/inference/tts.ts +4 -3
- package/src/ipc/job_proc_lazy_main.ts +12 -1
- package/src/job.ts +59 -10
- package/src/llm/llm.ts +48 -5
- package/src/log.ts +52 -15
- package/src/stt/stt.ts +18 -5
- package/src/telemetry/index.ts +22 -4
- package/src/telemetry/logging.ts +55 -0
- package/src/telemetry/otel_http_exporter.ts +191 -0
- package/src/telemetry/pino_otel_transport.ts +265 -0
- package/src/telemetry/traces.ts +320 -20
- package/src/tts/tts.ts +85 -24
- package/src/utils.ts +5 -0
- package/src/voice/agent_activity.ts +140 -22
- package/src/voice/agent_session.ts +174 -34
- package/src/voice/audio_recognition.ts +85 -26
- package/src/voice/generation.ts +59 -7
- package/src/voice/report.ts +10 -4
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { AudioFrame, Room } from '@livekit/rtc-node';
|
|
5
5
|
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
6
|
+
import type { Context, Span } from '@opentelemetry/api';
|
|
7
|
+
import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
|
|
6
8
|
import { EventEmitter } from 'node:events';
|
|
7
9
|
import type { ReadableStream } from 'node:stream/web';
|
|
8
10
|
import {
|
|
@@ -14,12 +16,14 @@ import {
|
|
|
14
16
|
type TTSModelString,
|
|
15
17
|
} from '../inference/index.js';
|
|
16
18
|
import { getJobContext } from '../job.js';
|
|
19
|
+
import type { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js';
|
|
17
20
|
import { AgentHandoffItem, ChatContext, ChatMessage } from '../llm/chat_context.js';
|
|
18
21
|
import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js';
|
|
19
22
|
import type { LLMError } from '../llm/llm.js';
|
|
20
23
|
import { log } from '../log.js';
|
|
21
24
|
import type { STT } from '../stt/index.js';
|
|
22
25
|
import type { STTError } from '../stt/stt.js';
|
|
26
|
+
import { traceTypes, tracer } from '../telemetry/index.js';
|
|
23
27
|
import type { TTS, TTSError } from '../tts/tts.js';
|
|
24
28
|
import type { VAD } from '../vad.js';
|
|
25
29
|
import type { Agent } from './agent.js';
|
|
@@ -128,9 +132,22 @@ export class AgentSession<
|
|
|
128
132
|
private closingTask: Promise<void> | null = null;
|
|
129
133
|
private userAwayTimer: NodeJS.Timeout | null = null;
|
|
130
134
|
|
|
135
|
+
private sessionSpan?: Span;
|
|
136
|
+
private userSpeakingSpan?: Span;
|
|
137
|
+
private agentSpeakingSpan?: Span;
|
|
138
|
+
|
|
139
|
+
/** @internal */
|
|
140
|
+
rootSpanContext?: Context;
|
|
141
|
+
|
|
131
142
|
/** @internal */
|
|
132
143
|
_recordedEvents: AgentEvent[] = [];
|
|
133
144
|
|
|
145
|
+
/** @internal */
|
|
146
|
+
_enableRecording = false;
|
|
147
|
+
|
|
148
|
+
/** @internal - Timestamp when the session started (milliseconds) */
|
|
149
|
+
_startedAt?: number;
|
|
150
|
+
|
|
134
151
|
constructor(opts: AgentSessionOptions<UserData>) {
|
|
135
152
|
super();
|
|
136
153
|
|
|
@@ -175,7 +192,8 @@ export class AgentSession<
|
|
|
175
192
|
this._chatCtx = ChatContext.empty();
|
|
176
193
|
this.options = { ...defaultVoiceOptions, ...voiceOptions };
|
|
177
194
|
|
|
178
|
-
this.
|
|
195
|
+
this._onUserInputTranscribed = this._onUserInputTranscribed.bind(this);
|
|
196
|
+
this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed);
|
|
179
197
|
}
|
|
180
198
|
|
|
181
199
|
emit<K extends keyof AgentSessionCallbacks>(
|
|
@@ -211,25 +229,22 @@ export class AgentSession<
|
|
|
211
229
|
this._userData = value;
|
|
212
230
|
}
|
|
213
231
|
|
|
214
|
-
async
|
|
215
|
-
// TODO(brian): PR2 - Add setupCloudTracer() call if on LiveKit Cloud with recording enabled
|
|
216
|
-
// TODO(brian): PR3 - Add span: this._sessionSpan = tracer.startSpan('agent_session'), store as instance property
|
|
217
|
-
// TODO(brian): PR4 - Add setupCloudLogger() call in setupCloudTracer() to setup OTEL logging with Pino bridge
|
|
232
|
+
private async _startImpl({
|
|
218
233
|
agent,
|
|
219
234
|
room,
|
|
220
235
|
inputOptions,
|
|
221
236
|
outputOptions,
|
|
222
|
-
record
|
|
237
|
+
record,
|
|
238
|
+
span,
|
|
223
239
|
}: {
|
|
224
240
|
agent: Agent;
|
|
225
241
|
room: Room;
|
|
226
242
|
inputOptions?: Partial<RoomInputOptions>;
|
|
227
243
|
outputOptions?: Partial<RoomOutputOptions>;
|
|
228
|
-
record
|
|
244
|
+
record: boolean;
|
|
245
|
+
span: Span;
|
|
229
246
|
}): Promise<void> {
|
|
230
|
-
|
|
231
|
-
return;
|
|
232
|
-
}
|
|
247
|
+
span.setAttribute(traceTypes.ATTR_AGENT_LABEL, agent.id);
|
|
233
248
|
|
|
234
249
|
this.agent = agent;
|
|
235
250
|
this._updateAgentState('initializing');
|
|
@@ -291,9 +306,62 @@ export class AgentSession<
|
|
|
291
306
|
);
|
|
292
307
|
|
|
293
308
|
this.started = true;
|
|
309
|
+
this._startedAt = Date.now();
|
|
294
310
|
this._updateAgentState('listening');
|
|
295
311
|
}
|
|
296
312
|
|
|
313
|
+
async start({
|
|
314
|
+
agent,
|
|
315
|
+
room,
|
|
316
|
+
inputOptions,
|
|
317
|
+
outputOptions,
|
|
318
|
+
record = true,
|
|
319
|
+
}: {
|
|
320
|
+
agent: Agent;
|
|
321
|
+
room: Room;
|
|
322
|
+
inputOptions?: Partial<RoomInputOptions>;
|
|
323
|
+
outputOptions?: Partial<RoomOutputOptions>;
|
|
324
|
+
record?: boolean;
|
|
325
|
+
}): Promise<void> {
|
|
326
|
+
if (this.started) {
|
|
327
|
+
return;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
const ctx = getJobContext();
|
|
331
|
+
|
|
332
|
+
record = record ?? ctx.info.job.enableRecording;
|
|
333
|
+
this._enableRecording = record;
|
|
334
|
+
|
|
335
|
+
this.logger.info(
|
|
336
|
+
{ record, enableRecording: ctx.info.job.enableRecording },
|
|
337
|
+
'Configuring session recording',
|
|
338
|
+
);
|
|
339
|
+
|
|
340
|
+
if (this._enableRecording) {
|
|
341
|
+
await ctx.initRecording();
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// Create agent_session as a ROOT span (new trace) to match Python behavior
|
|
345
|
+
// This creates a separate trace for better cloud dashboard organization
|
|
346
|
+
this.sessionSpan = tracer.startSpan({
|
|
347
|
+
name: 'agent_session',
|
|
348
|
+
context: ROOT_CONTEXT,
|
|
349
|
+
});
|
|
350
|
+
|
|
351
|
+
// Set the session span as the active span in the context
|
|
352
|
+
// This ensures all child spans (agent_turn, user_turn, etc.) are parented to it
|
|
353
|
+
this.rootSpanContext = trace.setSpan(ROOT_CONTEXT, this.sessionSpan);
|
|
354
|
+
|
|
355
|
+
await this._startImpl({
|
|
356
|
+
agent,
|
|
357
|
+
room,
|
|
358
|
+
inputOptions,
|
|
359
|
+
outputOptions,
|
|
360
|
+
record,
|
|
361
|
+
span: this.sessionSpan,
|
|
362
|
+
});
|
|
363
|
+
}
|
|
364
|
+
|
|
297
365
|
updateAgent(agent: Agent): void {
|
|
298
366
|
this.agent = agent;
|
|
299
367
|
|
|
@@ -367,32 +435,41 @@ export class AgentSession<
|
|
|
367
435
|
}
|
|
368
436
|
|
|
369
437
|
private async updateActivity(agent: Agent): Promise<void> {
|
|
370
|
-
|
|
371
|
-
|
|
438
|
+
const runWithContext = async () => {
|
|
439
|
+
// TODO(AJS-129): add lock to agent activity core lifecycle
|
|
440
|
+
this.nextActivity = new AgentActivity(agent, this);
|
|
372
441
|
|
|
373
|
-
|
|
442
|
+
const previousActivity = this.activity;
|
|
374
443
|
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
444
|
+
if (this.activity) {
|
|
445
|
+
await this.activity.drain();
|
|
446
|
+
await this.activity.close();
|
|
447
|
+
}
|
|
379
448
|
|
|
380
|
-
|
|
381
|
-
|
|
449
|
+
this.activity = this.nextActivity;
|
|
450
|
+
this.nextActivity = undefined;
|
|
382
451
|
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
452
|
+
this._chatCtx.insert(
|
|
453
|
+
new AgentHandoffItem({
|
|
454
|
+
oldAgentId: previousActivity?.agent.id,
|
|
455
|
+
newAgentId: agent.id,
|
|
456
|
+
}),
|
|
457
|
+
);
|
|
458
|
+
this.logger.debug({ previousActivity, agent }, 'Agent handoff inserted into chat context');
|
|
390
459
|
|
|
391
|
-
|
|
460
|
+
await this.activity.start();
|
|
392
461
|
|
|
393
|
-
|
|
394
|
-
|
|
462
|
+
if (this._input.audio) {
|
|
463
|
+
this.activity.attachAudioInput(this._input.audio.stream);
|
|
464
|
+
}
|
|
465
|
+
};
|
|
466
|
+
|
|
467
|
+
// Run within session span context if available
|
|
468
|
+
if (this.rootSpanContext) {
|
|
469
|
+
return otelContext.with(this.rootSpanContext, runWithContext);
|
|
395
470
|
}
|
|
471
|
+
|
|
472
|
+
return runWithContext();
|
|
396
473
|
}
|
|
397
474
|
|
|
398
475
|
get chatCtx(): ChatContext {
|
|
@@ -452,14 +529,35 @@ export class AgentSession<
|
|
|
452
529
|
this.emit(AgentSessionEventTypes.ConversationItemAdded, createConversationItemAddedEvent(item));
|
|
453
530
|
}
|
|
454
531
|
|
|
532
|
+
/** @internal */
|
|
533
|
+
_toolItemsAdded(items: (FunctionCall | FunctionCallOutput)[]): void {
|
|
534
|
+
this._chatCtx.insert(items);
|
|
535
|
+
}
|
|
536
|
+
|
|
455
537
|
/** @internal */
|
|
456
538
|
_updateAgentState(state: AgentState) {
|
|
457
539
|
if (this._agentState === state) {
|
|
458
540
|
return;
|
|
459
541
|
}
|
|
460
542
|
|
|
461
|
-
|
|
462
|
-
|
|
543
|
+
if (state === 'speaking') {
|
|
544
|
+
// TODO(brian): PR4 - Track error counts
|
|
545
|
+
|
|
546
|
+
if (this.agentSpeakingSpan === undefined) {
|
|
547
|
+
this.agentSpeakingSpan = tracer.startSpan({
|
|
548
|
+
name: 'agent_speaking',
|
|
549
|
+
context: this.rootSpanContext,
|
|
550
|
+
});
|
|
551
|
+
|
|
552
|
+
// TODO(brian): PR4 - Set participant attributes if roomIO.room.localParticipant is available
|
|
553
|
+
// (Ref: Python agent_session.py line 1161-1164)
|
|
554
|
+
}
|
|
555
|
+
} else if (this.agentSpeakingSpan !== undefined) {
|
|
556
|
+
// TODO(brian): PR4 - Set ATTR_END_TIME attribute if available
|
|
557
|
+
this.agentSpeakingSpan.end();
|
|
558
|
+
this.agentSpeakingSpan = undefined;
|
|
559
|
+
}
|
|
560
|
+
|
|
463
561
|
const oldState = this._agentState;
|
|
464
562
|
this._agentState = state;
|
|
465
563
|
|
|
@@ -482,8 +580,20 @@ export class AgentSession<
|
|
|
482
580
|
return;
|
|
483
581
|
}
|
|
484
582
|
|
|
485
|
-
|
|
486
|
-
|
|
583
|
+
if (state === 'speaking' && this.userSpeakingSpan === undefined) {
|
|
584
|
+
this.userSpeakingSpan = tracer.startSpan({
|
|
585
|
+
name: 'user_speaking',
|
|
586
|
+
context: this.rootSpanContext,
|
|
587
|
+
});
|
|
588
|
+
|
|
589
|
+
// TODO(brian): PR4 - Set participant attributes if roomIO.linkedParticipant is available
|
|
590
|
+
// (Ref: Python agent_session.py line 1192-1195)
|
|
591
|
+
} else if (this.userSpeakingSpan !== undefined) {
|
|
592
|
+
// TODO(brian): PR4 - Set ATTR_END_TIME attribute with lastSpeakingTime if available
|
|
593
|
+
this.userSpeakingSpan.end();
|
|
594
|
+
this.userSpeakingSpan = undefined;
|
|
595
|
+
}
|
|
596
|
+
|
|
487
597
|
const oldState = this.userState;
|
|
488
598
|
this.userState = state;
|
|
489
599
|
|
|
@@ -550,19 +660,33 @@ export class AgentSession<
|
|
|
550
660
|
reason: CloseReason,
|
|
551
661
|
error: RealtimeModelError | LLMError | TTSError | STTError | null = null,
|
|
552
662
|
drain: boolean = false,
|
|
663
|
+
): Promise<void> {
|
|
664
|
+
if (this.rootSpanContext) {
|
|
665
|
+
return otelContext.with(this.rootSpanContext, async () => {
|
|
666
|
+
await this.closeImplInner(reason, error, drain);
|
|
667
|
+
});
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
return this.closeImplInner(reason, error, drain);
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
private async closeImplInner(
|
|
674
|
+
reason: CloseReason,
|
|
675
|
+
error: RealtimeModelError | LLMError | TTSError | STTError | null = null,
|
|
676
|
+
drain: boolean = false,
|
|
553
677
|
): Promise<void> {
|
|
554
678
|
if (!this.started) {
|
|
555
679
|
return;
|
|
556
680
|
}
|
|
557
681
|
|
|
558
682
|
this._cancelUserAwayTimer();
|
|
683
|
+
this.off(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed);
|
|
559
684
|
|
|
560
685
|
if (this.activity) {
|
|
561
686
|
if (!drain) {
|
|
562
687
|
try {
|
|
563
688
|
this.activity.interrupt();
|
|
564
689
|
} catch (error) {
|
|
565
|
-
// uninterruptible speech [copied from python]
|
|
566
690
|
// TODO(shubhra): force interrupt or wait for it to finish?
|
|
567
691
|
// it might be an audio played from the error callback
|
|
568
692
|
}
|
|
@@ -584,12 +708,28 @@ export class AgentSession<
|
|
|
584
708
|
await this.activity?.close();
|
|
585
709
|
this.activity = undefined;
|
|
586
710
|
|
|
711
|
+
if (this.sessionSpan) {
|
|
712
|
+
this.sessionSpan.end();
|
|
713
|
+
this.sessionSpan = undefined;
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
if (this.userSpeakingSpan) {
|
|
717
|
+
this.userSpeakingSpan.end();
|
|
718
|
+
this.userSpeakingSpan = undefined;
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
if (this.agentSpeakingSpan) {
|
|
722
|
+
this.agentSpeakingSpan.end();
|
|
723
|
+
this.agentSpeakingSpan = undefined;
|
|
724
|
+
}
|
|
725
|
+
|
|
587
726
|
this.started = false;
|
|
588
727
|
|
|
589
728
|
this.emit(AgentSessionEventTypes.Close, createCloseEvent(reason, error));
|
|
590
729
|
|
|
591
730
|
this.userState = 'listening';
|
|
592
731
|
this._agentState = 'initializing';
|
|
732
|
+
this.rootSpanContext = undefined;
|
|
593
733
|
|
|
594
734
|
this.logger.info({ reason, error }, 'AgentSession closed');
|
|
595
735
|
}
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
+
import type { Context, Span } from '@opentelemetry/api';
|
|
5
6
|
import type { WritableStreamDefaultWriter } from 'node:stream/web';
|
|
6
7
|
import { ReadableStream } from 'node:stream/web';
|
|
7
8
|
import { type ChatContext } from '../llm/chat_context.js';
|
|
@@ -10,6 +11,7 @@ import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/de
|
|
|
10
11
|
import { IdentityTransform } from '../stream/identity_transform.js';
|
|
11
12
|
import { mergeReadableStreams } from '../stream/merge_readable_streams.js';
|
|
12
13
|
import { type SpeechEvent, SpeechEventType } from '../stt/stt.js';
|
|
14
|
+
import { traceTypes, tracer } from '../telemetry/index.js';
|
|
13
15
|
import { Task, delay } from '../utils.js';
|
|
14
16
|
import { type VAD, type VADEvent, VADEventType } from '../vad.js';
|
|
15
17
|
import type { TurnDetectionMode } from './agent_session.js';
|
|
@@ -55,10 +57,9 @@ export interface AudioRecognitionOptions {
|
|
|
55
57
|
turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
|
|
56
58
|
minEndpointingDelay: number;
|
|
57
59
|
maxEndpointingDelay: number;
|
|
60
|
+
rootSpanContext?: Context;
|
|
58
61
|
}
|
|
59
62
|
|
|
60
|
-
// TODO(brian): PR3 - Add span: private _userTurnSpan?: Span, create lazily in _ensureUserTurnSpan() method (tracer.startSpan('user_turn') with participant attributes)
|
|
61
|
-
// TODO(brian): PR3 - Add span: 'eou_detection' span when running EOU detection (in runEOUDetection method)
|
|
62
63
|
export class AudioRecognition {
|
|
63
64
|
private hooks: RecognitionHooks;
|
|
64
65
|
private stt?: STTNode;
|
|
@@ -68,6 +69,7 @@ export class AudioRecognition {
|
|
|
68
69
|
private minEndpointingDelay: number;
|
|
69
70
|
private maxEndpointingDelay: number;
|
|
70
71
|
private lastLanguage?: string;
|
|
72
|
+
private rootSpanContext?: Context;
|
|
71
73
|
|
|
72
74
|
private deferredInputStream: DeferredReadableStream<AudioFrame>;
|
|
73
75
|
private logger = log();
|
|
@@ -82,6 +84,8 @@ export class AudioRecognition {
|
|
|
82
84
|
private speaking = false;
|
|
83
85
|
private sampleRate?: number;
|
|
84
86
|
|
|
87
|
+
private userTurnSpan?: Span;
|
|
88
|
+
|
|
85
89
|
private vadInputStream: ReadableStream<AudioFrame>;
|
|
86
90
|
private sttInputStream: ReadableStream<AudioFrame>;
|
|
87
91
|
private silenceAudioTransform = new IdentityTransform<AudioFrame>();
|
|
@@ -102,6 +106,7 @@ export class AudioRecognition {
|
|
|
102
106
|
this.minEndpointingDelay = opts.minEndpointingDelay;
|
|
103
107
|
this.maxEndpointingDelay = opts.maxEndpointingDelay;
|
|
104
108
|
this.lastLanguage = undefined;
|
|
109
|
+
this.rootSpanContext = opts.rootSpanContext;
|
|
105
110
|
|
|
106
111
|
this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
|
|
107
112
|
const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
|
|
@@ -357,31 +362,47 @@ export class AudioRecognition {
|
|
|
357
362
|
let endpointingDelay = this.minEndpointingDelay;
|
|
358
363
|
|
|
359
364
|
if (turnDetector) {
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
365
|
+
await tracer.startActiveSpan(
|
|
366
|
+
async (span) => {
|
|
367
|
+
this.logger.debug('Running turn detector model');
|
|
368
|
+
|
|
369
|
+
let endOfTurnProbability = 0.0;
|
|
370
|
+
let unlikelyThreshold: number | undefined;
|
|
371
|
+
|
|
372
|
+
if (!(await turnDetector.supportsLanguage(this.lastLanguage))) {
|
|
373
|
+
this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
|
|
374
|
+
} else {
|
|
375
|
+
try {
|
|
376
|
+
endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
|
|
377
|
+
unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
|
|
378
|
+
|
|
379
|
+
this.logger.debug(
|
|
380
|
+
{ endOfTurnProbability, unlikelyThreshold, language: this.lastLanguage },
|
|
381
|
+
'end of turn probability',
|
|
382
|
+
);
|
|
383
|
+
|
|
384
|
+
if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
|
|
385
|
+
endpointingDelay = this.maxEndpointingDelay;
|
|
386
|
+
}
|
|
387
|
+
} catch (error) {
|
|
388
|
+
this.logger.error(error, 'Error predicting end of turn');
|
|
389
|
+
}
|
|
390
|
+
}
|
|
380
391
|
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
392
|
+
span.setAttribute(
|
|
393
|
+
traceTypes.ATTR_CHAT_CTX,
|
|
394
|
+
JSON.stringify(chatCtx.toJSON({ excludeTimestamp: false })),
|
|
395
|
+
);
|
|
396
|
+
span.setAttribute(traceTypes.ATTR_EOU_PROBABILITY, endOfTurnProbability);
|
|
397
|
+
span.setAttribute(traceTypes.ATTR_EOU_UNLIKELY_THRESHOLD, unlikelyThreshold ?? 0);
|
|
398
|
+
span.setAttribute(traceTypes.ATTR_EOU_DELAY, endpointingDelay);
|
|
399
|
+
span.setAttribute(traceTypes.ATTR_EOU_LANGUAGE, this.lastLanguage ?? '');
|
|
400
|
+
},
|
|
401
|
+
{
|
|
402
|
+
name: 'eou_detection',
|
|
403
|
+
context: this.rootSpanContext,
|
|
404
|
+
},
|
|
405
|
+
);
|
|
385
406
|
}
|
|
386
407
|
|
|
387
408
|
let extraSleep = endpointingDelay;
|
|
@@ -430,6 +451,13 @@ export class AudioRecognition {
|
|
|
430
451
|
});
|
|
431
452
|
|
|
432
453
|
if (committed) {
|
|
454
|
+
this._endUserTurnSpan({
|
|
455
|
+
transcript: this.audioTranscript,
|
|
456
|
+
confidence: confidenceAvg,
|
|
457
|
+
transcriptionDelay: transcriptionDelay ?? 0,
|
|
458
|
+
endOfUtteranceDelay: endOfUtteranceDelay ?? 0,
|
|
459
|
+
});
|
|
460
|
+
|
|
433
461
|
// clear the transcript if the user turn was committed
|
|
434
462
|
this.audioTranscript = '';
|
|
435
463
|
this.finalTranscriptConfidence = [];
|
|
@@ -537,6 +565,13 @@ export class AudioRecognition {
|
|
|
537
565
|
this.hooks.onStartOfSpeech(ev);
|
|
538
566
|
this.speaking = true;
|
|
539
567
|
|
|
568
|
+
if (!this.userTurnSpan) {
|
|
569
|
+
this.userTurnSpan = tracer.startSpan({
|
|
570
|
+
name: 'user_turn',
|
|
571
|
+
context: this.rootSpanContext,
|
|
572
|
+
});
|
|
573
|
+
}
|
|
574
|
+
|
|
540
575
|
// Capture sample rate from the first VAD event if not already set
|
|
541
576
|
if (ev.frames.length > 0 && ev.frames[0]) {
|
|
542
577
|
this.sampleRate = ev.frames[0].sampleRate;
|
|
@@ -646,12 +681,36 @@ export class AudioRecognition {
|
|
|
646
681
|
|
|
647
682
|
async close() {
|
|
648
683
|
this.detachInputAudioStream();
|
|
684
|
+
this.silenceAudioWriter.releaseLock();
|
|
649
685
|
await this.commitUserTurnTask?.cancelAndWait();
|
|
650
686
|
await this.sttTask?.cancelAndWait();
|
|
651
687
|
await this.vadTask?.cancelAndWait();
|
|
652
688
|
await this.bounceEOUTask?.cancelAndWait();
|
|
653
689
|
}
|
|
654
690
|
|
|
691
|
+
private _endUserTurnSpan({
|
|
692
|
+
transcript,
|
|
693
|
+
confidence,
|
|
694
|
+
transcriptionDelay,
|
|
695
|
+
endOfUtteranceDelay,
|
|
696
|
+
}: {
|
|
697
|
+
transcript: string;
|
|
698
|
+
confidence: number;
|
|
699
|
+
transcriptionDelay: number;
|
|
700
|
+
endOfUtteranceDelay: number;
|
|
701
|
+
}): void {
|
|
702
|
+
if (this.userTurnSpan) {
|
|
703
|
+
this.userTurnSpan.setAttributes({
|
|
704
|
+
[traceTypes.ATTR_USER_TRANSCRIPT]: transcript,
|
|
705
|
+
[traceTypes.ATTR_TRANSCRIPT_CONFIDENCE]: confidence,
|
|
706
|
+
[traceTypes.ATTR_TRANSCRIPTION_DELAY]: transcriptionDelay,
|
|
707
|
+
[traceTypes.ATTR_END_OF_TURN_DELAY]: endOfUtteranceDelay,
|
|
708
|
+
});
|
|
709
|
+
this.userTurnSpan.end();
|
|
710
|
+
this.userTurnSpan = undefined;
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
|
|
655
714
|
private get vadBaseTurnDetection() {
|
|
656
715
|
return ['vad', undefined].includes(this.turnDetectionMode);
|
|
657
716
|
}
|
package/src/voice/generation.ts
CHANGED
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
5
|
import { AudioResampler } from '@livekit/rtc-node';
|
|
6
|
+
import type { Span } from '@opentelemetry/api';
|
|
7
|
+
import { context as otelContext } from '@opentelemetry/api';
|
|
6
8
|
import type { ReadableStream, ReadableStreamDefaultReader } from 'stream/web';
|
|
7
9
|
import {
|
|
8
10
|
type ChatContext,
|
|
@@ -21,6 +23,7 @@ import {
|
|
|
21
23
|
import { isZodSchema, parseZodSchema } from '../llm/zod-utils.js';
|
|
22
24
|
import { log } from '../log.js';
|
|
23
25
|
import { IdentityTransform } from '../stream/identity_transform.js';
|
|
26
|
+
import { traceTypes, tracer } from '../telemetry/index.js';
|
|
24
27
|
import { Future, Task, shortuuid, toError } from '../utils.js';
|
|
25
28
|
import { type Agent, type ModelSettings, asyncLocalStorage, isStopResponse } from './agent.js';
|
|
26
29
|
import type { AgentSession } from './agent_session.js';
|
|
@@ -377,7 +380,6 @@ export function updateInstructions(options: {
|
|
|
377
380
|
}
|
|
378
381
|
}
|
|
379
382
|
|
|
380
|
-
// TODO(brian): PR3 - Add @tracer.startActiveSpan('llm_node') decorator/wrapper
|
|
381
383
|
export function performLLMInference(
|
|
382
384
|
node: LLMNode,
|
|
383
385
|
chatCtx: ChatContext,
|
|
@@ -392,7 +394,13 @@ export function performLLMInference(
|
|
|
392
394
|
const toolCallWriter = toolCallStream.writable.getWriter();
|
|
393
395
|
const data = new _LLMGenerationData(textStream.readable, toolCallStream.readable);
|
|
394
396
|
|
|
395
|
-
const
|
|
397
|
+
const _performLLMInferenceImpl = async (signal: AbortSignal, span: Span) => {
|
|
398
|
+
span.setAttribute(
|
|
399
|
+
traceTypes.ATTR_CHAT_CTX,
|
|
400
|
+
JSON.stringify(chatCtx.toJSON({ excludeTimestamp: false })),
|
|
401
|
+
);
|
|
402
|
+
span.setAttribute(traceTypes.ATTR_FUNCTION_TOOLS, JSON.stringify(Object.keys(toolCtx)));
|
|
403
|
+
|
|
396
404
|
let llmStreamReader: ReadableStreamDefaultReader<string | ChatChunk> | null = null;
|
|
397
405
|
let llmStream: ReadableStream<string | ChatChunk> | null = null;
|
|
398
406
|
|
|
@@ -448,6 +456,8 @@ export function performLLMInference(
|
|
|
448
456
|
// No need to check if chunk is of type other than ChatChunk or string like in
|
|
449
457
|
// Python since chunk is defined in the type ChatChunk | string in TypeScript
|
|
450
458
|
}
|
|
459
|
+
|
|
460
|
+
span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, data.generatedText);
|
|
451
461
|
} catch (error) {
|
|
452
462
|
if (error instanceof DOMException && error.name === 'AbortError') {
|
|
453
463
|
// Abort signal was triggered, handle gracefully
|
|
@@ -462,13 +472,21 @@ export function performLLMInference(
|
|
|
462
472
|
}
|
|
463
473
|
};
|
|
464
474
|
|
|
475
|
+
// Capture the current context (agent_turn) to ensure llm_node is properly parented
|
|
476
|
+
const currentContext = otelContext.active();
|
|
477
|
+
|
|
478
|
+
const inferenceTask = async (signal: AbortSignal) =>
|
|
479
|
+
tracer.startActiveSpan(async (span) => _performLLMInferenceImpl(signal, span), {
|
|
480
|
+
name: 'llm_node',
|
|
481
|
+
context: currentContext,
|
|
482
|
+
});
|
|
483
|
+
|
|
465
484
|
return [
|
|
466
485
|
Task.from((controller) => inferenceTask(controller.signal), controller, 'performLLMInference'),
|
|
467
486
|
data,
|
|
468
487
|
];
|
|
469
488
|
}
|
|
470
489
|
|
|
471
|
-
// TODO(brian): PR3 - Add @tracer.startActiveSpan('tts_node') decorator/wrapper
|
|
472
490
|
export function performTTSInference(
|
|
473
491
|
node: TTSNode,
|
|
474
492
|
text: ReadableStream<string>,
|
|
@@ -479,7 +497,7 @@ export function performTTSInference(
|
|
|
479
497
|
const outputWriter = audioStream.writable.getWriter();
|
|
480
498
|
const audioOutputStream = audioStream.readable;
|
|
481
499
|
|
|
482
|
-
const
|
|
500
|
+
const _performTTSInferenceImpl = async (signal: AbortSignal) => {
|
|
483
501
|
let ttsStreamReader: ReadableStreamDefaultReader<AudioFrame> | null = null;
|
|
484
502
|
let ttsStream: ReadableStream<AudioFrame> | null = null;
|
|
485
503
|
|
|
@@ -514,6 +532,15 @@ export function performTTSInference(
|
|
|
514
532
|
}
|
|
515
533
|
};
|
|
516
534
|
|
|
535
|
+
// Capture the current context (agent_turn) to ensure tts_node is properly parented
|
|
536
|
+
const currentContext = otelContext.active();
|
|
537
|
+
|
|
538
|
+
const inferenceTask = async (signal: AbortSignal) =>
|
|
539
|
+
tracer.startActiveSpan(async () => _performTTSInferenceImpl(signal), {
|
|
540
|
+
name: 'tts_node',
|
|
541
|
+
context: currentContext,
|
|
542
|
+
});
|
|
543
|
+
|
|
517
544
|
return [
|
|
518
545
|
Task.from((controller) => inferenceTask(controller.signal), controller, 'performTTSInference'),
|
|
519
546
|
audioOutputStream,
|
|
@@ -652,7 +679,7 @@ export function performAudioForwarding(
|
|
|
652
679
|
];
|
|
653
680
|
}
|
|
654
681
|
|
|
655
|
-
//
|
|
682
|
+
// function_tool span is already implemented in tracableToolExecution below (line ~796)
|
|
656
683
|
export function performToolExecutions({
|
|
657
684
|
session,
|
|
658
685
|
speechHandle,
|
|
@@ -788,8 +815,9 @@ export function performToolExecutions({
|
|
|
788
815
|
});
|
|
789
816
|
});
|
|
790
817
|
|
|
791
|
-
const
|
|
792
|
-
|
|
818
|
+
const _tracableToolExecutionImpl = async (toolExecTask: Promise<unknown>, span: Span) => {
|
|
819
|
+
span.setAttribute(traceTypes.ATTR_FUNCTION_TOOL_NAME, toolCall.name);
|
|
820
|
+
span.setAttribute(traceTypes.ATTR_FUNCTION_TOOL_ARGS, toolCall.args);
|
|
793
821
|
|
|
794
822
|
// await for task to complete, if task is aborted, set exception
|
|
795
823
|
let toolOutput: ToolExecutionOutput | undefined;
|
|
@@ -800,6 +828,17 @@ export function performToolExecutions({
|
|
|
800
828
|
exception: isAborted ? new Error('tool call was aborted') : undefined,
|
|
801
829
|
output: isAborted ? undefined : result,
|
|
802
830
|
});
|
|
831
|
+
|
|
832
|
+
if (toolOutput.toolCallOutput) {
|
|
833
|
+
span.setAttribute(
|
|
834
|
+
traceTypes.ATTR_FUNCTION_TOOL_OUTPUT,
|
|
835
|
+
toolOutput.toolCallOutput.output,
|
|
836
|
+
);
|
|
837
|
+
span.setAttribute(
|
|
838
|
+
traceTypes.ATTR_FUNCTION_TOOL_IS_ERROR,
|
|
839
|
+
toolOutput.toolCallOutput.isError,
|
|
840
|
+
);
|
|
841
|
+
}
|
|
803
842
|
} catch (rawError) {
|
|
804
843
|
logger.error(
|
|
805
844
|
{
|
|
@@ -813,12 +852,25 @@ export function performToolExecutions({
|
|
|
813
852
|
toolCall,
|
|
814
853
|
exception: toError(rawError),
|
|
815
854
|
});
|
|
855
|
+
|
|
856
|
+
if (toolOutput.toolCallOutput) {
|
|
857
|
+
span.setAttribute(
|
|
858
|
+
traceTypes.ATTR_FUNCTION_TOOL_OUTPUT,
|
|
859
|
+
toolOutput.toolCallOutput.output,
|
|
860
|
+
);
|
|
861
|
+
span.setAttribute(traceTypes.ATTR_FUNCTION_TOOL_IS_ERROR, true);
|
|
862
|
+
}
|
|
816
863
|
} finally {
|
|
817
864
|
if (!toolOutput) throw new Error('toolOutput is undefined');
|
|
818
865
|
toolCompleted(toolOutput);
|
|
819
866
|
}
|
|
820
867
|
};
|
|
821
868
|
|
|
869
|
+
const tracableToolExecution = (toolExecTask: Promise<unknown>) =>
|
|
870
|
+
tracer.startActiveSpan(async (span) => _tracableToolExecutionImpl(toolExecTask, span), {
|
|
871
|
+
name: 'function_tool',
|
|
872
|
+
});
|
|
873
|
+
|
|
822
874
|
// wait, not cancelling all tool calling tasks
|
|
823
875
|
tasks.push(tracableToolExecution(toolExecution));
|
|
824
876
|
}
|