@livekit/agents 1.0.21 → 1.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/dist/inference/api_protos.cjs +2 -2
  2. package/dist/inference/api_protos.cjs.map +1 -1
  3. package/dist/inference/api_protos.d.cts +16 -16
  4. package/dist/inference/api_protos.d.ts +16 -16
  5. package/dist/inference/api_protos.js +2 -2
  6. package/dist/inference/api_protos.js.map +1 -1
  7. package/dist/inference/stt.cjs +42 -30
  8. package/dist/inference/stt.cjs.map +1 -1
  9. package/dist/inference/stt.d.ts.map +1 -1
  10. package/dist/inference/stt.js +42 -30
  11. package/dist/inference/stt.js.map +1 -1
  12. package/dist/inference/tts.cjs +2 -3
  13. package/dist/inference/tts.cjs.map +1 -1
  14. package/dist/inference/tts.d.ts.map +1 -1
  15. package/dist/inference/tts.js +2 -3
  16. package/dist/inference/tts.js.map +1 -1
  17. package/dist/ipc/job_proc_lazy_main.cjs +35 -1
  18. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  19. package/dist/ipc/job_proc_lazy_main.js +13 -1
  20. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  21. package/dist/job.cjs +52 -6
  22. package/dist/job.cjs.map +1 -1
  23. package/dist/job.d.cts +2 -0
  24. package/dist/job.d.ts +2 -0
  25. package/dist/job.d.ts.map +1 -1
  26. package/dist/job.js +52 -6
  27. package/dist/job.js.map +1 -1
  28. package/dist/llm/llm.cjs +38 -3
  29. package/dist/llm/llm.cjs.map +1 -1
  30. package/dist/llm/llm.d.cts +1 -0
  31. package/dist/llm/llm.d.ts +1 -0
  32. package/dist/llm/llm.d.ts.map +1 -1
  33. package/dist/llm/llm.js +38 -3
  34. package/dist/llm/llm.js.map +1 -1
  35. package/dist/log.cjs +34 -10
  36. package/dist/log.cjs.map +1 -1
  37. package/dist/log.d.cts +7 -0
  38. package/dist/log.d.ts +7 -0
  39. package/dist/log.d.ts.map +1 -1
  40. package/dist/log.js +34 -11
  41. package/dist/log.js.map +1 -1
  42. package/dist/stt/stt.cjs +18 -5
  43. package/dist/stt/stt.cjs.map +1 -1
  44. package/dist/stt/stt.d.ts.map +1 -1
  45. package/dist/stt/stt.js +18 -5
  46. package/dist/stt/stt.js.map +1 -1
  47. package/dist/telemetry/index.cjs +23 -2
  48. package/dist/telemetry/index.cjs.map +1 -1
  49. package/dist/telemetry/index.d.cts +4 -1
  50. package/dist/telemetry/index.d.ts +4 -1
  51. package/dist/telemetry/index.d.ts.map +1 -1
  52. package/dist/telemetry/index.js +27 -2
  53. package/dist/telemetry/index.js.map +1 -1
  54. package/dist/telemetry/logging.cjs +65 -0
  55. package/dist/telemetry/logging.cjs.map +1 -0
  56. package/dist/telemetry/logging.d.cts +21 -0
  57. package/dist/telemetry/logging.d.ts +21 -0
  58. package/dist/telemetry/logging.d.ts.map +1 -0
  59. package/dist/telemetry/logging.js +40 -0
  60. package/dist/telemetry/logging.js.map +1 -0
  61. package/dist/telemetry/otel_http_exporter.cjs +144 -0
  62. package/dist/telemetry/otel_http_exporter.cjs.map +1 -0
  63. package/dist/telemetry/otel_http_exporter.d.cts +62 -0
  64. package/dist/telemetry/otel_http_exporter.d.ts +62 -0
  65. package/dist/telemetry/otel_http_exporter.d.ts.map +1 -0
  66. package/dist/telemetry/otel_http_exporter.js +120 -0
  67. package/dist/telemetry/otel_http_exporter.js.map +1 -0
  68. package/dist/telemetry/pino_otel_transport.cjs +217 -0
  69. package/dist/telemetry/pino_otel_transport.cjs.map +1 -0
  70. package/dist/telemetry/pino_otel_transport.d.cts +58 -0
  71. package/dist/telemetry/pino_otel_transport.d.ts +58 -0
  72. package/dist/telemetry/pino_otel_transport.d.ts.map +1 -0
  73. package/dist/telemetry/pino_otel_transport.js +189 -0
  74. package/dist/telemetry/pino_otel_transport.js.map +1 -0
  75. package/dist/telemetry/traces.cjs +225 -16
  76. package/dist/telemetry/traces.cjs.map +1 -1
  77. package/dist/telemetry/traces.d.cts +17 -0
  78. package/dist/telemetry/traces.d.ts +17 -0
  79. package/dist/telemetry/traces.d.ts.map +1 -1
  80. package/dist/telemetry/traces.js +211 -14
  81. package/dist/telemetry/traces.js.map +1 -1
  82. package/dist/tts/tts.cjs +68 -20
  83. package/dist/tts/tts.cjs.map +1 -1
  84. package/dist/tts/tts.d.cts +2 -0
  85. package/dist/tts/tts.d.ts +2 -0
  86. package/dist/tts/tts.d.ts.map +1 -1
  87. package/dist/tts/tts.js +68 -20
  88. package/dist/tts/tts.js.map +1 -1
  89. package/dist/utils.cjs +6 -0
  90. package/dist/utils.cjs.map +1 -1
  91. package/dist/utils.d.cts +1 -0
  92. package/dist/utils.d.ts +1 -0
  93. package/dist/utils.d.ts.map +1 -1
  94. package/dist/utils.js +5 -0
  95. package/dist/utils.js.map +1 -1
  96. package/dist/voice/agent_activity.cjs +93 -7
  97. package/dist/voice/agent_activity.cjs.map +1 -1
  98. package/dist/voice/agent_activity.d.cts +3 -0
  99. package/dist/voice/agent_activity.d.ts +3 -0
  100. package/dist/voice/agent_activity.d.ts.map +1 -1
  101. package/dist/voice/agent_activity.js +93 -7
  102. package/dist/voice/agent_activity.js.map +1 -1
  103. package/dist/voice/agent_session.cjs +122 -27
  104. package/dist/voice/agent_session.cjs.map +1 -1
  105. package/dist/voice/agent_session.d.cts +15 -0
  106. package/dist/voice/agent_session.d.ts +15 -0
  107. package/dist/voice/agent_session.d.ts.map +1 -1
  108. package/dist/voice/agent_session.js +122 -27
  109. package/dist/voice/agent_session.js.map +1 -1
  110. package/dist/voice/audio_recognition.cjs +69 -22
  111. package/dist/voice/audio_recognition.cjs.map +1 -1
  112. package/dist/voice/audio_recognition.d.cts +5 -0
  113. package/dist/voice/audio_recognition.d.ts +5 -0
  114. package/dist/voice/audio_recognition.d.ts.map +1 -1
  115. package/dist/voice/audio_recognition.js +69 -22
  116. package/dist/voice/audio_recognition.js.map +1 -1
  117. package/dist/voice/generation.cjs +43 -3
  118. package/dist/voice/generation.cjs.map +1 -1
  119. package/dist/voice/generation.d.ts.map +1 -1
  120. package/dist/voice/generation.js +43 -3
  121. package/dist/voice/generation.js.map +1 -1
  122. package/dist/voice/report.cjs +3 -2
  123. package/dist/voice/report.cjs.map +1 -1
  124. package/dist/voice/report.d.cts +7 -1
  125. package/dist/voice/report.d.ts +7 -1
  126. package/dist/voice/report.d.ts.map +1 -1
  127. package/dist/voice/report.js +3 -2
  128. package/dist/voice/report.js.map +1 -1
  129. package/package.json +8 -2
  130. package/src/inference/api_protos.ts +2 -2
  131. package/src/inference/stt.ts +48 -33
  132. package/src/inference/tts.ts +4 -3
  133. package/src/ipc/job_proc_lazy_main.ts +12 -1
  134. package/src/job.ts +59 -10
  135. package/src/llm/llm.ts +48 -5
  136. package/src/log.ts +52 -15
  137. package/src/stt/stt.ts +18 -5
  138. package/src/telemetry/index.ts +22 -4
  139. package/src/telemetry/logging.ts +55 -0
  140. package/src/telemetry/otel_http_exporter.ts +191 -0
  141. package/src/telemetry/pino_otel_transport.ts +265 -0
  142. package/src/telemetry/traces.ts +320 -20
  143. package/src/tts/tts.ts +85 -24
  144. package/src/utils.ts +5 -0
  145. package/src/voice/agent_activity.ts +140 -22
  146. package/src/voice/agent_session.ts +174 -34
  147. package/src/voice/audio_recognition.ts +85 -26
  148. package/src/voice/generation.ts +59 -7
  149. package/src/voice/report.ts +10 -4
@@ -3,6 +3,8 @@
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { AudioFrame, Room } from '@livekit/rtc-node';
5
5
  import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
6
+ import type { Context, Span } from '@opentelemetry/api';
7
+ import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
6
8
  import { EventEmitter } from 'node:events';
7
9
  import type { ReadableStream } from 'node:stream/web';
8
10
  import {
@@ -14,12 +16,14 @@ import {
14
16
  type TTSModelString,
15
17
  } from '../inference/index.js';
16
18
  import { getJobContext } from '../job.js';
19
+ import type { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js';
17
20
  import { AgentHandoffItem, ChatContext, ChatMessage } from '../llm/chat_context.js';
18
21
  import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js';
19
22
  import type { LLMError } from '../llm/llm.js';
20
23
  import { log } from '../log.js';
21
24
  import type { STT } from '../stt/index.js';
22
25
  import type { STTError } from '../stt/stt.js';
26
+ import { traceTypes, tracer } from '../telemetry/index.js';
23
27
  import type { TTS, TTSError } from '../tts/tts.js';
24
28
  import type { VAD } from '../vad.js';
25
29
  import type { Agent } from './agent.js';
@@ -128,9 +132,22 @@ export class AgentSession<
128
132
  private closingTask: Promise<void> | null = null;
129
133
  private userAwayTimer: NodeJS.Timeout | null = null;
130
134
 
135
+ private sessionSpan?: Span;
136
+ private userSpeakingSpan?: Span;
137
+ private agentSpeakingSpan?: Span;
138
+
139
+ /** @internal */
140
+ rootSpanContext?: Context;
141
+
131
142
  /** @internal */
132
143
  _recordedEvents: AgentEvent[] = [];
133
144
 
145
+ /** @internal */
146
+ _enableRecording = false;
147
+
148
+ /** @internal - Timestamp when the session started (milliseconds) */
149
+ _startedAt?: number;
150
+
134
151
  constructor(opts: AgentSessionOptions<UserData>) {
135
152
  super();
136
153
 
@@ -175,7 +192,8 @@ export class AgentSession<
175
192
  this._chatCtx = ChatContext.empty();
176
193
  this.options = { ...defaultVoiceOptions, ...voiceOptions };
177
194
 
178
- this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed.bind(this));
195
+ this._onUserInputTranscribed = this._onUserInputTranscribed.bind(this);
196
+ this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed);
179
197
  }
180
198
 
181
199
  emit<K extends keyof AgentSessionCallbacks>(
@@ -211,25 +229,22 @@ export class AgentSession<
211
229
  this._userData = value;
212
230
  }
213
231
 
214
- async start({
215
- // TODO(brian): PR2 - Add setupCloudTracer() call if on LiveKit Cloud with recording enabled
216
- // TODO(brian): PR3 - Add span: this._sessionSpan = tracer.startSpan('agent_session'), store as instance property
217
- // TODO(brian): PR4 - Add setupCloudLogger() call in setupCloudTracer() to setup OTEL logging with Pino bridge
232
+ private async _startImpl({
218
233
  agent,
219
234
  room,
220
235
  inputOptions,
221
236
  outputOptions,
222
- record = true,
237
+ record,
238
+ span,
223
239
  }: {
224
240
  agent: Agent;
225
241
  room: Room;
226
242
  inputOptions?: Partial<RoomInputOptions>;
227
243
  outputOptions?: Partial<RoomOutputOptions>;
228
- record?: boolean;
244
+ record: boolean;
245
+ span: Span;
229
246
  }): Promise<void> {
230
- if (this.started) {
231
- return;
232
- }
247
+ span.setAttribute(traceTypes.ATTR_AGENT_LABEL, agent.id);
233
248
 
234
249
  this.agent = agent;
235
250
  this._updateAgentState('initializing');
@@ -291,9 +306,62 @@ export class AgentSession<
291
306
  );
292
307
 
293
308
  this.started = true;
309
+ this._startedAt = Date.now();
294
310
  this._updateAgentState('listening');
295
311
  }
296
312
 
313
+ async start({
314
+ agent,
315
+ room,
316
+ inputOptions,
317
+ outputOptions,
318
+ record = true,
319
+ }: {
320
+ agent: Agent;
321
+ room: Room;
322
+ inputOptions?: Partial<RoomInputOptions>;
323
+ outputOptions?: Partial<RoomOutputOptions>;
324
+ record?: boolean;
325
+ }): Promise<void> {
326
+ if (this.started) {
327
+ return;
328
+ }
329
+
330
+ const ctx = getJobContext();
331
+
332
+ record = record ?? ctx.info.job.enableRecording;
333
+ this._enableRecording = record;
334
+
335
+ this.logger.info(
336
+ { record, enableRecording: ctx.info.job.enableRecording },
337
+ 'Configuring session recording',
338
+ );
339
+
340
+ if (this._enableRecording) {
341
+ await ctx.initRecording();
342
+ }
343
+
344
+ // Create agent_session as a ROOT span (new trace) to match Python behavior
345
+ // This creates a separate trace for better cloud dashboard organization
346
+ this.sessionSpan = tracer.startSpan({
347
+ name: 'agent_session',
348
+ context: ROOT_CONTEXT,
349
+ });
350
+
351
+ // Set the session span as the active span in the context
352
+ // This ensures all child spans (agent_turn, user_turn, etc.) are parented to it
353
+ this.rootSpanContext = trace.setSpan(ROOT_CONTEXT, this.sessionSpan);
354
+
355
+ await this._startImpl({
356
+ agent,
357
+ room,
358
+ inputOptions,
359
+ outputOptions,
360
+ record,
361
+ span: this.sessionSpan,
362
+ });
363
+ }
364
+
297
365
  updateAgent(agent: Agent): void {
298
366
  this.agent = agent;
299
367
 
@@ -367,32 +435,41 @@ export class AgentSession<
367
435
  }
368
436
 
369
437
  private async updateActivity(agent: Agent): Promise<void> {
370
- // TODO(AJS-129): add lock to agent activity core lifecycle
371
- this.nextActivity = new AgentActivity(agent, this);
438
+ const runWithContext = async () => {
439
+ // TODO(AJS-129): add lock to agent activity core lifecycle
440
+ this.nextActivity = new AgentActivity(agent, this);
372
441
 
373
- const previousActivity = this.activity;
442
+ const previousActivity = this.activity;
374
443
 
375
- if (this.activity) {
376
- await this.activity.drain();
377
- await this.activity.close();
378
- }
444
+ if (this.activity) {
445
+ await this.activity.drain();
446
+ await this.activity.close();
447
+ }
379
448
 
380
- this.activity = this.nextActivity;
381
- this.nextActivity = undefined;
449
+ this.activity = this.nextActivity;
450
+ this.nextActivity = undefined;
382
451
 
383
- this._chatCtx.insert(
384
- new AgentHandoffItem({
385
- oldAgentId: previousActivity?.agent.id,
386
- newAgentId: agent.id,
387
- }),
388
- );
389
- this.logger.debug({ previousActivity, agent }, 'Agent handoff inserted into chat context');
452
+ this._chatCtx.insert(
453
+ new AgentHandoffItem({
454
+ oldAgentId: previousActivity?.agent.id,
455
+ newAgentId: agent.id,
456
+ }),
457
+ );
458
+ this.logger.debug({ previousActivity, agent }, 'Agent handoff inserted into chat context');
390
459
 
391
- await this.activity.start();
460
+ await this.activity.start();
392
461
 
393
- if (this._input.audio) {
394
- this.activity.attachAudioInput(this._input.audio.stream);
462
+ if (this._input.audio) {
463
+ this.activity.attachAudioInput(this._input.audio.stream);
464
+ }
465
+ };
466
+
467
+ // Run within session span context if available
468
+ if (this.rootSpanContext) {
469
+ return otelContext.with(this.rootSpanContext, runWithContext);
395
470
  }
471
+
472
+ return runWithContext();
396
473
  }
397
474
 
398
475
  get chatCtx(): ChatContext {
@@ -452,14 +529,35 @@ export class AgentSession<
452
529
  this.emit(AgentSessionEventTypes.ConversationItemAdded, createConversationItemAddedEvent(item));
453
530
  }
454
531
 
532
+ /** @internal */
533
+ _toolItemsAdded(items: (FunctionCall | FunctionCallOutput)[]): void {
534
+ this._chatCtx.insert(items);
535
+ }
536
+
455
537
  /** @internal */
456
538
  _updateAgentState(state: AgentState) {
457
539
  if (this._agentState === state) {
458
540
  return;
459
541
  }
460
542
 
461
- // TODO(brian): PR3 - Add span: if state === 'speaking' && !this._agentSpeakingSpan, create tracer.startSpan('agent_speaking') with participant attributes
462
- // TODO(brian): PR3 - Add span: if state !== 'speaking' && this._agentSpeakingSpan, end and clear this._agentSpeakingSpan
543
+ if (state === 'speaking') {
544
+ // TODO(brian): PR4 - Track error counts
545
+
546
+ if (this.agentSpeakingSpan === undefined) {
547
+ this.agentSpeakingSpan = tracer.startSpan({
548
+ name: 'agent_speaking',
549
+ context: this.rootSpanContext,
550
+ });
551
+
552
+ // TODO(brian): PR4 - Set participant attributes if roomIO.room.localParticipant is available
553
+ // (Ref: Python agent_session.py line 1161-1164)
554
+ }
555
+ } else if (this.agentSpeakingSpan !== undefined) {
556
+ // TODO(brian): PR4 - Set ATTR_END_TIME attribute if available
557
+ this.agentSpeakingSpan.end();
558
+ this.agentSpeakingSpan = undefined;
559
+ }
560
+
463
561
  const oldState = this._agentState;
464
562
  this._agentState = state;
465
563
 
@@ -482,8 +580,20 @@ export class AgentSession<
482
580
  return;
483
581
  }
484
582
 
485
- // TODO(brian): PR3 - Add span: if state === 'speaking' && !this._userSpeakingSpan, create tracer.startSpan('user_speaking') with participant attributes
486
- // TODO(brian): PR3 - Add span: if state !== 'speaking' && this._userSpeakingSpan, end and clear this._userSpeakingSpan
583
+ if (state === 'speaking' && this.userSpeakingSpan === undefined) {
584
+ this.userSpeakingSpan = tracer.startSpan({
585
+ name: 'user_speaking',
586
+ context: this.rootSpanContext,
587
+ });
588
+
589
+ // TODO(brian): PR4 - Set participant attributes if roomIO.linkedParticipant is available
590
+ // (Ref: Python agent_session.py line 1192-1195)
591
+ } else if (this.userSpeakingSpan !== undefined) {
592
+ // TODO(brian): PR4 - Set ATTR_END_TIME attribute with lastSpeakingTime if available
593
+ this.userSpeakingSpan.end();
594
+ this.userSpeakingSpan = undefined;
595
+ }
596
+
487
597
  const oldState = this.userState;
488
598
  this.userState = state;
489
599
 
@@ -550,19 +660,33 @@ export class AgentSession<
550
660
  reason: CloseReason,
551
661
  error: RealtimeModelError | LLMError | TTSError | STTError | null = null,
552
662
  drain: boolean = false,
663
+ ): Promise<void> {
664
+ if (this.rootSpanContext) {
665
+ return otelContext.with(this.rootSpanContext, async () => {
666
+ await this.closeImplInner(reason, error, drain);
667
+ });
668
+ }
669
+
670
+ return this.closeImplInner(reason, error, drain);
671
+ }
672
+
673
+ private async closeImplInner(
674
+ reason: CloseReason,
675
+ error: RealtimeModelError | LLMError | TTSError | STTError | null = null,
676
+ drain: boolean = false,
553
677
  ): Promise<void> {
554
678
  if (!this.started) {
555
679
  return;
556
680
  }
557
681
 
558
682
  this._cancelUserAwayTimer();
683
+ this.off(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed);
559
684
 
560
685
  if (this.activity) {
561
686
  if (!drain) {
562
687
  try {
563
688
  this.activity.interrupt();
564
689
  } catch (error) {
565
- // uninterruptible speech [copied from python]
566
690
  // TODO(shubhra): force interrupt or wait for it to finish?
567
691
  // it might be an audio played from the error callback
568
692
  }
@@ -584,12 +708,28 @@ export class AgentSession<
584
708
  await this.activity?.close();
585
709
  this.activity = undefined;
586
710
 
711
+ if (this.sessionSpan) {
712
+ this.sessionSpan.end();
713
+ this.sessionSpan = undefined;
714
+ }
715
+
716
+ if (this.userSpeakingSpan) {
717
+ this.userSpeakingSpan.end();
718
+ this.userSpeakingSpan = undefined;
719
+ }
720
+
721
+ if (this.agentSpeakingSpan) {
722
+ this.agentSpeakingSpan.end();
723
+ this.agentSpeakingSpan = undefined;
724
+ }
725
+
587
726
  this.started = false;
588
727
 
589
728
  this.emit(AgentSessionEventTypes.Close, createCloseEvent(reason, error));
590
729
 
591
730
  this.userState = 'listening';
592
731
  this._agentState = 'initializing';
732
+ this.rootSpanContext = undefined;
593
733
 
594
734
  this.logger.info({ reason, error }, 'AgentSession closed');
595
735
  }
@@ -2,6 +2,7 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import { AudioFrame } from '@livekit/rtc-node';
5
+ import type { Context, Span } from '@opentelemetry/api';
5
6
  import type { WritableStreamDefaultWriter } from 'node:stream/web';
6
7
  import { ReadableStream } from 'node:stream/web';
7
8
  import { type ChatContext } from '../llm/chat_context.js';
@@ -10,6 +11,7 @@ import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/de
10
11
  import { IdentityTransform } from '../stream/identity_transform.js';
11
12
  import { mergeReadableStreams } from '../stream/merge_readable_streams.js';
12
13
  import { type SpeechEvent, SpeechEventType } from '../stt/stt.js';
14
+ import { traceTypes, tracer } from '../telemetry/index.js';
13
15
  import { Task, delay } from '../utils.js';
14
16
  import { type VAD, type VADEvent, VADEventType } from '../vad.js';
15
17
  import type { TurnDetectionMode } from './agent_session.js';
@@ -55,10 +57,9 @@ export interface AudioRecognitionOptions {
55
57
  turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
56
58
  minEndpointingDelay: number;
57
59
  maxEndpointingDelay: number;
60
+ rootSpanContext?: Context;
58
61
  }
59
62
 
60
- // TODO(brian): PR3 - Add span: private _userTurnSpan?: Span, create lazily in _ensureUserTurnSpan() method (tracer.startSpan('user_turn') with participant attributes)
61
- // TODO(brian): PR3 - Add span: 'eou_detection' span when running EOU detection (in runEOUDetection method)
62
63
  export class AudioRecognition {
63
64
  private hooks: RecognitionHooks;
64
65
  private stt?: STTNode;
@@ -68,6 +69,7 @@ export class AudioRecognition {
68
69
  private minEndpointingDelay: number;
69
70
  private maxEndpointingDelay: number;
70
71
  private lastLanguage?: string;
72
+ private rootSpanContext?: Context;
71
73
 
72
74
  private deferredInputStream: DeferredReadableStream<AudioFrame>;
73
75
  private logger = log();
@@ -82,6 +84,8 @@ export class AudioRecognition {
82
84
  private speaking = false;
83
85
  private sampleRate?: number;
84
86
 
87
+ private userTurnSpan?: Span;
88
+
85
89
  private vadInputStream: ReadableStream<AudioFrame>;
86
90
  private sttInputStream: ReadableStream<AudioFrame>;
87
91
  private silenceAudioTransform = new IdentityTransform<AudioFrame>();
@@ -102,6 +106,7 @@ export class AudioRecognition {
102
106
  this.minEndpointingDelay = opts.minEndpointingDelay;
103
107
  this.maxEndpointingDelay = opts.maxEndpointingDelay;
104
108
  this.lastLanguage = undefined;
109
+ this.rootSpanContext = opts.rootSpanContext;
105
110
 
106
111
  this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
107
112
  const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
@@ -357,31 +362,47 @@ export class AudioRecognition {
357
362
  let endpointingDelay = this.minEndpointingDelay;
358
363
 
359
364
  if (turnDetector) {
360
- this.logger.debug('Running turn detector model');
361
- if (!(await turnDetector.supportsLanguage(this.lastLanguage))) {
362
- this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
363
- } else {
364
- const endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
365
- this.logger.debug(
366
- { endOfTurnProbability, language: this.lastLanguage },
367
- 'end of turn probability',
368
- );
369
-
370
- const unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
371
- this.logger.debug(
372
- {
373
- unlikelyThreshold,
374
- endOfTurnProbability,
375
- language: this.lastLanguage,
376
- transcript: this.audioTranscript,
377
- },
378
- 'EOU Detection',
379
- );
365
+ await tracer.startActiveSpan(
366
+ async (span) => {
367
+ this.logger.debug('Running turn detector model');
368
+
369
+ let endOfTurnProbability = 0.0;
370
+ let unlikelyThreshold: number | undefined;
371
+
372
+ if (!(await turnDetector.supportsLanguage(this.lastLanguage))) {
373
+ this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
374
+ } else {
375
+ try {
376
+ endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
377
+ unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
378
+
379
+ this.logger.debug(
380
+ { endOfTurnProbability, unlikelyThreshold, language: this.lastLanguage },
381
+ 'end of turn probability',
382
+ );
383
+
384
+ if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
385
+ endpointingDelay = this.maxEndpointingDelay;
386
+ }
387
+ } catch (error) {
388
+ this.logger.error(error, 'Error predicting end of turn');
389
+ }
390
+ }
380
391
 
381
- if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
382
- endpointingDelay = this.maxEndpointingDelay;
383
- }
384
- }
392
+ span.setAttribute(
393
+ traceTypes.ATTR_CHAT_CTX,
394
+ JSON.stringify(chatCtx.toJSON({ excludeTimestamp: false })),
395
+ );
396
+ span.setAttribute(traceTypes.ATTR_EOU_PROBABILITY, endOfTurnProbability);
397
+ span.setAttribute(traceTypes.ATTR_EOU_UNLIKELY_THRESHOLD, unlikelyThreshold ?? 0);
398
+ span.setAttribute(traceTypes.ATTR_EOU_DELAY, endpointingDelay);
399
+ span.setAttribute(traceTypes.ATTR_EOU_LANGUAGE, this.lastLanguage ?? '');
400
+ },
401
+ {
402
+ name: 'eou_detection',
403
+ context: this.rootSpanContext,
404
+ },
405
+ );
385
406
  }
386
407
 
387
408
  let extraSleep = endpointingDelay;
@@ -430,6 +451,13 @@ export class AudioRecognition {
430
451
  });
431
452
 
432
453
  if (committed) {
454
+ this._endUserTurnSpan({
455
+ transcript: this.audioTranscript,
456
+ confidence: confidenceAvg,
457
+ transcriptionDelay: transcriptionDelay ?? 0,
458
+ endOfUtteranceDelay: endOfUtteranceDelay ?? 0,
459
+ });
460
+
433
461
  // clear the transcript if the user turn was committed
434
462
  this.audioTranscript = '';
435
463
  this.finalTranscriptConfidence = [];
@@ -537,6 +565,13 @@ export class AudioRecognition {
537
565
  this.hooks.onStartOfSpeech(ev);
538
566
  this.speaking = true;
539
567
 
568
+ if (!this.userTurnSpan) {
569
+ this.userTurnSpan = tracer.startSpan({
570
+ name: 'user_turn',
571
+ context: this.rootSpanContext,
572
+ });
573
+ }
574
+
540
575
  // Capture sample rate from the first VAD event if not already set
541
576
  if (ev.frames.length > 0 && ev.frames[0]) {
542
577
  this.sampleRate = ev.frames[0].sampleRate;
@@ -646,12 +681,36 @@ export class AudioRecognition {
646
681
 
647
682
  async close() {
648
683
  this.detachInputAudioStream();
684
+ this.silenceAudioWriter.releaseLock();
649
685
  await this.commitUserTurnTask?.cancelAndWait();
650
686
  await this.sttTask?.cancelAndWait();
651
687
  await this.vadTask?.cancelAndWait();
652
688
  await this.bounceEOUTask?.cancelAndWait();
653
689
  }
654
690
 
691
+ private _endUserTurnSpan({
692
+ transcript,
693
+ confidence,
694
+ transcriptionDelay,
695
+ endOfUtteranceDelay,
696
+ }: {
697
+ transcript: string;
698
+ confidence: number;
699
+ transcriptionDelay: number;
700
+ endOfUtteranceDelay: number;
701
+ }): void {
702
+ if (this.userTurnSpan) {
703
+ this.userTurnSpan.setAttributes({
704
+ [traceTypes.ATTR_USER_TRANSCRIPT]: transcript,
705
+ [traceTypes.ATTR_TRANSCRIPT_CONFIDENCE]: confidence,
706
+ [traceTypes.ATTR_TRANSCRIPTION_DELAY]: transcriptionDelay,
707
+ [traceTypes.ATTR_END_OF_TURN_DELAY]: endOfUtteranceDelay,
708
+ });
709
+ this.userTurnSpan.end();
710
+ this.userTurnSpan = undefined;
711
+ }
712
+ }
713
+
655
714
  private get vadBaseTurnDetection() {
656
715
  return ['vad', undefined].includes(this.turnDetectionMode);
657
716
  }
@@ -3,6 +3,8 @@
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
5
  import { AudioResampler } from '@livekit/rtc-node';
6
+ import type { Span } from '@opentelemetry/api';
7
+ import { context as otelContext } from '@opentelemetry/api';
6
8
  import type { ReadableStream, ReadableStreamDefaultReader } from 'stream/web';
7
9
  import {
8
10
  type ChatContext,
@@ -21,6 +23,7 @@ import {
21
23
  import { isZodSchema, parseZodSchema } from '../llm/zod-utils.js';
22
24
  import { log } from '../log.js';
23
25
  import { IdentityTransform } from '../stream/identity_transform.js';
26
+ import { traceTypes, tracer } from '../telemetry/index.js';
24
27
  import { Future, Task, shortuuid, toError } from '../utils.js';
25
28
  import { type Agent, type ModelSettings, asyncLocalStorage, isStopResponse } from './agent.js';
26
29
  import type { AgentSession } from './agent_session.js';
@@ -377,7 +380,6 @@ export function updateInstructions(options: {
377
380
  }
378
381
  }
379
382
 
380
- // TODO(brian): PR3 - Add @tracer.startActiveSpan('llm_node') decorator/wrapper
381
383
  export function performLLMInference(
382
384
  node: LLMNode,
383
385
  chatCtx: ChatContext,
@@ -392,7 +394,13 @@ export function performLLMInference(
392
394
  const toolCallWriter = toolCallStream.writable.getWriter();
393
395
  const data = new _LLMGenerationData(textStream.readable, toolCallStream.readable);
394
396
 
395
- const inferenceTask = async (signal: AbortSignal) => {
397
+ const _performLLMInferenceImpl = async (signal: AbortSignal, span: Span) => {
398
+ span.setAttribute(
399
+ traceTypes.ATTR_CHAT_CTX,
400
+ JSON.stringify(chatCtx.toJSON({ excludeTimestamp: false })),
401
+ );
402
+ span.setAttribute(traceTypes.ATTR_FUNCTION_TOOLS, JSON.stringify(Object.keys(toolCtx)));
403
+
396
404
  let llmStreamReader: ReadableStreamDefaultReader<string | ChatChunk> | null = null;
397
405
  let llmStream: ReadableStream<string | ChatChunk> | null = null;
398
406
 
@@ -448,6 +456,8 @@ export function performLLMInference(
448
456
  // No need to check if chunk is of type other than ChatChunk or string like in
449
457
  // Python since chunk is defined in the type ChatChunk | string in TypeScript
450
458
  }
459
+
460
+ span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, data.generatedText);
451
461
  } catch (error) {
452
462
  if (error instanceof DOMException && error.name === 'AbortError') {
453
463
  // Abort signal was triggered, handle gracefully
@@ -462,13 +472,21 @@ export function performLLMInference(
462
472
  }
463
473
  };
464
474
 
475
+ // Capture the current context (agent_turn) to ensure llm_node is properly parented
476
+ const currentContext = otelContext.active();
477
+
478
+ const inferenceTask = async (signal: AbortSignal) =>
479
+ tracer.startActiveSpan(async (span) => _performLLMInferenceImpl(signal, span), {
480
+ name: 'llm_node',
481
+ context: currentContext,
482
+ });
483
+
465
484
  return [
466
485
  Task.from((controller) => inferenceTask(controller.signal), controller, 'performLLMInference'),
467
486
  data,
468
487
  ];
469
488
  }
470
489
 
471
- // TODO(brian): PR3 - Add @tracer.startActiveSpan('tts_node') decorator/wrapper
472
490
  export function performTTSInference(
473
491
  node: TTSNode,
474
492
  text: ReadableStream<string>,
@@ -479,7 +497,7 @@ export function performTTSInference(
479
497
  const outputWriter = audioStream.writable.getWriter();
480
498
  const audioOutputStream = audioStream.readable;
481
499
 
482
- const inferenceTask = async (signal: AbortSignal) => {
500
+ const _performTTSInferenceImpl = async (signal: AbortSignal) => {
483
501
  let ttsStreamReader: ReadableStreamDefaultReader<AudioFrame> | null = null;
484
502
  let ttsStream: ReadableStream<AudioFrame> | null = null;
485
503
 
@@ -514,6 +532,15 @@ export function performTTSInference(
514
532
  }
515
533
  };
516
534
 
535
+ // Capture the current context (agent_turn) to ensure tts_node is properly parented
536
+ const currentContext = otelContext.active();
537
+
538
+ const inferenceTask = async (signal: AbortSignal) =>
539
+ tracer.startActiveSpan(async () => _performTTSInferenceImpl(signal), {
540
+ name: 'tts_node',
541
+ context: currentContext,
542
+ });
543
+
517
544
  return [
518
545
  Task.from((controller) => inferenceTask(controller.signal), controller, 'performTTSInference'),
519
546
  audioOutputStream,
@@ -652,7 +679,7 @@ export function performAudioForwarding(
652
679
  ];
653
680
  }
654
681
 
655
- // TODO(brian): PR3 - Add @tracer.startActiveSpan('function_tool') wrapper for each tool execution
682
+ // function_tool span is already implemented in tracableToolExecution below (line ~796)
656
683
  export function performToolExecutions({
657
684
  session,
658
685
  speechHandle,
@@ -788,8 +815,9 @@ export function performToolExecutions({
788
815
  });
789
816
  });
790
817
 
791
- const tracableToolExecution = async (toolExecTask: Promise<unknown>) => {
792
- // TODO(brian): add tracing
818
+ const _tracableToolExecutionImpl = async (toolExecTask: Promise<unknown>, span: Span) => {
819
+ span.setAttribute(traceTypes.ATTR_FUNCTION_TOOL_NAME, toolCall.name);
820
+ span.setAttribute(traceTypes.ATTR_FUNCTION_TOOL_ARGS, toolCall.args);
793
821
 
794
822
  // await for task to complete, if task is aborted, set exception
795
823
  let toolOutput: ToolExecutionOutput | undefined;
@@ -800,6 +828,17 @@ export function performToolExecutions({
800
828
  exception: isAborted ? new Error('tool call was aborted') : undefined,
801
829
  output: isAborted ? undefined : result,
802
830
  });
831
+
832
+ if (toolOutput.toolCallOutput) {
833
+ span.setAttribute(
834
+ traceTypes.ATTR_FUNCTION_TOOL_OUTPUT,
835
+ toolOutput.toolCallOutput.output,
836
+ );
837
+ span.setAttribute(
838
+ traceTypes.ATTR_FUNCTION_TOOL_IS_ERROR,
839
+ toolOutput.toolCallOutput.isError,
840
+ );
841
+ }
803
842
  } catch (rawError) {
804
843
  logger.error(
805
844
  {
@@ -813,12 +852,25 @@ export function performToolExecutions({
813
852
  toolCall,
814
853
  exception: toError(rawError),
815
854
  });
855
+
856
+ if (toolOutput.toolCallOutput) {
857
+ span.setAttribute(
858
+ traceTypes.ATTR_FUNCTION_TOOL_OUTPUT,
859
+ toolOutput.toolCallOutput.output,
860
+ );
861
+ span.setAttribute(traceTypes.ATTR_FUNCTION_TOOL_IS_ERROR, true);
862
+ }
816
863
  } finally {
817
864
  if (!toolOutput) throw new Error('toolOutput is undefined');
818
865
  toolCompleted(toolOutput);
819
866
  }
820
867
  };
821
868
 
869
+ const tracableToolExecution = (toolExecTask: Promise<unknown>) =>
870
+ tracer.startActiveSpan(async (span) => _tracableToolExecutionImpl(toolExecTask, span), {
871
+ name: 'function_tool',
872
+ });
873
+
822
874
  // wait, not cancelling all tool calling tasks
823
875
  tasks.push(tracableToolExecution(toolExecution));
824
876
  }