@livekit/agents 1.0.44 → 1.0.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. package/dist/ipc/supervised_proc.cjs +1 -1
  2. package/dist/ipc/supervised_proc.cjs.map +1 -1
  3. package/dist/ipc/supervised_proc.js +1 -1
  4. package/dist/ipc/supervised_proc.js.map +1 -1
  5. package/dist/llm/llm.cjs +1 -1
  6. package/dist/llm/llm.cjs.map +1 -1
  7. package/dist/llm/llm.js +1 -1
  8. package/dist/llm/llm.js.map +1 -1
  9. package/dist/log.cjs +13 -9
  10. package/dist/log.cjs.map +1 -1
  11. package/dist/log.d.cts +1 -1
  12. package/dist/log.d.ts +1 -1
  13. package/dist/log.d.ts.map +1 -1
  14. package/dist/log.js +13 -9
  15. package/dist/log.js.map +1 -1
  16. package/dist/stream/index.cjs +3 -0
  17. package/dist/stream/index.cjs.map +1 -1
  18. package/dist/stream/index.d.cts +1 -0
  19. package/dist/stream/index.d.ts +1 -0
  20. package/dist/stream/index.d.ts.map +1 -1
  21. package/dist/stream/index.js +2 -0
  22. package/dist/stream/index.js.map +1 -1
  23. package/dist/stream/multi_input_stream.cjs +139 -0
  24. package/dist/stream/multi_input_stream.cjs.map +1 -0
  25. package/dist/stream/multi_input_stream.d.cts +55 -0
  26. package/dist/stream/multi_input_stream.d.ts +55 -0
  27. package/dist/stream/multi_input_stream.d.ts.map +1 -0
  28. package/dist/stream/multi_input_stream.js +115 -0
  29. package/dist/stream/multi_input_stream.js.map +1 -0
  30. package/dist/stream/multi_input_stream.test.cjs +340 -0
  31. package/dist/stream/multi_input_stream.test.cjs.map +1 -0
  32. package/dist/stream/multi_input_stream.test.js +339 -0
  33. package/dist/stream/multi_input_stream.test.js.map +1 -0
  34. package/dist/stt/stt.cjs +2 -2
  35. package/dist/stt/stt.cjs.map +1 -1
  36. package/dist/stt/stt.js +2 -2
  37. package/dist/stt/stt.js.map +1 -1
  38. package/dist/telemetry/trace_types.cjs +42 -0
  39. package/dist/telemetry/trace_types.cjs.map +1 -1
  40. package/dist/telemetry/trace_types.d.cts +14 -0
  41. package/dist/telemetry/trace_types.d.ts +14 -0
  42. package/dist/telemetry/trace_types.d.ts.map +1 -1
  43. package/dist/telemetry/trace_types.js +28 -0
  44. package/dist/telemetry/trace_types.js.map +1 -1
  45. package/dist/tts/fallback_adapter.cjs +466 -0
  46. package/dist/tts/fallback_adapter.cjs.map +1 -0
  47. package/dist/tts/fallback_adapter.d.cts +110 -0
  48. package/dist/tts/fallback_adapter.d.ts +110 -0
  49. package/dist/tts/fallback_adapter.d.ts.map +1 -0
  50. package/dist/tts/fallback_adapter.js +442 -0
  51. package/dist/tts/fallback_adapter.js.map +1 -0
  52. package/dist/tts/index.cjs +3 -0
  53. package/dist/tts/index.cjs.map +1 -1
  54. package/dist/tts/index.d.cts +1 -0
  55. package/dist/tts/index.d.ts +1 -0
  56. package/dist/tts/index.d.ts.map +1 -1
  57. package/dist/tts/index.js +2 -0
  58. package/dist/tts/index.js.map +1 -1
  59. package/dist/tts/tts.cjs +2 -2
  60. package/dist/tts/tts.cjs.map +1 -1
  61. package/dist/tts/tts.js +2 -2
  62. package/dist/tts/tts.js.map +1 -1
  63. package/dist/utils.cjs +13 -0
  64. package/dist/utils.cjs.map +1 -1
  65. package/dist/utils.d.cts +1 -0
  66. package/dist/utils.d.ts +1 -0
  67. package/dist/utils.d.ts.map +1 -1
  68. package/dist/utils.js +13 -0
  69. package/dist/utils.js.map +1 -1
  70. package/dist/vad.cjs +11 -10
  71. package/dist/vad.cjs.map +1 -1
  72. package/dist/vad.d.cts +5 -3
  73. package/dist/vad.d.ts +5 -3
  74. package/dist/vad.d.ts.map +1 -1
  75. package/dist/vad.js +11 -10
  76. package/dist/vad.js.map +1 -1
  77. package/dist/voice/agent_activity.cjs +35 -10
  78. package/dist/voice/agent_activity.cjs.map +1 -1
  79. package/dist/voice/agent_activity.d.cts +1 -0
  80. package/dist/voice/agent_activity.d.ts +1 -0
  81. package/dist/voice/agent_activity.d.ts.map +1 -1
  82. package/dist/voice/agent_activity.js +35 -10
  83. package/dist/voice/agent_activity.js.map +1 -1
  84. package/dist/voice/agent_session.cjs +19 -7
  85. package/dist/voice/agent_session.cjs.map +1 -1
  86. package/dist/voice/agent_session.d.cts +3 -2
  87. package/dist/voice/agent_session.d.ts +3 -2
  88. package/dist/voice/agent_session.d.ts.map +1 -1
  89. package/dist/voice/agent_session.js +19 -7
  90. package/dist/voice/agent_session.js.map +1 -1
  91. package/dist/voice/audio_recognition.cjs +85 -36
  92. package/dist/voice/audio_recognition.cjs.map +1 -1
  93. package/dist/voice/audio_recognition.d.cts +22 -1
  94. package/dist/voice/audio_recognition.d.ts +22 -1
  95. package/dist/voice/audio_recognition.d.ts.map +1 -1
  96. package/dist/voice/audio_recognition.js +89 -36
  97. package/dist/voice/audio_recognition.js.map +1 -1
  98. package/dist/voice/audio_recognition_span.test.cjs +233 -0
  99. package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
  100. package/dist/voice/audio_recognition_span.test.js +232 -0
  101. package/dist/voice/audio_recognition_span.test.js.map +1 -0
  102. package/dist/voice/io.cjs +6 -3
  103. package/dist/voice/io.cjs.map +1 -1
  104. package/dist/voice/io.d.cts +3 -2
  105. package/dist/voice/io.d.ts +3 -2
  106. package/dist/voice/io.d.ts.map +1 -1
  107. package/dist/voice/io.js +6 -3
  108. package/dist/voice/io.js.map +1 -1
  109. package/dist/voice/recorder_io/recorder_io.cjs +3 -1
  110. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  111. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
  112. package/dist/voice/recorder_io/recorder_io.js +3 -1
  113. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  114. package/dist/voice/room_io/_input.cjs +23 -20
  115. package/dist/voice/room_io/_input.cjs.map +1 -1
  116. package/dist/voice/room_io/_input.d.cts +2 -2
  117. package/dist/voice/room_io/_input.d.ts +2 -2
  118. package/dist/voice/room_io/_input.d.ts.map +1 -1
  119. package/dist/voice/room_io/_input.js +13 -9
  120. package/dist/voice/room_io/_input.js.map +1 -1
  121. package/dist/voice/room_io/room_io.cjs +9 -0
  122. package/dist/voice/room_io/room_io.cjs.map +1 -1
  123. package/dist/voice/room_io/room_io.d.cts +3 -1
  124. package/dist/voice/room_io/room_io.d.ts +3 -1
  125. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  126. package/dist/voice/room_io/room_io.js +9 -0
  127. package/dist/voice/room_io/room_io.js.map +1 -1
  128. package/dist/voice/utils.cjs +47 -0
  129. package/dist/voice/utils.cjs.map +1 -0
  130. package/dist/voice/utils.d.cts +4 -0
  131. package/dist/voice/utils.d.ts +4 -0
  132. package/dist/voice/utils.d.ts.map +1 -0
  133. package/dist/voice/utils.js +23 -0
  134. package/dist/voice/utils.js.map +1 -0
  135. package/package.json +1 -1
  136. package/src/ipc/supervised_proc.ts +1 -1
  137. package/src/llm/llm.ts +1 -1
  138. package/src/log.ts +22 -11
  139. package/src/stream/index.ts +1 -0
  140. package/src/stream/multi_input_stream.test.ts +540 -0
  141. package/src/stream/multi_input_stream.ts +172 -0
  142. package/src/stt/stt.ts +2 -2
  143. package/src/telemetry/trace_types.ts +18 -0
  144. package/src/tts/fallback_adapter.ts +579 -0
  145. package/src/tts/index.ts +1 -0
  146. package/src/tts/tts.ts +2 -2
  147. package/src/utils.ts +16 -0
  148. package/src/vad.ts +12 -11
  149. package/src/voice/agent_activity.ts +25 -0
  150. package/src/voice/agent_session.ts +17 -11
  151. package/src/voice/audio_recognition.ts +114 -38
  152. package/src/voice/audio_recognition_span.test.ts +261 -0
  153. package/src/voice/io.ts +7 -4
  154. package/src/voice/recorder_io/recorder_io.ts +2 -1
  155. package/src/voice/room_io/_input.ts +16 -10
  156. package/src/voice/room_io/room_io.ts +12 -0
  157. package/src/voice/utils.ts +29 -0
package/src/utils.ts CHANGED
@@ -126,6 +126,8 @@ export class Future<T = void> {
126
126
  #rejectPromise!: (error: Error) => void;
127
127
  #done: boolean = false;
128
128
  #rejected: boolean = false;
129
+ #result: T | undefined = undefined;
130
+ #error: Error | undefined = undefined;
129
131
 
130
132
  constructor() {
131
133
  this.#await = new Promise<T>((resolve, reject) => {
@@ -142,6 +144,18 @@ export class Future<T = void> {
142
144
  return this.#done;
143
145
  }
144
146
 
147
+ get result(): T {
148
+ if (!this.#done) {
149
+ throw new Error('Future is not done');
150
+ }
151
+
152
+ if (this.#rejected) {
153
+ throw this.#error;
154
+ }
155
+
156
+ return this.#result!;
157
+ }
158
+
145
159
  /** Whether the future was rejected (cancelled) */
146
160
  get rejected() {
147
161
  return this.#rejected;
@@ -149,12 +163,14 @@ export class Future<T = void> {
149
163
 
150
164
  resolve(value: T) {
151
165
  this.#done = true;
166
+ this.#result = value;
152
167
  this.#resolvePromise(value);
153
168
  }
154
169
 
155
170
  reject(error: Error) {
156
171
  this.#done = true;
157
172
  this.#rejected = true;
173
+ this.#error = error;
158
174
  this.#rejectPromise(error);
159
175
  }
160
176
  }
package/src/vad.ts CHANGED
@@ -98,14 +98,15 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
98
98
  protected closed = false;
99
99
  protected inputClosed = false;
100
100
 
101
- #vad: VAD;
102
- #lastActivityTime = BigInt(0);
103
- private logger = log();
104
- private deferredInputStream: DeferredReadableStream<AudioFrame>;
101
+ protected vad: VAD;
102
+ protected lastActivityTime = BigInt(0);
103
+ protected logger;
104
+ protected deferredInputStream: DeferredReadableStream<AudioFrame>;
105
105
 
106
106
  private metricsStream: ReadableStream<VADEvent>;
107
107
  constructor(vad: VAD) {
108
- this.#vad = vad;
108
+ this.logger = log();
109
+ this.vad = vad;
109
110
  this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
110
111
 
111
112
  this.inputWriter = this.input.writable.getWriter();
@@ -155,16 +156,16 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
155
156
  switch (value.type) {
156
157
  case VADEventType.START_OF_SPEECH:
157
158
  inferenceCount++;
158
- if (inferenceCount >= 1000 / this.#vad.capabilities.updateInterval) {
159
- this.#vad.emit('metrics_collected', {
159
+ if (inferenceCount >= 1000 / this.vad.capabilities.updateInterval) {
160
+ this.vad.emit('metrics_collected', {
160
161
  type: 'vad_metrics',
161
162
  timestamp: Date.now(),
162
163
  idleTimeMs: Math.trunc(
163
- Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1000000)),
164
+ Number((process.hrtime.bigint() - this.lastActivityTime) / BigInt(1000000)),
164
165
  ),
165
166
  inferenceDurationTotalMs,
166
167
  inferenceCount,
167
- label: this.#vad.label,
168
+ label: this.vad.label,
168
169
  });
169
170
 
170
171
  inferenceCount = 0;
@@ -173,10 +174,10 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
173
174
  break;
174
175
  case VADEventType.INFERENCE_DONE:
175
176
  inferenceDurationTotalMs += Math.round(value.inferenceDuration);
176
- this.#lastActivityTime = process.hrtime.bigint();
177
+ this.lastActivityTime = process.hrtime.bigint();
177
178
  break;
178
179
  case VADEventType.END_OF_SPEECH:
179
- this.#lastActivityTime = process.hrtime.bigint();
180
+ this.lastActivityTime = process.hrtime.bigint();
180
181
  break;
181
182
  }
182
183
  }
@@ -74,6 +74,7 @@ import {
74
74
  } from './generation.js';
75
75
  import type { TimedString } from './io.js';
76
76
  import { SpeechHandle } from './speech_handle.js';
77
+ import { setParticipantSpanAttributes } from './utils.js';
77
78
 
78
79
  const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
79
80
 
@@ -299,6 +300,9 @@ export class AgentActivity implements RecognitionHooks {
299
300
  minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
300
301
  maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
301
302
  rootSpanContext: this.agentSession.rootSpanContext,
303
+ sttModel: this.stt?.label,
304
+ sttProvider: this.getSttProvider(),
305
+ getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
302
306
  });
303
307
  this.audioRecognition.start();
304
308
  this.started = true;
@@ -335,6 +339,17 @@ export class AgentActivity implements RecognitionHooks {
335
339
  return this.agent.stt || this.agentSession.stt;
336
340
  }
337
341
 
342
+ private getSttProvider(): string | undefined {
343
+ const label = this.stt?.label;
344
+ if (!label) {
345
+ return undefined;
346
+ }
347
+
348
+ // Heuristic: most labels look like "<provider>-<model>"
349
+ const [provider] = label.split('-', 1);
350
+ return provider || label;
351
+ }
352
+
338
353
  get llm(): LLM | RealtimeModel | undefined {
339
354
  return this.agent.llm || this.agentSession.llm;
340
355
  }
@@ -1355,6 +1370,11 @@ export class AgentActivity implements RecognitionHooks {
1355
1370
  span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || '');
1356
1371
  }
1357
1372
 
1373
+ const localParticipant = this.agentSession._roomIO?.localParticipant;
1374
+ if (localParticipant) {
1375
+ setParticipantSpanAttributes(span, localParticipant);
1376
+ }
1377
+
1358
1378
  speechHandleStorage.enterWith(speechHandle);
1359
1379
 
1360
1380
  const audioOutput = this.agentSession.output.audioEnabled
@@ -1815,6 +1835,11 @@ export class AgentActivity implements RecognitionHooks {
1815
1835
 
1816
1836
  span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1817
1837
 
1838
+ const localParticipant = this.agentSession._roomIO?.localParticipant;
1839
+ if (localParticipant) {
1840
+ setParticipantSpanAttributes(span, localParticipant);
1841
+ }
1842
+
1818
1843
  speechHandleStorage.enterWith(speechHandle);
1819
1844
 
1820
1845
  if (!this.realtimeSession) {
@@ -62,6 +62,7 @@ import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io
62
62
  import type { UnknownUserData } from './run_context.js';
63
63
  import type { SpeechHandle } from './speech_handle.js';
64
64
  import { RunResult } from './testing/run_result.js';
65
+ import { setParticipantSpanAttributes } from './utils.js';
65
66
 
66
67
  export interface VoiceOptions {
67
68
  allowInterruptions: boolean;
@@ -131,7 +132,8 @@ export class AgentSession<
131
132
  private started = false;
132
133
  private userState: UserState = 'listening';
133
134
 
134
- private roomIO?: RoomIO;
135
+ /** @internal */
136
+ _roomIO?: RoomIO;
135
137
  private logger = log();
136
138
 
137
139
  private _chatCtx: ChatContext;
@@ -294,7 +296,7 @@ export class AgentSession<
294
296
 
295
297
  const tasks: Promise<void>[] = [];
296
298
 
297
- if (room && !this.roomIO) {
299
+ if (room && !this._roomIO) {
298
300
  // Check for existing input/output configuration and warn if needed
299
301
  if (this.input.audio && inputOptions?.audioEnabled !== false) {
300
302
  this.logger.warn(
@@ -314,13 +316,13 @@ export class AgentSession<
314
316
  );
315
317
  }
316
318
 
317
- this.roomIO = new RoomIO({
319
+ this._roomIO = new RoomIO({
318
320
  agentSession: this,
319
321
  room,
320
322
  inputOptions,
321
323
  outputOptions,
322
324
  });
323
- this.roomIO.start();
325
+ this._roomIO.start();
324
326
  }
325
327
 
326
328
  let ctx: JobContext | undefined = undefined;
@@ -700,8 +702,10 @@ export class AgentSession<
700
702
  startTime: options?.startTime,
701
703
  });
702
704
 
703
- // TODO(brian): PR4 - Set participant attributes if roomIO.room.localParticipant is available
704
- // (Ref: Python agent_session.py line 1161-1164)
705
+ const localParticipant = this._roomIO?.localParticipant;
706
+ if (localParticipant) {
707
+ setParticipantSpanAttributes(this.agentSpeakingSpan, localParticipant);
708
+ }
705
709
  }
706
710
  } else if (this.agentSpeakingSpan !== undefined) {
707
711
  // TODO(brian): PR4 - Set ATTR_END_TIME attribute if available
@@ -738,8 +742,10 @@ export class AgentSession<
738
742
  startTime: lastSpeakingTime,
739
743
  });
740
744
 
741
- // TODO(brian): PR4 - Set participant attributes if roomIO.linkedParticipant is available
742
- // (Ref: Python agent_session.py line 1192-1195)
745
+ const linked = this._roomIO?.linkedParticipant;
746
+ if (linked) {
747
+ setParticipantSpanAttributes(this.userSpeakingSpan, linked);
748
+ }
743
749
  } else if (this.userSpeakingSpan !== undefined) {
744
750
  this.userSpeakingSpan.end(lastSpeakingTime);
745
751
  this.userSpeakingSpan = undefined;
@@ -783,7 +789,7 @@ export class AgentSession<
783
789
  return;
784
790
  }
785
791
 
786
- if (this.roomIO && !this.roomIO.isParticipantAvailable) {
792
+ if (this._roomIO && !this._roomIO.isParticipantAvailable) {
787
793
  return;
788
794
  }
789
795
 
@@ -862,8 +868,8 @@ export class AgentSession<
862
868
  this.output.audio = null;
863
869
  this.output.transcription = null;
864
870
 
865
- await this.roomIO?.close();
866
- this.roomIO = undefined;
871
+ await this._roomIO?.close();
872
+ this._roomIO = undefined;
867
873
 
868
874
  await this.activity?.close();
869
875
  this.activity = undefined;
@@ -1,8 +1,15 @@
1
1
  // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
+ import type { ParticipantKind } from '@livekit/rtc-node';
4
5
  import { AudioFrame } from '@livekit/rtc-node';
5
- import type { Context, Span } from '@opentelemetry/api';
6
+ import {
7
+ type Context,
8
+ ROOT_CONTEXT,
9
+ type Span,
10
+ context as otelContext,
11
+ trace,
12
+ } from '@opentelemetry/api';
6
13
  import type { WritableStreamDefaultWriter } from 'node:stream/web';
7
14
  import { ReadableStream } from 'node:stream/web';
8
15
  import { type ChatContext } from '../llm/chat_context.js';
@@ -16,6 +23,7 @@ import { Task, delay } from '../utils.js';
16
23
  import { type VAD, type VADEvent, VADEventType } from '../vad.js';
17
24
  import type { TurnDetectionMode } from './agent_session.js';
18
25
  import type { STTNode } from './io.js';
26
+ import { setParticipantSpanAttributes } from './utils.js';
19
27
 
20
28
  export interface EndOfTurnInfo {
21
29
  /** The new transcript text from the user's speech. */
@@ -72,6 +80,22 @@ export interface AudioRecognitionOptions {
72
80
  maxEndpointingDelay: number;
73
81
  /** Root span context for tracing. */
74
82
  rootSpanContext?: Context;
83
+ /** STT model name for tracing */
84
+ sttModel?: string;
85
+ /** STT provider name for tracing */
86
+ sttProvider?: string;
87
+ /** Getter for linked participant for span attribution */
88
+ getLinkedParticipant?: () => ParticipantLike | undefined;
89
+ }
90
+
91
+ /**
92
+ * Minimal participant shape for span attribution.
93
+ * Compatible with both `LocalParticipant` and `RemoteParticipant` from `@livekit/rtc-node`.
94
+ */
95
+ export interface ParticipantLike {
96
+ sid: string | undefined;
97
+ identity: string;
98
+ kind: ParticipantKind;
75
99
  }
76
100
 
77
101
  export class AudioRecognition {
@@ -84,6 +108,9 @@ export class AudioRecognition {
84
108
  private maxEndpointingDelay: number;
85
109
  private lastLanguage?: string;
86
110
  private rootSpanContext?: Context;
111
+ private sttModel?: string;
112
+ private sttProvider?: string;
113
+ private getLinkedParticipant?: () => ParticipantLike | undefined;
87
114
 
88
115
  private deferredInputStream: DeferredReadableStream<AudioFrame>;
89
116
  private logger = log();
@@ -121,6 +148,9 @@ export class AudioRecognition {
121
148
  this.maxEndpointingDelay = opts.maxEndpointingDelay;
122
149
  this.lastLanguage = undefined;
123
150
  this.rootSpanContext = opts.rootSpanContext;
151
+ this.sttModel = opts.sttModel;
152
+ this.sttProvider = opts.sttProvider;
153
+ this.getLinkedParticipant = opts.getLinkedParticipant;
124
154
 
125
155
  this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
126
156
  const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
@@ -151,6 +181,37 @@ export class AudioRecognition {
151
181
  });
152
182
  }
153
183
 
184
+ private ensureUserTurnSpan(startTime?: number): Span {
185
+ if (this.userTurnSpan && this.userTurnSpan.isRecording()) {
186
+ return this.userTurnSpan;
187
+ }
188
+
189
+ this.userTurnSpan = tracer.startSpan({
190
+ name: 'user_turn',
191
+ context: this.rootSpanContext,
192
+ startTime,
193
+ });
194
+
195
+ const participant = this.getLinkedParticipant?.();
196
+ if (participant) {
197
+ setParticipantSpanAttributes(this.userTurnSpan, participant);
198
+ }
199
+
200
+ if (this.sttModel) {
201
+ this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.sttModel);
202
+ }
203
+ if (this.sttProvider) {
204
+ this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, this.sttProvider);
205
+ }
206
+
207
+ return this.userTurnSpan;
208
+ }
209
+
210
+ private userTurnContext(span: Span): Context {
211
+ const base = this.rootSpanContext ?? ROOT_CONTEXT;
212
+ return trace.setSpan(base, span);
213
+ }
214
+
154
215
  private async onSTTEvent(ev: SpeechEvent) {
155
216
  if (
156
217
  this.turnDetectionMode === 'manual' &&
@@ -299,19 +360,25 @@ export class AudioRecognition {
299
360
  break;
300
361
  case SpeechEventType.START_OF_SPEECH:
301
362
  if (this.turnDetectionMode !== 'stt') break;
302
- this.hooks.onStartOfSpeech({
303
- type: VADEventType.START_OF_SPEECH,
304
- samplesIndex: 0,
305
- timestamp: Date.now(),
306
- speechDuration: 0,
307
- silenceDuration: 0,
308
- frames: [],
309
- probability: 0,
310
- inferenceDuration: 0,
311
- speaking: true,
312
- rawAccumulatedSilence: 0,
313
- rawAccumulatedSpeech: 0,
314
- });
363
+ {
364
+ const span = this.ensureUserTurnSpan(Date.now());
365
+ const ctx = this.userTurnContext(span);
366
+ otelContext.with(ctx, () => {
367
+ this.hooks.onStartOfSpeech({
368
+ type: VADEventType.START_OF_SPEECH,
369
+ samplesIndex: 0,
370
+ timestamp: Date.now(),
371
+ speechDuration: 0,
372
+ silenceDuration: 0,
373
+ frames: [],
374
+ probability: 0,
375
+ inferenceDuration: 0,
376
+ speaking: true,
377
+ rawAccumulatedSilence: 0,
378
+ rawAccumulatedSpeech: 0,
379
+ });
380
+ });
381
+ }
315
382
  this.speaking = true;
316
383
  this.lastSpeakingTime = Date.now();
317
384
 
@@ -319,19 +386,25 @@ export class AudioRecognition {
319
386
  break;
320
387
  case SpeechEventType.END_OF_SPEECH:
321
388
  if (this.turnDetectionMode !== 'stt') break;
322
- this.hooks.onEndOfSpeech({
323
- type: VADEventType.END_OF_SPEECH,
324
- samplesIndex: 0,
325
- timestamp: Date.now(),
326
- speechDuration: 0,
327
- silenceDuration: 0,
328
- frames: [],
329
- probability: 0,
330
- inferenceDuration: 0,
331
- speaking: false,
332
- rawAccumulatedSilence: 0,
333
- rawAccumulatedSpeech: 0,
334
- });
389
+ {
390
+ const span = this.ensureUserTurnSpan();
391
+ const ctx = this.userTurnContext(span);
392
+ otelContext.with(ctx, () => {
393
+ this.hooks.onEndOfSpeech({
394
+ type: VADEventType.END_OF_SPEECH,
395
+ samplesIndex: 0,
396
+ timestamp: Date.now(),
397
+ speechDuration: 0,
398
+ silenceDuration: 0,
399
+ frames: [],
400
+ probability: 0,
401
+ inferenceDuration: 0,
402
+ speaking: false,
403
+ rawAccumulatedSilence: 0,
404
+ rawAccumulatedSpeech: 0,
405
+ });
406
+ });
407
+ }
335
408
  this.speaking = false;
336
409
  this.userTurnCommitted = true;
337
410
  this.lastSpeakingTime = Date.now();
@@ -376,6 +449,9 @@ export class AudioRecognition {
376
449
  async (controller: AbortController) => {
377
450
  let endpointingDelay = this.minEndpointingDelay;
378
451
 
452
+ const userTurnSpan = this.ensureUserTurnSpan();
453
+ const userTurnCtx = this.userTurnContext(userTurnSpan);
454
+
379
455
  if (turnDetector) {
380
456
  await tracer.startActiveSpan(
381
457
  async (span) => {
@@ -415,7 +491,7 @@ export class AudioRecognition {
415
491
  },
416
492
  {
417
493
  name: 'eou_detection',
418
- context: this.rootSpanContext,
494
+ context: userTurnCtx,
419
495
  },
420
496
  );
421
497
  }
@@ -577,17 +653,13 @@ export class AudioRecognition {
577
653
  switch (ev.type) {
578
654
  case VADEventType.START_OF_SPEECH:
579
655
  this.logger.debug('VAD task: START_OF_SPEECH');
580
- this.hooks.onStartOfSpeech(ev);
581
- this.speaking = true;
582
-
583
- if (!this.userTurnSpan) {
656
+ {
584
657
  const startTime = Date.now() - ev.speechDuration;
585
- this.userTurnSpan = tracer.startSpan({
586
- name: 'user_turn',
587
- context: this.rootSpanContext,
588
- startTime,
589
- });
658
+ const span = this.ensureUserTurnSpan(startTime);
659
+ const ctx = this.userTurnContext(span);
660
+ otelContext.with(ctx, () => this.hooks.onStartOfSpeech(ev));
590
661
  }
662
+ this.speaking = true;
591
663
 
592
664
  // Capture sample rate from the first VAD event if not already set
593
665
  if (ev.frames.length > 0 && ev.frames[0]) {
@@ -609,7 +681,11 @@ export class AudioRecognition {
609
681
  break;
610
682
  case VADEventType.END_OF_SPEECH:
611
683
  this.logger.debug('VAD task: END_OF_SPEECH');
612
- this.hooks.onEndOfSpeech(ev);
684
+ {
685
+ const span = this.ensureUserTurnSpan();
686
+ const ctx = this.userTurnContext(span);
687
+ otelContext.with(ctx, () => this.hooks.onEndOfSpeech(ev));
688
+ }
613
689
 
614
690
  // when VAD fires END_OF_SPEECH, it already waited for the silence_duration
615
691
  this.speaking = false;