@livekit/agents 1.0.36 → 1.0.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/dist/cli.cjs.map +1 -1
  2. package/dist/inference/api_protos.cjs +68 -0
  3. package/dist/inference/api_protos.cjs.map +1 -1
  4. package/dist/inference/api_protos.d.cts +345 -4
  5. package/dist/inference/api_protos.d.ts +345 -4
  6. package/dist/inference/api_protos.d.ts.map +1 -1
  7. package/dist/inference/api_protos.js +60 -0
  8. package/dist/inference/api_protos.js.map +1 -1
  9. package/dist/inference/stt.cjs +32 -21
  10. package/dist/inference/stt.cjs.map +1 -1
  11. package/dist/inference/stt.d.ts.map +1 -1
  12. package/dist/inference/stt.js +34 -21
  13. package/dist/inference/stt.js.map +1 -1
  14. package/dist/ipc/inference_proc_executor.cjs.map +1 -1
  15. package/dist/ipc/job_proc_executor.cjs.map +1 -1
  16. package/dist/stt/stt.cjs +10 -0
  17. package/dist/stt/stt.cjs.map +1 -1
  18. package/dist/stt/stt.d.cts +12 -0
  19. package/dist/stt/stt.d.ts +12 -0
  20. package/dist/stt/stt.d.ts.map +1 -1
  21. package/dist/stt/stt.js +10 -0
  22. package/dist/stt/stt.js.map +1 -1
  23. package/dist/telemetry/traces.cjs +4 -3
  24. package/dist/telemetry/traces.cjs.map +1 -1
  25. package/dist/telemetry/traces.d.cts +2 -0
  26. package/dist/telemetry/traces.d.ts +2 -0
  27. package/dist/telemetry/traces.d.ts.map +1 -1
  28. package/dist/telemetry/traces.js +4 -3
  29. package/dist/telemetry/traces.js.map +1 -1
  30. package/dist/utils.cjs +6 -0
  31. package/dist/utils.cjs.map +1 -1
  32. package/dist/utils.d.cts +2 -0
  33. package/dist/utils.d.ts +2 -0
  34. package/dist/utils.d.ts.map +1 -1
  35. package/dist/utils.js +6 -0
  36. package/dist/utils.js.map +1 -1
  37. package/dist/voice/agent.cjs +5 -0
  38. package/dist/voice/agent.cjs.map +1 -1
  39. package/dist/voice/agent.d.ts.map +1 -1
  40. package/dist/voice/agent.js +5 -0
  41. package/dist/voice/agent.js.map +1 -1
  42. package/dist/voice/agent_activity.cjs +49 -23
  43. package/dist/voice/agent_activity.cjs.map +1 -1
  44. package/dist/voice/agent_activity.d.cts +1 -1
  45. package/dist/voice/agent_activity.d.ts +1 -1
  46. package/dist/voice/agent_activity.d.ts.map +1 -1
  47. package/dist/voice/agent_activity.js +50 -24
  48. package/dist/voice/agent_activity.js.map +1 -1
  49. package/dist/voice/agent_session.cjs +7 -5
  50. package/dist/voice/agent_session.cjs.map +1 -1
  51. package/dist/voice/agent_session.d.cts +5 -2
  52. package/dist/voice/agent_session.d.ts +5 -2
  53. package/dist/voice/agent_session.d.ts.map +1 -1
  54. package/dist/voice/agent_session.js +7 -5
  55. package/dist/voice/agent_session.js.map +1 -1
  56. package/dist/voice/audio_recognition.cjs +3 -1
  57. package/dist/voice/audio_recognition.cjs.map +1 -1
  58. package/dist/voice/audio_recognition.d.ts.map +1 -1
  59. package/dist/voice/audio_recognition.js +3 -1
  60. package/dist/voice/audio_recognition.js.map +1 -1
  61. package/dist/voice/avatar/datastream_io.cjs +6 -0
  62. package/dist/voice/avatar/datastream_io.cjs.map +1 -1
  63. package/dist/voice/avatar/datastream_io.d.cts +1 -0
  64. package/dist/voice/avatar/datastream_io.d.ts +1 -0
  65. package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
  66. package/dist/voice/avatar/datastream_io.js +6 -0
  67. package/dist/voice/avatar/datastream_io.js.map +1 -1
  68. package/dist/voice/background_audio.cjs.map +1 -1
  69. package/dist/voice/generation.cjs +14 -5
  70. package/dist/voice/generation.cjs.map +1 -1
  71. package/dist/voice/generation.d.cts +3 -2
  72. package/dist/voice/generation.d.ts +3 -2
  73. package/dist/voice/generation.d.ts.map +1 -1
  74. package/dist/voice/generation.js +14 -5
  75. package/dist/voice/generation.js.map +1 -1
  76. package/dist/voice/io.cjs +12 -0
  77. package/dist/voice/io.cjs.map +1 -1
  78. package/dist/voice/io.d.cts +19 -1
  79. package/dist/voice/io.d.ts +19 -1
  80. package/dist/voice/io.d.ts.map +1 -1
  81. package/dist/voice/io.js +12 -0
  82. package/dist/voice/io.js.map +1 -1
  83. package/dist/voice/recorder_io/recorder_io.cjs +91 -28
  84. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  85. package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
  86. package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
  87. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
  88. package/dist/voice/recorder_io/recorder_io.js +91 -28
  89. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  90. package/dist/voice/room_io/_input.cjs +40 -11
  91. package/dist/voice/room_io/_input.cjs.map +1 -1
  92. package/dist/voice/room_io/_input.d.cts +4 -1
  93. package/dist/voice/room_io/_input.d.ts +4 -1
  94. package/dist/voice/room_io/_input.d.ts.map +1 -1
  95. package/dist/voice/room_io/_input.js +31 -2
  96. package/dist/voice/room_io/_input.js.map +1 -1
  97. package/dist/voice/room_io/_output.cjs +6 -0
  98. package/dist/voice/room_io/_output.cjs.map +1 -1
  99. package/dist/voice/room_io/_output.d.cts +1 -0
  100. package/dist/voice/room_io/_output.d.ts +1 -0
  101. package/dist/voice/room_io/_output.d.ts.map +1 -1
  102. package/dist/voice/room_io/_output.js +6 -0
  103. package/dist/voice/room_io/_output.js.map +1 -1
  104. package/dist/voice/room_io/room_io.cjs.map +1 -1
  105. package/dist/voice/room_io/room_io.d.cts +2 -2
  106. package/dist/voice/room_io/room_io.d.ts +2 -2
  107. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  108. package/dist/voice/room_io/room_io.js.map +1 -1
  109. package/dist/voice/speech_handle.cjs +2 -0
  110. package/dist/voice/speech_handle.cjs.map +1 -1
  111. package/dist/voice/speech_handle.d.cts +3 -0
  112. package/dist/voice/speech_handle.d.ts +3 -0
  113. package/dist/voice/speech_handle.d.ts.map +1 -1
  114. package/dist/voice/speech_handle.js +2 -0
  115. package/dist/voice/speech_handle.js.map +1 -1
  116. package/dist/voice/testing/index.cjs +2 -0
  117. package/dist/voice/testing/index.cjs.map +1 -1
  118. package/dist/voice/testing/index.d.cts +1 -1
  119. package/dist/voice/testing/index.d.ts +1 -1
  120. package/dist/voice/testing/index.d.ts.map +1 -1
  121. package/dist/voice/testing/index.js +2 -0
  122. package/dist/voice/testing/index.js.map +1 -1
  123. package/dist/voice/testing/run_result.cjs +294 -5
  124. package/dist/voice/testing/run_result.cjs.map +1 -1
  125. package/dist/voice/testing/run_result.d.cts +149 -1
  126. package/dist/voice/testing/run_result.d.ts +149 -1
  127. package/dist/voice/testing/run_result.d.ts.map +1 -1
  128. package/dist/voice/testing/run_result.js +293 -5
  129. package/dist/voice/testing/run_result.js.map +1 -1
  130. package/package.json +1 -1
  131. package/src/inference/api_protos.ts +83 -0
  132. package/src/inference/stt.ts +39 -22
  133. package/src/stt/stt.ts +21 -0
  134. package/src/telemetry/traces.ts +6 -2
  135. package/src/utils.ts +7 -0
  136. package/src/voice/agent.ts +9 -0
  137. package/src/voice/agent_activity.ts +72 -26
  138. package/src/voice/agent_session.ts +6 -5
  139. package/src/voice/audio_recognition.ts +2 -0
  140. package/src/voice/avatar/datastream_io.ts +8 -0
  141. package/src/voice/generation.ts +24 -12
  142. package/src/voice/io.ts +27 -5
  143. package/src/voice/recorder_io/recorder_io.ts +123 -31
  144. package/src/voice/room_io/_input.ts +32 -4
  145. package/src/voice/room_io/_output.ts +8 -0
  146. package/src/voice/room_io/room_io.ts +3 -1
  147. package/src/voice/speech_handle.ts +4 -0
  148. package/src/voice/testing/index.ts +1 -0
  149. package/src/voice/testing/run_result.ts +373 -12
@@ -16,6 +16,12 @@ import {
16
16
  } from '../stt/index.js';
17
17
  import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
18
18
  import { type AudioBuffer, Event, Task, cancelAndWait, shortuuid, waitForAbort } from '../utils.js';
19
+ import type { TimedString } from '../voice/io.js';
20
+ import {
21
+ type SttServerEvent,
22
+ type SttTranscriptEvent,
23
+ sttServerEventSchema,
24
+ } from './api_protos.js';
19
25
  import { type AnyString, connectWs, createAccessToken } from './utils.js';
20
26
 
21
27
  export type DeepgramModels =
@@ -122,7 +128,7 @@ export class STT<TModel extends STTModels> extends BaseSTT {
122
128
  apiSecret?: string;
123
129
  modelOptions?: STTOptions<TModel>;
124
130
  }) {
125
- super({ streaming: true, interimResults: true });
131
+ super({ streaming: true, interimResults: true, alignedTranscript: 'word' });
126
132
 
127
133
  const {
128
134
  model,
@@ -271,7 +277,6 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
271
277
  let closing = false;
272
278
  let finalReceived = false;
273
279
 
274
- type SttServerEvent = Record<string, any>;
275
280
  const eventChannel = createStreamChannel<SttServerEvent>();
276
281
 
277
282
  const resourceCleanup = () => {
@@ -380,10 +385,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
380
385
  if (signal.aborted) return;
381
386
  if (result.done) return;
382
387
 
383
- const json = result.value;
384
- const type = json.type as string | undefined;
388
+ // Parse and validate with Zod schema
389
+ const parseResult = await sttServerEventSchema.safeParseAsync(result.value);
390
+ if (!parseResult.success) {
391
+ this.#logger.warn(
392
+ { error: parseResult.error, rawData: result.value },
393
+ 'Failed to parse STT server event',
394
+ );
395
+ continue;
396
+ }
397
+
398
+ const event: SttServerEvent = parseResult.data;
385
399
 
386
- switch (type) {
400
+ switch (event.type) {
387
401
  case 'session.created':
388
402
  case 'session.finalized':
389
403
  break;
@@ -392,21 +406,15 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
392
406
  resourceCleanup();
393
407
  break;
394
408
  case 'interim_transcript':
395
- this.processTranscript(json, false);
409
+ this.processTranscript(event, false);
396
410
  break;
397
411
  case 'final_transcript':
398
- this.processTranscript(json, true);
412
+ this.processTranscript(event, true);
399
413
  break;
400
414
  case 'error':
401
- this.#logger.error({ error: json }, 'Received error from LiveKit STT');
415
+ this.#logger.error({ error: event }, 'Received error from LiveKit STT');
402
416
  resourceCleanup();
403
- throw new APIError(`LiveKit STT returned error: ${JSON.stringify(json)}`);
404
- default:
405
- this.#logger.warn(
406
- { message: json },
407
- 'Received unexpected message from LiveKit STT',
408
- );
409
- break;
417
+ throw new APIError(`LiveKit STT returned error: ${JSON.stringify(event)}`);
410
418
  }
411
419
  }
412
420
  } finally {
@@ -457,13 +465,13 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
457
465
  }
458
466
  }
459
467
 
460
- private processTranscript(data: Record<string, any>, isFinal: boolean) {
468
+ private processTranscript(data: SttTranscriptEvent, isFinal: boolean) {
461
469
  // Check if queue is closed to avoid race condition during disconnect
462
470
  if (this.queue.closed) return;
463
471
 
464
- const requestId = data.request_id ?? this.requestId;
465
- const text = data.transcript ?? '';
466
- const language = data.language ?? this.opts.language ?? 'en';
472
+ const requestId = data.session_id || this.requestId;
473
+ const text = data.transcript;
474
+ const language = data.language || this.opts.language || 'en';
467
475
 
468
476
  if (!text && !isFinal) return;
469
477
 
@@ -476,10 +484,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
476
484
 
477
485
  const speechData: SpeechData = {
478
486
  language,
479
- startTime: data.start ?? 0,
480
- endTime: data.duration ?? 0,
481
- confidence: data.confidence ?? 1.0,
487
+ startTime: this.startTimeOffset + data.start,
488
+ endTime: this.startTimeOffset + data.start + data.duration,
489
+ confidence: data.confidence,
482
490
  text,
491
+ words: data.words.map(
492
+ (word): TimedString => ({
493
+ text: word.word,
494
+ startTime: word.start + this.startTimeOffset,
495
+ endTime: word.end + this.startTimeOffset,
496
+ startTimeOffset: this.startTimeOffset,
497
+ confidence: word.confidence,
498
+ }),
499
+ ),
483
500
  };
484
501
 
485
502
  if (isFinal) {
package/src/stt/stt.ts CHANGED
@@ -13,6 +13,7 @@ import { DeferredReadableStream } from '../stream/deferred_stream.js';
13
13
  import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS, intervalForRetry } from '../types.js';
14
14
  import type { AudioBuffer } from '../utils.js';
15
15
  import { AsyncIterableQueue, delay, startSoon, toError } from '../utils.js';
16
+ import type { TimedString } from '../voice/index.js';
16
17
 
17
18
  /** Indicates start/middle/end of speech */
18
19
  export enum SpeechEventType {
@@ -53,6 +54,7 @@ export interface SpeechData {
53
54
  startTime: number;
54
55
  endTime: number;
55
56
  confidence: number;
57
+ words?: TimedString[];
56
58
  }
57
59
 
58
60
  export interface RecognitionUsage {
@@ -76,6 +78,13 @@ export interface SpeechEvent {
76
78
  export interface STTCapabilities {
77
79
  streaming: boolean;
78
80
  interimResults: boolean;
81
+ /**
82
+ * Whether this STT supports aligned transcripts with word/chunk timestamps.
83
+ * - 'word': Provider returns word-level timestamps
84
+ * - 'chunk': Provider returns chunk-level timestamps (e.g., sentence/phrase boundaries)
85
+ * - false: Provider does not support aligned transcripts
86
+ */
87
+ alignedTranscript?: 'word' | 'chunk' | false;
79
88
  }
80
89
 
81
90
  export interface STTError {
@@ -176,6 +185,7 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
176
185
  private deferredInputStream: DeferredReadableStream<AudioFrame>;
177
186
  private logger = log();
178
187
  private _connOptions: APIConnectOptions;
188
+ private _startTimeOffset: number = 0;
179
189
 
180
190
  protected abortController = new AbortController();
181
191
 
@@ -300,6 +310,17 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
300
310
  return this.abortController.signal;
301
311
  }
302
312
 
313
+ get startTimeOffset(): number {
314
+ return this._startTimeOffset;
315
+ }
316
+
317
+ set startTimeOffset(value: number) {
318
+ if (value < 0) {
319
+ throw new Error('startTimeOffset must be non-negative');
320
+ }
321
+ this._startTimeOffset = value;
322
+ }
323
+
303
324
  updateInputStream(audioStream: ReadableStream<AudioFrame>) {
304
325
  this.deferredInputStream.setSource(audioStream);
305
326
  }
@@ -37,6 +37,8 @@ export interface StartSpanOptions {
37
37
  attributes?: Attributes;
38
38
  /** Whether to end the span when the function exits (default: true) */
39
39
  endOnExit?: boolean;
40
+ /** Optional start time for the span in milliseconds (Date.now() format) */
41
+ startTime?: number;
40
42
  }
41
43
 
42
44
  /**
@@ -79,10 +81,12 @@ class DynamicTracer {
79
81
  */
80
82
  startSpan(options: StartSpanOptions): Span {
81
83
  const ctx = options.context || otelContext.active();
84
+
82
85
  const span = this.tracer.startSpan(
83
86
  options.name,
84
87
  {
85
88
  attributes: options.attributes,
89
+ startTime: options.startTime,
86
90
  },
87
91
  ctx,
88
92
  );
@@ -101,7 +105,7 @@ class DynamicTracer {
101
105
  async startActiveSpan<T>(fn: (span: Span) => Promise<T>, options: StartSpanOptions): Promise<T> {
102
106
  const ctx = options.context || otelContext.active();
103
107
  const endOnExit = options.endOnExit === undefined ? true : options.endOnExit; // default true
104
- const opts: SpanOptions = { attributes: options.attributes };
108
+ const opts: SpanOptions = { attributes: options.attributes, startTime: options.startTime };
105
109
 
106
110
  // Directly return the tracer's startActiveSpan result - it handles async correctly
107
111
  return await this.tracer.startActiveSpan(options.name, opts, ctx, async (span) => {
@@ -125,7 +129,7 @@ class DynamicTracer {
125
129
  startActiveSpanSync<T>(fn: (span: Span) => T, options: StartSpanOptions): T {
126
130
  const ctx = options.context || otelContext.active();
127
131
  const endOnExit = options.endOnExit === undefined ? true : options.endOnExit; // default true
128
- const opts: SpanOptions = { attributes: options.attributes };
132
+ const opts: SpanOptions = { attributes: options.attributes, startTime: options.startTime };
129
133
 
130
134
  return this.tracer.startActiveSpan(options.name, opts, ctx, (span) => {
131
135
  try {
package/src/utils.ts CHANGED
@@ -125,6 +125,7 @@ export class Future<T = void> {
125
125
  #resolvePromise!: (value: T) => void;
126
126
  #rejectPromise!: (error: Error) => void;
127
127
  #done: boolean = false;
128
+ #rejected: boolean = false;
128
129
 
129
130
  constructor() {
130
131
  this.#await = new Promise<T>((resolve, reject) => {
@@ -141,6 +142,11 @@ export class Future<T = void> {
141
142
  return this.#done;
142
143
  }
143
144
 
145
+ /** Whether the future was rejected (cancelled) */
146
+ get rejected() {
147
+ return this.#rejected;
148
+ }
149
+
144
150
  resolve(value: T) {
145
151
  this.#done = true;
146
152
  this.#resolvePromise(value);
@@ -148,6 +154,7 @@ export class Future<T = void> {
148
154
 
149
155
  reject(error: Error) {
150
156
  this.#done = true;
157
+ this.#rejected = true;
151
158
  this.#rejectPromise(error);
152
159
  }
153
160
  }
@@ -271,6 +271,15 @@ export class Agent<UserData = any> {
271
271
 
272
272
  const connOptions = activity.agentSession.connOptions.sttConnOptions;
273
273
  const stream = wrapped_stt.stream({ connOptions });
274
+
275
+ // Set startTimeOffset to provide linear timestamps across reconnections
276
+ const audioInputStartedAt =
277
+ activity.agentSession._recorderIO?.recordingStartedAt ?? // Use recording start time if available
278
+ activity.agentSession._startedAt ?? // Fallback to session start time
279
+ Date.now(); // Fallback to current time
280
+
281
+ stream.startTimeOffset = (Date.now() - audioInputStartedAt) / 1000;
282
+
274
283
  stream.updateInputStream(audio);
275
284
 
276
285
  let cleaned = false;
@@ -4,7 +4,7 @@
4
4
  import { Mutex } from '@livekit/mutex';
5
5
  import type { AudioFrame } from '@livekit/rtc-node';
6
6
  import type { Span } from '@opentelemetry/api';
7
- import { ROOT_CONTEXT, trace } from '@opentelemetry/api';
7
+ import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
8
8
  import { Heap } from 'heap-js';
9
9
  import { AsyncLocalStorage } from 'node:async_hooks';
10
10
  import { ReadableStream } from 'node:stream/web';
@@ -637,9 +637,12 @@ export class AgentActivity implements RecognitionHooks {
637
637
  }
638
638
 
639
639
  // recognition hooks
640
-
641
- onStartOfSpeech(_ev: VADEvent): void {
642
- this.agentSession._updateUserState('speaking');
640
+ onStartOfSpeech(ev: VADEvent): void {
641
+ let speechStartTime = Date.now();
642
+ if (ev) {
643
+ speechStartTime = speechStartTime - ev.speechDuration;
644
+ }
645
+ this.agentSession._updateUserState('speaking', speechStartTime);
643
646
  }
644
647
 
645
648
  onEndOfSpeech(ev: VADEvent): void {
@@ -1168,6 +1171,8 @@ export class AgentActivity implements RecognitionHooks {
1168
1171
  replyAbortController: AbortController,
1169
1172
  audio?: ReadableStream<AudioFrame> | null,
1170
1173
  ): Promise<void> {
1174
+ speechHandle._agentTurnContext = otelContext.active();
1175
+
1171
1176
  speechHandleStorage.enterWith(speechHandle);
1172
1177
 
1173
1178
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled
@@ -1212,13 +1217,18 @@ export class AgentActivity implements RecognitionHooks {
1212
1217
  tasks.push(textForwardTask);
1213
1218
  }
1214
1219
 
1215
- const onFirstFrame = () => {
1216
- this.agentSession._updateAgentState('speaking');
1220
+ const onFirstFrame = (startedSpeakingAt?: number) => {
1221
+ this.agentSession._updateAgentState('speaking', {
1222
+ startTime: startedSpeakingAt,
1223
+ otelContext: speechHandle._agentTurnContext,
1224
+ });
1217
1225
  };
1218
1226
 
1219
1227
  if (!audioOutput) {
1220
1228
  if (textOut) {
1221
- textOut.firstTextFut.await.finally(onFirstFrame);
1229
+ textOut.firstTextFut.await
1230
+ .then(() => onFirstFrame())
1231
+ .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
1222
1232
  }
1223
1233
  } else {
1224
1234
  let audioOut: _AudioOut | null = null;
@@ -1249,7 +1259,9 @@ export class AgentActivity implements RecognitionHooks {
1249
1259
  tasks.push(forwardTask);
1250
1260
  audioOut = _audioOut;
1251
1261
  }
1252
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1262
+ audioOut.firstFrameFut.await
1263
+ .then((ts) => onFirstFrame(ts))
1264
+ .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
1253
1265
  }
1254
1266
 
1255
1267
  await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
@@ -1303,6 +1315,8 @@ export class AgentActivity implements RecognitionHooks {
1303
1315
  toolsMessages?: ChatItem[];
1304
1316
  span: Span;
1305
1317
  }): Promise<void> => {
1318
+ speechHandle._agentTurnContext = otelContext.active();
1319
+
1306
1320
  span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1307
1321
  if (instructions) {
1308
1322
  span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
@@ -1402,8 +1416,11 @@ export class AgentActivity implements RecognitionHooks {
1402
1416
  textOut = _textOut;
1403
1417
  }
1404
1418
 
1405
- const onFirstFrame = () => {
1406
- this.agentSession._updateAgentState('speaking');
1419
+ const onFirstFrame = (startedSpeakingAt?: number) => {
1420
+ this.agentSession._updateAgentState('speaking', {
1421
+ startTime: startedSpeakingAt,
1422
+ otelContext: speechHandle._agentTurnContext,
1423
+ });
1407
1424
  };
1408
1425
 
1409
1426
  let audioOut: _AudioOut | null = null;
@@ -1416,12 +1433,16 @@ export class AgentActivity implements RecognitionHooks {
1416
1433
  );
1417
1434
  audioOut = _audioOut;
1418
1435
  tasks.push(forwardTask);
1419
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1436
+ audioOut.firstFrameFut.await
1437
+ .then((ts) => onFirstFrame(ts))
1438
+ .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
1420
1439
  } else {
1421
1440
  throw Error('ttsStream is null when audioOutput is enabled');
1422
1441
  }
1423
1442
  } else {
1424
- textOut?.firstTextFut.await.finally(onFirstFrame);
1443
+ textOut?.firstTextFut.await
1444
+ .then(() => onFirstFrame())
1445
+ .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
1425
1446
  }
1426
1447
 
1427
1448
  //TODO(AJS-272): before executing tools, make sure we generated all the text
@@ -1462,8 +1483,14 @@ export class AgentActivity implements RecognitionHooks {
1462
1483
  msg.createdAt = replyStartedAt;
1463
1484
  }
1464
1485
  this.agent._chatCtx.insert(toolsMessages);
1465
- // Also add to session history (matches Python agent_session.py _tool_items_added)
1466
- this.agentSession._toolItemsAdded(toolsMessages as (FunctionCall | FunctionCallOutput)[]);
1486
+ // Only add FunctionCallOutput items to session history since FunctionCall items
1487
+ // were already added by onToolExecutionStarted when the tool execution began
1488
+ const toolCallOutputs = toolsMessages.filter(
1489
+ (m): m is FunctionCallOutput => m.type === 'function_call_output',
1490
+ );
1491
+ if (toolCallOutputs.length > 0) {
1492
+ this.agentSession._toolItemsAdded(toolCallOutputs);
1493
+ }
1467
1494
  }
1468
1495
 
1469
1496
  if (speechHandle.interrupted) {
@@ -1487,10 +1514,10 @@ export class AgentActivity implements RecognitionHooks {
1487
1514
 
1488
1515
  if (audioOutput) {
1489
1516
  const playbackEv = await audioOutput.waitForPlayout();
1490
- if (audioOut?.firstFrameFut.done) {
1517
+ if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
1491
1518
  // playback EV is valid only if the first frame was already played
1492
1519
  this.logger.info(
1493
- { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
1520
+ { speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
1494
1521
  'playout interrupted',
1495
1522
  );
1496
1523
  if (playbackEv.synchronizedTranscript) {
@@ -1656,8 +1683,18 @@ export class AgentActivity implements RecognitionHooks {
1656
1683
  for (const msg of toolMessages) {
1657
1684
  msg.createdAt = replyStartedAt;
1658
1685
  }
1686
+
1659
1687
  this.agent._chatCtx.insert(toolMessages);
1660
- this.agentSession._toolItemsAdded(toolMessages as (FunctionCall | FunctionCallOutput)[]);
1688
+
1689
+ // Only add FunctionCallOutput items to session history since FunctionCall items
1690
+ // were already added by onToolExecutionStarted when the tool execution began
1691
+ const toolCallOutputs = toolMessages.filter(
1692
+ (m): m is FunctionCallOutput => m.type === 'function_call_output',
1693
+ );
1694
+
1695
+ if (toolCallOutputs.length > 0) {
1696
+ this.agentSession._toolItemsAdded(toolCallOutputs);
1697
+ }
1661
1698
  }
1662
1699
  };
1663
1700
 
@@ -1725,6 +1762,8 @@ export class AgentActivity implements RecognitionHooks {
1725
1762
  replyAbortController: AbortController;
1726
1763
  span: Span;
1727
1764
  }): Promise<void> {
1765
+ speechHandle._agentTurnContext = otelContext.active();
1766
+
1728
1767
  span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1729
1768
 
1730
1769
  speechHandleStorage.enterWith(speechHandle);
@@ -1762,8 +1801,11 @@ export class AgentActivity implements RecognitionHooks {
1762
1801
  return;
1763
1802
  }
1764
1803
 
1765
- const onFirstFrame = () => {
1766
- this.agentSession._updateAgentState('speaking');
1804
+ const onFirstFrame = (startedSpeakingAt?: number) => {
1805
+ this.agentSession._updateAgentState('speaking', {
1806
+ startTime: startedSpeakingAt,
1807
+ otelContext: speechHandle._agentTurnContext,
1808
+ });
1767
1809
  };
1768
1810
 
1769
1811
  const readMessages = async (
@@ -1851,10 +1893,14 @@ export class AgentActivity implements RecognitionHooks {
1851
1893
  );
1852
1894
  forwardTasks.push(forwardTask);
1853
1895
  audioOut = _audioOut;
1854
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1896
+ audioOut.firstFrameFut.await
1897
+ .then((ts) => onFirstFrame(ts))
1898
+ .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
1855
1899
  }
1856
1900
  } else if (textOut) {
1857
- textOut.firstTextFut.await.finally(onFirstFrame);
1901
+ textOut.firstTextFut.await
1902
+ .then(() => onFirstFrame())
1903
+ .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
1858
1904
  }
1859
1905
  outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
1860
1906
  }
@@ -1955,11 +2001,11 @@ export class AgentActivity implements RecognitionHooks {
1955
2001
  if (audioOutput) {
1956
2002
  audioOutput.clearBuffer();
1957
2003
  const playbackEv = await audioOutput.waitForPlayout();
1958
- let playbackPosition = playbackEv.playbackPosition;
1959
- if (audioOut?.firstFrameFut.done) {
2004
+ let playbackPositionInS = playbackEv.playbackPosition;
2005
+ if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
1960
2006
  // playback EV is valid only if the first frame was already played
1961
2007
  this.logger.info(
1962
- { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
2008
+ { speech_id: speechHandle.id, playbackPositionInS },
1963
2009
  'playout interrupted',
1964
2010
  );
1965
2011
  if (playbackEv.synchronizedTranscript) {
@@ -1967,13 +2013,13 @@ export class AgentActivity implements RecognitionHooks {
1967
2013
  }
1968
2014
  } else {
1969
2015
  forwardedText = '';
1970
- playbackPosition = 0;
2016
+ playbackPositionInS = 0;
1971
2017
  }
1972
2018
 
1973
2019
  // truncate server-side message
1974
2020
  this.realtimeSession.truncate({
1975
2021
  messageId: msgId,
1976
- audioEndMs: Math.floor(playbackPosition),
2022
+ audioEndMs: Math.floor(playbackPositionInS * 1000),
1977
2023
  modalities: msgModalities,
1978
2024
  audioTranscript: forwardedText,
1979
2025
  });
@@ -677,7 +677,7 @@ export class AgentSession<
677
677
  }
678
678
 
679
679
  /** @internal */
680
- _updateAgentState(state: AgentState) {
680
+ _updateAgentState(state: AgentState, options?: { startTime?: number; otelContext?: Context }) {
681
681
  if (this._agentState === state) {
682
682
  return;
683
683
  }
@@ -690,7 +690,8 @@ export class AgentSession<
690
690
  if (this.agentSpeakingSpan === undefined) {
691
691
  this.agentSpeakingSpan = tracer.startSpan({
692
692
  name: 'agent_speaking',
693
- context: this.rootSpanContext,
693
+ context: options?.otelContext ?? this.rootSpanContext,
694
+ startTime: options?.startTime,
694
695
  });
695
696
 
696
697
  // TODO(brian): PR4 - Set participant attributes if roomIO.room.localParticipant is available
@@ -719,7 +720,7 @@ export class AgentSession<
719
720
  }
720
721
 
721
722
  /** @internal */
722
- _updateUserState(state: UserState, _lastSpeakingTime?: number) {
723
+ _updateUserState(state: UserState, lastSpeakingTime?: number) {
723
724
  if (this.userState === state) {
724
725
  return;
725
726
  }
@@ -728,13 +729,13 @@ export class AgentSession<
728
729
  this.userSpeakingSpan = tracer.startSpan({
729
730
  name: 'user_speaking',
730
731
  context: this.rootSpanContext,
732
+ startTime: lastSpeakingTime,
731
733
  });
732
734
 
733
735
  // TODO(brian): PR4 - Set participant attributes if roomIO.linkedParticipant is available
734
736
  // (Ref: Python agent_session.py line 1192-1195)
735
737
  } else if (this.userSpeakingSpan !== undefined) {
736
- // TODO(brian): PR4 - Set ATTR_END_TIME attribute with lastSpeakingTime if available
737
- this.userSpeakingSpan.end();
738
+ this.userSpeakingSpan.end(lastSpeakingTime);
738
739
  this.userSpeakingSpan = undefined;
739
740
  }
740
741
 
@@ -566,9 +566,11 @@ export class AudioRecognition {
566
566
  this.speaking = true;
567
567
 
568
568
  if (!this.userTurnSpan) {
569
+ const startTime = Date.now() - ev.speechDuration;
569
570
  this.userTurnSpan = tracer.startSpan({
570
571
  name: 'user_turn',
571
572
  context: this.rootSpanContext,
573
+ startTime,
572
574
  });
573
575
  }
574
576
 
@@ -47,6 +47,7 @@ export class DataStreamAudioOutput extends AudioOutput {
47
47
  private started: boolean = false;
48
48
  private lock = new Mutex();
49
49
  private startTask?: Task<void>;
50
+ private firstFrameEmitted: boolean = false;
50
51
 
51
52
  #logger = log();
52
53
 
@@ -146,6 +147,11 @@ export class DataStreamAudioOutput extends AudioOutput {
146
147
  await this.startTask.result;
147
148
  await super.captureFrame(frame);
148
149
 
150
+ if (!this.firstFrameEmitted) {
151
+ this.firstFrameEmitted = true;
152
+ this.onPlaybackStarted(Date.now());
153
+ }
154
+
149
155
  if (!this.streamWriter) {
150
156
  this.streamWriter = await this.room.localParticipant!.streamBytes({
151
157
  name: shortuuid('AUDIO_'),
@@ -174,6 +180,8 @@ export class DataStreamAudioOutput extends AudioOutput {
174
180
  this.streamWriter.close().finally(() => {
175
181
  this.streamWriter = undefined;
176
182
  });
183
+
184
+ this.firstFrameEmitted = false;
177
185
  }
178
186
 
179
187
  clearBuffer(): void {
@@ -27,7 +27,7 @@ import { traceTypes, tracer } from '../telemetry/index.js';
27
27
  import { Future, Task, shortuuid, toError, waitForAbort } from '../utils.js';
28
28
  import { type Agent, type ModelSettings, asyncLocalStorage, isStopResponse } from './agent.js';
29
29
  import type { AgentSession } from './agent_session.js';
30
- import type { AudioOutput, LLMNode, TTSNode, TextOutput } from './io.js';
30
+ import { AudioOutput, type LLMNode, type TTSNode, type TextOutput } from './io.js';
31
31
  import { RunContext } from './run_context.js';
32
32
  import type { SpeechHandle } from './speech_handle.js';
33
33
 
@@ -608,7 +608,8 @@ export function performTextForwarding(
608
608
 
609
609
  export interface _AudioOut {
610
610
  audio: Array<AudioFrame>;
611
- firstFrameFut: Future;
611
+ /** Future that will be set with the timestamp of the first frame's capture */
612
+ firstFrameFut: Future<number>;
612
613
  }
613
614
 
614
615
  async function forwardAudio(
@@ -620,7 +621,16 @@ async function forwardAudio(
620
621
  const reader = ttsStream.getReader();
621
622
  let resampler: AudioResampler | null = null;
622
623
 
624
+ const onPlaybackStarted = (ev: { createdAt: number }) => {
625
+ if (!out.firstFrameFut.done) {
626
+ out.firstFrameFut.resolve(ev.createdAt);
627
+ }
628
+ };
629
+
623
630
  try {
631
+ audioOuput.on(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
632
+ audioOuput.resume();
633
+
624
634
  while (true) {
625
635
  if (signal?.aborted) {
626
636
  break;
@@ -647,20 +657,21 @@ async function forwardAudio(
647
657
  } else {
648
658
  await audioOuput.captureFrame(frame);
649
659
  }
650
-
651
- // set the first frame future if not already set
652
- // (after completing the first frame)
653
- if (!out.firstFrameFut.done) {
654
- out.firstFrameFut.resolve();
655
- }
656
660
  }
657
- } finally {
658
- reader?.releaseLock();
661
+
659
662
  if (resampler) {
660
663
  for (const f of resampler.flush()) {
661
664
  await audioOuput.captureFrame(f);
662
665
  }
663
666
  }
667
+ } finally {
668
+ audioOuput.off(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
669
+
670
+ if (!out.firstFrameFut.done) {
671
+ out.firstFrameFut.reject(new Error('audio forwarding cancelled before playback started'));
672
+ }
673
+
674
+ reader?.releaseLock();
664
675
  audioOuput.flush();
665
676
  }
666
677
  }
@@ -670,10 +681,11 @@ export function performAudioForwarding(
670
681
  audioOutput: AudioOutput,
671
682
  controller: AbortController,
672
683
  ): [Task<void>, _AudioOut] {
673
- const out = {
684
+ const out: _AudioOut = {
674
685
  audio: [],
675
- firstFrameFut: new Future(),
686
+ firstFrameFut: new Future<number>(),
676
687
  };
688
+
677
689
  return [
678
690
  Task.from(
679
691
  (controller) => forwardAudio(ttsStream, audioOutput, out, controller.signal),