@livekit/agents 1.0.37 → 1.0.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/dist/cli.cjs.map +1 -1
  2. package/dist/inference/api_protos.cjs +68 -0
  3. package/dist/inference/api_protos.cjs.map +1 -1
  4. package/dist/inference/api_protos.d.cts +345 -4
  5. package/dist/inference/api_protos.d.ts +345 -4
  6. package/dist/inference/api_protos.d.ts.map +1 -1
  7. package/dist/inference/api_protos.js +60 -0
  8. package/dist/inference/api_protos.js.map +1 -1
  9. package/dist/inference/llm.cjs +7 -3
  10. package/dist/inference/llm.cjs.map +1 -1
  11. package/dist/inference/llm.d.cts +5 -6
  12. package/dist/inference/llm.d.ts +5 -6
  13. package/dist/inference/llm.d.ts.map +1 -1
  14. package/dist/inference/llm.js +7 -3
  15. package/dist/inference/llm.js.map +1 -1
  16. package/dist/inference/stt.cjs +32 -21
  17. package/dist/inference/stt.cjs.map +1 -1
  18. package/dist/inference/stt.d.cts +5 -4
  19. package/dist/inference/stt.d.ts +5 -4
  20. package/dist/inference/stt.d.ts.map +1 -1
  21. package/dist/inference/stt.js +34 -21
  22. package/dist/inference/stt.js.map +1 -1
  23. package/dist/inference/tts.cjs.map +1 -1
  24. package/dist/inference/tts.d.cts +10 -7
  25. package/dist/inference/tts.d.ts +10 -7
  26. package/dist/inference/tts.d.ts.map +1 -1
  27. package/dist/inference/tts.js.map +1 -1
  28. package/dist/ipc/inference_proc_executor.cjs.map +1 -1
  29. package/dist/ipc/job_proc_executor.cjs.map +1 -1
  30. package/dist/stt/stream_adapter.cjs +9 -1
  31. package/dist/stt/stream_adapter.cjs.map +1 -1
  32. package/dist/stt/stream_adapter.d.ts.map +1 -1
  33. package/dist/stt/stream_adapter.js +9 -1
  34. package/dist/stt/stream_adapter.js.map +1 -1
  35. package/dist/stt/stt.cjs +10 -0
  36. package/dist/stt/stt.cjs.map +1 -1
  37. package/dist/stt/stt.d.cts +12 -0
  38. package/dist/stt/stt.d.ts +12 -0
  39. package/dist/stt/stt.d.ts.map +1 -1
  40. package/dist/stt/stt.js +10 -0
  41. package/dist/stt/stt.js.map +1 -1
  42. package/dist/telemetry/traces.cjs +4 -3
  43. package/dist/telemetry/traces.cjs.map +1 -1
  44. package/dist/telemetry/traces.d.cts +2 -0
  45. package/dist/telemetry/traces.d.ts +2 -0
  46. package/dist/telemetry/traces.d.ts.map +1 -1
  47. package/dist/telemetry/traces.js +4 -3
  48. package/dist/telemetry/traces.js.map +1 -1
  49. package/dist/utils.cjs +11 -0
  50. package/dist/utils.cjs.map +1 -1
  51. package/dist/utils.d.cts +10 -0
  52. package/dist/utils.d.ts +10 -0
  53. package/dist/utils.d.ts.map +1 -1
  54. package/dist/utils.js +10 -0
  55. package/dist/utils.js.map +1 -1
  56. package/dist/voice/agent.cjs +6 -2
  57. package/dist/voice/agent.cjs.map +1 -1
  58. package/dist/voice/agent.d.ts.map +1 -1
  59. package/dist/voice/agent.js +6 -2
  60. package/dist/voice/agent.js.map +1 -1
  61. package/dist/voice/agent_activity.cjs +72 -37
  62. package/dist/voice/agent_activity.cjs.map +1 -1
  63. package/dist/voice/agent_activity.d.cts +2 -1
  64. package/dist/voice/agent_activity.d.ts +2 -1
  65. package/dist/voice/agent_activity.d.ts.map +1 -1
  66. package/dist/voice/agent_activity.js +73 -38
  67. package/dist/voice/agent_activity.js.map +1 -1
  68. package/dist/voice/agent_session.cjs +7 -5
  69. package/dist/voice/agent_session.cjs.map +1 -1
  70. package/dist/voice/agent_session.d.cts +5 -2
  71. package/dist/voice/agent_session.d.ts +5 -2
  72. package/dist/voice/agent_session.d.ts.map +1 -1
  73. package/dist/voice/agent_session.js +7 -5
  74. package/dist/voice/agent_session.js.map +1 -1
  75. package/dist/voice/audio_recognition.cjs +3 -1
  76. package/dist/voice/audio_recognition.cjs.map +1 -1
  77. package/dist/voice/audio_recognition.d.ts.map +1 -1
  78. package/dist/voice/audio_recognition.js +3 -1
  79. package/dist/voice/audio_recognition.js.map +1 -1
  80. package/dist/voice/avatar/datastream_io.cjs +6 -0
  81. package/dist/voice/avatar/datastream_io.cjs.map +1 -1
  82. package/dist/voice/avatar/datastream_io.d.cts +1 -0
  83. package/dist/voice/avatar/datastream_io.d.ts +1 -0
  84. package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
  85. package/dist/voice/avatar/datastream_io.js +6 -0
  86. package/dist/voice/avatar/datastream_io.js.map +1 -1
  87. package/dist/voice/background_audio.cjs.map +1 -1
  88. package/dist/voice/generation.cjs +14 -5
  89. package/dist/voice/generation.cjs.map +1 -1
  90. package/dist/voice/generation.d.cts +3 -2
  91. package/dist/voice/generation.d.ts +3 -2
  92. package/dist/voice/generation.d.ts.map +1 -1
  93. package/dist/voice/generation.js +14 -5
  94. package/dist/voice/generation.js.map +1 -1
  95. package/dist/voice/io.cjs +12 -0
  96. package/dist/voice/io.cjs.map +1 -1
  97. package/dist/voice/io.d.cts +19 -1
  98. package/dist/voice/io.d.ts +19 -1
  99. package/dist/voice/io.d.ts.map +1 -1
  100. package/dist/voice/io.js +12 -0
  101. package/dist/voice/io.js.map +1 -1
  102. package/dist/voice/recorder_io/recorder_io.cjs +91 -28
  103. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  104. package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
  105. package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
  106. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
  107. package/dist/voice/recorder_io/recorder_io.js +91 -28
  108. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  109. package/dist/voice/room_io/_input.cjs +40 -11
  110. package/dist/voice/room_io/_input.cjs.map +1 -1
  111. package/dist/voice/room_io/_input.d.cts +4 -1
  112. package/dist/voice/room_io/_input.d.ts +4 -1
  113. package/dist/voice/room_io/_input.d.ts.map +1 -1
  114. package/dist/voice/room_io/_input.js +31 -2
  115. package/dist/voice/room_io/_input.js.map +1 -1
  116. package/dist/voice/room_io/_output.cjs +6 -0
  117. package/dist/voice/room_io/_output.cjs.map +1 -1
  118. package/dist/voice/room_io/_output.d.cts +1 -0
  119. package/dist/voice/room_io/_output.d.ts +1 -0
  120. package/dist/voice/room_io/_output.d.ts.map +1 -1
  121. package/dist/voice/room_io/_output.js +6 -0
  122. package/dist/voice/room_io/_output.js.map +1 -1
  123. package/dist/voice/room_io/room_io.cjs.map +1 -1
  124. package/dist/voice/room_io/room_io.d.cts +2 -2
  125. package/dist/voice/room_io/room_io.d.ts +2 -2
  126. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  127. package/dist/voice/room_io/room_io.js.map +1 -1
  128. package/dist/voice/speech_handle.cjs +2 -0
  129. package/dist/voice/speech_handle.cjs.map +1 -1
  130. package/dist/voice/speech_handle.d.cts +3 -0
  131. package/dist/voice/speech_handle.d.ts +3 -0
  132. package/dist/voice/speech_handle.d.ts.map +1 -1
  133. package/dist/voice/speech_handle.js +2 -0
  134. package/dist/voice/speech_handle.js.map +1 -1
  135. package/package.json +2 -2
  136. package/src/inference/api_protos.ts +83 -0
  137. package/src/inference/llm.ts +20 -15
  138. package/src/inference/stt.ts +48 -29
  139. package/src/inference/tts.ts +36 -16
  140. package/src/stt/stream_adapter.ts +12 -1
  141. package/src/stt/stt.ts +21 -0
  142. package/src/telemetry/traces.ts +6 -2
  143. package/src/utils.ts +21 -0
  144. package/src/voice/agent.ts +11 -2
  145. package/src/voice/agent_activity.ts +108 -41
  146. package/src/voice/agent_session.ts +6 -5
  147. package/src/voice/audio_recognition.ts +2 -0
  148. package/src/voice/avatar/datastream_io.ts +8 -0
  149. package/src/voice/generation.ts +24 -12
  150. package/src/voice/io.ts +27 -5
  151. package/src/voice/recorder_io/recorder_io.ts +123 -31
  152. package/src/voice/room_io/_input.ts +32 -4
  153. package/src/voice/room_io/_output.ts +8 -0
  154. package/src/voice/room_io/room_io.ts +3 -1
  155. package/src/voice/speech_handle.ts +4 -0
@@ -16,22 +16,30 @@ import {
16
16
  } from '../stt/index.js';
17
17
  import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
18
18
  import { type AudioBuffer, Event, Task, cancelAndWait, shortuuid, waitForAbort } from '../utils.js';
19
+ import type { TimedString } from '../voice/io.js';
20
+ import {
21
+ type SttServerEvent,
22
+ type SttTranscriptEvent,
23
+ sttServerEventSchema,
24
+ } from './api_protos.js';
19
25
  import { type AnyString, connectWs, createAccessToken } from './utils.js';
20
26
 
21
27
  export type DeepgramModels =
22
- | 'deepgram'
28
+ | 'deepgram/flux-general'
23
29
  | 'deepgram/nova-3'
24
- | 'deepgram/nova-3-general'
25
30
  | 'deepgram/nova-3-medical'
26
- | 'deepgram/nova-2-conversationalai'
27
31
  | 'deepgram/nova-2'
28
- | 'deepgram/nova-2-general'
29
32
  | 'deepgram/nova-2-medical'
33
+ | 'deepgram/nova-2-conversationalai'
30
34
  | 'deepgram/nova-2-phonecall';
31
35
 
32
- export type CartesiaModels = 'cartesia' | 'cartesia/ink-whisper';
36
+ export type CartesiaModels = 'cartesia/ink-whisper';
37
+
38
+ export type AssemblyaiModels =
39
+ | 'assemblyai/universal-streaming'
40
+ | 'assemblyai/universal-streaming-multilingual';
33
41
 
34
- export type AssemblyaiModels = 'assemblyai' | 'assemblyai/universal-streaming';
42
+ export type ElevenlabsSTTModels = 'elevenlabs/scribe_v2_realtime';
35
43
 
36
44
  export interface CartesiaOptions {
37
45
  min_volume?: number; // default: not specified
@@ -71,7 +79,7 @@ export type STTLanguages =
71
79
  | 'hi'
72
80
  | AnyString;
73
81
 
74
- type _STTModels = DeepgramModels | CartesiaModels | AssemblyaiModels;
82
+ type _STTModels = DeepgramModels | CartesiaModels | AssemblyaiModels | ElevenlabsSTTModels;
75
83
 
76
84
  export type STTModels = _STTModels | 'auto' | AnyString;
77
85
 
@@ -122,7 +130,7 @@ export class STT<TModel extends STTModels> extends BaseSTT {
122
130
  apiSecret?: string;
123
131
  modelOptions?: STTOptions<TModel>;
124
132
  }) {
125
- super({ streaming: true, interimResults: true });
133
+ super({ streaming: true, interimResults: true, alignedTranscript: 'word' });
126
134
 
127
135
  const {
128
136
  model,
@@ -271,7 +279,6 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
271
279
  let closing = false;
272
280
  let finalReceived = false;
273
281
 
274
- type SttServerEvent = Record<string, any>;
275
282
  const eventChannel = createStreamChannel<SttServerEvent>();
276
283
 
277
284
  const resourceCleanup = () => {
@@ -380,10 +387,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
380
387
  if (signal.aborted) return;
381
388
  if (result.done) return;
382
389
 
383
- const json = result.value;
384
- const type = json.type as string | undefined;
390
+ // Parse and validate with Zod schema
391
+ const parseResult = await sttServerEventSchema.safeParseAsync(result.value);
392
+ if (!parseResult.success) {
393
+ this.#logger.warn(
394
+ { error: parseResult.error, rawData: result.value },
395
+ 'Failed to parse STT server event',
396
+ );
397
+ continue;
398
+ }
399
+
400
+ const event: SttServerEvent = parseResult.data;
385
401
 
386
- switch (type) {
402
+ switch (event.type) {
387
403
  case 'session.created':
388
404
  case 'session.finalized':
389
405
  break;
@@ -392,21 +408,15 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
392
408
  resourceCleanup();
393
409
  break;
394
410
  case 'interim_transcript':
395
- this.processTranscript(json, false);
411
+ this.processTranscript(event, false);
396
412
  break;
397
413
  case 'final_transcript':
398
- this.processTranscript(json, true);
414
+ this.processTranscript(event, true);
399
415
  break;
400
416
  case 'error':
401
- this.#logger.error({ error: json }, 'Received error from LiveKit STT');
417
+ this.#logger.error({ error: event }, 'Received error from LiveKit STT');
402
418
  resourceCleanup();
403
- throw new APIError(`LiveKit STT returned error: ${JSON.stringify(json)}`);
404
- default:
405
- this.#logger.warn(
406
- { message: json },
407
- 'Received unexpected message from LiveKit STT',
408
- );
409
- break;
419
+ throw new APIError(`LiveKit STT returned error: ${JSON.stringify(event)}`);
410
420
  }
411
421
  }
412
422
  } finally {
@@ -457,13 +467,13 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
457
467
  }
458
468
  }
459
469
 
460
- private processTranscript(data: Record<string, any>, isFinal: boolean) {
470
+ private processTranscript(data: SttTranscriptEvent, isFinal: boolean) {
461
471
  // Check if queue is closed to avoid race condition during disconnect
462
472
  if (this.queue.closed) return;
463
473
 
464
- const requestId = data.request_id ?? this.requestId;
465
- const text = data.transcript ?? '';
466
- const language = data.language ?? this.opts.language ?? 'en';
474
+ const requestId = data.session_id || this.requestId;
475
+ const text = data.transcript;
476
+ const language = data.language || this.opts.language || 'en';
467
477
 
468
478
  if (!text && !isFinal) return;
469
479
 
@@ -476,10 +486,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
476
486
 
477
487
  const speechData: SpeechData = {
478
488
  language,
479
- startTime: data.start ?? 0,
480
- endTime: data.duration ?? 0,
481
- confidence: data.confidence ?? 1.0,
489
+ startTime: this.startTimeOffset + data.start,
490
+ endTime: this.startTimeOffset + data.start + data.duration,
491
+ confidence: data.confidence,
482
492
  text,
493
+ words: data.words.map(
494
+ (word): TimedString => ({
495
+ text: word.word,
496
+ startTime: word.start + this.startTimeOffset,
497
+ endTime: word.end + this.startTimeOffset,
498
+ startTimeOffset: this.startTimeOffset,
499
+ confidence: word.confidence,
500
+ }),
501
+ ),
483
502
  };
484
503
 
485
504
  if (isFinal) {
@@ -23,22 +23,27 @@ import {
23
23
  import { type AnyString, connectWs, createAccessToken } from './utils.js';
24
24
 
25
25
  export type CartesiaModels =
26
- | 'cartesia'
27
- | 'cartesia/sonic'
26
+ | 'cartesia/sonic-3'
28
27
  | 'cartesia/sonic-2'
29
- | 'cartesia/sonic-turbo';
28
+ | 'cartesia/sonic-turbo'
29
+ | 'cartesia/sonic';
30
+
31
+ export type DeepgramTTSModels = 'deepgram/aura' | 'deepgram/aura-2';
30
32
 
31
33
  export type ElevenlabsModels =
32
- | 'elevenlabs'
33
34
  | 'elevenlabs/eleven_flash_v2'
34
35
  | 'elevenlabs/eleven_flash_v2_5'
35
36
  | 'elevenlabs/eleven_turbo_v2'
36
37
  | 'elevenlabs/eleven_turbo_v2_5'
37
38
  | 'elevenlabs/eleven_multilingual_v2';
38
39
 
39
- export type RimeModels = 'rime' | 'rime/mist' | 'rime/mistv2' | 'rime/arcana';
40
+ export type InworldModels =
41
+ | 'inworld/inworld-tts-1.5-max'
42
+ | 'inworld/inworld-tts-1.5-mini'
43
+ | 'inworld/inworld-tts-1-max'
44
+ | 'inworld/inworld-tts-1';
40
45
 
41
- export type InworldModels = 'inworld' | 'inworld/inworld-tts-1';
46
+ export type RimeModels = 'rime/arcana' | 'rime/mistv2';
42
47
 
43
48
  export interface CartesiaOptions {
44
49
  duration?: number; // max duration of audio in seconds
@@ -50,25 +55,40 @@ export interface ElevenlabsOptions {
50
55
  apply_text_normalization?: 'auto' | 'off' | 'on'; // default: "auto"
51
56
  }
52
57
 
58
+ export interface DeepgramTTSOptions {}
59
+
53
60
  export interface RimeOptions {}
54
61
 
55
62
  export interface InworldOptions {}
56
63
 
57
- type _TTSModels = CartesiaModels | ElevenlabsModels | RimeModels | InworldModels;
58
-
59
- export type TTSModels = CartesiaModels | ElevenlabsModels | RimeModels | InworldModels | AnyString;
64
+ type _TTSModels =
65
+ | CartesiaModels
66
+ | DeepgramTTSModels
67
+ | ElevenlabsModels
68
+ | RimeModels
69
+ | InworldModels;
70
+
71
+ export type TTSModels =
72
+ | CartesiaModels
73
+ | DeepgramTTSModels
74
+ | ElevenlabsModels
75
+ | RimeModels
76
+ | InworldModels
77
+ | AnyString;
60
78
 
61
79
  export type ModelWithVoice = `${_TTSModels}:${string}` | TTSModels;
62
80
 
63
81
  export type TTSOptions<TModel extends TTSModels> = TModel extends CartesiaModels
64
82
  ? CartesiaOptions
65
- : TModel extends ElevenlabsModels
66
- ? ElevenlabsOptions
67
- : TModel extends RimeOptions
68
- ? RimeOptions
69
- : TModel extends InworldOptions
70
- ? InworldOptions
71
- : Record<string, unknown>;
83
+ : TModel extends DeepgramTTSModels
84
+ ? DeepgramTTSOptions
85
+ : TModel extends ElevenlabsModels
86
+ ? ElevenlabsOptions
87
+ : TModel extends RimeModels
88
+ ? RimeOptions
89
+ : TModel extends InworldModels
90
+ ? InworldOptions
91
+ : Record<string, unknown>;
72
92
 
73
93
  type TTSEncoding = 'pcm_s16le';
74
94
 
@@ -4,6 +4,7 @@
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
5
  import { log } from '../log.js';
6
6
  import type { APIConnectOptions } from '../types.js';
7
+ import { isStreamClosedError } from '../utils.js';
7
8
  import type { VAD, VADStream } from '../vad.js';
8
9
  import { VADEventType } from '../vad.js';
9
10
  import type { SpeechEvent } from './stt.js';
@@ -68,7 +69,17 @@ export class StreamAdapterWrapper extends SpeechStream {
68
69
  this.#vadStream.pushFrame(input);
69
70
  }
70
71
  }
71
- this.#vadStream.endInput();
72
+
73
+ // Guard against calling endInput() on already-closed stream
74
+ // This happens during handover when close() is called while forwardInput is running
75
+ try {
76
+ this.#vadStream.endInput();
77
+ } catch (e) {
78
+ if (isStreamClosedError(e)) {
79
+ return;
80
+ }
81
+ throw e;
82
+ }
72
83
  };
73
84
 
74
85
  const recognize = async () => {
package/src/stt/stt.ts CHANGED
@@ -13,6 +13,7 @@ import { DeferredReadableStream } from '../stream/deferred_stream.js';
13
13
  import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS, intervalForRetry } from '../types.js';
14
14
  import type { AudioBuffer } from '../utils.js';
15
15
  import { AsyncIterableQueue, delay, startSoon, toError } from '../utils.js';
16
+ import type { TimedString } from '../voice/index.js';
16
17
 
17
18
  /** Indicates start/middle/end of speech */
18
19
  export enum SpeechEventType {
@@ -53,6 +54,7 @@ export interface SpeechData {
53
54
  startTime: number;
54
55
  endTime: number;
55
56
  confidence: number;
57
+ words?: TimedString[];
56
58
  }
57
59
 
58
60
  export interface RecognitionUsage {
@@ -76,6 +78,13 @@ export interface SpeechEvent {
76
78
  export interface STTCapabilities {
77
79
  streaming: boolean;
78
80
  interimResults: boolean;
81
+ /**
82
+ * Whether this STT supports aligned transcripts with word/chunk timestamps.
83
+ * - 'word': Provider returns word-level timestamps
84
+ * - 'chunk': Provider returns chunk-level timestamps (e.g., sentence/phrase boundaries)
85
+ * - false: Provider does not support aligned transcripts
86
+ */
87
+ alignedTranscript?: 'word' | 'chunk' | false;
79
88
  }
80
89
 
81
90
  export interface STTError {
@@ -176,6 +185,7 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
176
185
  private deferredInputStream: DeferredReadableStream<AudioFrame>;
177
186
  private logger = log();
178
187
  private _connOptions: APIConnectOptions;
188
+ private _startTimeOffset: number = 0;
179
189
 
180
190
  protected abortController = new AbortController();
181
191
 
@@ -300,6 +310,17 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
300
310
  return this.abortController.signal;
301
311
  }
302
312
 
313
+ get startTimeOffset(): number {
314
+ return this._startTimeOffset;
315
+ }
316
+
317
+ set startTimeOffset(value: number) {
318
+ if (value < 0) {
319
+ throw new Error('startTimeOffset must be non-negative');
320
+ }
321
+ this._startTimeOffset = value;
322
+ }
323
+
303
324
  updateInputStream(audioStream: ReadableStream<AudioFrame>) {
304
325
  this.deferredInputStream.setSource(audioStream);
305
326
  }
@@ -37,6 +37,8 @@ export interface StartSpanOptions {
37
37
  attributes?: Attributes;
38
38
  /** Whether to end the span when the function exits (default: true) */
39
39
  endOnExit?: boolean;
40
+ /** Optional start time for the span in milliseconds (Date.now() format) */
41
+ startTime?: number;
40
42
  }
41
43
 
42
44
  /**
@@ -79,10 +81,12 @@ class DynamicTracer {
79
81
  */
80
82
  startSpan(options: StartSpanOptions): Span {
81
83
  const ctx = options.context || otelContext.active();
84
+
82
85
  const span = this.tracer.startSpan(
83
86
  options.name,
84
87
  {
85
88
  attributes: options.attributes,
89
+ startTime: options.startTime,
86
90
  },
87
91
  ctx,
88
92
  );
@@ -101,7 +105,7 @@ class DynamicTracer {
101
105
  async startActiveSpan<T>(fn: (span: Span) => Promise<T>, options: StartSpanOptions): Promise<T> {
102
106
  const ctx = options.context || otelContext.active();
103
107
  const endOnExit = options.endOnExit === undefined ? true : options.endOnExit; // default true
104
- const opts: SpanOptions = { attributes: options.attributes };
108
+ const opts: SpanOptions = { attributes: options.attributes, startTime: options.startTime };
105
109
 
106
110
  // Directly return the tracer's startActiveSpan result - it handles async correctly
107
111
  return await this.tracer.startActiveSpan(options.name, opts, ctx, async (span) => {
@@ -125,7 +129,7 @@ class DynamicTracer {
125
129
  startActiveSpanSync<T>(fn: (span: Span) => T, options: StartSpanOptions): T {
126
130
  const ctx = options.context || otelContext.active();
127
131
  const endOnExit = options.endOnExit === undefined ? true : options.endOnExit; // default true
128
- const opts: SpanOptions = { attributes: options.attributes };
132
+ const opts: SpanOptions = { attributes: options.attributes, startTime: options.startTime };
129
133
 
130
134
  return this.tracer.startActiveSpan(options.name, opts, ctx, (span) => {
131
135
  try {
package/src/utils.ts CHANGED
@@ -125,6 +125,7 @@ export class Future<T = void> {
125
125
  #resolvePromise!: (value: T) => void;
126
126
  #rejectPromise!: (error: Error) => void;
127
127
  #done: boolean = false;
128
+ #rejected: boolean = false;
128
129
 
129
130
  constructor() {
130
131
  this.#await = new Promise<T>((resolve, reject) => {
@@ -141,6 +142,11 @@ export class Future<T = void> {
141
142
  return this.#done;
142
143
  }
143
144
 
145
+ /** Whether the future was rejected (cancelled) */
146
+ get rejected() {
147
+ return this.#rejected;
148
+ }
149
+
144
150
  resolve(value: T) {
145
151
  this.#done = true;
146
152
  this.#resolvePromise(value);
@@ -148,6 +154,7 @@ export class Future<T = void> {
148
154
 
149
155
  reject(error: Error) {
150
156
  this.#done = true;
157
+ this.#rejected = true;
151
158
  this.#rejectPromise(error);
152
159
  }
153
160
  }
@@ -668,6 +675,20 @@ export class InvalidErrorType extends Error {
668
675
  }
669
676
  }
670
677
 
678
+ /**
679
+ * Check if an error is a stream closed error that can be safely ignored during cleanup.
680
+ * This happens during handover/cleanup when close() is called while operations are still running.
681
+ *
682
+ * @param error - The error to check.
683
+ * @returns True if the error is a stream closed error.
684
+ */
685
+ export function isStreamClosedError(error: unknown): boolean {
686
+ return (
687
+ error instanceof Error &&
688
+ (error.message === 'Stream is closed' || error.message === 'Input is closed')
689
+ );
690
+ }
691
+
671
692
  /**
672
693
  * In JS an error can be any arbitrary value.
673
694
  * This function converts an unknown error to an Error and stores the original value in the error object.
@@ -271,6 +271,15 @@ export class Agent<UserData = any> {
271
271
 
272
272
  const connOptions = activity.agentSession.connOptions.sttConnOptions;
273
273
  const stream = wrapped_stt.stream({ connOptions });
274
+
275
+ // Set startTimeOffset to provide linear timestamps across reconnections
276
+ const audioInputStartedAt =
277
+ activity.agentSession._recorderIO?.recordingStartedAt ?? // Use recording start time if available
278
+ activity.agentSession._startedAt ?? // Fallback to session start time
279
+ Date.now(); // Fallback to current time
280
+
281
+ stream.startTimeOffset = (Date.now() - audioInputStartedAt) / 1000;
282
+
274
283
  stream.updateInputStream(audio);
275
284
 
276
285
  let cleaned = false;
@@ -316,16 +325,16 @@ export class Agent<UserData = any> {
316
325
  );
317
326
  }
318
327
 
319
- // TODO(brian): make parallelToolCalls configurable
320
328
  const { toolChoice } = modelSettings;
321
329
  const connOptions = activity.agentSession.connOptions.llmConnOptions;
322
330
 
331
+ // parallelToolCalls is not passed here - it will use the value from LLM's modelOptions
332
+ // This allows users to configure it via: new inference.LLM({ modelOptions: { parallel_tool_calls: false } })
323
333
  const stream = activity.llm.chat({
324
334
  chatCtx,
325
335
  toolCtx,
326
336
  toolChoice,
327
337
  connOptions,
328
- parallelToolCalls: true,
329
338
  });
330
339
 
331
340
  let cleaned = false;