@livekit/agents 1.0.36 → 1.0.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs.map +1 -1
- package/dist/inference/api_protos.cjs +68 -0
- package/dist/inference/api_protos.cjs.map +1 -1
- package/dist/inference/api_protos.d.cts +345 -4
- package/dist/inference/api_protos.d.ts +345 -4
- package/dist/inference/api_protos.d.ts.map +1 -1
- package/dist/inference/api_protos.js +60 -0
- package/dist/inference/api_protos.js.map +1 -1
- package/dist/inference/stt.cjs +32 -21
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +34 -21
- package/dist/inference/stt.js.map +1 -1
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/stt/stt.cjs +10 -0
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +12 -0
- package/dist/stt/stt.d.ts +12 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +10 -0
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/traces.cjs +4 -3
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.cts +2 -0
- package/dist/telemetry/traces.d.ts +2 -0
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +4 -3
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/utils.cjs +6 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +2 -0
- package/dist/utils.d.ts +2 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +6 -0
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent.cjs +5 -0
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +5 -0
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +49 -23
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +1 -1
- package/dist/voice/agent_activity.d.ts +1 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +50 -24
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +7 -5
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +5 -2
- package/dist/voice/agent_session.d.ts +5 -2
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +7 -5
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +3 -1
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +3 -1
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/avatar/datastream_io.cjs +6 -0
- package/dist/voice/avatar/datastream_io.cjs.map +1 -1
- package/dist/voice/avatar/datastream_io.d.cts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
- package/dist/voice/avatar/datastream_io.js +6 -0
- package/dist/voice/avatar/datastream_io.js.map +1 -1
- package/dist/voice/background_audio.cjs.map +1 -1
- package/dist/voice/generation.cjs +14 -5
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -2
- package/dist/voice/generation.d.ts +3 -2
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +14 -5
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/io.cjs +12 -0
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +19 -1
- package/dist/voice/io.d.ts +19 -1
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +12 -0
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +91 -28
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
- package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +91 -28
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +40 -11
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +4 -1
- package/dist/voice/room_io/_input.d.ts +4 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +31 -2
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +6 -0
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +1 -0
- package/dist/voice/room_io/_output.d.ts +1 -0
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +6 -0
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +2 -2
- package/dist/voice/room_io/room_io.d.ts +2 -2
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +2 -0
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +3 -0
- package/dist/voice/speech_handle.d.ts +3 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +2 -0
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/index.cjs +2 -0
- package/dist/voice/testing/index.cjs.map +1 -1
- package/dist/voice/testing/index.d.cts +1 -1
- package/dist/voice/testing/index.d.ts +1 -1
- package/dist/voice/testing/index.d.ts.map +1 -1
- package/dist/voice/testing/index.js +2 -0
- package/dist/voice/testing/index.js.map +1 -1
- package/dist/voice/testing/run_result.cjs +294 -5
- package/dist/voice/testing/run_result.cjs.map +1 -1
- package/dist/voice/testing/run_result.d.cts +149 -1
- package/dist/voice/testing/run_result.d.ts +149 -1
- package/dist/voice/testing/run_result.d.ts.map +1 -1
- package/dist/voice/testing/run_result.js +293 -5
- package/dist/voice/testing/run_result.js.map +1 -1
- package/package.json +1 -1
- package/src/inference/api_protos.ts +83 -0
- package/src/inference/stt.ts +39 -22
- package/src/stt/stt.ts +21 -0
- package/src/telemetry/traces.ts +6 -2
- package/src/utils.ts +7 -0
- package/src/voice/agent.ts +9 -0
- package/src/voice/agent_activity.ts +72 -26
- package/src/voice/agent_session.ts +6 -5
- package/src/voice/audio_recognition.ts +2 -0
- package/src/voice/avatar/datastream_io.ts +8 -0
- package/src/voice/generation.ts +24 -12
- package/src/voice/io.ts +27 -5
- package/src/voice/recorder_io/recorder_io.ts +123 -31
- package/src/voice/room_io/_input.ts +32 -4
- package/src/voice/room_io/_output.ts +8 -0
- package/src/voice/room_io/room_io.ts +3 -1
- package/src/voice/speech_handle.ts +4 -0
- package/src/voice/testing/index.ts +1 -0
- package/src/voice/testing/run_result.ts +373 -12
package/src/inference/stt.ts
CHANGED
|
@@ -16,6 +16,12 @@ import {
|
|
|
16
16
|
} from '../stt/index.js';
|
|
17
17
|
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
|
|
18
18
|
import { type AudioBuffer, Event, Task, cancelAndWait, shortuuid, waitForAbort } from '../utils.js';
|
|
19
|
+
import type { TimedString } from '../voice/io.js';
|
|
20
|
+
import {
|
|
21
|
+
type SttServerEvent,
|
|
22
|
+
type SttTranscriptEvent,
|
|
23
|
+
sttServerEventSchema,
|
|
24
|
+
} from './api_protos.js';
|
|
19
25
|
import { type AnyString, connectWs, createAccessToken } from './utils.js';
|
|
20
26
|
|
|
21
27
|
export type DeepgramModels =
|
|
@@ -122,7 +128,7 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
122
128
|
apiSecret?: string;
|
|
123
129
|
modelOptions?: STTOptions<TModel>;
|
|
124
130
|
}) {
|
|
125
|
-
super({ streaming: true, interimResults: true });
|
|
131
|
+
super({ streaming: true, interimResults: true, alignedTranscript: 'word' });
|
|
126
132
|
|
|
127
133
|
const {
|
|
128
134
|
model,
|
|
@@ -271,7 +277,6 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
|
|
|
271
277
|
let closing = false;
|
|
272
278
|
let finalReceived = false;
|
|
273
279
|
|
|
274
|
-
type SttServerEvent = Record<string, any>;
|
|
275
280
|
const eventChannel = createStreamChannel<SttServerEvent>();
|
|
276
281
|
|
|
277
282
|
const resourceCleanup = () => {
|
|
@@ -380,10 +385,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
|
|
|
380
385
|
if (signal.aborted) return;
|
|
381
386
|
if (result.done) return;
|
|
382
387
|
|
|
383
|
-
|
|
384
|
-
const
|
|
388
|
+
// Parse and validate with Zod schema
|
|
389
|
+
const parseResult = await sttServerEventSchema.safeParseAsync(result.value);
|
|
390
|
+
if (!parseResult.success) {
|
|
391
|
+
this.#logger.warn(
|
|
392
|
+
{ error: parseResult.error, rawData: result.value },
|
|
393
|
+
'Failed to parse STT server event',
|
|
394
|
+
);
|
|
395
|
+
continue;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
const event: SttServerEvent = parseResult.data;
|
|
385
399
|
|
|
386
|
-
switch (type) {
|
|
400
|
+
switch (event.type) {
|
|
387
401
|
case 'session.created':
|
|
388
402
|
case 'session.finalized':
|
|
389
403
|
break;
|
|
@@ -392,21 +406,15 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
|
|
|
392
406
|
resourceCleanup();
|
|
393
407
|
break;
|
|
394
408
|
case 'interim_transcript':
|
|
395
|
-
this.processTranscript(
|
|
409
|
+
this.processTranscript(event, false);
|
|
396
410
|
break;
|
|
397
411
|
case 'final_transcript':
|
|
398
|
-
this.processTranscript(
|
|
412
|
+
this.processTranscript(event, true);
|
|
399
413
|
break;
|
|
400
414
|
case 'error':
|
|
401
|
-
this.#logger.error({ error:
|
|
415
|
+
this.#logger.error({ error: event }, 'Received error from LiveKit STT');
|
|
402
416
|
resourceCleanup();
|
|
403
|
-
throw new APIError(`LiveKit STT returned error: ${JSON.stringify(
|
|
404
|
-
default:
|
|
405
|
-
this.#logger.warn(
|
|
406
|
-
{ message: json },
|
|
407
|
-
'Received unexpected message from LiveKit STT',
|
|
408
|
-
);
|
|
409
|
-
break;
|
|
417
|
+
throw new APIError(`LiveKit STT returned error: ${JSON.stringify(event)}`);
|
|
410
418
|
}
|
|
411
419
|
}
|
|
412
420
|
} finally {
|
|
@@ -457,13 +465,13 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
|
|
|
457
465
|
}
|
|
458
466
|
}
|
|
459
467
|
|
|
460
|
-
private processTranscript(data:
|
|
468
|
+
private processTranscript(data: SttTranscriptEvent, isFinal: boolean) {
|
|
461
469
|
// Check if queue is closed to avoid race condition during disconnect
|
|
462
470
|
if (this.queue.closed) return;
|
|
463
471
|
|
|
464
|
-
const requestId = data.
|
|
465
|
-
const text = data.transcript
|
|
466
|
-
const language = data.language
|
|
472
|
+
const requestId = data.session_id || this.requestId;
|
|
473
|
+
const text = data.transcript;
|
|
474
|
+
const language = data.language || this.opts.language || 'en';
|
|
467
475
|
|
|
468
476
|
if (!text && !isFinal) return;
|
|
469
477
|
|
|
@@ -476,10 +484,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
|
|
|
476
484
|
|
|
477
485
|
const speechData: SpeechData = {
|
|
478
486
|
language,
|
|
479
|
-
startTime: data.start
|
|
480
|
-
endTime: data.
|
|
481
|
-
confidence: data.confidence
|
|
487
|
+
startTime: this.startTimeOffset + data.start,
|
|
488
|
+
endTime: this.startTimeOffset + data.start + data.duration,
|
|
489
|
+
confidence: data.confidence,
|
|
482
490
|
text,
|
|
491
|
+
words: data.words.map(
|
|
492
|
+
(word): TimedString => ({
|
|
493
|
+
text: word.word,
|
|
494
|
+
startTime: word.start + this.startTimeOffset,
|
|
495
|
+
endTime: word.end + this.startTimeOffset,
|
|
496
|
+
startTimeOffset: this.startTimeOffset,
|
|
497
|
+
confidence: word.confidence,
|
|
498
|
+
}),
|
|
499
|
+
),
|
|
483
500
|
};
|
|
484
501
|
|
|
485
502
|
if (isFinal) {
|
package/src/stt/stt.ts
CHANGED
|
@@ -13,6 +13,7 @@ import { DeferredReadableStream } from '../stream/deferred_stream.js';
|
|
|
13
13
|
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS, intervalForRetry } from '../types.js';
|
|
14
14
|
import type { AudioBuffer } from '../utils.js';
|
|
15
15
|
import { AsyncIterableQueue, delay, startSoon, toError } from '../utils.js';
|
|
16
|
+
import type { TimedString } from '../voice/index.js';
|
|
16
17
|
|
|
17
18
|
/** Indicates start/middle/end of speech */
|
|
18
19
|
export enum SpeechEventType {
|
|
@@ -53,6 +54,7 @@ export interface SpeechData {
|
|
|
53
54
|
startTime: number;
|
|
54
55
|
endTime: number;
|
|
55
56
|
confidence: number;
|
|
57
|
+
words?: TimedString[];
|
|
56
58
|
}
|
|
57
59
|
|
|
58
60
|
export interface RecognitionUsage {
|
|
@@ -76,6 +78,13 @@ export interface SpeechEvent {
|
|
|
76
78
|
export interface STTCapabilities {
|
|
77
79
|
streaming: boolean;
|
|
78
80
|
interimResults: boolean;
|
|
81
|
+
/**
|
|
82
|
+
* Whether this STT supports aligned transcripts with word/chunk timestamps.
|
|
83
|
+
* - 'word': Provider returns word-level timestamps
|
|
84
|
+
* - 'chunk': Provider returns chunk-level timestamps (e.g., sentence/phrase boundaries)
|
|
85
|
+
* - false: Provider does not support aligned transcripts
|
|
86
|
+
*/
|
|
87
|
+
alignedTranscript?: 'word' | 'chunk' | false;
|
|
79
88
|
}
|
|
80
89
|
|
|
81
90
|
export interface STTError {
|
|
@@ -176,6 +185,7 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
176
185
|
private deferredInputStream: DeferredReadableStream<AudioFrame>;
|
|
177
186
|
private logger = log();
|
|
178
187
|
private _connOptions: APIConnectOptions;
|
|
188
|
+
private _startTimeOffset: number = 0;
|
|
179
189
|
|
|
180
190
|
protected abortController = new AbortController();
|
|
181
191
|
|
|
@@ -300,6 +310,17 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
300
310
|
return this.abortController.signal;
|
|
301
311
|
}
|
|
302
312
|
|
|
313
|
+
get startTimeOffset(): number {
|
|
314
|
+
return this._startTimeOffset;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
set startTimeOffset(value: number) {
|
|
318
|
+
if (value < 0) {
|
|
319
|
+
throw new Error('startTimeOffset must be non-negative');
|
|
320
|
+
}
|
|
321
|
+
this._startTimeOffset = value;
|
|
322
|
+
}
|
|
323
|
+
|
|
303
324
|
updateInputStream(audioStream: ReadableStream<AudioFrame>) {
|
|
304
325
|
this.deferredInputStream.setSource(audioStream);
|
|
305
326
|
}
|
package/src/telemetry/traces.ts
CHANGED
|
@@ -37,6 +37,8 @@ export interface StartSpanOptions {
|
|
|
37
37
|
attributes?: Attributes;
|
|
38
38
|
/** Whether to end the span when the function exits (default: true) */
|
|
39
39
|
endOnExit?: boolean;
|
|
40
|
+
/** Optional start time for the span in milliseconds (Date.now() format) */
|
|
41
|
+
startTime?: number;
|
|
40
42
|
}
|
|
41
43
|
|
|
42
44
|
/**
|
|
@@ -79,10 +81,12 @@ class DynamicTracer {
|
|
|
79
81
|
*/
|
|
80
82
|
startSpan(options: StartSpanOptions): Span {
|
|
81
83
|
const ctx = options.context || otelContext.active();
|
|
84
|
+
|
|
82
85
|
const span = this.tracer.startSpan(
|
|
83
86
|
options.name,
|
|
84
87
|
{
|
|
85
88
|
attributes: options.attributes,
|
|
89
|
+
startTime: options.startTime,
|
|
86
90
|
},
|
|
87
91
|
ctx,
|
|
88
92
|
);
|
|
@@ -101,7 +105,7 @@ class DynamicTracer {
|
|
|
101
105
|
async startActiveSpan<T>(fn: (span: Span) => Promise<T>, options: StartSpanOptions): Promise<T> {
|
|
102
106
|
const ctx = options.context || otelContext.active();
|
|
103
107
|
const endOnExit = options.endOnExit === undefined ? true : options.endOnExit; // default true
|
|
104
|
-
const opts: SpanOptions = { attributes: options.attributes };
|
|
108
|
+
const opts: SpanOptions = { attributes: options.attributes, startTime: options.startTime };
|
|
105
109
|
|
|
106
110
|
// Directly return the tracer's startActiveSpan result - it handles async correctly
|
|
107
111
|
return await this.tracer.startActiveSpan(options.name, opts, ctx, async (span) => {
|
|
@@ -125,7 +129,7 @@ class DynamicTracer {
|
|
|
125
129
|
startActiveSpanSync<T>(fn: (span: Span) => T, options: StartSpanOptions): T {
|
|
126
130
|
const ctx = options.context || otelContext.active();
|
|
127
131
|
const endOnExit = options.endOnExit === undefined ? true : options.endOnExit; // default true
|
|
128
|
-
const opts: SpanOptions = { attributes: options.attributes };
|
|
132
|
+
const opts: SpanOptions = { attributes: options.attributes, startTime: options.startTime };
|
|
129
133
|
|
|
130
134
|
return this.tracer.startActiveSpan(options.name, opts, ctx, (span) => {
|
|
131
135
|
try {
|
package/src/utils.ts
CHANGED
|
@@ -125,6 +125,7 @@ export class Future<T = void> {
|
|
|
125
125
|
#resolvePromise!: (value: T) => void;
|
|
126
126
|
#rejectPromise!: (error: Error) => void;
|
|
127
127
|
#done: boolean = false;
|
|
128
|
+
#rejected: boolean = false;
|
|
128
129
|
|
|
129
130
|
constructor() {
|
|
130
131
|
this.#await = new Promise<T>((resolve, reject) => {
|
|
@@ -141,6 +142,11 @@ export class Future<T = void> {
|
|
|
141
142
|
return this.#done;
|
|
142
143
|
}
|
|
143
144
|
|
|
145
|
+
/** Whether the future was rejected (cancelled) */
|
|
146
|
+
get rejected() {
|
|
147
|
+
return this.#rejected;
|
|
148
|
+
}
|
|
149
|
+
|
|
144
150
|
resolve(value: T) {
|
|
145
151
|
this.#done = true;
|
|
146
152
|
this.#resolvePromise(value);
|
|
@@ -148,6 +154,7 @@ export class Future<T = void> {
|
|
|
148
154
|
|
|
149
155
|
reject(error: Error) {
|
|
150
156
|
this.#done = true;
|
|
157
|
+
this.#rejected = true;
|
|
151
158
|
this.#rejectPromise(error);
|
|
152
159
|
}
|
|
153
160
|
}
|
package/src/voice/agent.ts
CHANGED
|
@@ -271,6 +271,15 @@ export class Agent<UserData = any> {
|
|
|
271
271
|
|
|
272
272
|
const connOptions = activity.agentSession.connOptions.sttConnOptions;
|
|
273
273
|
const stream = wrapped_stt.stream({ connOptions });
|
|
274
|
+
|
|
275
|
+
// Set startTimeOffset to provide linear timestamps across reconnections
|
|
276
|
+
const audioInputStartedAt =
|
|
277
|
+
activity.agentSession._recorderIO?.recordingStartedAt ?? // Use recording start time if available
|
|
278
|
+
activity.agentSession._startedAt ?? // Fallback to session start time
|
|
279
|
+
Date.now(); // Fallback to current time
|
|
280
|
+
|
|
281
|
+
stream.startTimeOffset = (Date.now() - audioInputStartedAt) / 1000;
|
|
282
|
+
|
|
274
283
|
stream.updateInputStream(audio);
|
|
275
284
|
|
|
276
285
|
let cleaned = false;
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
import { Mutex } from '@livekit/mutex';
|
|
5
5
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
6
6
|
import type { Span } from '@opentelemetry/api';
|
|
7
|
-
import { ROOT_CONTEXT, trace } from '@opentelemetry/api';
|
|
7
|
+
import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
|
|
8
8
|
import { Heap } from 'heap-js';
|
|
9
9
|
import { AsyncLocalStorage } from 'node:async_hooks';
|
|
10
10
|
import { ReadableStream } from 'node:stream/web';
|
|
@@ -637,9 +637,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
637
637
|
}
|
|
638
638
|
|
|
639
639
|
// recognition hooks
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
640
|
+
onStartOfSpeech(ev: VADEvent): void {
|
|
641
|
+
let speechStartTime = Date.now();
|
|
642
|
+
if (ev) {
|
|
643
|
+
speechStartTime = speechStartTime - ev.speechDuration;
|
|
644
|
+
}
|
|
645
|
+
this.agentSession._updateUserState('speaking', speechStartTime);
|
|
643
646
|
}
|
|
644
647
|
|
|
645
648
|
onEndOfSpeech(ev: VADEvent): void {
|
|
@@ -1168,6 +1171,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1168
1171
|
replyAbortController: AbortController,
|
|
1169
1172
|
audio?: ReadableStream<AudioFrame> | null,
|
|
1170
1173
|
): Promise<void> {
|
|
1174
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
1175
|
+
|
|
1171
1176
|
speechHandleStorage.enterWith(speechHandle);
|
|
1172
1177
|
|
|
1173
1178
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled
|
|
@@ -1212,13 +1217,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1212
1217
|
tasks.push(textForwardTask);
|
|
1213
1218
|
}
|
|
1214
1219
|
|
|
1215
|
-
const onFirstFrame = () => {
|
|
1216
|
-
this.agentSession._updateAgentState('speaking'
|
|
1220
|
+
const onFirstFrame = (startedSpeakingAt?: number) => {
|
|
1221
|
+
this.agentSession._updateAgentState('speaking', {
|
|
1222
|
+
startTime: startedSpeakingAt,
|
|
1223
|
+
otelContext: speechHandle._agentTurnContext,
|
|
1224
|
+
});
|
|
1217
1225
|
};
|
|
1218
1226
|
|
|
1219
1227
|
if (!audioOutput) {
|
|
1220
1228
|
if (textOut) {
|
|
1221
|
-
textOut.firstTextFut.await
|
|
1229
|
+
textOut.firstTextFut.await
|
|
1230
|
+
.then(() => onFirstFrame())
|
|
1231
|
+
.catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
|
|
1222
1232
|
}
|
|
1223
1233
|
} else {
|
|
1224
1234
|
let audioOut: _AudioOut | null = null;
|
|
@@ -1249,7 +1259,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1249
1259
|
tasks.push(forwardTask);
|
|
1250
1260
|
audioOut = _audioOut;
|
|
1251
1261
|
}
|
|
1252
|
-
audioOut.firstFrameFut.await
|
|
1262
|
+
audioOut.firstFrameFut.await
|
|
1263
|
+
.then((ts) => onFirstFrame(ts))
|
|
1264
|
+
.catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
|
|
1253
1265
|
}
|
|
1254
1266
|
|
|
1255
1267
|
await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
|
|
@@ -1303,6 +1315,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1303
1315
|
toolsMessages?: ChatItem[];
|
|
1304
1316
|
span: Span;
|
|
1305
1317
|
}): Promise<void> => {
|
|
1318
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
1319
|
+
|
|
1306
1320
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1307
1321
|
if (instructions) {
|
|
1308
1322
|
span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
|
|
@@ -1402,8 +1416,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1402
1416
|
textOut = _textOut;
|
|
1403
1417
|
}
|
|
1404
1418
|
|
|
1405
|
-
const onFirstFrame = () => {
|
|
1406
|
-
this.agentSession._updateAgentState('speaking'
|
|
1419
|
+
const onFirstFrame = (startedSpeakingAt?: number) => {
|
|
1420
|
+
this.agentSession._updateAgentState('speaking', {
|
|
1421
|
+
startTime: startedSpeakingAt,
|
|
1422
|
+
otelContext: speechHandle._agentTurnContext,
|
|
1423
|
+
});
|
|
1407
1424
|
};
|
|
1408
1425
|
|
|
1409
1426
|
let audioOut: _AudioOut | null = null;
|
|
@@ -1416,12 +1433,16 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1416
1433
|
);
|
|
1417
1434
|
audioOut = _audioOut;
|
|
1418
1435
|
tasks.push(forwardTask);
|
|
1419
|
-
audioOut.firstFrameFut.await
|
|
1436
|
+
audioOut.firstFrameFut.await
|
|
1437
|
+
.then((ts) => onFirstFrame(ts))
|
|
1438
|
+
.catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
|
|
1420
1439
|
} else {
|
|
1421
1440
|
throw Error('ttsStream is null when audioOutput is enabled');
|
|
1422
1441
|
}
|
|
1423
1442
|
} else {
|
|
1424
|
-
textOut?.firstTextFut.await
|
|
1443
|
+
textOut?.firstTextFut.await
|
|
1444
|
+
.then(() => onFirstFrame())
|
|
1445
|
+
.catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
|
|
1425
1446
|
}
|
|
1426
1447
|
|
|
1427
1448
|
//TODO(AJS-272): before executing tools, make sure we generated all the text
|
|
@@ -1462,8 +1483,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1462
1483
|
msg.createdAt = replyStartedAt;
|
|
1463
1484
|
}
|
|
1464
1485
|
this.agent._chatCtx.insert(toolsMessages);
|
|
1465
|
-
//
|
|
1466
|
-
|
|
1486
|
+
// Only add FunctionCallOutput items to session history since FunctionCall items
|
|
1487
|
+
// were already added by onToolExecutionStarted when the tool execution began
|
|
1488
|
+
const toolCallOutputs = toolsMessages.filter(
|
|
1489
|
+
(m): m is FunctionCallOutput => m.type === 'function_call_output',
|
|
1490
|
+
);
|
|
1491
|
+
if (toolCallOutputs.length > 0) {
|
|
1492
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1493
|
+
}
|
|
1467
1494
|
}
|
|
1468
1495
|
|
|
1469
1496
|
if (speechHandle.interrupted) {
|
|
@@ -1487,10 +1514,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1487
1514
|
|
|
1488
1515
|
if (audioOutput) {
|
|
1489
1516
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1490
|
-
if (audioOut?.firstFrameFut.done) {
|
|
1517
|
+
if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
|
|
1491
1518
|
// playback EV is valid only if the first frame was already played
|
|
1492
1519
|
this.logger.info(
|
|
1493
|
-
{ speech_id: speechHandle.id,
|
|
1520
|
+
{ speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
|
|
1494
1521
|
'playout interrupted',
|
|
1495
1522
|
);
|
|
1496
1523
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -1656,8 +1683,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1656
1683
|
for (const msg of toolMessages) {
|
|
1657
1684
|
msg.createdAt = replyStartedAt;
|
|
1658
1685
|
}
|
|
1686
|
+
|
|
1659
1687
|
this.agent._chatCtx.insert(toolMessages);
|
|
1660
|
-
|
|
1688
|
+
|
|
1689
|
+
// Only add FunctionCallOutput items to session history since FunctionCall items
|
|
1690
|
+
// were already added by onToolExecutionStarted when the tool execution began
|
|
1691
|
+
const toolCallOutputs = toolMessages.filter(
|
|
1692
|
+
(m): m is FunctionCallOutput => m.type === 'function_call_output',
|
|
1693
|
+
);
|
|
1694
|
+
|
|
1695
|
+
if (toolCallOutputs.length > 0) {
|
|
1696
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1697
|
+
}
|
|
1661
1698
|
}
|
|
1662
1699
|
};
|
|
1663
1700
|
|
|
@@ -1725,6 +1762,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1725
1762
|
replyAbortController: AbortController;
|
|
1726
1763
|
span: Span;
|
|
1727
1764
|
}): Promise<void> {
|
|
1765
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
1766
|
+
|
|
1728
1767
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1729
1768
|
|
|
1730
1769
|
speechHandleStorage.enterWith(speechHandle);
|
|
@@ -1762,8 +1801,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1762
1801
|
return;
|
|
1763
1802
|
}
|
|
1764
1803
|
|
|
1765
|
-
const onFirstFrame = () => {
|
|
1766
|
-
this.agentSession._updateAgentState('speaking'
|
|
1804
|
+
const onFirstFrame = (startedSpeakingAt?: number) => {
|
|
1805
|
+
this.agentSession._updateAgentState('speaking', {
|
|
1806
|
+
startTime: startedSpeakingAt,
|
|
1807
|
+
otelContext: speechHandle._agentTurnContext,
|
|
1808
|
+
});
|
|
1767
1809
|
};
|
|
1768
1810
|
|
|
1769
1811
|
const readMessages = async (
|
|
@@ -1851,10 +1893,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1851
1893
|
);
|
|
1852
1894
|
forwardTasks.push(forwardTask);
|
|
1853
1895
|
audioOut = _audioOut;
|
|
1854
|
-
audioOut.firstFrameFut.await
|
|
1896
|
+
audioOut.firstFrameFut.await
|
|
1897
|
+
.then((ts) => onFirstFrame(ts))
|
|
1898
|
+
.catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
|
|
1855
1899
|
}
|
|
1856
1900
|
} else if (textOut) {
|
|
1857
|
-
textOut.firstTextFut.await
|
|
1901
|
+
textOut.firstTextFut.await
|
|
1902
|
+
.then(() => onFirstFrame())
|
|
1903
|
+
.catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
|
|
1858
1904
|
}
|
|
1859
1905
|
outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
|
|
1860
1906
|
}
|
|
@@ -1955,11 +2001,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1955
2001
|
if (audioOutput) {
|
|
1956
2002
|
audioOutput.clearBuffer();
|
|
1957
2003
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1958
|
-
let
|
|
1959
|
-
if (audioOut?.firstFrameFut.done) {
|
|
2004
|
+
let playbackPositionInS = playbackEv.playbackPosition;
|
|
2005
|
+
if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
|
|
1960
2006
|
// playback EV is valid only if the first frame was already played
|
|
1961
2007
|
this.logger.info(
|
|
1962
|
-
{ speech_id: speechHandle.id,
|
|
2008
|
+
{ speech_id: speechHandle.id, playbackPositionInS },
|
|
1963
2009
|
'playout interrupted',
|
|
1964
2010
|
);
|
|
1965
2011
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -1967,13 +2013,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1967
2013
|
}
|
|
1968
2014
|
} else {
|
|
1969
2015
|
forwardedText = '';
|
|
1970
|
-
|
|
2016
|
+
playbackPositionInS = 0;
|
|
1971
2017
|
}
|
|
1972
2018
|
|
|
1973
2019
|
// truncate server-side message
|
|
1974
2020
|
this.realtimeSession.truncate({
|
|
1975
2021
|
messageId: msgId,
|
|
1976
|
-
audioEndMs: Math.floor(
|
|
2022
|
+
audioEndMs: Math.floor(playbackPositionInS * 1000),
|
|
1977
2023
|
modalities: msgModalities,
|
|
1978
2024
|
audioTranscript: forwardedText,
|
|
1979
2025
|
});
|
|
@@ -677,7 +677,7 @@ export class AgentSession<
|
|
|
677
677
|
}
|
|
678
678
|
|
|
679
679
|
/** @internal */
|
|
680
|
-
_updateAgentState(state: AgentState) {
|
|
680
|
+
_updateAgentState(state: AgentState, options?: { startTime?: number; otelContext?: Context }) {
|
|
681
681
|
if (this._agentState === state) {
|
|
682
682
|
return;
|
|
683
683
|
}
|
|
@@ -690,7 +690,8 @@ export class AgentSession<
|
|
|
690
690
|
if (this.agentSpeakingSpan === undefined) {
|
|
691
691
|
this.agentSpeakingSpan = tracer.startSpan({
|
|
692
692
|
name: 'agent_speaking',
|
|
693
|
-
context: this.rootSpanContext,
|
|
693
|
+
context: options?.otelContext ?? this.rootSpanContext,
|
|
694
|
+
startTime: options?.startTime,
|
|
694
695
|
});
|
|
695
696
|
|
|
696
697
|
// TODO(brian): PR4 - Set participant attributes if roomIO.room.localParticipant is available
|
|
@@ -719,7 +720,7 @@ export class AgentSession<
|
|
|
719
720
|
}
|
|
720
721
|
|
|
721
722
|
/** @internal */
|
|
722
|
-
_updateUserState(state: UserState,
|
|
723
|
+
_updateUserState(state: UserState, lastSpeakingTime?: number) {
|
|
723
724
|
if (this.userState === state) {
|
|
724
725
|
return;
|
|
725
726
|
}
|
|
@@ -728,13 +729,13 @@ export class AgentSession<
|
|
|
728
729
|
this.userSpeakingSpan = tracer.startSpan({
|
|
729
730
|
name: 'user_speaking',
|
|
730
731
|
context: this.rootSpanContext,
|
|
732
|
+
startTime: lastSpeakingTime,
|
|
731
733
|
});
|
|
732
734
|
|
|
733
735
|
// TODO(brian): PR4 - Set participant attributes if roomIO.linkedParticipant is available
|
|
734
736
|
// (Ref: Python agent_session.py line 1192-1195)
|
|
735
737
|
} else if (this.userSpeakingSpan !== undefined) {
|
|
736
|
-
|
|
737
|
-
this.userSpeakingSpan.end();
|
|
738
|
+
this.userSpeakingSpan.end(lastSpeakingTime);
|
|
738
739
|
this.userSpeakingSpan = undefined;
|
|
739
740
|
}
|
|
740
741
|
|
|
@@ -566,9 +566,11 @@ export class AudioRecognition {
|
|
|
566
566
|
this.speaking = true;
|
|
567
567
|
|
|
568
568
|
if (!this.userTurnSpan) {
|
|
569
|
+
const startTime = Date.now() - ev.speechDuration;
|
|
569
570
|
this.userTurnSpan = tracer.startSpan({
|
|
570
571
|
name: 'user_turn',
|
|
571
572
|
context: this.rootSpanContext,
|
|
573
|
+
startTime,
|
|
572
574
|
});
|
|
573
575
|
}
|
|
574
576
|
|
|
@@ -47,6 +47,7 @@ export class DataStreamAudioOutput extends AudioOutput {
|
|
|
47
47
|
private started: boolean = false;
|
|
48
48
|
private lock = new Mutex();
|
|
49
49
|
private startTask?: Task<void>;
|
|
50
|
+
private firstFrameEmitted: boolean = false;
|
|
50
51
|
|
|
51
52
|
#logger = log();
|
|
52
53
|
|
|
@@ -146,6 +147,11 @@ export class DataStreamAudioOutput extends AudioOutput {
|
|
|
146
147
|
await this.startTask.result;
|
|
147
148
|
await super.captureFrame(frame);
|
|
148
149
|
|
|
150
|
+
if (!this.firstFrameEmitted) {
|
|
151
|
+
this.firstFrameEmitted = true;
|
|
152
|
+
this.onPlaybackStarted(Date.now());
|
|
153
|
+
}
|
|
154
|
+
|
|
149
155
|
if (!this.streamWriter) {
|
|
150
156
|
this.streamWriter = await this.room.localParticipant!.streamBytes({
|
|
151
157
|
name: shortuuid('AUDIO_'),
|
|
@@ -174,6 +180,8 @@ export class DataStreamAudioOutput extends AudioOutput {
|
|
|
174
180
|
this.streamWriter.close().finally(() => {
|
|
175
181
|
this.streamWriter = undefined;
|
|
176
182
|
});
|
|
183
|
+
|
|
184
|
+
this.firstFrameEmitted = false;
|
|
177
185
|
}
|
|
178
186
|
|
|
179
187
|
clearBuffer(): void {
|
package/src/voice/generation.ts
CHANGED
|
@@ -27,7 +27,7 @@ import { traceTypes, tracer } from '../telemetry/index.js';
|
|
|
27
27
|
import { Future, Task, shortuuid, toError, waitForAbort } from '../utils.js';
|
|
28
28
|
import { type Agent, type ModelSettings, asyncLocalStorage, isStopResponse } from './agent.js';
|
|
29
29
|
import type { AgentSession } from './agent_session.js';
|
|
30
|
-
import
|
|
30
|
+
import { AudioOutput, type LLMNode, type TTSNode, type TextOutput } from './io.js';
|
|
31
31
|
import { RunContext } from './run_context.js';
|
|
32
32
|
import type { SpeechHandle } from './speech_handle.js';
|
|
33
33
|
|
|
@@ -608,7 +608,8 @@ export function performTextForwarding(
|
|
|
608
608
|
|
|
609
609
|
export interface _AudioOut {
|
|
610
610
|
audio: Array<AudioFrame>;
|
|
611
|
-
|
|
611
|
+
/** Future that will be set with the timestamp of the first frame's capture */
|
|
612
|
+
firstFrameFut: Future<number>;
|
|
612
613
|
}
|
|
613
614
|
|
|
614
615
|
async function forwardAudio(
|
|
@@ -620,7 +621,16 @@ async function forwardAudio(
|
|
|
620
621
|
const reader = ttsStream.getReader();
|
|
621
622
|
let resampler: AudioResampler | null = null;
|
|
622
623
|
|
|
624
|
+
const onPlaybackStarted = (ev: { createdAt: number }) => {
|
|
625
|
+
if (!out.firstFrameFut.done) {
|
|
626
|
+
out.firstFrameFut.resolve(ev.createdAt);
|
|
627
|
+
}
|
|
628
|
+
};
|
|
629
|
+
|
|
623
630
|
try {
|
|
631
|
+
audioOuput.on(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
|
|
632
|
+
audioOuput.resume();
|
|
633
|
+
|
|
624
634
|
while (true) {
|
|
625
635
|
if (signal?.aborted) {
|
|
626
636
|
break;
|
|
@@ -647,20 +657,21 @@ async function forwardAudio(
|
|
|
647
657
|
} else {
|
|
648
658
|
await audioOuput.captureFrame(frame);
|
|
649
659
|
}
|
|
650
|
-
|
|
651
|
-
// set the first frame future if not already set
|
|
652
|
-
// (after completing the first frame)
|
|
653
|
-
if (!out.firstFrameFut.done) {
|
|
654
|
-
out.firstFrameFut.resolve();
|
|
655
|
-
}
|
|
656
660
|
}
|
|
657
|
-
|
|
658
|
-
reader?.releaseLock();
|
|
661
|
+
|
|
659
662
|
if (resampler) {
|
|
660
663
|
for (const f of resampler.flush()) {
|
|
661
664
|
await audioOuput.captureFrame(f);
|
|
662
665
|
}
|
|
663
666
|
}
|
|
667
|
+
} finally {
|
|
668
|
+
audioOuput.off(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
|
|
669
|
+
|
|
670
|
+
if (!out.firstFrameFut.done) {
|
|
671
|
+
out.firstFrameFut.reject(new Error('audio forwarding cancelled before playback started'));
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
reader?.releaseLock();
|
|
664
675
|
audioOuput.flush();
|
|
665
676
|
}
|
|
666
677
|
}
|
|
@@ -670,10 +681,11 @@ export function performAudioForwarding(
|
|
|
670
681
|
audioOutput: AudioOutput,
|
|
671
682
|
controller: AbortController,
|
|
672
683
|
): [Task<void>, _AudioOut] {
|
|
673
|
-
const out = {
|
|
684
|
+
const out: _AudioOut = {
|
|
674
685
|
audio: [],
|
|
675
|
-
firstFrameFut: new Future(),
|
|
686
|
+
firstFrameFut: new Future<number>(),
|
|
676
687
|
};
|
|
688
|
+
|
|
677
689
|
return [
|
|
678
690
|
Task.from(
|
|
679
691
|
(controller) => forwardAudio(ttsStream, audioOutput, out, controller.signal),
|