@livekit/agents 1.0.44 → 1.0.46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ipc/supervised_proc.cjs +1 -1
- package/dist/ipc/supervised_proc.cjs.map +1 -1
- package/dist/ipc/supervised_proc.js +1 -1
- package/dist/ipc/supervised_proc.js.map +1 -1
- package/dist/llm/llm.cjs +1 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.js +1 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/log.cjs +13 -9
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.cts +1 -1
- package/dist/log.d.ts +1 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +13 -9
- package/dist/log.js.map +1 -1
- package/dist/stream/index.cjs +3 -0
- package/dist/stream/index.cjs.map +1 -1
- package/dist/stream/index.d.cts +1 -0
- package/dist/stream/index.d.ts +1 -0
- package/dist/stream/index.d.ts.map +1 -1
- package/dist/stream/index.js +2 -0
- package/dist/stream/index.js.map +1 -1
- package/dist/stream/multi_input_stream.cjs +139 -0
- package/dist/stream/multi_input_stream.cjs.map +1 -0
- package/dist/stream/multi_input_stream.d.cts +55 -0
- package/dist/stream/multi_input_stream.d.ts +55 -0
- package/dist/stream/multi_input_stream.d.ts.map +1 -0
- package/dist/stream/multi_input_stream.js +115 -0
- package/dist/stream/multi_input_stream.js.map +1 -0
- package/dist/stream/multi_input_stream.test.cjs +340 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -0
- package/dist/stream/multi_input_stream.test.js +339 -0
- package/dist/stream/multi_input_stream.test.js.map +1 -0
- package/dist/stt/stt.cjs +2 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.js +2 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/trace_types.cjs +42 -0
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +14 -0
- package/dist/telemetry/trace_types.d.ts +14 -0
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +28 -0
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/tts/fallback_adapter.cjs +466 -0
- package/dist/tts/fallback_adapter.cjs.map +1 -0
- package/dist/tts/fallback_adapter.d.cts +110 -0
- package/dist/tts/fallback_adapter.d.ts +110 -0
- package/dist/tts/fallback_adapter.d.ts.map +1 -0
- package/dist/tts/fallback_adapter.js +442 -0
- package/dist/tts/fallback_adapter.js.map +1 -0
- package/dist/tts/index.cjs +3 -0
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.cts +1 -0
- package/dist/tts/index.d.ts +1 -0
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +2 -0
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/tts.cjs +2 -2
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.js +2 -2
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +13 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +1 -0
- package/dist/utils.d.ts +1 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +13 -0
- package/dist/utils.js.map +1 -1
- package/dist/vad.cjs +11 -10
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.cts +5 -3
- package/dist/vad.d.ts +5 -3
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +11 -10
- package/dist/vad.js.map +1 -1
- package/dist/voice/agent_activity.cjs +35 -10
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +1 -0
- package/dist/voice/agent_activity.d.ts +1 -0
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +35 -10
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +19 -7
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +3 -2
- package/dist/voice/agent_session.d.ts +3 -2
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +19 -7
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +85 -36
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +22 -1
- package/dist/voice/audio_recognition.d.ts +22 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +89 -36
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/audio_recognition_span.test.cjs +233 -0
- package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
- package/dist/voice/audio_recognition_span.test.js +232 -0
- package/dist/voice/audio_recognition_span.test.js.map +1 -0
- package/dist/voice/io.cjs +6 -3
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +3 -2
- package/dist/voice/io.d.ts +3 -2
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +6 -3
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +3 -1
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +3 -1
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +23 -20
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +2 -2
- package/dist/voice/room_io/_input.d.ts +2 -2
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +13 -9
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +9 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +3 -1
- package/dist/voice/room_io/room_io.d.ts +3 -1
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +9 -0
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/utils.cjs +47 -0
- package/dist/voice/utils.cjs.map +1 -0
- package/dist/voice/utils.d.cts +4 -0
- package/dist/voice/utils.d.ts +4 -0
- package/dist/voice/utils.d.ts.map +1 -0
- package/dist/voice/utils.js +23 -0
- package/dist/voice/utils.js.map +1 -0
- package/package.json +1 -1
- package/src/ipc/supervised_proc.ts +1 -1
- package/src/llm/llm.ts +1 -1
- package/src/log.ts +22 -11
- package/src/stream/index.ts +1 -0
- package/src/stream/multi_input_stream.test.ts +540 -0
- package/src/stream/multi_input_stream.ts +172 -0
- package/src/stt/stt.ts +2 -2
- package/src/telemetry/trace_types.ts +18 -0
- package/src/tts/fallback_adapter.ts +579 -0
- package/src/tts/index.ts +1 -0
- package/src/tts/tts.ts +2 -2
- package/src/utils.ts +16 -0
- package/src/vad.ts +12 -11
- package/src/voice/agent_activity.ts +25 -0
- package/src/voice/agent_session.ts +17 -11
- package/src/voice/audio_recognition.ts +114 -38
- package/src/voice/audio_recognition_span.test.ts +261 -0
- package/src/voice/io.ts +7 -4
- package/src/voice/recorder_io/recorder_io.ts +2 -1
- package/src/voice/room_io/_input.ts +16 -10
- package/src/voice/room_io/room_io.ts +12 -0
- package/src/voice/utils.ts +29 -0
package/src/utils.ts
CHANGED
|
@@ -126,6 +126,8 @@ export class Future<T = void> {
|
|
|
126
126
|
#rejectPromise!: (error: Error) => void;
|
|
127
127
|
#done: boolean = false;
|
|
128
128
|
#rejected: boolean = false;
|
|
129
|
+
#result: T | undefined = undefined;
|
|
130
|
+
#error: Error | undefined = undefined;
|
|
129
131
|
|
|
130
132
|
constructor() {
|
|
131
133
|
this.#await = new Promise<T>((resolve, reject) => {
|
|
@@ -142,6 +144,18 @@ export class Future<T = void> {
|
|
|
142
144
|
return this.#done;
|
|
143
145
|
}
|
|
144
146
|
|
|
147
|
+
get result(): T {
|
|
148
|
+
if (!this.#done) {
|
|
149
|
+
throw new Error('Future is not done');
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if (this.#rejected) {
|
|
153
|
+
throw this.#error;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
return this.#result!;
|
|
157
|
+
}
|
|
158
|
+
|
|
145
159
|
/** Whether the future was rejected (cancelled) */
|
|
146
160
|
get rejected() {
|
|
147
161
|
return this.#rejected;
|
|
@@ -149,12 +163,14 @@ export class Future<T = void> {
|
|
|
149
163
|
|
|
150
164
|
resolve(value: T) {
|
|
151
165
|
this.#done = true;
|
|
166
|
+
this.#result = value;
|
|
152
167
|
this.#resolvePromise(value);
|
|
153
168
|
}
|
|
154
169
|
|
|
155
170
|
reject(error: Error) {
|
|
156
171
|
this.#done = true;
|
|
157
172
|
this.#rejected = true;
|
|
173
|
+
this.#error = error;
|
|
158
174
|
this.#rejectPromise(error);
|
|
159
175
|
}
|
|
160
176
|
}
|
package/src/vad.ts
CHANGED
|
@@ -98,14 +98,15 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
|
|
|
98
98
|
protected closed = false;
|
|
99
99
|
protected inputClosed = false;
|
|
100
100
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
101
|
+
protected vad: VAD;
|
|
102
|
+
protected lastActivityTime = BigInt(0);
|
|
103
|
+
protected logger;
|
|
104
|
+
protected deferredInputStream: DeferredReadableStream<AudioFrame>;
|
|
105
105
|
|
|
106
106
|
private metricsStream: ReadableStream<VADEvent>;
|
|
107
107
|
constructor(vad: VAD) {
|
|
108
|
-
this
|
|
108
|
+
this.logger = log();
|
|
109
|
+
this.vad = vad;
|
|
109
110
|
this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
|
|
110
111
|
|
|
111
112
|
this.inputWriter = this.input.writable.getWriter();
|
|
@@ -155,16 +156,16 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
|
|
|
155
156
|
switch (value.type) {
|
|
156
157
|
case VADEventType.START_OF_SPEECH:
|
|
157
158
|
inferenceCount++;
|
|
158
|
-
if (inferenceCount >= 1000 / this
|
|
159
|
-
this
|
|
159
|
+
if (inferenceCount >= 1000 / this.vad.capabilities.updateInterval) {
|
|
160
|
+
this.vad.emit('metrics_collected', {
|
|
160
161
|
type: 'vad_metrics',
|
|
161
162
|
timestamp: Date.now(),
|
|
162
163
|
idleTimeMs: Math.trunc(
|
|
163
|
-
Number((process.hrtime.bigint() - this
|
|
164
|
+
Number((process.hrtime.bigint() - this.lastActivityTime) / BigInt(1000000)),
|
|
164
165
|
),
|
|
165
166
|
inferenceDurationTotalMs,
|
|
166
167
|
inferenceCount,
|
|
167
|
-
label: this
|
|
168
|
+
label: this.vad.label,
|
|
168
169
|
});
|
|
169
170
|
|
|
170
171
|
inferenceCount = 0;
|
|
@@ -173,10 +174,10 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
|
|
|
173
174
|
break;
|
|
174
175
|
case VADEventType.INFERENCE_DONE:
|
|
175
176
|
inferenceDurationTotalMs += Math.round(value.inferenceDuration);
|
|
176
|
-
this
|
|
177
|
+
this.lastActivityTime = process.hrtime.bigint();
|
|
177
178
|
break;
|
|
178
179
|
case VADEventType.END_OF_SPEECH:
|
|
179
|
-
this
|
|
180
|
+
this.lastActivityTime = process.hrtime.bigint();
|
|
180
181
|
break;
|
|
181
182
|
}
|
|
182
183
|
}
|
|
@@ -74,6 +74,7 @@ import {
|
|
|
74
74
|
} from './generation.js';
|
|
75
75
|
import type { TimedString } from './io.js';
|
|
76
76
|
import { SpeechHandle } from './speech_handle.js';
|
|
77
|
+
import { setParticipantSpanAttributes } from './utils.js';
|
|
77
78
|
|
|
78
79
|
const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
|
|
79
80
|
|
|
@@ -299,6 +300,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
299
300
|
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
300
301
|
maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
|
|
301
302
|
rootSpanContext: this.agentSession.rootSpanContext,
|
|
303
|
+
sttModel: this.stt?.label,
|
|
304
|
+
sttProvider: this.getSttProvider(),
|
|
305
|
+
getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
|
|
302
306
|
});
|
|
303
307
|
this.audioRecognition.start();
|
|
304
308
|
this.started = true;
|
|
@@ -335,6 +339,17 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
335
339
|
return this.agent.stt || this.agentSession.stt;
|
|
336
340
|
}
|
|
337
341
|
|
|
342
|
+
private getSttProvider(): string | undefined {
|
|
343
|
+
const label = this.stt?.label;
|
|
344
|
+
if (!label) {
|
|
345
|
+
return undefined;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
// Heuristic: most labels look like "<provider>-<model>"
|
|
349
|
+
const [provider] = label.split('-', 1);
|
|
350
|
+
return provider || label;
|
|
351
|
+
}
|
|
352
|
+
|
|
338
353
|
get llm(): LLM | RealtimeModel | undefined {
|
|
339
354
|
return this.agent.llm || this.agentSession.llm;
|
|
340
355
|
}
|
|
@@ -1355,6 +1370,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1355
1370
|
span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || '');
|
|
1356
1371
|
}
|
|
1357
1372
|
|
|
1373
|
+
const localParticipant = this.agentSession._roomIO?.localParticipant;
|
|
1374
|
+
if (localParticipant) {
|
|
1375
|
+
setParticipantSpanAttributes(span, localParticipant);
|
|
1376
|
+
}
|
|
1377
|
+
|
|
1358
1378
|
speechHandleStorage.enterWith(speechHandle);
|
|
1359
1379
|
|
|
1360
1380
|
const audioOutput = this.agentSession.output.audioEnabled
|
|
@@ -1815,6 +1835,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1815
1835
|
|
|
1816
1836
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1817
1837
|
|
|
1838
|
+
const localParticipant = this.agentSession._roomIO?.localParticipant;
|
|
1839
|
+
if (localParticipant) {
|
|
1840
|
+
setParticipantSpanAttributes(span, localParticipant);
|
|
1841
|
+
}
|
|
1842
|
+
|
|
1818
1843
|
speechHandleStorage.enterWith(speechHandle);
|
|
1819
1844
|
|
|
1820
1845
|
if (!this.realtimeSession) {
|
|
@@ -62,6 +62,7 @@ import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io
|
|
|
62
62
|
import type { UnknownUserData } from './run_context.js';
|
|
63
63
|
import type { SpeechHandle } from './speech_handle.js';
|
|
64
64
|
import { RunResult } from './testing/run_result.js';
|
|
65
|
+
import { setParticipantSpanAttributes } from './utils.js';
|
|
65
66
|
|
|
66
67
|
export interface VoiceOptions {
|
|
67
68
|
allowInterruptions: boolean;
|
|
@@ -131,7 +132,8 @@ export class AgentSession<
|
|
|
131
132
|
private started = false;
|
|
132
133
|
private userState: UserState = 'listening';
|
|
133
134
|
|
|
134
|
-
|
|
135
|
+
/** @internal */
|
|
136
|
+
_roomIO?: RoomIO;
|
|
135
137
|
private logger = log();
|
|
136
138
|
|
|
137
139
|
private _chatCtx: ChatContext;
|
|
@@ -294,7 +296,7 @@ export class AgentSession<
|
|
|
294
296
|
|
|
295
297
|
const tasks: Promise<void>[] = [];
|
|
296
298
|
|
|
297
|
-
if (room && !this.
|
|
299
|
+
if (room && !this._roomIO) {
|
|
298
300
|
// Check for existing input/output configuration and warn if needed
|
|
299
301
|
if (this.input.audio && inputOptions?.audioEnabled !== false) {
|
|
300
302
|
this.logger.warn(
|
|
@@ -314,13 +316,13 @@ export class AgentSession<
|
|
|
314
316
|
);
|
|
315
317
|
}
|
|
316
318
|
|
|
317
|
-
this.
|
|
319
|
+
this._roomIO = new RoomIO({
|
|
318
320
|
agentSession: this,
|
|
319
321
|
room,
|
|
320
322
|
inputOptions,
|
|
321
323
|
outputOptions,
|
|
322
324
|
});
|
|
323
|
-
this.
|
|
325
|
+
this._roomIO.start();
|
|
324
326
|
}
|
|
325
327
|
|
|
326
328
|
let ctx: JobContext | undefined = undefined;
|
|
@@ -700,8 +702,10 @@ export class AgentSession<
|
|
|
700
702
|
startTime: options?.startTime,
|
|
701
703
|
});
|
|
702
704
|
|
|
703
|
-
|
|
704
|
-
|
|
705
|
+
const localParticipant = this._roomIO?.localParticipant;
|
|
706
|
+
if (localParticipant) {
|
|
707
|
+
setParticipantSpanAttributes(this.agentSpeakingSpan, localParticipant);
|
|
708
|
+
}
|
|
705
709
|
}
|
|
706
710
|
} else if (this.agentSpeakingSpan !== undefined) {
|
|
707
711
|
// TODO(brian): PR4 - Set ATTR_END_TIME attribute if available
|
|
@@ -738,8 +742,10 @@ export class AgentSession<
|
|
|
738
742
|
startTime: lastSpeakingTime,
|
|
739
743
|
});
|
|
740
744
|
|
|
741
|
-
|
|
742
|
-
|
|
745
|
+
const linked = this._roomIO?.linkedParticipant;
|
|
746
|
+
if (linked) {
|
|
747
|
+
setParticipantSpanAttributes(this.userSpeakingSpan, linked);
|
|
748
|
+
}
|
|
743
749
|
} else if (this.userSpeakingSpan !== undefined) {
|
|
744
750
|
this.userSpeakingSpan.end(lastSpeakingTime);
|
|
745
751
|
this.userSpeakingSpan = undefined;
|
|
@@ -783,7 +789,7 @@ export class AgentSession<
|
|
|
783
789
|
return;
|
|
784
790
|
}
|
|
785
791
|
|
|
786
|
-
if (this.
|
|
792
|
+
if (this._roomIO && !this._roomIO.isParticipantAvailable) {
|
|
787
793
|
return;
|
|
788
794
|
}
|
|
789
795
|
|
|
@@ -862,8 +868,8 @@ export class AgentSession<
|
|
|
862
868
|
this.output.audio = null;
|
|
863
869
|
this.output.transcription = null;
|
|
864
870
|
|
|
865
|
-
await this.
|
|
866
|
-
this.
|
|
871
|
+
await this._roomIO?.close();
|
|
872
|
+
this._roomIO = undefined;
|
|
867
873
|
|
|
868
874
|
await this.activity?.close();
|
|
869
875
|
this.activity = undefined;
|
|
@@ -1,8 +1,15 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import type { ParticipantKind } from '@livekit/rtc-node';
|
|
4
5
|
import { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
-
import
|
|
6
|
+
import {
|
|
7
|
+
type Context,
|
|
8
|
+
ROOT_CONTEXT,
|
|
9
|
+
type Span,
|
|
10
|
+
context as otelContext,
|
|
11
|
+
trace,
|
|
12
|
+
} from '@opentelemetry/api';
|
|
6
13
|
import type { WritableStreamDefaultWriter } from 'node:stream/web';
|
|
7
14
|
import { ReadableStream } from 'node:stream/web';
|
|
8
15
|
import { type ChatContext } from '../llm/chat_context.js';
|
|
@@ -16,6 +23,7 @@ import { Task, delay } from '../utils.js';
|
|
|
16
23
|
import { type VAD, type VADEvent, VADEventType } from '../vad.js';
|
|
17
24
|
import type { TurnDetectionMode } from './agent_session.js';
|
|
18
25
|
import type { STTNode } from './io.js';
|
|
26
|
+
import { setParticipantSpanAttributes } from './utils.js';
|
|
19
27
|
|
|
20
28
|
export interface EndOfTurnInfo {
|
|
21
29
|
/** The new transcript text from the user's speech. */
|
|
@@ -72,6 +80,22 @@ export interface AudioRecognitionOptions {
|
|
|
72
80
|
maxEndpointingDelay: number;
|
|
73
81
|
/** Root span context for tracing. */
|
|
74
82
|
rootSpanContext?: Context;
|
|
83
|
+
/** STT model name for tracing */
|
|
84
|
+
sttModel?: string;
|
|
85
|
+
/** STT provider name for tracing */
|
|
86
|
+
sttProvider?: string;
|
|
87
|
+
/** Getter for linked participant for span attribution */
|
|
88
|
+
getLinkedParticipant?: () => ParticipantLike | undefined;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Minimal participant shape for span attribution.
|
|
93
|
+
* Compatible with both `LocalParticipant` and `RemoteParticipant` from `@livekit/rtc-node`.
|
|
94
|
+
*/
|
|
95
|
+
export interface ParticipantLike {
|
|
96
|
+
sid: string | undefined;
|
|
97
|
+
identity: string;
|
|
98
|
+
kind: ParticipantKind;
|
|
75
99
|
}
|
|
76
100
|
|
|
77
101
|
export class AudioRecognition {
|
|
@@ -84,6 +108,9 @@ export class AudioRecognition {
|
|
|
84
108
|
private maxEndpointingDelay: number;
|
|
85
109
|
private lastLanguage?: string;
|
|
86
110
|
private rootSpanContext?: Context;
|
|
111
|
+
private sttModel?: string;
|
|
112
|
+
private sttProvider?: string;
|
|
113
|
+
private getLinkedParticipant?: () => ParticipantLike | undefined;
|
|
87
114
|
|
|
88
115
|
private deferredInputStream: DeferredReadableStream<AudioFrame>;
|
|
89
116
|
private logger = log();
|
|
@@ -121,6 +148,9 @@ export class AudioRecognition {
|
|
|
121
148
|
this.maxEndpointingDelay = opts.maxEndpointingDelay;
|
|
122
149
|
this.lastLanguage = undefined;
|
|
123
150
|
this.rootSpanContext = opts.rootSpanContext;
|
|
151
|
+
this.sttModel = opts.sttModel;
|
|
152
|
+
this.sttProvider = opts.sttProvider;
|
|
153
|
+
this.getLinkedParticipant = opts.getLinkedParticipant;
|
|
124
154
|
|
|
125
155
|
this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
|
|
126
156
|
const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
|
|
@@ -151,6 +181,37 @@ export class AudioRecognition {
|
|
|
151
181
|
});
|
|
152
182
|
}
|
|
153
183
|
|
|
184
|
+
private ensureUserTurnSpan(startTime?: number): Span {
|
|
185
|
+
if (this.userTurnSpan && this.userTurnSpan.isRecording()) {
|
|
186
|
+
return this.userTurnSpan;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
this.userTurnSpan = tracer.startSpan({
|
|
190
|
+
name: 'user_turn',
|
|
191
|
+
context: this.rootSpanContext,
|
|
192
|
+
startTime,
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
const participant = this.getLinkedParticipant?.();
|
|
196
|
+
if (participant) {
|
|
197
|
+
setParticipantSpanAttributes(this.userTurnSpan, participant);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (this.sttModel) {
|
|
201
|
+
this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.sttModel);
|
|
202
|
+
}
|
|
203
|
+
if (this.sttProvider) {
|
|
204
|
+
this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, this.sttProvider);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
return this.userTurnSpan;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
private userTurnContext(span: Span): Context {
|
|
211
|
+
const base = this.rootSpanContext ?? ROOT_CONTEXT;
|
|
212
|
+
return trace.setSpan(base, span);
|
|
213
|
+
}
|
|
214
|
+
|
|
154
215
|
private async onSTTEvent(ev: SpeechEvent) {
|
|
155
216
|
if (
|
|
156
217
|
this.turnDetectionMode === 'manual' &&
|
|
@@ -299,19 +360,25 @@ export class AudioRecognition {
|
|
|
299
360
|
break;
|
|
300
361
|
case SpeechEventType.START_OF_SPEECH:
|
|
301
362
|
if (this.turnDetectionMode !== 'stt') break;
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
363
|
+
{
|
|
364
|
+
const span = this.ensureUserTurnSpan(Date.now());
|
|
365
|
+
const ctx = this.userTurnContext(span);
|
|
366
|
+
otelContext.with(ctx, () => {
|
|
367
|
+
this.hooks.onStartOfSpeech({
|
|
368
|
+
type: VADEventType.START_OF_SPEECH,
|
|
369
|
+
samplesIndex: 0,
|
|
370
|
+
timestamp: Date.now(),
|
|
371
|
+
speechDuration: 0,
|
|
372
|
+
silenceDuration: 0,
|
|
373
|
+
frames: [],
|
|
374
|
+
probability: 0,
|
|
375
|
+
inferenceDuration: 0,
|
|
376
|
+
speaking: true,
|
|
377
|
+
rawAccumulatedSilence: 0,
|
|
378
|
+
rawAccumulatedSpeech: 0,
|
|
379
|
+
});
|
|
380
|
+
});
|
|
381
|
+
}
|
|
315
382
|
this.speaking = true;
|
|
316
383
|
this.lastSpeakingTime = Date.now();
|
|
317
384
|
|
|
@@ -319,19 +386,25 @@ export class AudioRecognition {
|
|
|
319
386
|
break;
|
|
320
387
|
case SpeechEventType.END_OF_SPEECH:
|
|
321
388
|
if (this.turnDetectionMode !== 'stt') break;
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
389
|
+
{
|
|
390
|
+
const span = this.ensureUserTurnSpan();
|
|
391
|
+
const ctx = this.userTurnContext(span);
|
|
392
|
+
otelContext.with(ctx, () => {
|
|
393
|
+
this.hooks.onEndOfSpeech({
|
|
394
|
+
type: VADEventType.END_OF_SPEECH,
|
|
395
|
+
samplesIndex: 0,
|
|
396
|
+
timestamp: Date.now(),
|
|
397
|
+
speechDuration: 0,
|
|
398
|
+
silenceDuration: 0,
|
|
399
|
+
frames: [],
|
|
400
|
+
probability: 0,
|
|
401
|
+
inferenceDuration: 0,
|
|
402
|
+
speaking: false,
|
|
403
|
+
rawAccumulatedSilence: 0,
|
|
404
|
+
rawAccumulatedSpeech: 0,
|
|
405
|
+
});
|
|
406
|
+
});
|
|
407
|
+
}
|
|
335
408
|
this.speaking = false;
|
|
336
409
|
this.userTurnCommitted = true;
|
|
337
410
|
this.lastSpeakingTime = Date.now();
|
|
@@ -376,6 +449,9 @@ export class AudioRecognition {
|
|
|
376
449
|
async (controller: AbortController) => {
|
|
377
450
|
let endpointingDelay = this.minEndpointingDelay;
|
|
378
451
|
|
|
452
|
+
const userTurnSpan = this.ensureUserTurnSpan();
|
|
453
|
+
const userTurnCtx = this.userTurnContext(userTurnSpan);
|
|
454
|
+
|
|
379
455
|
if (turnDetector) {
|
|
380
456
|
await tracer.startActiveSpan(
|
|
381
457
|
async (span) => {
|
|
@@ -415,7 +491,7 @@ export class AudioRecognition {
|
|
|
415
491
|
},
|
|
416
492
|
{
|
|
417
493
|
name: 'eou_detection',
|
|
418
|
-
context:
|
|
494
|
+
context: userTurnCtx,
|
|
419
495
|
},
|
|
420
496
|
);
|
|
421
497
|
}
|
|
@@ -577,17 +653,13 @@ export class AudioRecognition {
|
|
|
577
653
|
switch (ev.type) {
|
|
578
654
|
case VADEventType.START_OF_SPEECH:
|
|
579
655
|
this.logger.debug('VAD task: START_OF_SPEECH');
|
|
580
|
-
|
|
581
|
-
this.speaking = true;
|
|
582
|
-
|
|
583
|
-
if (!this.userTurnSpan) {
|
|
656
|
+
{
|
|
584
657
|
const startTime = Date.now() - ev.speechDuration;
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
startTime,
|
|
589
|
-
});
|
|
658
|
+
const span = this.ensureUserTurnSpan(startTime);
|
|
659
|
+
const ctx = this.userTurnContext(span);
|
|
660
|
+
otelContext.with(ctx, () => this.hooks.onStartOfSpeech(ev));
|
|
590
661
|
}
|
|
662
|
+
this.speaking = true;
|
|
591
663
|
|
|
592
664
|
// Capture sample rate from the first VAD event if not already set
|
|
593
665
|
if (ev.frames.length > 0 && ev.frames[0]) {
|
|
@@ -609,7 +681,11 @@ export class AudioRecognition {
|
|
|
609
681
|
break;
|
|
610
682
|
case VADEventType.END_OF_SPEECH:
|
|
611
683
|
this.logger.debug('VAD task: END_OF_SPEECH');
|
|
612
|
-
|
|
684
|
+
{
|
|
685
|
+
const span = this.ensureUserTurnSpan();
|
|
686
|
+
const ctx = this.userTurnContext(span);
|
|
687
|
+
otelContext.with(ctx, () => this.hooks.onEndOfSpeech(ev));
|
|
688
|
+
}
|
|
613
689
|
|
|
614
690
|
// when VAD fires END_OF_SPEECH, it already waited for the silence_duration
|
|
615
691
|
this.speaking = false;
|