@livekit/agents 1.0.45 → 1.0.46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/stream/index.cjs +3 -0
- package/dist/stream/index.cjs.map +1 -1
- package/dist/stream/index.d.cts +1 -0
- package/dist/stream/index.d.ts +1 -0
- package/dist/stream/index.d.ts.map +1 -1
- package/dist/stream/index.js +2 -0
- package/dist/stream/index.js.map +1 -1
- package/dist/stream/multi_input_stream.cjs +139 -0
- package/dist/stream/multi_input_stream.cjs.map +1 -0
- package/dist/stream/multi_input_stream.d.cts +55 -0
- package/dist/stream/multi_input_stream.d.ts +55 -0
- package/dist/stream/multi_input_stream.d.ts.map +1 -0
- package/dist/stream/multi_input_stream.js +115 -0
- package/dist/stream/multi_input_stream.js.map +1 -0
- package/dist/stream/multi_input_stream.test.cjs +340 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -0
- package/dist/stream/multi_input_stream.test.js +339 -0
- package/dist/stream/multi_input_stream.test.js.map +1 -0
- package/dist/telemetry/trace_types.cjs +42 -0
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +14 -0
- package/dist/telemetry/trace_types.d.ts +14 -0
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +28 -0
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/utils.cjs +13 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +1 -0
- package/dist/utils.d.ts +1 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +13 -0
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent_activity.cjs +35 -10
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +1 -0
- package/dist/voice/agent_activity.d.ts +1 -0
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +35 -10
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +19 -7
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +3 -2
- package/dist/voice/agent_session.d.ts +3 -2
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +19 -7
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +85 -36
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +22 -1
- package/dist/voice/audio_recognition.d.ts +22 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +89 -36
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/audio_recognition_span.test.cjs +233 -0
- package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
- package/dist/voice/audio_recognition_span.test.js +232 -0
- package/dist/voice/audio_recognition_span.test.js.map +1 -0
- package/dist/voice/io.cjs +6 -3
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +3 -2
- package/dist/voice/io.d.ts +3 -2
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +6 -3
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +3 -1
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +3 -1
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +17 -17
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +2 -2
- package/dist/voice/room_io/_input.d.ts +2 -2
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +7 -6
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +9 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +3 -1
- package/dist/voice/room_io/room_io.d.ts +3 -1
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +9 -0
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/utils.cjs +47 -0
- package/dist/voice/utils.cjs.map +1 -0
- package/dist/voice/utils.d.cts +4 -0
- package/dist/voice/utils.d.ts +4 -0
- package/dist/voice/utils.d.ts.map +1 -0
- package/dist/voice/utils.js +23 -0
- package/dist/voice/utils.js.map +1 -0
- package/package.json +1 -1
- package/src/stream/index.ts +1 -0
- package/src/stream/multi_input_stream.test.ts +540 -0
- package/src/stream/multi_input_stream.ts +172 -0
- package/src/telemetry/trace_types.ts +18 -0
- package/src/utils.ts +16 -0
- package/src/voice/agent_activity.ts +25 -0
- package/src/voice/agent_session.ts +17 -11
- package/src/voice/audio_recognition.ts +114 -38
- package/src/voice/audio_recognition_span.test.ts +261 -0
- package/src/voice/io.ts +7 -4
- package/src/voice/recorder_io/recorder_io.ts +2 -1
- package/src/voice/room_io/_input.ts +11 -7
- package/src/voice/room_io/room_io.ts +12 -0
- package/src/voice/utils.ts +29 -0
|
@@ -20,6 +20,8 @@ export const ATTR_ROOM_NAME = 'lk.room_name';
|
|
|
20
20
|
export const ATTR_SESSION_OPTIONS = 'lk.session_options';
|
|
21
21
|
|
|
22
22
|
// assistant turn
|
|
23
|
+
export const ATTR_AGENT_TURN_ID = 'lk.generation_id';
|
|
24
|
+
export const ATTR_AGENT_PARENT_TURN_ID = 'lk.parent_generation_id';
|
|
23
25
|
export const ATTR_USER_INPUT = 'lk.user_input';
|
|
24
26
|
export const ATTR_INSTRUCTIONS = 'lk.instructions';
|
|
25
27
|
export const ATTR_SPEECH_INTERRUPTED = 'lk.interrupted';
|
|
@@ -27,10 +29,14 @@ export const ATTR_SPEECH_INTERRUPTED = 'lk.interrupted';
|
|
|
27
29
|
// llm node
|
|
28
30
|
export const ATTR_CHAT_CTX = 'lk.chat_ctx';
|
|
29
31
|
export const ATTR_FUNCTION_TOOLS = 'lk.function_tools';
|
|
32
|
+
export const ATTR_PROVIDER_TOOLS = 'lk.provider_tools';
|
|
33
|
+
export const ATTR_TOOL_SETS = 'lk.tool_sets';
|
|
30
34
|
export const ATTR_RESPONSE_TEXT = 'lk.response.text';
|
|
31
35
|
export const ATTR_RESPONSE_FUNCTION_CALLS = 'lk.response.function_calls';
|
|
36
|
+
export const ATTR_RESPONSE_TTFT = 'lk.response.ttft';
|
|
32
37
|
|
|
33
38
|
// function tool
|
|
39
|
+
export const ATTR_FUNCTION_TOOL_ID = 'lk.function_tool.id';
|
|
34
40
|
export const ATTR_FUNCTION_TOOL_NAME = 'lk.function_tool.name';
|
|
35
41
|
export const ATTR_FUNCTION_TOOL_ARGS = 'lk.function_tool.arguments';
|
|
36
42
|
export const ATTR_FUNCTION_TOOL_IS_ERROR = 'lk.function_tool.is_error';
|
|
@@ -40,6 +46,7 @@ export const ATTR_FUNCTION_TOOL_OUTPUT = 'lk.function_tool.output';
|
|
|
40
46
|
export const ATTR_TTS_INPUT_TEXT = 'lk.input_text';
|
|
41
47
|
export const ATTR_TTS_STREAMING = 'lk.tts.streaming';
|
|
42
48
|
export const ATTR_TTS_LABEL = 'lk.tts.label';
|
|
49
|
+
export const ATTR_RESPONSE_TTFB = 'lk.response.ttfb';
|
|
43
50
|
|
|
44
51
|
// eou detection
|
|
45
52
|
export const ATTR_EOU_PROBABILITY = 'lk.eou.probability';
|
|
@@ -56,10 +63,14 @@ export const ATTR_LLM_METRICS = 'lk.llm_metrics';
|
|
|
56
63
|
export const ATTR_TTS_METRICS = 'lk.tts_metrics';
|
|
57
64
|
export const ATTR_REALTIME_MODEL_METRICS = 'lk.realtime_model_metrics';
|
|
58
65
|
|
|
66
|
+
// latency span attributes
|
|
67
|
+
export const ATTR_E2E_LATENCY = 'lk.e2e_latency';
|
|
68
|
+
|
|
59
69
|
// OpenTelemetry GenAI attributes
|
|
60
70
|
// OpenTelemetry specification: https://opentelemetry.io/docs/specs/semconv/registry/attributes/gen-ai/
|
|
61
71
|
export const ATTR_GEN_AI_OPERATION_NAME = 'gen_ai.operation.name';
|
|
62
72
|
export const ATTR_GEN_AI_REQUEST_MODEL = 'gen_ai.request.model';
|
|
73
|
+
export const ATTR_GEN_AI_PROVIDER_NAME = 'gen_ai.provider.name';
|
|
63
74
|
export const ATTR_GEN_AI_USAGE_INPUT_TOKENS = 'gen_ai.usage.input_tokens';
|
|
64
75
|
export const ATTR_GEN_AI_USAGE_OUTPUT_TOKENS = 'gen_ai.usage.output_tokens';
|
|
65
76
|
|
|
@@ -86,3 +97,10 @@ export const ATTR_EXCEPTION_MESSAGE = 'exception.message';
|
|
|
86
97
|
|
|
87
98
|
// Platform-specific attributes
|
|
88
99
|
export const ATTR_LANGFUSE_COMPLETION_START_TIME = 'langfuse.observation.completion_start_time';
|
|
100
|
+
|
|
101
|
+
// Adaptive Interruption attributes
|
|
102
|
+
export const ATTR_IS_INTERRUPTION = 'lk.is_interruption';
|
|
103
|
+
export const ATTR_INTERRUPTION_PROBABILITY = 'lk.interruption.probability';
|
|
104
|
+
export const ATTR_INTERRUPTION_TOTAL_DURATION = 'lk.interruption.total_duration';
|
|
105
|
+
export const ATTR_INTERRUPTION_PREDICTION_DURATION = 'lk.interruption.prediction_duration';
|
|
106
|
+
export const ATTR_INTERRUPTION_DETECTION_DELAY = 'lk.interruption.detection_delay';
|
package/src/utils.ts
CHANGED
|
@@ -126,6 +126,8 @@ export class Future<T = void> {
|
|
|
126
126
|
#rejectPromise!: (error: Error) => void;
|
|
127
127
|
#done: boolean = false;
|
|
128
128
|
#rejected: boolean = false;
|
|
129
|
+
#result: T | undefined = undefined;
|
|
130
|
+
#error: Error | undefined = undefined;
|
|
129
131
|
|
|
130
132
|
constructor() {
|
|
131
133
|
this.#await = new Promise<T>((resolve, reject) => {
|
|
@@ -142,6 +144,18 @@ export class Future<T = void> {
|
|
|
142
144
|
return this.#done;
|
|
143
145
|
}
|
|
144
146
|
|
|
147
|
+
get result(): T {
|
|
148
|
+
if (!this.#done) {
|
|
149
|
+
throw new Error('Future is not done');
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if (this.#rejected) {
|
|
153
|
+
throw this.#error;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
return this.#result!;
|
|
157
|
+
}
|
|
158
|
+
|
|
145
159
|
/** Whether the future was rejected (cancelled) */
|
|
146
160
|
get rejected() {
|
|
147
161
|
return this.#rejected;
|
|
@@ -149,12 +163,14 @@ export class Future<T = void> {
|
|
|
149
163
|
|
|
150
164
|
resolve(value: T) {
|
|
151
165
|
this.#done = true;
|
|
166
|
+
this.#result = value;
|
|
152
167
|
this.#resolvePromise(value);
|
|
153
168
|
}
|
|
154
169
|
|
|
155
170
|
reject(error: Error) {
|
|
156
171
|
this.#done = true;
|
|
157
172
|
this.#rejected = true;
|
|
173
|
+
this.#error = error;
|
|
158
174
|
this.#rejectPromise(error);
|
|
159
175
|
}
|
|
160
176
|
}
|
|
@@ -74,6 +74,7 @@ import {
|
|
|
74
74
|
} from './generation.js';
|
|
75
75
|
import type { TimedString } from './io.js';
|
|
76
76
|
import { SpeechHandle } from './speech_handle.js';
|
|
77
|
+
import { setParticipantSpanAttributes } from './utils.js';
|
|
77
78
|
|
|
78
79
|
const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
|
|
79
80
|
|
|
@@ -299,6 +300,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
299
300
|
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
300
301
|
maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
|
|
301
302
|
rootSpanContext: this.agentSession.rootSpanContext,
|
|
303
|
+
sttModel: this.stt?.label,
|
|
304
|
+
sttProvider: this.getSttProvider(),
|
|
305
|
+
getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
|
|
302
306
|
});
|
|
303
307
|
this.audioRecognition.start();
|
|
304
308
|
this.started = true;
|
|
@@ -335,6 +339,17 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
335
339
|
return this.agent.stt || this.agentSession.stt;
|
|
336
340
|
}
|
|
337
341
|
|
|
342
|
+
private getSttProvider(): string | undefined {
|
|
343
|
+
const label = this.stt?.label;
|
|
344
|
+
if (!label) {
|
|
345
|
+
return undefined;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
// Heuristic: most labels look like "<provider>-<model>"
|
|
349
|
+
const [provider] = label.split('-', 1);
|
|
350
|
+
return provider || label;
|
|
351
|
+
}
|
|
352
|
+
|
|
338
353
|
get llm(): LLM | RealtimeModel | undefined {
|
|
339
354
|
return this.agent.llm || this.agentSession.llm;
|
|
340
355
|
}
|
|
@@ -1355,6 +1370,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1355
1370
|
span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || '');
|
|
1356
1371
|
}
|
|
1357
1372
|
|
|
1373
|
+
const localParticipant = this.agentSession._roomIO?.localParticipant;
|
|
1374
|
+
if (localParticipant) {
|
|
1375
|
+
setParticipantSpanAttributes(span, localParticipant);
|
|
1376
|
+
}
|
|
1377
|
+
|
|
1358
1378
|
speechHandleStorage.enterWith(speechHandle);
|
|
1359
1379
|
|
|
1360
1380
|
const audioOutput = this.agentSession.output.audioEnabled
|
|
@@ -1815,6 +1835,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1815
1835
|
|
|
1816
1836
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1817
1837
|
|
|
1838
|
+
const localParticipant = this.agentSession._roomIO?.localParticipant;
|
|
1839
|
+
if (localParticipant) {
|
|
1840
|
+
setParticipantSpanAttributes(span, localParticipant);
|
|
1841
|
+
}
|
|
1842
|
+
|
|
1818
1843
|
speechHandleStorage.enterWith(speechHandle);
|
|
1819
1844
|
|
|
1820
1845
|
if (!this.realtimeSession) {
|
|
@@ -62,6 +62,7 @@ import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io
|
|
|
62
62
|
import type { UnknownUserData } from './run_context.js';
|
|
63
63
|
import type { SpeechHandle } from './speech_handle.js';
|
|
64
64
|
import { RunResult } from './testing/run_result.js';
|
|
65
|
+
import { setParticipantSpanAttributes } from './utils.js';
|
|
65
66
|
|
|
66
67
|
export interface VoiceOptions {
|
|
67
68
|
allowInterruptions: boolean;
|
|
@@ -131,7 +132,8 @@ export class AgentSession<
|
|
|
131
132
|
private started = false;
|
|
132
133
|
private userState: UserState = 'listening';
|
|
133
134
|
|
|
134
|
-
|
|
135
|
+
/** @internal */
|
|
136
|
+
_roomIO?: RoomIO;
|
|
135
137
|
private logger = log();
|
|
136
138
|
|
|
137
139
|
private _chatCtx: ChatContext;
|
|
@@ -294,7 +296,7 @@ export class AgentSession<
|
|
|
294
296
|
|
|
295
297
|
const tasks: Promise<void>[] = [];
|
|
296
298
|
|
|
297
|
-
if (room && !this.
|
|
299
|
+
if (room && !this._roomIO) {
|
|
298
300
|
// Check for existing input/output configuration and warn if needed
|
|
299
301
|
if (this.input.audio && inputOptions?.audioEnabled !== false) {
|
|
300
302
|
this.logger.warn(
|
|
@@ -314,13 +316,13 @@ export class AgentSession<
|
|
|
314
316
|
);
|
|
315
317
|
}
|
|
316
318
|
|
|
317
|
-
this.
|
|
319
|
+
this._roomIO = new RoomIO({
|
|
318
320
|
agentSession: this,
|
|
319
321
|
room,
|
|
320
322
|
inputOptions,
|
|
321
323
|
outputOptions,
|
|
322
324
|
});
|
|
323
|
-
this.
|
|
325
|
+
this._roomIO.start();
|
|
324
326
|
}
|
|
325
327
|
|
|
326
328
|
let ctx: JobContext | undefined = undefined;
|
|
@@ -700,8 +702,10 @@ export class AgentSession<
|
|
|
700
702
|
startTime: options?.startTime,
|
|
701
703
|
});
|
|
702
704
|
|
|
703
|
-
|
|
704
|
-
|
|
705
|
+
const localParticipant = this._roomIO?.localParticipant;
|
|
706
|
+
if (localParticipant) {
|
|
707
|
+
setParticipantSpanAttributes(this.agentSpeakingSpan, localParticipant);
|
|
708
|
+
}
|
|
705
709
|
}
|
|
706
710
|
} else if (this.agentSpeakingSpan !== undefined) {
|
|
707
711
|
// TODO(brian): PR4 - Set ATTR_END_TIME attribute if available
|
|
@@ -738,8 +742,10 @@ export class AgentSession<
|
|
|
738
742
|
startTime: lastSpeakingTime,
|
|
739
743
|
});
|
|
740
744
|
|
|
741
|
-
|
|
742
|
-
|
|
745
|
+
const linked = this._roomIO?.linkedParticipant;
|
|
746
|
+
if (linked) {
|
|
747
|
+
setParticipantSpanAttributes(this.userSpeakingSpan, linked);
|
|
748
|
+
}
|
|
743
749
|
} else if (this.userSpeakingSpan !== undefined) {
|
|
744
750
|
this.userSpeakingSpan.end(lastSpeakingTime);
|
|
745
751
|
this.userSpeakingSpan = undefined;
|
|
@@ -783,7 +789,7 @@ export class AgentSession<
|
|
|
783
789
|
return;
|
|
784
790
|
}
|
|
785
791
|
|
|
786
|
-
if (this.
|
|
792
|
+
if (this._roomIO && !this._roomIO.isParticipantAvailable) {
|
|
787
793
|
return;
|
|
788
794
|
}
|
|
789
795
|
|
|
@@ -862,8 +868,8 @@ export class AgentSession<
|
|
|
862
868
|
this.output.audio = null;
|
|
863
869
|
this.output.transcription = null;
|
|
864
870
|
|
|
865
|
-
await this.
|
|
866
|
-
this.
|
|
871
|
+
await this._roomIO?.close();
|
|
872
|
+
this._roomIO = undefined;
|
|
867
873
|
|
|
868
874
|
await this.activity?.close();
|
|
869
875
|
this.activity = undefined;
|
|
@@ -1,8 +1,15 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import type { ParticipantKind } from '@livekit/rtc-node';
|
|
4
5
|
import { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
-
import
|
|
6
|
+
import {
|
|
7
|
+
type Context,
|
|
8
|
+
ROOT_CONTEXT,
|
|
9
|
+
type Span,
|
|
10
|
+
context as otelContext,
|
|
11
|
+
trace,
|
|
12
|
+
} from '@opentelemetry/api';
|
|
6
13
|
import type { WritableStreamDefaultWriter } from 'node:stream/web';
|
|
7
14
|
import { ReadableStream } from 'node:stream/web';
|
|
8
15
|
import { type ChatContext } from '../llm/chat_context.js';
|
|
@@ -16,6 +23,7 @@ import { Task, delay } from '../utils.js';
|
|
|
16
23
|
import { type VAD, type VADEvent, VADEventType } from '../vad.js';
|
|
17
24
|
import type { TurnDetectionMode } from './agent_session.js';
|
|
18
25
|
import type { STTNode } from './io.js';
|
|
26
|
+
import { setParticipantSpanAttributes } from './utils.js';
|
|
19
27
|
|
|
20
28
|
export interface EndOfTurnInfo {
|
|
21
29
|
/** The new transcript text from the user's speech. */
|
|
@@ -72,6 +80,22 @@ export interface AudioRecognitionOptions {
|
|
|
72
80
|
maxEndpointingDelay: number;
|
|
73
81
|
/** Root span context for tracing. */
|
|
74
82
|
rootSpanContext?: Context;
|
|
83
|
+
/** STT model name for tracing */
|
|
84
|
+
sttModel?: string;
|
|
85
|
+
/** STT provider name for tracing */
|
|
86
|
+
sttProvider?: string;
|
|
87
|
+
/** Getter for linked participant for span attribution */
|
|
88
|
+
getLinkedParticipant?: () => ParticipantLike | undefined;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Minimal participant shape for span attribution.
|
|
93
|
+
* Compatible with both `LocalParticipant` and `RemoteParticipant` from `@livekit/rtc-node`.
|
|
94
|
+
*/
|
|
95
|
+
export interface ParticipantLike {
|
|
96
|
+
sid: string | undefined;
|
|
97
|
+
identity: string;
|
|
98
|
+
kind: ParticipantKind;
|
|
75
99
|
}
|
|
76
100
|
|
|
77
101
|
export class AudioRecognition {
|
|
@@ -84,6 +108,9 @@ export class AudioRecognition {
|
|
|
84
108
|
private maxEndpointingDelay: number;
|
|
85
109
|
private lastLanguage?: string;
|
|
86
110
|
private rootSpanContext?: Context;
|
|
111
|
+
private sttModel?: string;
|
|
112
|
+
private sttProvider?: string;
|
|
113
|
+
private getLinkedParticipant?: () => ParticipantLike | undefined;
|
|
87
114
|
|
|
88
115
|
private deferredInputStream: DeferredReadableStream<AudioFrame>;
|
|
89
116
|
private logger = log();
|
|
@@ -121,6 +148,9 @@ export class AudioRecognition {
|
|
|
121
148
|
this.maxEndpointingDelay = opts.maxEndpointingDelay;
|
|
122
149
|
this.lastLanguage = undefined;
|
|
123
150
|
this.rootSpanContext = opts.rootSpanContext;
|
|
151
|
+
this.sttModel = opts.sttModel;
|
|
152
|
+
this.sttProvider = opts.sttProvider;
|
|
153
|
+
this.getLinkedParticipant = opts.getLinkedParticipant;
|
|
124
154
|
|
|
125
155
|
this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
|
|
126
156
|
const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
|
|
@@ -151,6 +181,37 @@ export class AudioRecognition {
|
|
|
151
181
|
});
|
|
152
182
|
}
|
|
153
183
|
|
|
184
|
+
private ensureUserTurnSpan(startTime?: number): Span {
|
|
185
|
+
if (this.userTurnSpan && this.userTurnSpan.isRecording()) {
|
|
186
|
+
return this.userTurnSpan;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
this.userTurnSpan = tracer.startSpan({
|
|
190
|
+
name: 'user_turn',
|
|
191
|
+
context: this.rootSpanContext,
|
|
192
|
+
startTime,
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
const participant = this.getLinkedParticipant?.();
|
|
196
|
+
if (participant) {
|
|
197
|
+
setParticipantSpanAttributes(this.userTurnSpan, participant);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (this.sttModel) {
|
|
201
|
+
this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.sttModel);
|
|
202
|
+
}
|
|
203
|
+
if (this.sttProvider) {
|
|
204
|
+
this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, this.sttProvider);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
return this.userTurnSpan;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
private userTurnContext(span: Span): Context {
|
|
211
|
+
const base = this.rootSpanContext ?? ROOT_CONTEXT;
|
|
212
|
+
return trace.setSpan(base, span);
|
|
213
|
+
}
|
|
214
|
+
|
|
154
215
|
private async onSTTEvent(ev: SpeechEvent) {
|
|
155
216
|
if (
|
|
156
217
|
this.turnDetectionMode === 'manual' &&
|
|
@@ -299,19 +360,25 @@ export class AudioRecognition {
|
|
|
299
360
|
break;
|
|
300
361
|
case SpeechEventType.START_OF_SPEECH:
|
|
301
362
|
if (this.turnDetectionMode !== 'stt') break;
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
363
|
+
{
|
|
364
|
+
const span = this.ensureUserTurnSpan(Date.now());
|
|
365
|
+
const ctx = this.userTurnContext(span);
|
|
366
|
+
otelContext.with(ctx, () => {
|
|
367
|
+
this.hooks.onStartOfSpeech({
|
|
368
|
+
type: VADEventType.START_OF_SPEECH,
|
|
369
|
+
samplesIndex: 0,
|
|
370
|
+
timestamp: Date.now(),
|
|
371
|
+
speechDuration: 0,
|
|
372
|
+
silenceDuration: 0,
|
|
373
|
+
frames: [],
|
|
374
|
+
probability: 0,
|
|
375
|
+
inferenceDuration: 0,
|
|
376
|
+
speaking: true,
|
|
377
|
+
rawAccumulatedSilence: 0,
|
|
378
|
+
rawAccumulatedSpeech: 0,
|
|
379
|
+
});
|
|
380
|
+
});
|
|
381
|
+
}
|
|
315
382
|
this.speaking = true;
|
|
316
383
|
this.lastSpeakingTime = Date.now();
|
|
317
384
|
|
|
@@ -319,19 +386,25 @@ export class AudioRecognition {
|
|
|
319
386
|
break;
|
|
320
387
|
case SpeechEventType.END_OF_SPEECH:
|
|
321
388
|
if (this.turnDetectionMode !== 'stt') break;
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
389
|
+
{
|
|
390
|
+
const span = this.ensureUserTurnSpan();
|
|
391
|
+
const ctx = this.userTurnContext(span);
|
|
392
|
+
otelContext.with(ctx, () => {
|
|
393
|
+
this.hooks.onEndOfSpeech({
|
|
394
|
+
type: VADEventType.END_OF_SPEECH,
|
|
395
|
+
samplesIndex: 0,
|
|
396
|
+
timestamp: Date.now(),
|
|
397
|
+
speechDuration: 0,
|
|
398
|
+
silenceDuration: 0,
|
|
399
|
+
frames: [],
|
|
400
|
+
probability: 0,
|
|
401
|
+
inferenceDuration: 0,
|
|
402
|
+
speaking: false,
|
|
403
|
+
rawAccumulatedSilence: 0,
|
|
404
|
+
rawAccumulatedSpeech: 0,
|
|
405
|
+
});
|
|
406
|
+
});
|
|
407
|
+
}
|
|
335
408
|
this.speaking = false;
|
|
336
409
|
this.userTurnCommitted = true;
|
|
337
410
|
this.lastSpeakingTime = Date.now();
|
|
@@ -376,6 +449,9 @@ export class AudioRecognition {
|
|
|
376
449
|
async (controller: AbortController) => {
|
|
377
450
|
let endpointingDelay = this.minEndpointingDelay;
|
|
378
451
|
|
|
452
|
+
const userTurnSpan = this.ensureUserTurnSpan();
|
|
453
|
+
const userTurnCtx = this.userTurnContext(userTurnSpan);
|
|
454
|
+
|
|
379
455
|
if (turnDetector) {
|
|
380
456
|
await tracer.startActiveSpan(
|
|
381
457
|
async (span) => {
|
|
@@ -415,7 +491,7 @@ export class AudioRecognition {
|
|
|
415
491
|
},
|
|
416
492
|
{
|
|
417
493
|
name: 'eou_detection',
|
|
418
|
-
context:
|
|
494
|
+
context: userTurnCtx,
|
|
419
495
|
},
|
|
420
496
|
);
|
|
421
497
|
}
|
|
@@ -577,17 +653,13 @@ export class AudioRecognition {
|
|
|
577
653
|
switch (ev.type) {
|
|
578
654
|
case VADEventType.START_OF_SPEECH:
|
|
579
655
|
this.logger.debug('VAD task: START_OF_SPEECH');
|
|
580
|
-
|
|
581
|
-
this.speaking = true;
|
|
582
|
-
|
|
583
|
-
if (!this.userTurnSpan) {
|
|
656
|
+
{
|
|
584
657
|
const startTime = Date.now() - ev.speechDuration;
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
startTime,
|
|
589
|
-
});
|
|
658
|
+
const span = this.ensureUserTurnSpan(startTime);
|
|
659
|
+
const ctx = this.userTurnContext(span);
|
|
660
|
+
otelContext.with(ctx, () => this.hooks.onStartOfSpeech(ev));
|
|
590
661
|
}
|
|
662
|
+
this.speaking = true;
|
|
591
663
|
|
|
592
664
|
// Capture sample rate from the first VAD event if not already set
|
|
593
665
|
if (ev.frames.length > 0 && ev.frames[0]) {
|
|
@@ -609,7 +681,11 @@ export class AudioRecognition {
|
|
|
609
681
|
break;
|
|
610
682
|
case VADEventType.END_OF_SPEECH:
|
|
611
683
|
this.logger.debug('VAD task: END_OF_SPEECH');
|
|
612
|
-
|
|
684
|
+
{
|
|
685
|
+
const span = this.ensureUserTurnSpan();
|
|
686
|
+
const ctx = this.userTurnContext(span);
|
|
687
|
+
otelContext.with(ctx, () => this.hooks.onEndOfSpeech(ev));
|
|
688
|
+
}
|
|
613
689
|
|
|
614
690
|
// when VAD fires END_OF_SPEECH, it already waited for the silence_duration
|
|
615
691
|
this.speaking = false;
|