@livekit/agents 1.0.45 → 1.0.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +14 -20
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +14 -20
- package/dist/cli.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +14 -5
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +14 -5
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/llm/chat_context.cjs +19 -0
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +4 -0
- package/dist/llm/chat_context.d.ts +4 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +19 -0
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +82 -2
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +80 -1
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +326 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +327 -1
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +4 -3
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +4 -3
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/log.cjs +5 -2
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +5 -2
- package/dist/log.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +15 -6
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +15 -6
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/stream/index.cjs +3 -0
- package/dist/stream/index.cjs.map +1 -1
- package/dist/stream/index.d.cts +1 -0
- package/dist/stream/index.d.ts +1 -0
- package/dist/stream/index.d.ts.map +1 -1
- package/dist/stream/index.js +2 -0
- package/dist/stream/index.js.map +1 -1
- package/dist/stream/multi_input_stream.cjs +139 -0
- package/dist/stream/multi_input_stream.cjs.map +1 -0
- package/dist/stream/multi_input_stream.d.cts +55 -0
- package/dist/stream/multi_input_stream.d.ts +55 -0
- package/dist/stream/multi_input_stream.d.ts.map +1 -0
- package/dist/stream/multi_input_stream.js +115 -0
- package/dist/stream/multi_input_stream.js.map +1 -0
- package/dist/stream/multi_input_stream.test.cjs +340 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -0
- package/dist/stream/multi_input_stream.test.js +339 -0
- package/dist/stream/multi_input_stream.test.js.map +1 -0
- package/dist/telemetry/trace_types.cjs +42 -0
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +14 -0
- package/dist/telemetry/trace_types.d.ts +14 -0
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +28 -0
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/utils.cjs +44 -2
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +8 -0
- package/dist/utils.d.ts +8 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +44 -2
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +71 -0
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +71 -0
- package/dist/utils.test.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +144 -12
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +29 -4
- package/dist/voice/agent.d.ts +29 -4
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +140 -11
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +120 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +122 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +402 -292
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +35 -7
- package/dist/voice/agent_activity.d.ts +35 -7
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +402 -287
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +156 -44
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +22 -9
- package/dist/voice/agent_session.d.ts +22 -9
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +156 -44
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +89 -36
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +22 -1
- package/dist/voice/audio_recognition.d.ts +22 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +93 -36
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/audio_recognition_span.test.cjs +233 -0
- package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
- package/dist/voice/audio_recognition_span.test.js +232 -0
- package/dist/voice/audio_recognition_span.test.js.map +1 -0
- package/dist/voice/generation.cjs +39 -19
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +44 -20
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +2 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +2 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/io.cjs +6 -3
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +3 -2
- package/dist/voice/io.d.ts +3 -2
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +6 -3
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +3 -1
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +3 -1
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +17 -17
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +2 -2
- package/dist/voice/room_io/_input.d.ts +2 -2
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +7 -6
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +9 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +3 -1
- package/dist/voice/room_io/room_io.d.ts +3 -1
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +9 -0
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +7 -1
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +2 -0
- package/dist/voice/speech_handle.d.ts +2 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +8 -2
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/run_result.cjs +66 -15
- package/dist/voice/testing/run_result.cjs.map +1 -1
- package/dist/voice/testing/run_result.d.cts +14 -3
- package/dist/voice/testing/run_result.d.ts +14 -3
- package/dist/voice/testing/run_result.d.ts.map +1 -1
- package/dist/voice/testing/run_result.js +66 -15
- package/dist/voice/testing/run_result.js.map +1 -1
- package/dist/voice/utils.cjs +47 -0
- package/dist/voice/utils.cjs.map +1 -0
- package/dist/voice/utils.d.cts +4 -0
- package/dist/voice/utils.d.ts +4 -0
- package/dist/voice/utils.d.ts.map +1 -0
- package/dist/voice/utils.js +23 -0
- package/dist/voice/utils.js.map +1 -0
- package/package.json +1 -1
- package/src/cli.ts +20 -33
- package/src/ipc/job_proc_lazy_main.ts +16 -5
- package/src/llm/chat_context.ts +35 -0
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +385 -1
- package/src/llm/provider_format/openai.ts +103 -0
- package/src/llm/provider_format/utils.ts +6 -4
- package/src/llm/realtime.ts +1 -0
- package/src/log.ts +5 -2
- package/src/stream/deferred_stream.ts +17 -6
- package/src/stream/index.ts +1 -0
- package/src/stream/multi_input_stream.test.ts +540 -0
- package/src/stream/multi_input_stream.ts +172 -0
- package/src/telemetry/trace_types.ts +18 -0
- package/src/utils.test.ts +87 -0
- package/src/utils.ts +52 -2
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +140 -2
- package/src/voice/agent.ts +189 -10
- package/src/voice/agent_activity.ts +449 -286
- package/src/voice/agent_session.ts +195 -51
- package/src/voice/audio_recognition.ts +118 -38
- package/src/voice/audio_recognition_span.test.ts +261 -0
- package/src/voice/generation.ts +52 -23
- package/src/voice/index.ts +1 -1
- package/src/voice/io.ts +7 -4
- package/src/voice/recorder_io/recorder_io.ts +2 -1
- package/src/voice/room_io/_input.ts +11 -7
- package/src/voice/room_io/room_io.ts +12 -0
- package/src/voice/speech_handle.ts +9 -2
- package/src/voice/testing/run_result.ts +81 -23
- package/src/voice/utils.ts +29 -0
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var import_rtc_node = require("@livekit/rtc-node");
|
|
3
|
+
var import_sdk_trace_base = require("@opentelemetry/sdk-trace-base");
|
|
4
|
+
var import_sdk_trace_node = require("@opentelemetry/sdk-trace-node");
|
|
5
|
+
var import_vitest = require("vitest");
|
|
6
|
+
var import_log = require("../log.cjs");
|
|
7
|
+
var import_stt = require("../stt/stt.cjs");
|
|
8
|
+
var import_telemetry = require("../telemetry/index.cjs");
|
|
9
|
+
var import_vad = require("../vad.cjs");
|
|
10
|
+
var import_audio_recognition = require("./audio_recognition.cjs");
|
|
11
|
+
function setupInMemoryTracing() {
|
|
12
|
+
const exporter = new import_sdk_trace_base.InMemorySpanExporter();
|
|
13
|
+
const provider = new import_sdk_trace_node.NodeTracerProvider();
|
|
14
|
+
provider.addSpanProcessor(new import_sdk_trace_base.SimpleSpanProcessor(exporter));
|
|
15
|
+
provider.register();
|
|
16
|
+
(0, import_telemetry.setTracerProvider)(provider);
|
|
17
|
+
return { exporter };
|
|
18
|
+
}
|
|
19
|
+
function spanByName(spans, name) {
|
|
20
|
+
return spans.find((s) => s.name === name);
|
|
21
|
+
}
|
|
22
|
+
class FakeVADStream extends Object {
|
|
23
|
+
// We intentionally avoid extending the real VADStream (it is not exported as a value in JS output
|
|
24
|
+
// in some bundling contexts). Instead we emulate the async iterator shape used by AudioRecognition.
|
|
25
|
+
events;
|
|
26
|
+
idx = 0;
|
|
27
|
+
constructor(events) {
|
|
28
|
+
super();
|
|
29
|
+
this.events = events;
|
|
30
|
+
}
|
|
31
|
+
updateInputStream() {
|
|
32
|
+
}
|
|
33
|
+
detachInputStream() {
|
|
34
|
+
}
|
|
35
|
+
close() {
|
|
36
|
+
}
|
|
37
|
+
[Symbol.asyncIterator]() {
|
|
38
|
+
return this;
|
|
39
|
+
}
|
|
40
|
+
async next() {
|
|
41
|
+
if (this.idx >= this.events.length) {
|
|
42
|
+
return { done: true, value: void 0 };
|
|
43
|
+
}
|
|
44
|
+
const value = this.events[this.idx++];
|
|
45
|
+
return { done: false, value };
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
class FakeVAD extends import_vad.VAD {
|
|
49
|
+
label = "fake-vad";
|
|
50
|
+
events;
|
|
51
|
+
constructor(events) {
|
|
52
|
+
super({ updateInterval: 1 });
|
|
53
|
+
this.events = events;
|
|
54
|
+
}
|
|
55
|
+
stream() {
|
|
56
|
+
return new FakeVADStream(this.events);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
const alwaysTrueTurnDetector = {
|
|
60
|
+
supportsLanguage: async () => true,
|
|
61
|
+
unlikelyThreshold: async () => void 0,
|
|
62
|
+
predictEndOfTurn: async () => 1
|
|
63
|
+
};
|
|
64
|
+
(0, import_vitest.describe)("AudioRecognition user_turn span parity", () => {
|
|
65
|
+
(0, import_log.initializeLogger)({ pretty: false, level: "silent" });
|
|
66
|
+
(0, import_vitest.it)("creates user_turn and parents eou_detection under it (stt mode)", async () => {
|
|
67
|
+
const { exporter } = setupInMemoryTracing();
|
|
68
|
+
const hooks = {
|
|
69
|
+
onStartOfSpeech: import_vitest.vi.fn(),
|
|
70
|
+
onVADInferenceDone: import_vitest.vi.fn(),
|
|
71
|
+
onEndOfSpeech: import_vitest.vi.fn(),
|
|
72
|
+
onInterimTranscript: import_vitest.vi.fn(),
|
|
73
|
+
onFinalTranscript: import_vitest.vi.fn(),
|
|
74
|
+
onPreemptiveGeneration: import_vitest.vi.fn(),
|
|
75
|
+
retrieveChatCtx: () => ({
|
|
76
|
+
copy() {
|
|
77
|
+
return this;
|
|
78
|
+
},
|
|
79
|
+
addMessage() {
|
|
80
|
+
},
|
|
81
|
+
toJSON() {
|
|
82
|
+
return { items: [] };
|
|
83
|
+
}
|
|
84
|
+
}),
|
|
85
|
+
onEndOfTurn: import_vitest.vi.fn(async () => true)
|
|
86
|
+
};
|
|
87
|
+
const sttEvents = [
|
|
88
|
+
{ type: import_stt.SpeechEventType.START_OF_SPEECH },
|
|
89
|
+
{
|
|
90
|
+
type: import_stt.SpeechEventType.FINAL_TRANSCRIPT,
|
|
91
|
+
alternatives: [
|
|
92
|
+
{
|
|
93
|
+
language: "en",
|
|
94
|
+
text: "hello",
|
|
95
|
+
startTime: 0,
|
|
96
|
+
endTime: 0,
|
|
97
|
+
confidence: 0.9
|
|
98
|
+
}
|
|
99
|
+
]
|
|
100
|
+
},
|
|
101
|
+
{ type: import_stt.SpeechEventType.END_OF_SPEECH }
|
|
102
|
+
];
|
|
103
|
+
const sttNode = async () => new ReadableStream({
|
|
104
|
+
start(controller) {
|
|
105
|
+
for (const ev of sttEvents) controller.enqueue(ev);
|
|
106
|
+
controller.close();
|
|
107
|
+
}
|
|
108
|
+
});
|
|
109
|
+
const ar = new import_audio_recognition.AudioRecognition({
|
|
110
|
+
recognitionHooks: hooks,
|
|
111
|
+
stt: sttNode,
|
|
112
|
+
vad: void 0,
|
|
113
|
+
turnDetector: alwaysTrueTurnDetector,
|
|
114
|
+
turnDetectionMode: "stt",
|
|
115
|
+
minEndpointingDelay: 0,
|
|
116
|
+
maxEndpointingDelay: 0,
|
|
117
|
+
sttModel: "deepgram-nova2",
|
|
118
|
+
sttProvider: "deepgram",
|
|
119
|
+
getLinkedParticipant: () => ({ sid: "p1", identity: "bob", kind: import_rtc_node.ParticipantKind.AGENT })
|
|
120
|
+
});
|
|
121
|
+
await ar.start();
|
|
122
|
+
await new Promise((r) => setTimeout(r, 20));
|
|
123
|
+
await ar.close();
|
|
124
|
+
const spans = exporter.getFinishedSpans();
|
|
125
|
+
const userTurn = spanByName(spans, "user_turn");
|
|
126
|
+
const eou = spanByName(spans, "eou_detection");
|
|
127
|
+
(0, import_vitest.expect)(userTurn, "user_turn span missing").toBeTruthy();
|
|
128
|
+
(0, import_vitest.expect)(eou, "eou_detection span missing").toBeTruthy();
|
|
129
|
+
(0, import_vitest.expect)(eou.parentSpanId).toBe(userTurn.spanContext().spanId);
|
|
130
|
+
(0, import_vitest.expect)(userTurn.attributes["lk.participant_id"]).toBe("p1");
|
|
131
|
+
(0, import_vitest.expect)(userTurn.attributes["lk.participant_identity"]).toBe("bob");
|
|
132
|
+
(0, import_vitest.expect)(userTurn.attributes["lk.participant_kind"]).toBe("AGENT");
|
|
133
|
+
(0, import_vitest.expect)(userTurn.attributes["gen_ai.request.model"]).toBe("deepgram-nova2");
|
|
134
|
+
(0, import_vitest.expect)(userTurn.attributes["gen_ai.provider.name"]).toBe("deepgram");
|
|
135
|
+
(0, import_vitest.expect)(userTurn.attributes["lk.user_transcript"]).toContain("hello");
|
|
136
|
+
(0, import_vitest.expect)(userTurn.attributes["lk.transcript_confidence"]).toBeGreaterThan(0);
|
|
137
|
+
});
|
|
138
|
+
(0, import_vitest.it)("creates user_turn from VAD startTime (vad mode) and keeps same parenting", async () => {
|
|
139
|
+
const { exporter } = setupInMemoryTracing();
|
|
140
|
+
const hooks = {
|
|
141
|
+
onStartOfSpeech: import_vitest.vi.fn(),
|
|
142
|
+
onVADInferenceDone: import_vitest.vi.fn(),
|
|
143
|
+
onEndOfSpeech: import_vitest.vi.fn(),
|
|
144
|
+
onInterimTranscript: import_vitest.vi.fn(),
|
|
145
|
+
onFinalTranscript: import_vitest.vi.fn(),
|
|
146
|
+
onPreemptiveGeneration: import_vitest.vi.fn(),
|
|
147
|
+
retrieveChatCtx: () => ({
|
|
148
|
+
copy() {
|
|
149
|
+
return this;
|
|
150
|
+
},
|
|
151
|
+
addMessage() {
|
|
152
|
+
},
|
|
153
|
+
toJSON() {
|
|
154
|
+
return { items: [] };
|
|
155
|
+
}
|
|
156
|
+
}),
|
|
157
|
+
onEndOfTurn: import_vitest.vi.fn(async () => true)
|
|
158
|
+
};
|
|
159
|
+
const now = Date.now();
|
|
160
|
+
const vadEvents = [
|
|
161
|
+
{
|
|
162
|
+
type: import_vad.VADEventType.START_OF_SPEECH,
|
|
163
|
+
samplesIndex: 0,
|
|
164
|
+
timestamp: now,
|
|
165
|
+
speechDuration: 100,
|
|
166
|
+
silenceDuration: 0,
|
|
167
|
+
frames: [],
|
|
168
|
+
probability: 0,
|
|
169
|
+
inferenceDuration: 0,
|
|
170
|
+
speaking: true,
|
|
171
|
+
rawAccumulatedSilence: 0,
|
|
172
|
+
rawAccumulatedSpeech: 0
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
type: import_vad.VADEventType.END_OF_SPEECH,
|
|
176
|
+
samplesIndex: 0,
|
|
177
|
+
timestamp: now + 200,
|
|
178
|
+
speechDuration: 100,
|
|
179
|
+
silenceDuration: 100,
|
|
180
|
+
frames: [],
|
|
181
|
+
probability: 0,
|
|
182
|
+
inferenceDuration: 0,
|
|
183
|
+
speaking: false,
|
|
184
|
+
rawAccumulatedSilence: 0,
|
|
185
|
+
rawAccumulatedSpeech: 0
|
|
186
|
+
}
|
|
187
|
+
];
|
|
188
|
+
const sttEvents = [
|
|
189
|
+
{
|
|
190
|
+
type: import_stt.SpeechEventType.FINAL_TRANSCRIPT,
|
|
191
|
+
alternatives: [
|
|
192
|
+
{
|
|
193
|
+
language: "en",
|
|
194
|
+
text: "test",
|
|
195
|
+
startTime: 0,
|
|
196
|
+
endTime: 0,
|
|
197
|
+
confidence: 0.8
|
|
198
|
+
}
|
|
199
|
+
]
|
|
200
|
+
}
|
|
201
|
+
];
|
|
202
|
+
const sttNode = async () => new ReadableStream({
|
|
203
|
+
start(controller) {
|
|
204
|
+
for (const ev of sttEvents) controller.enqueue(ev);
|
|
205
|
+
controller.close();
|
|
206
|
+
}
|
|
207
|
+
});
|
|
208
|
+
const ar = new import_audio_recognition.AudioRecognition({
|
|
209
|
+
recognitionHooks: hooks,
|
|
210
|
+
stt: sttNode,
|
|
211
|
+
vad: new FakeVAD(vadEvents),
|
|
212
|
+
turnDetector: alwaysTrueTurnDetector,
|
|
213
|
+
turnDetectionMode: "vad",
|
|
214
|
+
minEndpointingDelay: 0,
|
|
215
|
+
maxEndpointingDelay: 0,
|
|
216
|
+
sttModel: "stt-model",
|
|
217
|
+
sttProvider: "stt-provider",
|
|
218
|
+
getLinkedParticipant: () => ({ sid: "p2", identity: "alice", kind: import_rtc_node.ParticipantKind.AGENT })
|
|
219
|
+
});
|
|
220
|
+
await ar.start();
|
|
221
|
+
await new Promise((r) => setTimeout(r, 20));
|
|
222
|
+
await ar.close();
|
|
223
|
+
const spans = exporter.getFinishedSpans();
|
|
224
|
+
const userTurn = spanByName(spans, "user_turn");
|
|
225
|
+
const eou = spanByName(spans, "eou_detection");
|
|
226
|
+
(0, import_vitest.expect)(userTurn).toBeTruthy();
|
|
227
|
+
(0, import_vitest.expect)(eou).toBeTruthy();
|
|
228
|
+
(0, import_vitest.expect)(eou.parentSpanId).toBe(userTurn.spanContext().spanId);
|
|
229
|
+
(0, import_vitest.expect)(hooks.onStartOfSpeech).toHaveBeenCalled();
|
|
230
|
+
(0, import_vitest.expect)(hooks.onEndOfSpeech).toHaveBeenCalled();
|
|
231
|
+
});
|
|
232
|
+
});
|
|
233
|
+
//# sourceMappingURL=audio_recognition_span.test.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/voice/audio_recognition_span.test.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2026 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { ParticipantKind } from '@livekit/rtc-node';\nimport { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base';\nimport { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';\nimport { describe, expect, it, vi } from 'vitest';\nimport { initializeLogger } from '../log.js';\nimport { type SpeechEvent, SpeechEventType } from '../stt/stt.js';\nimport { setTracerProvider } from '../telemetry/index.js';\nimport { VAD, type VADEvent, VADEventType, type VADStream } from '../vad.js';\nimport { AudioRecognition, type _TurnDetector } from './audio_recognition.js';\n\nfunction setupInMemoryTracing() {\n const exporter = new InMemorySpanExporter();\n const provider = new NodeTracerProvider();\n provider.addSpanProcessor(new SimpleSpanProcessor(exporter));\n provider.register();\n setTracerProvider(provider);\n return { exporter };\n}\n\nfunction spanByName(spans: any[], name: string) {\n return spans.find((s) => s.name === name);\n}\n\nclass FakeVADStream extends (Object as unknown as { new (): VADStream }) {\n // We intentionally avoid extending the real VADStream (it is not exported as a value in JS output\n // in some bundling contexts). Instead we emulate the async iterator shape used by AudioRecognition.\n private events: VADEvent[];\n private idx = 0;\n constructor(events: VADEvent[]) {\n super();\n this.events = events;\n }\n updateInputStream() {}\n detachInputStream() {}\n close() {}\n [Symbol.asyncIterator]() {\n return this;\n }\n async next(): Promise<IteratorResult<VADEvent>> {\n if (this.idx >= this.events.length) {\n return { done: true, value: undefined };\n }\n const value = this.events[this.idx++]!;\n return { done: false, value };\n }\n}\n\nclass FakeVAD extends VAD {\n label = 'fake-vad';\n private events: VADEvent[];\n constructor(events: VADEvent[]) {\n super({ updateInterval: 1 });\n this.events = events;\n }\n stream(): any {\n return new FakeVADStream(this.events);\n }\n}\n\nconst alwaysTrueTurnDetector: _TurnDetector = {\n supportsLanguage: async () => true,\n unlikelyThreshold: async () => undefined,\n predictEndOfTurn: async () => 1.0,\n};\n\ndescribe('AudioRecognition user_turn span parity', () => {\n initializeLogger({ pretty: false, level: 'silent' });\n\n it('creates user_turn and parents eou_detection under it (stt mode)', async () => {\n const { exporter } = setupInMemoryTracing();\n\n const hooks = {\n onStartOfSpeech: vi.fn(),\n onVADInferenceDone: vi.fn(),\n onEndOfSpeech: vi.fn(),\n onInterimTranscript: vi.fn(),\n onFinalTranscript: vi.fn(),\n onPreemptiveGeneration: vi.fn(),\n retrieveChatCtx: () =>\n ({\n copy() {\n return this;\n },\n addMessage() {},\n toJSON() {\n return { items: [] };\n },\n }) as any,\n onEndOfTurn: vi.fn(async () => true),\n };\n\n const sttEvents: SpeechEvent[] = [\n { type: SpeechEventType.START_OF_SPEECH },\n {\n type: SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [\n {\n language: 'en',\n text: 'hello',\n startTime: 0,\n endTime: 0,\n confidence: 0.9,\n },\n ],\n },\n { type: SpeechEventType.END_OF_SPEECH },\n ];\n\n const sttNode = async () =>\n new ReadableStream<SpeechEvent>({\n start(controller) {\n for (const ev of sttEvents) controller.enqueue(ev);\n controller.close();\n },\n });\n\n const ar = new AudioRecognition({\n recognitionHooks: hooks as any,\n stt: sttNode as any,\n vad: undefined,\n turnDetector: alwaysTrueTurnDetector,\n turnDetectionMode: 'stt',\n minEndpointingDelay: 0,\n maxEndpointingDelay: 0,\n sttModel: 'deepgram-nova2',\n sttProvider: 'deepgram',\n getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }),\n });\n\n await ar.start();\n // allow background task to drain\n await new Promise((r) => setTimeout(r, 20));\n await ar.close();\n\n const spans = exporter.getFinishedSpans();\n const userTurn = spanByName(spans, 'user_turn');\n const eou = spanByName(spans, 'eou_detection');\n expect(userTurn, 'user_turn span missing').toBeTruthy();\n expect(eou, 'eou_detection span missing').toBeTruthy();\n\n expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);\n\n // creation-time attributes\n expect(userTurn.attributes['lk.participant_id']).toBe('p1');\n expect(userTurn.attributes['lk.participant_identity']).toBe('bob');\n expect(userTurn.attributes['lk.participant_kind']).toBe('AGENT');\n expect(userTurn.attributes['gen_ai.request.model']).toBe('deepgram-nova2');\n expect(userTurn.attributes['gen_ai.provider.name']).toBe('deepgram');\n\n // end-of-turn attributes\n expect(userTurn.attributes['lk.user_transcript']).toContain('hello');\n expect(userTurn.attributes['lk.transcript_confidence']).toBeGreaterThan(0);\n });\n\n it('creates user_turn from VAD startTime (vad mode) and keeps same parenting', async () => {\n const { exporter } = setupInMemoryTracing();\n\n const hooks = {\n onStartOfSpeech: vi.fn(),\n onVADInferenceDone: vi.fn(),\n onEndOfSpeech: vi.fn(),\n onInterimTranscript: vi.fn(),\n onFinalTranscript: vi.fn(),\n onPreemptiveGeneration: vi.fn(),\n retrieveChatCtx: () =>\n ({\n copy() {\n return this;\n },\n addMessage() {},\n toJSON() {\n return { items: [] };\n },\n }) as any,\n onEndOfTurn: vi.fn(async () => true),\n };\n\n const now = Date.now();\n const vadEvents: VADEvent[] = [\n {\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: 0,\n timestamp: now,\n speechDuration: 100,\n silenceDuration: 0,\n frames: [],\n probability: 0,\n inferenceDuration: 0,\n speaking: true,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n },\n {\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: 0,\n timestamp: now + 200,\n speechDuration: 100,\n silenceDuration: 100,\n frames: [],\n probability: 0,\n inferenceDuration: 0,\n speaking: false,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n },\n ];\n\n const sttEvents: SpeechEvent[] = [\n {\n type: SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [\n {\n language: 'en',\n text: 'test',\n startTime: 0,\n endTime: 0,\n confidence: 0.8,\n },\n ],\n },\n ];\n\n const sttNode = async () =>\n new ReadableStream<SpeechEvent>({\n start(controller) {\n for (const ev of sttEvents) controller.enqueue(ev);\n controller.close();\n },\n });\n\n const ar = new AudioRecognition({\n recognitionHooks: hooks as any,\n stt: sttNode as any,\n vad: new FakeVAD(vadEvents) as any,\n turnDetector: alwaysTrueTurnDetector,\n turnDetectionMode: 'vad',\n minEndpointingDelay: 0,\n maxEndpointingDelay: 0,\n sttModel: 'stt-model',\n sttProvider: 'stt-provider',\n getLinkedParticipant: () => ({ sid: 'p2', identity: 'alice', kind: ParticipantKind.AGENT }),\n });\n\n await ar.start();\n await new Promise((r) => setTimeout(r, 20));\n await ar.close();\n\n const spans = exporter.getFinishedSpans();\n const userTurn = spanByName(spans, 'user_turn');\n const eou = spanByName(spans, 'eou_detection');\n expect(userTurn).toBeTruthy();\n expect(eou).toBeTruthy();\n expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);\n\n expect(hooks.onStartOfSpeech).toHaveBeenCalled();\n expect(hooks.onEndOfSpeech).toHaveBeenCalled();\n });\n});\n"],"mappings":";AAGA,sBAAgC;AAChC,4BAA0D;AAC1D,4BAAmC;AACnC,oBAAyC;AACzC,iBAAiC;AACjC,iBAAkD;AAClD,uBAAkC;AAClC,iBAAiE;AACjE,+BAAqD;AAErD,SAAS,uBAAuB;AAC9B,QAAM,WAAW,IAAI,2CAAqB;AAC1C,QAAM,WAAW,IAAI,yCAAmB;AACxC,WAAS,iBAAiB,IAAI,0CAAoB,QAAQ,CAAC;AAC3D,WAAS,SAAS;AAClB,0CAAkB,QAAQ;AAC1B,SAAO,EAAE,SAAS;AACpB;AAEA,SAAS,WAAW,OAAc,MAAc;AAC9C,SAAO,MAAM,KAAK,CAAC,MAAM,EAAE,SAAS,IAAI;AAC1C;AAEA,MAAM,sBAAuB,OAA4C;AAAA;AAAA;AAAA,EAG/D;AAAA,EACA,MAAM;AAAA,EACd,YAAY,QAAoB;AAC9B,UAAM;AACN,SAAK,SAAS;AAAA,EAChB;AAAA,EACA,oBAAoB;AAAA,EAAC;AAAA,EACrB,oBAAoB;AAAA,EAAC;AAAA,EACrB,QAAQ;AAAA,EAAC;AAAA,EACT,CAAC,OAAO,aAAa,IAAI;AACvB,WAAO;AAAA,EACT;AAAA,EACA,MAAM,OAA0C;AAC9C,QAAI,KAAK,OAAO,KAAK,OAAO,QAAQ;AAClC,aAAO,EAAE,MAAM,MAAM,OAAO,OAAU;AAAA,IACxC;AACA,UAAM,QAAQ,KAAK,OAAO,KAAK,KAAK;AACpC,WAAO,EAAE,MAAM,OAAO,MAAM;AAAA,EAC9B;AACF;AAEA,MAAM,gBAAgB,eAAI;AAAA,EACxB,QAAQ;AAAA,EACA;AAAA,EACR,YAAY,QAAoB;AAC9B,UAAM,EAAE,gBAAgB,EAAE,CAAC;AAC3B,SAAK,SAAS;AAAA,EAChB;AAAA,EACA,SAAc;AACZ,WAAO,IAAI,cAAc,KAAK,MAAM;AAAA,EACtC;AACF;AAEA,MAAM,yBAAwC;AAAA,EAC5C,kBAAkB,YAAY;AAAA,EAC9B,mBAAmB,YAAY;AAAA,EAC/B,kBAAkB,YAAY;AAChC;AAAA,IAEA,wBAAS,0CAA0C,MAAM;AACvD,mCAAiB,EAAE,QAAQ,OAAO,OAAO,SAAS,CAAC;AAEnD,wBAAG,mEAAmE,YAAY;AAChF,UAAM,EAAE,SAAS,IAAI,qBAAqB;AAE1C,UAAM,QAAQ;AAAA,MACZ,iBAAiB,iBAAG,GAAG;AAAA,MACvB,oBAAoB,iBAAG,GAAG;AAAA,MAC1B,eAAe,iBAAG,GAAG;AAAA,MACrB,qBAAqB,iBAAG,GAAG;AAAA,MAC3B,mBAAmB,iBAAG,GAAG;AAAA,MACzB,wBAAwB,iBAAG,GAAG;AAAA,MAC9B,iBAAiB,OACd;AAAA,QACC,OAAO;AACL,iBAAO;AAAA,QACT;AAAA,QACA,aAAa;AAAA,QAAC;AAAA,QACd,SAAS;AACP,iBAAO,EAAE,OAAO,CAAC,EAAE;AAAA,QACrB;AAAA,MACF;AAAA,MACF,aAAa,iBAAG,GAAG,YAAY,IAAI;AAAA,IACrC;AAEA,UAAM,YAA2B;AAAA,MAC/B,EAAE,MAAM,2BAAgB,gBAAgB;AAAA,MACxC;AAAA,QACE,MAAM,2BAAgB;AAAA,QACtB,cAAc;AAAA,UACZ;AAAA,YACE,UAAU;AAAA,YACV,MAAM;AAAA,YACN,WAAW;AAAA,YACX,SAAS;AAAA,YACT,YAAY;AAAA,UACd;AAAA,QACF;AAAA,MACF;AAAA,MACA,EAAE,MAAM,2BAAgB,cAAc;AAAA,IACxC;AAEA,UAAM,UAAU,YACd,IAAI,eAA4B;AAAA,MAC9B,MAAM,YAAY;AAChB,mBAAW,MAAM,UAAW,YAAW,QAAQ,EAAE;AACjD,mBAAW,MAAM;AAAA,MACnB;AAAA,IACF,CAAC;AAEH,UAAM,KAAK,IAAI,0CAAiB;AAAA,MAC9B,kBAAkB;AAAA,MAClB,KAAK;AAAA,MACL,KAAK;AAAA,MACL,cAAc;AAAA,MACd,mBAAmB;AAAA,MACnB,qBAAqB;AAAA,MACrB,qBAAqB;AAAA,MACrB,UAAU;AAAA,MACV,aAAa;AAAA,MACb,sBAAsB,OAAO,EAAE,KAAK,MAAM,UAAU,OAAO,MAAM,gCAAgB,MAAM;AAAA,IACzF,CAAC;AAED,UAAM,GAAG,MAAM;AAEf,UAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,EAAE,CAAC;AAC1C,UAAM,GAAG,MAAM;AAEf,UAAM,QAAQ,SAAS,iBAAiB;AACxC,UAAM,WAAW,WAAW,OAAO,WAAW;AAC9C,UAAM,MAAM,WAAW,OAAO,eAAe;AAC7C,8BAAO,UAAU,wBAAwB,EAAE,WAAW;AACtD,8BAAO,KAAK,4BAA4B,EAAE,WAAW;AAErD,8BAAO,IAAI,YAAY,EAAE,KAAK,SAAS,YAAY,EAAE,MAAM;AAG3D,8BAAO,SAAS,WAAW,mBAAmB,CAAC,EAAE,KAAK,IAAI;AAC1D,8BAAO,SAAS,WAAW,yBAAyB,CAAC,EAAE,KAAK,KAAK;AACjE,8BAAO,SAAS,WAAW,qBAAqB,CAAC,EAAE,KAAK,OAAO;AAC/D,8BAAO,SAAS,WAAW,sBAAsB,CAAC,EAAE,KAAK,gBAAgB;AACzE,8BAAO,SAAS,WAAW,sBAAsB,CAAC,EAAE,KAAK,UAAU;AAGnE,8BAAO,SAAS,WAAW,oBAAoB,CAAC,EAAE,UAAU,OAAO;AACnE,8BAAO,SAAS,WAAW,0BAA0B,CAAC,EAAE,gBAAgB,CAAC;AAAA,EAC3E,CAAC;AAED,wBAAG,4EAA4E,YAAY;AACzF,UAAM,EAAE,SAAS,IAAI,qBAAqB;AAE1C,UAAM,QAAQ;AAAA,MACZ,iBAAiB,iBAAG,GAAG;AAAA,MACvB,oBAAoB,iBAAG,GAAG;AAAA,MAC1B,eAAe,iBAAG,GAAG;AAAA,MACrB,qBAAqB,iBAAG,GAAG;AAAA,MAC3B,mBAAmB,iBAAG,GAAG;AAAA,MACzB,wBAAwB,iBAAG,GAAG;AAAA,MAC9B,iBAAiB,OACd;AAAA,QACC,OAAO;AACL,iBAAO;AAAA,QACT;AAAA,QACA,aAAa;AAAA,QAAC;AAAA,QACd,SAAS;AACP,iBAAO,EAAE,OAAO,CAAC,EAAE;AAAA,QACrB;AAAA,MACF;AAAA,MACF,aAAa,iBAAG,GAAG,YAAY,IAAI;AAAA,IACrC;AAEA,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,YAAwB;AAAA,MAC5B;AAAA,QACE,MAAM,wBAAa;AAAA,QACnB,cAAc;AAAA,QACd,WAAW;AAAA,QACX,gBAAgB;AAAA,QAChB,iBAAiB;AAAA,QACjB,QAAQ,CAAC;AAAA,QACT,aAAa;AAAA,QACb,mBAAmB;AAAA,QACnB,UAAU;AAAA,QACV,uBAAuB;AAAA,QACvB,sBAAsB;AAAA,MACxB;AAAA,MACA;AAAA,QACE,MAAM,wBAAa;AAAA,QACnB,cAAc;AAAA,QACd,WAAW,MAAM;AAAA,QACjB,gBAAgB;AAAA,QAChB,iBAAiB;AAAA,QACjB,QAAQ,CAAC;AAAA,QACT,aAAa;AAAA,QACb,mBAAmB;AAAA,QACnB,UAAU;AAAA,QACV,uBAAuB;AAAA,QACvB,sBAAsB;AAAA,MACxB;AAAA,IACF;AAEA,UAAM,YAA2B;AAAA,MAC/B;AAAA,QACE,MAAM,2BAAgB;AAAA,QACtB,cAAc;AAAA,UACZ;AAAA,YACE,UAAU;AAAA,YACV,MAAM;AAAA,YACN,WAAW;AAAA,YACX,SAAS;AAAA,YACT,YAAY;AAAA,UACd;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,UAAU,YACd,IAAI,eAA4B;AAAA,MAC9B,MAAM,YAAY;AAChB,mBAAW,MAAM,UAAW,YAAW,QAAQ,EAAE;AACjD,mBAAW,MAAM;AAAA,MACnB;AAAA,IACF,CAAC;AAEH,UAAM,KAAK,IAAI,0CAAiB;AAAA,MAC9B,kBAAkB;AAAA,MAClB,KAAK;AAAA,MACL,KAAK,IAAI,QAAQ,SAAS;AAAA,MAC1B,cAAc;AAAA,MACd,mBAAmB;AAAA,MACnB,qBAAqB;AAAA,MACrB,qBAAqB;AAAA,MACrB,UAAU;AAAA,MACV,aAAa;AAAA,MACb,sBAAsB,OAAO,EAAE,KAAK,MAAM,UAAU,SAAS,MAAM,gCAAgB,MAAM;AAAA,IAC3F,CAAC;AAED,UAAM,GAAG,MAAM;AACf,UAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,EAAE,CAAC;AAC1C,UAAM,GAAG,MAAM;AAEf,UAAM,QAAQ,SAAS,iBAAiB;AACxC,UAAM,WAAW,WAAW,OAAO,WAAW;AAC9C,UAAM,MAAM,WAAW,OAAO,eAAe;AAC7C,8BAAO,QAAQ,EAAE,WAAW;AAC5B,8BAAO,GAAG,EAAE,WAAW;AACvB,8BAAO,IAAI,YAAY,EAAE,KAAK,SAAS,YAAY,EAAE,MAAM;AAE3D,8BAAO,MAAM,eAAe,EAAE,iBAAiB;AAC/C,8BAAO,MAAM,aAAa,EAAE,iBAAiB;AAAA,EAC/C,CAAC;AACH,CAAC;","names":[]}
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
import { ParticipantKind } from "@livekit/rtc-node";
|
|
2
|
+
import { InMemorySpanExporter, SimpleSpanProcessor } from "@opentelemetry/sdk-trace-base";
|
|
3
|
+
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
4
|
+
import { describe, expect, it, vi } from "vitest";
|
|
5
|
+
import { initializeLogger } from "../log.js";
|
|
6
|
+
import { SpeechEventType } from "../stt/stt.js";
|
|
7
|
+
import { setTracerProvider } from "../telemetry/index.js";
|
|
8
|
+
import { VAD, VADEventType } from "../vad.js";
|
|
9
|
+
import { AudioRecognition } from "./audio_recognition.js";
|
|
10
|
+
function setupInMemoryTracing() {
|
|
11
|
+
const exporter = new InMemorySpanExporter();
|
|
12
|
+
const provider = new NodeTracerProvider();
|
|
13
|
+
provider.addSpanProcessor(new SimpleSpanProcessor(exporter));
|
|
14
|
+
provider.register();
|
|
15
|
+
setTracerProvider(provider);
|
|
16
|
+
return { exporter };
|
|
17
|
+
}
|
|
18
|
+
function spanByName(spans, name) {
|
|
19
|
+
return spans.find((s) => s.name === name);
|
|
20
|
+
}
|
|
21
|
+
class FakeVADStream extends Object {
|
|
22
|
+
// We intentionally avoid extending the real VADStream (it is not exported as a value in JS output
|
|
23
|
+
// in some bundling contexts). Instead we emulate the async iterator shape used by AudioRecognition.
|
|
24
|
+
events;
|
|
25
|
+
idx = 0;
|
|
26
|
+
constructor(events) {
|
|
27
|
+
super();
|
|
28
|
+
this.events = events;
|
|
29
|
+
}
|
|
30
|
+
updateInputStream() {
|
|
31
|
+
}
|
|
32
|
+
detachInputStream() {
|
|
33
|
+
}
|
|
34
|
+
close() {
|
|
35
|
+
}
|
|
36
|
+
[Symbol.asyncIterator]() {
|
|
37
|
+
return this;
|
|
38
|
+
}
|
|
39
|
+
async next() {
|
|
40
|
+
if (this.idx >= this.events.length) {
|
|
41
|
+
return { done: true, value: void 0 };
|
|
42
|
+
}
|
|
43
|
+
const value = this.events[this.idx++];
|
|
44
|
+
return { done: false, value };
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
class FakeVAD extends VAD {
|
|
48
|
+
label = "fake-vad";
|
|
49
|
+
events;
|
|
50
|
+
constructor(events) {
|
|
51
|
+
super({ updateInterval: 1 });
|
|
52
|
+
this.events = events;
|
|
53
|
+
}
|
|
54
|
+
stream() {
|
|
55
|
+
return new FakeVADStream(this.events);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
const alwaysTrueTurnDetector = {
|
|
59
|
+
supportsLanguage: async () => true,
|
|
60
|
+
unlikelyThreshold: async () => void 0,
|
|
61
|
+
predictEndOfTurn: async () => 1
|
|
62
|
+
};
|
|
63
|
+
describe("AudioRecognition user_turn span parity", () => {
|
|
64
|
+
initializeLogger({ pretty: false, level: "silent" });
|
|
65
|
+
it("creates user_turn and parents eou_detection under it (stt mode)", async () => {
|
|
66
|
+
const { exporter } = setupInMemoryTracing();
|
|
67
|
+
const hooks = {
|
|
68
|
+
onStartOfSpeech: vi.fn(),
|
|
69
|
+
onVADInferenceDone: vi.fn(),
|
|
70
|
+
onEndOfSpeech: vi.fn(),
|
|
71
|
+
onInterimTranscript: vi.fn(),
|
|
72
|
+
onFinalTranscript: vi.fn(),
|
|
73
|
+
onPreemptiveGeneration: vi.fn(),
|
|
74
|
+
retrieveChatCtx: () => ({
|
|
75
|
+
copy() {
|
|
76
|
+
return this;
|
|
77
|
+
},
|
|
78
|
+
addMessage() {
|
|
79
|
+
},
|
|
80
|
+
toJSON() {
|
|
81
|
+
return { items: [] };
|
|
82
|
+
}
|
|
83
|
+
}),
|
|
84
|
+
onEndOfTurn: vi.fn(async () => true)
|
|
85
|
+
};
|
|
86
|
+
const sttEvents = [
|
|
87
|
+
{ type: SpeechEventType.START_OF_SPEECH },
|
|
88
|
+
{
|
|
89
|
+
type: SpeechEventType.FINAL_TRANSCRIPT,
|
|
90
|
+
alternatives: [
|
|
91
|
+
{
|
|
92
|
+
language: "en",
|
|
93
|
+
text: "hello",
|
|
94
|
+
startTime: 0,
|
|
95
|
+
endTime: 0,
|
|
96
|
+
confidence: 0.9
|
|
97
|
+
}
|
|
98
|
+
]
|
|
99
|
+
},
|
|
100
|
+
{ type: SpeechEventType.END_OF_SPEECH }
|
|
101
|
+
];
|
|
102
|
+
const sttNode = async () => new ReadableStream({
|
|
103
|
+
start(controller) {
|
|
104
|
+
for (const ev of sttEvents) controller.enqueue(ev);
|
|
105
|
+
controller.close();
|
|
106
|
+
}
|
|
107
|
+
});
|
|
108
|
+
const ar = new AudioRecognition({
|
|
109
|
+
recognitionHooks: hooks,
|
|
110
|
+
stt: sttNode,
|
|
111
|
+
vad: void 0,
|
|
112
|
+
turnDetector: alwaysTrueTurnDetector,
|
|
113
|
+
turnDetectionMode: "stt",
|
|
114
|
+
minEndpointingDelay: 0,
|
|
115
|
+
maxEndpointingDelay: 0,
|
|
116
|
+
sttModel: "deepgram-nova2",
|
|
117
|
+
sttProvider: "deepgram",
|
|
118
|
+
getLinkedParticipant: () => ({ sid: "p1", identity: "bob", kind: ParticipantKind.AGENT })
|
|
119
|
+
});
|
|
120
|
+
await ar.start();
|
|
121
|
+
await new Promise((r) => setTimeout(r, 20));
|
|
122
|
+
await ar.close();
|
|
123
|
+
const spans = exporter.getFinishedSpans();
|
|
124
|
+
const userTurn = spanByName(spans, "user_turn");
|
|
125
|
+
const eou = spanByName(spans, "eou_detection");
|
|
126
|
+
expect(userTurn, "user_turn span missing").toBeTruthy();
|
|
127
|
+
expect(eou, "eou_detection span missing").toBeTruthy();
|
|
128
|
+
expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);
|
|
129
|
+
expect(userTurn.attributes["lk.participant_id"]).toBe("p1");
|
|
130
|
+
expect(userTurn.attributes["lk.participant_identity"]).toBe("bob");
|
|
131
|
+
expect(userTurn.attributes["lk.participant_kind"]).toBe("AGENT");
|
|
132
|
+
expect(userTurn.attributes["gen_ai.request.model"]).toBe("deepgram-nova2");
|
|
133
|
+
expect(userTurn.attributes["gen_ai.provider.name"]).toBe("deepgram");
|
|
134
|
+
expect(userTurn.attributes["lk.user_transcript"]).toContain("hello");
|
|
135
|
+
expect(userTurn.attributes["lk.transcript_confidence"]).toBeGreaterThan(0);
|
|
136
|
+
});
|
|
137
|
+
it("creates user_turn from VAD startTime (vad mode) and keeps same parenting", async () => {
|
|
138
|
+
const { exporter } = setupInMemoryTracing();
|
|
139
|
+
const hooks = {
|
|
140
|
+
onStartOfSpeech: vi.fn(),
|
|
141
|
+
onVADInferenceDone: vi.fn(),
|
|
142
|
+
onEndOfSpeech: vi.fn(),
|
|
143
|
+
onInterimTranscript: vi.fn(),
|
|
144
|
+
onFinalTranscript: vi.fn(),
|
|
145
|
+
onPreemptiveGeneration: vi.fn(),
|
|
146
|
+
retrieveChatCtx: () => ({
|
|
147
|
+
copy() {
|
|
148
|
+
return this;
|
|
149
|
+
},
|
|
150
|
+
addMessage() {
|
|
151
|
+
},
|
|
152
|
+
toJSON() {
|
|
153
|
+
return { items: [] };
|
|
154
|
+
}
|
|
155
|
+
}),
|
|
156
|
+
onEndOfTurn: vi.fn(async () => true)
|
|
157
|
+
};
|
|
158
|
+
const now = Date.now();
|
|
159
|
+
const vadEvents = [
|
|
160
|
+
{
|
|
161
|
+
type: VADEventType.START_OF_SPEECH,
|
|
162
|
+
samplesIndex: 0,
|
|
163
|
+
timestamp: now,
|
|
164
|
+
speechDuration: 100,
|
|
165
|
+
silenceDuration: 0,
|
|
166
|
+
frames: [],
|
|
167
|
+
probability: 0,
|
|
168
|
+
inferenceDuration: 0,
|
|
169
|
+
speaking: true,
|
|
170
|
+
rawAccumulatedSilence: 0,
|
|
171
|
+
rawAccumulatedSpeech: 0
|
|
172
|
+
},
|
|
173
|
+
{
|
|
174
|
+
type: VADEventType.END_OF_SPEECH,
|
|
175
|
+
samplesIndex: 0,
|
|
176
|
+
timestamp: now + 200,
|
|
177
|
+
speechDuration: 100,
|
|
178
|
+
silenceDuration: 100,
|
|
179
|
+
frames: [],
|
|
180
|
+
probability: 0,
|
|
181
|
+
inferenceDuration: 0,
|
|
182
|
+
speaking: false,
|
|
183
|
+
rawAccumulatedSilence: 0,
|
|
184
|
+
rawAccumulatedSpeech: 0
|
|
185
|
+
}
|
|
186
|
+
];
|
|
187
|
+
const sttEvents = [
|
|
188
|
+
{
|
|
189
|
+
type: SpeechEventType.FINAL_TRANSCRIPT,
|
|
190
|
+
alternatives: [
|
|
191
|
+
{
|
|
192
|
+
language: "en",
|
|
193
|
+
text: "test",
|
|
194
|
+
startTime: 0,
|
|
195
|
+
endTime: 0,
|
|
196
|
+
confidence: 0.8
|
|
197
|
+
}
|
|
198
|
+
]
|
|
199
|
+
}
|
|
200
|
+
];
|
|
201
|
+
const sttNode = async () => new ReadableStream({
|
|
202
|
+
start(controller) {
|
|
203
|
+
for (const ev of sttEvents) controller.enqueue(ev);
|
|
204
|
+
controller.close();
|
|
205
|
+
}
|
|
206
|
+
});
|
|
207
|
+
const ar = new AudioRecognition({
|
|
208
|
+
recognitionHooks: hooks,
|
|
209
|
+
stt: sttNode,
|
|
210
|
+
vad: new FakeVAD(vadEvents),
|
|
211
|
+
turnDetector: alwaysTrueTurnDetector,
|
|
212
|
+
turnDetectionMode: "vad",
|
|
213
|
+
minEndpointingDelay: 0,
|
|
214
|
+
maxEndpointingDelay: 0,
|
|
215
|
+
sttModel: "stt-model",
|
|
216
|
+
sttProvider: "stt-provider",
|
|
217
|
+
getLinkedParticipant: () => ({ sid: "p2", identity: "alice", kind: ParticipantKind.AGENT })
|
|
218
|
+
});
|
|
219
|
+
await ar.start();
|
|
220
|
+
await new Promise((r) => setTimeout(r, 20));
|
|
221
|
+
await ar.close();
|
|
222
|
+
const spans = exporter.getFinishedSpans();
|
|
223
|
+
const userTurn = spanByName(spans, "user_turn");
|
|
224
|
+
const eou = spanByName(spans, "eou_detection");
|
|
225
|
+
expect(userTurn).toBeTruthy();
|
|
226
|
+
expect(eou).toBeTruthy();
|
|
227
|
+
expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);
|
|
228
|
+
expect(hooks.onStartOfSpeech).toHaveBeenCalled();
|
|
229
|
+
expect(hooks.onEndOfSpeech).toHaveBeenCalled();
|
|
230
|
+
});
|
|
231
|
+
});
|
|
232
|
+
//# sourceMappingURL=audio_recognition_span.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/voice/audio_recognition_span.test.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2026 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { ParticipantKind } from '@livekit/rtc-node';\nimport { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base';\nimport { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';\nimport { describe, expect, it, vi } from 'vitest';\nimport { initializeLogger } from '../log.js';\nimport { type SpeechEvent, SpeechEventType } from '../stt/stt.js';\nimport { setTracerProvider } from '../telemetry/index.js';\nimport { VAD, type VADEvent, VADEventType, type VADStream } from '../vad.js';\nimport { AudioRecognition, type _TurnDetector } from './audio_recognition.js';\n\nfunction setupInMemoryTracing() {\n const exporter = new InMemorySpanExporter();\n const provider = new NodeTracerProvider();\n provider.addSpanProcessor(new SimpleSpanProcessor(exporter));\n provider.register();\n setTracerProvider(provider);\n return { exporter };\n}\n\nfunction spanByName(spans: any[], name: string) {\n return spans.find((s) => s.name === name);\n}\n\nclass FakeVADStream extends (Object as unknown as { new (): VADStream }) {\n // We intentionally avoid extending the real VADStream (it is not exported as a value in JS output\n // in some bundling contexts). Instead we emulate the async iterator shape used by AudioRecognition.\n private events: VADEvent[];\n private idx = 0;\n constructor(events: VADEvent[]) {\n super();\n this.events = events;\n }\n updateInputStream() {}\n detachInputStream() {}\n close() {}\n [Symbol.asyncIterator]() {\n return this;\n }\n async next(): Promise<IteratorResult<VADEvent>> {\n if (this.idx >= this.events.length) {\n return { done: true, value: undefined };\n }\n const value = this.events[this.idx++]!;\n return { done: false, value };\n }\n}\n\nclass FakeVAD extends VAD {\n label = 'fake-vad';\n private events: VADEvent[];\n constructor(events: VADEvent[]) {\n super({ updateInterval: 1 });\n this.events = events;\n }\n stream(): any {\n return new FakeVADStream(this.events);\n }\n}\n\nconst alwaysTrueTurnDetector: _TurnDetector = {\n supportsLanguage: async () => true,\n unlikelyThreshold: async () => undefined,\n predictEndOfTurn: async () => 1.0,\n};\n\ndescribe('AudioRecognition user_turn span parity', () => {\n initializeLogger({ pretty: false, level: 'silent' });\n\n it('creates user_turn and parents eou_detection under it (stt mode)', async () => {\n const { exporter } = setupInMemoryTracing();\n\n const hooks = {\n onStartOfSpeech: vi.fn(),\n onVADInferenceDone: vi.fn(),\n onEndOfSpeech: vi.fn(),\n onInterimTranscript: vi.fn(),\n onFinalTranscript: vi.fn(),\n onPreemptiveGeneration: vi.fn(),\n retrieveChatCtx: () =>\n ({\n copy() {\n return this;\n },\n addMessage() {},\n toJSON() {\n return { items: [] };\n },\n }) as any,\n onEndOfTurn: vi.fn(async () => true),\n };\n\n const sttEvents: SpeechEvent[] = [\n { type: SpeechEventType.START_OF_SPEECH },\n {\n type: SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [\n {\n language: 'en',\n text: 'hello',\n startTime: 0,\n endTime: 0,\n confidence: 0.9,\n },\n ],\n },\n { type: SpeechEventType.END_OF_SPEECH },\n ];\n\n const sttNode = async () =>\n new ReadableStream<SpeechEvent>({\n start(controller) {\n for (const ev of sttEvents) controller.enqueue(ev);\n controller.close();\n },\n });\n\n const ar = new AudioRecognition({\n recognitionHooks: hooks as any,\n stt: sttNode as any,\n vad: undefined,\n turnDetector: alwaysTrueTurnDetector,\n turnDetectionMode: 'stt',\n minEndpointingDelay: 0,\n maxEndpointingDelay: 0,\n sttModel: 'deepgram-nova2',\n sttProvider: 'deepgram',\n getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }),\n });\n\n await ar.start();\n // allow background task to drain\n await new Promise((r) => setTimeout(r, 20));\n await ar.close();\n\n const spans = exporter.getFinishedSpans();\n const userTurn = spanByName(spans, 'user_turn');\n const eou = spanByName(spans, 'eou_detection');\n expect(userTurn, 'user_turn span missing').toBeTruthy();\n expect(eou, 'eou_detection span missing').toBeTruthy();\n\n expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);\n\n // creation-time attributes\n expect(userTurn.attributes['lk.participant_id']).toBe('p1');\n expect(userTurn.attributes['lk.participant_identity']).toBe('bob');\n expect(userTurn.attributes['lk.participant_kind']).toBe('AGENT');\n expect(userTurn.attributes['gen_ai.request.model']).toBe('deepgram-nova2');\n expect(userTurn.attributes['gen_ai.provider.name']).toBe('deepgram');\n\n // end-of-turn attributes\n expect(userTurn.attributes['lk.user_transcript']).toContain('hello');\n expect(userTurn.attributes['lk.transcript_confidence']).toBeGreaterThan(0);\n });\n\n it('creates user_turn from VAD startTime (vad mode) and keeps same parenting', async () => {\n const { exporter } = setupInMemoryTracing();\n\n const hooks = {\n onStartOfSpeech: vi.fn(),\n onVADInferenceDone: vi.fn(),\n onEndOfSpeech: vi.fn(),\n onInterimTranscript: vi.fn(),\n onFinalTranscript: vi.fn(),\n onPreemptiveGeneration: vi.fn(),\n retrieveChatCtx: () =>\n ({\n copy() {\n return this;\n },\n addMessage() {},\n toJSON() {\n return { items: [] };\n },\n }) as any,\n onEndOfTurn: vi.fn(async () => true),\n };\n\n const now = Date.now();\n const vadEvents: VADEvent[] = [\n {\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: 0,\n timestamp: now,\n speechDuration: 100,\n silenceDuration: 0,\n frames: [],\n probability: 0,\n inferenceDuration: 0,\n speaking: true,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n },\n {\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: 0,\n timestamp: now + 200,\n speechDuration: 100,\n silenceDuration: 100,\n frames: [],\n probability: 0,\n inferenceDuration: 0,\n speaking: false,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n },\n ];\n\n const sttEvents: SpeechEvent[] = [\n {\n type: SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [\n {\n language: 'en',\n text: 'test',\n startTime: 0,\n endTime: 0,\n confidence: 0.8,\n },\n ],\n },\n ];\n\n const sttNode = async () =>\n new ReadableStream<SpeechEvent>({\n start(controller) {\n for (const ev of sttEvents) controller.enqueue(ev);\n controller.close();\n },\n });\n\n const ar = new AudioRecognition({\n recognitionHooks: hooks as any,\n stt: sttNode as any,\n vad: new FakeVAD(vadEvents) as any,\n turnDetector: alwaysTrueTurnDetector,\n turnDetectionMode: 'vad',\n minEndpointingDelay: 0,\n maxEndpointingDelay: 0,\n sttModel: 'stt-model',\n sttProvider: 'stt-provider',\n getLinkedParticipant: () => ({ sid: 'p2', identity: 'alice', kind: ParticipantKind.AGENT }),\n });\n\n await ar.start();\n await new Promise((r) => setTimeout(r, 20));\n await ar.close();\n\n const spans = exporter.getFinishedSpans();\n const userTurn = spanByName(spans, 'user_turn');\n const eou = spanByName(spans, 'eou_detection');\n expect(userTurn).toBeTruthy();\n expect(eou).toBeTruthy();\n expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);\n\n expect(hooks.onStartOfSpeech).toHaveBeenCalled();\n expect(hooks.onEndOfSpeech).toHaveBeenCalled();\n });\n});\n"],"mappings":"AAGA,SAAS,uBAAuB;AAChC,SAAS,sBAAsB,2BAA2B;AAC1D,SAAS,0BAA0B;AACnC,SAAS,UAAU,QAAQ,IAAI,UAAU;AACzC,SAAS,wBAAwB;AACjC,SAA2B,uBAAuB;AAClD,SAAS,yBAAyB;AAClC,SAAS,KAAoB,oBAAoC;AACjE,SAAS,wBAA4C;AAErD,SAAS,uBAAuB;AAC9B,QAAM,WAAW,IAAI,qBAAqB;AAC1C,QAAM,WAAW,IAAI,mBAAmB;AACxC,WAAS,iBAAiB,IAAI,oBAAoB,QAAQ,CAAC;AAC3D,WAAS,SAAS;AAClB,oBAAkB,QAAQ;AAC1B,SAAO,EAAE,SAAS;AACpB;AAEA,SAAS,WAAW,OAAc,MAAc;AAC9C,SAAO,MAAM,KAAK,CAAC,MAAM,EAAE,SAAS,IAAI;AAC1C;AAEA,MAAM,sBAAuB,OAA4C;AAAA;AAAA;AAAA,EAG/D;AAAA,EACA,MAAM;AAAA,EACd,YAAY,QAAoB;AAC9B,UAAM;AACN,SAAK,SAAS;AAAA,EAChB;AAAA,EACA,oBAAoB;AAAA,EAAC;AAAA,EACrB,oBAAoB;AAAA,EAAC;AAAA,EACrB,QAAQ;AAAA,EAAC;AAAA,EACT,CAAC,OAAO,aAAa,IAAI;AACvB,WAAO;AAAA,EACT;AAAA,EACA,MAAM,OAA0C;AAC9C,QAAI,KAAK,OAAO,KAAK,OAAO,QAAQ;AAClC,aAAO,EAAE,MAAM,MAAM,OAAO,OAAU;AAAA,IACxC;AACA,UAAM,QAAQ,KAAK,OAAO,KAAK,KAAK;AACpC,WAAO,EAAE,MAAM,OAAO,MAAM;AAAA,EAC9B;AACF;AAEA,MAAM,gBAAgB,IAAI;AAAA,EACxB,QAAQ;AAAA,EACA;AAAA,EACR,YAAY,QAAoB;AAC9B,UAAM,EAAE,gBAAgB,EAAE,CAAC;AAC3B,SAAK,SAAS;AAAA,EAChB;AAAA,EACA,SAAc;AACZ,WAAO,IAAI,cAAc,KAAK,MAAM;AAAA,EACtC;AACF;AAEA,MAAM,yBAAwC;AAAA,EAC5C,kBAAkB,YAAY;AAAA,EAC9B,mBAAmB,YAAY;AAAA,EAC/B,kBAAkB,YAAY;AAChC;AAEA,SAAS,0CAA0C,MAAM;AACvD,mBAAiB,EAAE,QAAQ,OAAO,OAAO,SAAS,CAAC;AAEnD,KAAG,mEAAmE,YAAY;AAChF,UAAM,EAAE,SAAS,IAAI,qBAAqB;AAE1C,UAAM,QAAQ;AAAA,MACZ,iBAAiB,GAAG,GAAG;AAAA,MACvB,oBAAoB,GAAG,GAAG;AAAA,MAC1B,eAAe,GAAG,GAAG;AAAA,MACrB,qBAAqB,GAAG,GAAG;AAAA,MAC3B,mBAAmB,GAAG,GAAG;AAAA,MACzB,wBAAwB,GAAG,GAAG;AAAA,MAC9B,iBAAiB,OACd;AAAA,QACC,OAAO;AACL,iBAAO;AAAA,QACT;AAAA,QACA,aAAa;AAAA,QAAC;AAAA,QACd,SAAS;AACP,iBAAO,EAAE,OAAO,CAAC,EAAE;AAAA,QACrB;AAAA,MACF;AAAA,MACF,aAAa,GAAG,GAAG,YAAY,IAAI;AAAA,IACrC;AAEA,UAAM,YAA2B;AAAA,MAC/B,EAAE,MAAM,gBAAgB,gBAAgB;AAAA,MACxC;AAAA,QACE,MAAM,gBAAgB;AAAA,QACtB,cAAc;AAAA,UACZ;AAAA,YACE,UAAU;AAAA,YACV,MAAM;AAAA,YACN,WAAW;AAAA,YACX,SAAS;AAAA,YACT,YAAY;AAAA,UACd;AAAA,QACF;AAAA,MACF;AAAA,MACA,EAAE,MAAM,gBAAgB,cAAc;AAAA,IACxC;AAEA,UAAM,UAAU,YACd,IAAI,eAA4B;AAAA,MAC9B,MAAM,YAAY;AAChB,mBAAW,MAAM,UAAW,YAAW,QAAQ,EAAE;AACjD,mBAAW,MAAM;AAAA,MACnB;AAAA,IACF,CAAC;AAEH,UAAM,KAAK,IAAI,iBAAiB;AAAA,MAC9B,kBAAkB;AAAA,MAClB,KAAK;AAAA,MACL,KAAK;AAAA,MACL,cAAc;AAAA,MACd,mBAAmB;AAAA,MACnB,qBAAqB;AAAA,MACrB,qBAAqB;AAAA,MACrB,UAAU;AAAA,MACV,aAAa;AAAA,MACb,sBAAsB,OAAO,EAAE,KAAK,MAAM,UAAU,OAAO,MAAM,gBAAgB,MAAM;AAAA,IACzF,CAAC;AAED,UAAM,GAAG,MAAM;AAEf,UAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,EAAE,CAAC;AAC1C,UAAM,GAAG,MAAM;AAEf,UAAM,QAAQ,SAAS,iBAAiB;AACxC,UAAM,WAAW,WAAW,OAAO,WAAW;AAC9C,UAAM,MAAM,WAAW,OAAO,eAAe;AAC7C,WAAO,UAAU,wBAAwB,EAAE,WAAW;AACtD,WAAO,KAAK,4BAA4B,EAAE,WAAW;AAErD,WAAO,IAAI,YAAY,EAAE,KAAK,SAAS,YAAY,EAAE,MAAM;AAG3D,WAAO,SAAS,WAAW,mBAAmB,CAAC,EAAE,KAAK,IAAI;AAC1D,WAAO,SAAS,WAAW,yBAAyB,CAAC,EAAE,KAAK,KAAK;AACjE,WAAO,SAAS,WAAW,qBAAqB,CAAC,EAAE,KAAK,OAAO;AAC/D,WAAO,SAAS,WAAW,sBAAsB,CAAC,EAAE,KAAK,gBAAgB;AACzE,WAAO,SAAS,WAAW,sBAAsB,CAAC,EAAE,KAAK,UAAU;AAGnE,WAAO,SAAS,WAAW,oBAAoB,CAAC,EAAE,UAAU,OAAO;AACnE,WAAO,SAAS,WAAW,0BAA0B,CAAC,EAAE,gBAAgB,CAAC;AAAA,EAC3E,CAAC;AAED,KAAG,4EAA4E,YAAY;AACzF,UAAM,EAAE,SAAS,IAAI,qBAAqB;AAE1C,UAAM,QAAQ;AAAA,MACZ,iBAAiB,GAAG,GAAG;AAAA,MACvB,oBAAoB,GAAG,GAAG;AAAA,MAC1B,eAAe,GAAG,GAAG;AAAA,MACrB,qBAAqB,GAAG,GAAG;AAAA,MAC3B,mBAAmB,GAAG,GAAG;AAAA,MACzB,wBAAwB,GAAG,GAAG;AAAA,MAC9B,iBAAiB,OACd;AAAA,QACC,OAAO;AACL,iBAAO;AAAA,QACT;AAAA,QACA,aAAa;AAAA,QAAC;AAAA,QACd,SAAS;AACP,iBAAO,EAAE,OAAO,CAAC,EAAE;AAAA,QACrB;AAAA,MACF;AAAA,MACF,aAAa,GAAG,GAAG,YAAY,IAAI;AAAA,IACrC;AAEA,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,YAAwB;AAAA,MAC5B;AAAA,QACE,MAAM,aAAa;AAAA,QACnB,cAAc;AAAA,QACd,WAAW;AAAA,QACX,gBAAgB;AAAA,QAChB,iBAAiB;AAAA,QACjB,QAAQ,CAAC;AAAA,QACT,aAAa;AAAA,QACb,mBAAmB;AAAA,QACnB,UAAU;AAAA,QACV,uBAAuB;AAAA,QACvB,sBAAsB;AAAA,MACxB;AAAA,MACA;AAAA,QACE,MAAM,aAAa;AAAA,QACnB,cAAc;AAAA,QACd,WAAW,MAAM;AAAA,QACjB,gBAAgB;AAAA,QAChB,iBAAiB;AAAA,QACjB,QAAQ,CAAC;AAAA,QACT,aAAa;AAAA,QACb,mBAAmB;AAAA,QACnB,UAAU;AAAA,QACV,uBAAuB;AAAA,QACvB,sBAAsB;AAAA,MACxB;AAAA,IACF;AAEA,UAAM,YAA2B;AAAA,MAC/B;AAAA,QACE,MAAM,gBAAgB;AAAA,QACtB,cAAc;AAAA,UACZ;AAAA,YACE,UAAU;AAAA,YACV,MAAM;AAAA,YACN,WAAW;AAAA,YACX,SAAS;AAAA,YACT,YAAY;AAAA,UACd;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,UAAU,YACd,IAAI,eAA4B;AAAA,MAC9B,MAAM,YAAY;AAChB,mBAAW,MAAM,UAAW,YAAW,QAAQ,EAAE;AACjD,mBAAW,MAAM;AAAA,MACnB;AAAA,IACF,CAAC;AAEH,UAAM,KAAK,IAAI,iBAAiB;AAAA,MAC9B,kBAAkB;AAAA,MAClB,KAAK;AAAA,MACL,KAAK,IAAI,QAAQ,SAAS;AAAA,MAC1B,cAAc;AAAA,MACd,mBAAmB;AAAA,MACnB,qBAAqB;AAAA,MACrB,qBAAqB;AAAA,MACrB,UAAU;AAAA,MACV,aAAa;AAAA,MACb,sBAAsB,OAAO,EAAE,KAAK,MAAM,UAAU,SAAS,MAAM,gBAAgB,MAAM;AAAA,IAC3F,CAAC;AAED,UAAM,GAAG,MAAM;AACf,UAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,EAAE,CAAC;AAC1C,UAAM,GAAG,MAAM;AAEf,UAAM,QAAQ,SAAS,iBAAiB;AACxC,UAAM,WAAW,WAAW,OAAO,WAAW;AAC9C,UAAM,MAAM,WAAW,OAAO,eAAe;AAC7C,WAAO,QAAQ,EAAE,WAAW;AAC5B,WAAO,GAAG,EAAE,WAAW;AACvB,WAAO,IAAI,YAAY,EAAE,KAAK,SAAS,YAAY,EAAE,MAAM;AAE3D,WAAO,MAAM,eAAe,EAAE,iBAAiB;AAC/C,WAAO,MAAM,aAAa,EAAE,iBAAiB;AAAA,EAC/C,CAAC;AACH,CAAC;","names":[]}
|
|
@@ -528,7 +528,7 @@ function performTextForwarding(source, controller, textOutput) {
|
|
|
528
528
|
out
|
|
529
529
|
];
|
|
530
530
|
}
|
|
531
|
-
async function forwardAudio(ttsStream,
|
|
531
|
+
async function forwardAudio(ttsStream, audioOutput, out, signal) {
|
|
532
532
|
const reader = ttsStream.getReader();
|
|
533
533
|
let resampler = null;
|
|
534
534
|
const onPlaybackStarted = (ev) => {
|
|
@@ -537,8 +537,8 @@ async function forwardAudio(ttsStream, audioOuput, out, signal) {
|
|
|
537
537
|
}
|
|
538
538
|
};
|
|
539
539
|
try {
|
|
540
|
-
|
|
541
|
-
|
|
540
|
+
audioOutput.on(import_io.AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
|
|
541
|
+
audioOutput.resume();
|
|
542
542
|
while (true) {
|
|
543
543
|
if (signal == null ? void 0 : signal.aborted) {
|
|
544
544
|
break;
|
|
@@ -546,29 +546,29 @@ async function forwardAudio(ttsStream, audioOuput, out, signal) {
|
|
|
546
546
|
const { done, value: frame } = await reader.read();
|
|
547
547
|
if (done) break;
|
|
548
548
|
out.audio.push(frame);
|
|
549
|
-
if (!out.firstFrameFut.done &&
|
|
550
|
-
resampler = new import_rtc_node.AudioResampler(frame.sampleRate,
|
|
549
|
+
if (!out.firstFrameFut.done && audioOutput.sampleRate && audioOutput.sampleRate !== frame.sampleRate && !resampler) {
|
|
550
|
+
resampler = new import_rtc_node.AudioResampler(frame.sampleRate, audioOutput.sampleRate, 1);
|
|
551
551
|
}
|
|
552
552
|
if (resampler) {
|
|
553
553
|
for (const f of resampler.push(frame)) {
|
|
554
|
-
await
|
|
554
|
+
await audioOutput.captureFrame(f);
|
|
555
555
|
}
|
|
556
556
|
} else {
|
|
557
|
-
await
|
|
557
|
+
await audioOutput.captureFrame(frame);
|
|
558
558
|
}
|
|
559
559
|
}
|
|
560
560
|
if (resampler) {
|
|
561
561
|
for (const f of resampler.flush()) {
|
|
562
|
-
await
|
|
562
|
+
await audioOutput.captureFrame(f);
|
|
563
563
|
}
|
|
564
564
|
}
|
|
565
565
|
} finally {
|
|
566
|
-
|
|
566
|
+
audioOutput.off(import_io.AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
|
|
567
567
|
if (!out.firstFrameFut.done) {
|
|
568
568
|
out.firstFrameFut.reject(new Error("audio forwarding cancelled before playback started"));
|
|
569
569
|
}
|
|
570
570
|
reader == null ? void 0 : reader.releaseLock();
|
|
571
|
-
|
|
571
|
+
audioOutput.flush();
|
|
572
572
|
}
|
|
573
573
|
}
|
|
574
574
|
function performAudioForwarding(ttsStream, audioOutput, controller) {
|
|
@@ -689,13 +689,6 @@ function performToolExecutions({
|
|
|
689
689
|
},
|
|
690
690
|
"Executing LLM tool call"
|
|
691
691
|
);
|
|
692
|
-
const toolExecution = import_agent.asyncLocalStorage.run({ functionCall: toolCall }, async () => {
|
|
693
|
-
return await tool.execute(parsedArgs, {
|
|
694
|
-
ctx: new import_run_context.RunContext(session, speechHandle, toolCall),
|
|
695
|
-
toolCallId: toolCall.callId,
|
|
696
|
-
abortSignal: signal
|
|
697
|
-
});
|
|
698
|
-
});
|
|
699
692
|
const _tracableToolExecutionImpl = async (toolExecTask, span) => {
|
|
700
693
|
span.setAttribute(import_telemetry.traceTypes.ATTR_FUNCTION_TOOL_NAME, toolCall.name);
|
|
701
694
|
span.setAttribute(import_telemetry.traceTypes.ATTR_FUNCTION_TOOL_ARGS, toolCall.args);
|
|
@@ -745,9 +738,36 @@ function performToolExecutions({
|
|
|
745
738
|
const tracableToolExecution = (toolExecTask) => import_telemetry.tracer.startActiveSpan(async (span) => _tracableToolExecutionImpl(toolExecTask, span), {
|
|
746
739
|
name: "function_tool"
|
|
747
740
|
});
|
|
748
|
-
|
|
741
|
+
const toolTask = import_utils.Task.from(
|
|
742
|
+
async () => {
|
|
743
|
+
const currentTask = import_utils.Task.current();
|
|
744
|
+
if (currentTask) {
|
|
745
|
+
(0, import_agent._setActivityTaskInfo)(currentTask, {
|
|
746
|
+
speechHandle,
|
|
747
|
+
functionCall: toolCall,
|
|
748
|
+
inlineTask: true
|
|
749
|
+
});
|
|
750
|
+
}
|
|
751
|
+
const toolExecution = import_agent.functionCallStorage.run({ functionCall: toolCall }, async () => {
|
|
752
|
+
return await tool.execute(parsedArgs, {
|
|
753
|
+
ctx: new import_run_context.RunContext(session, speechHandle, toolCall),
|
|
754
|
+
toolCallId: toolCall.callId,
|
|
755
|
+
abortSignal: signal
|
|
756
|
+
});
|
|
757
|
+
});
|
|
758
|
+
await tracableToolExecution(toolExecution);
|
|
759
|
+
},
|
|
760
|
+
controller2,
|
|
761
|
+
`performToolExecution:${toolCall.name}`
|
|
762
|
+
);
|
|
763
|
+
(0, import_agent._setActivityTaskInfo)(toolTask, {
|
|
764
|
+
speechHandle,
|
|
765
|
+
functionCall: toolCall,
|
|
766
|
+
inlineTask: true
|
|
767
|
+
});
|
|
768
|
+
tasks.push(toolTask);
|
|
749
769
|
}
|
|
750
|
-
await Promise.allSettled(tasks);
|
|
770
|
+
await Promise.allSettled(tasks.map((task) => task.result));
|
|
751
771
|
if (toolOutput.output.length > 0) {
|
|
752
772
|
logger.debug(
|
|
753
773
|
{
|