@livekit/agents 1.0.44 → 1.0.46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ipc/supervised_proc.cjs +1 -1
- package/dist/ipc/supervised_proc.cjs.map +1 -1
- package/dist/ipc/supervised_proc.js +1 -1
- package/dist/ipc/supervised_proc.js.map +1 -1
- package/dist/llm/llm.cjs +1 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.js +1 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/log.cjs +13 -9
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.cts +1 -1
- package/dist/log.d.ts +1 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +13 -9
- package/dist/log.js.map +1 -1
- package/dist/stream/index.cjs +3 -0
- package/dist/stream/index.cjs.map +1 -1
- package/dist/stream/index.d.cts +1 -0
- package/dist/stream/index.d.ts +1 -0
- package/dist/stream/index.d.ts.map +1 -1
- package/dist/stream/index.js +2 -0
- package/dist/stream/index.js.map +1 -1
- package/dist/stream/multi_input_stream.cjs +139 -0
- package/dist/stream/multi_input_stream.cjs.map +1 -0
- package/dist/stream/multi_input_stream.d.cts +55 -0
- package/dist/stream/multi_input_stream.d.ts +55 -0
- package/dist/stream/multi_input_stream.d.ts.map +1 -0
- package/dist/stream/multi_input_stream.js +115 -0
- package/dist/stream/multi_input_stream.js.map +1 -0
- package/dist/stream/multi_input_stream.test.cjs +340 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -0
- package/dist/stream/multi_input_stream.test.js +339 -0
- package/dist/stream/multi_input_stream.test.js.map +1 -0
- package/dist/stt/stt.cjs +2 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.js +2 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/trace_types.cjs +42 -0
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +14 -0
- package/dist/telemetry/trace_types.d.ts +14 -0
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +28 -0
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/tts/fallback_adapter.cjs +466 -0
- package/dist/tts/fallback_adapter.cjs.map +1 -0
- package/dist/tts/fallback_adapter.d.cts +110 -0
- package/dist/tts/fallback_adapter.d.ts +110 -0
- package/dist/tts/fallback_adapter.d.ts.map +1 -0
- package/dist/tts/fallback_adapter.js +442 -0
- package/dist/tts/fallback_adapter.js.map +1 -0
- package/dist/tts/index.cjs +3 -0
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.cts +1 -0
- package/dist/tts/index.d.ts +1 -0
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +2 -0
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/tts.cjs +2 -2
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.js +2 -2
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +13 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +1 -0
- package/dist/utils.d.ts +1 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +13 -0
- package/dist/utils.js.map +1 -1
- package/dist/vad.cjs +11 -10
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.cts +5 -3
- package/dist/vad.d.ts +5 -3
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +11 -10
- package/dist/vad.js.map +1 -1
- package/dist/voice/agent_activity.cjs +35 -10
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +1 -0
- package/dist/voice/agent_activity.d.ts +1 -0
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +35 -10
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +19 -7
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +3 -2
- package/dist/voice/agent_session.d.ts +3 -2
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +19 -7
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +85 -36
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +22 -1
- package/dist/voice/audio_recognition.d.ts +22 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +89 -36
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/audio_recognition_span.test.cjs +233 -0
- package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
- package/dist/voice/audio_recognition_span.test.js +232 -0
- package/dist/voice/audio_recognition_span.test.js.map +1 -0
- package/dist/voice/io.cjs +6 -3
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +3 -2
- package/dist/voice/io.d.ts +3 -2
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +6 -3
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +3 -1
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +3 -1
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +23 -20
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +2 -2
- package/dist/voice/room_io/_input.d.ts +2 -2
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +13 -9
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +9 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +3 -1
- package/dist/voice/room_io/room_io.d.ts +3 -1
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +9 -0
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/utils.cjs +47 -0
- package/dist/voice/utils.cjs.map +1 -0
- package/dist/voice/utils.d.cts +4 -0
- package/dist/voice/utils.d.ts +4 -0
- package/dist/voice/utils.d.ts.map +1 -0
- package/dist/voice/utils.js +23 -0
- package/dist/voice/utils.js.map +1 -0
- package/package.json +1 -1
- package/src/ipc/supervised_proc.ts +1 -1
- package/src/llm/llm.ts +1 -1
- package/src/log.ts +22 -11
- package/src/stream/index.ts +1 -0
- package/src/stream/multi_input_stream.test.ts +540 -0
- package/src/stream/multi_input_stream.ts +172 -0
- package/src/stt/stt.ts +2 -2
- package/src/telemetry/trace_types.ts +18 -0
- package/src/tts/fallback_adapter.ts +579 -0
- package/src/tts/index.ts +1 -0
- package/src/tts/tts.ts +2 -2
- package/src/utils.ts +16 -0
- package/src/vad.ts +12 -11
- package/src/voice/agent_activity.ts +25 -0
- package/src/voice/agent_session.ts +17 -11
- package/src/voice/audio_recognition.ts +114 -38
- package/src/voice/audio_recognition_span.test.ts +261 -0
- package/src/voice/io.ts +7 -4
- package/src/voice/recorder_io/recorder_io.ts +2 -1
- package/src/voice/room_io/_input.ts +16 -10
- package/src/voice/room_io/room_io.ts +12 -0
- package/src/voice/utils.ts +29 -0
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var import_rtc_node = require("@livekit/rtc-node");
|
|
3
|
+
var import_sdk_trace_base = require("@opentelemetry/sdk-trace-base");
|
|
4
|
+
var import_sdk_trace_node = require("@opentelemetry/sdk-trace-node");
|
|
5
|
+
var import_vitest = require("vitest");
|
|
6
|
+
var import_log = require("../log.cjs");
|
|
7
|
+
var import_stt = require("../stt/stt.cjs");
|
|
8
|
+
var import_telemetry = require("../telemetry/index.cjs");
|
|
9
|
+
var import_vad = require("../vad.cjs");
|
|
10
|
+
var import_audio_recognition = require("./audio_recognition.cjs");
|
|
11
|
+
function setupInMemoryTracing() {
|
|
12
|
+
const exporter = new import_sdk_trace_base.InMemorySpanExporter();
|
|
13
|
+
const provider = new import_sdk_trace_node.NodeTracerProvider();
|
|
14
|
+
provider.addSpanProcessor(new import_sdk_trace_base.SimpleSpanProcessor(exporter));
|
|
15
|
+
provider.register();
|
|
16
|
+
(0, import_telemetry.setTracerProvider)(provider);
|
|
17
|
+
return { exporter };
|
|
18
|
+
}
|
|
19
|
+
function spanByName(spans, name) {
|
|
20
|
+
return spans.find((s) => s.name === name);
|
|
21
|
+
}
|
|
22
|
+
class FakeVADStream extends Object {
|
|
23
|
+
// We intentionally avoid extending the real VADStream (it is not exported as a value in JS output
|
|
24
|
+
// in some bundling contexts). Instead we emulate the async iterator shape used by AudioRecognition.
|
|
25
|
+
events;
|
|
26
|
+
idx = 0;
|
|
27
|
+
constructor(events) {
|
|
28
|
+
super();
|
|
29
|
+
this.events = events;
|
|
30
|
+
}
|
|
31
|
+
updateInputStream() {
|
|
32
|
+
}
|
|
33
|
+
detachInputStream() {
|
|
34
|
+
}
|
|
35
|
+
close() {
|
|
36
|
+
}
|
|
37
|
+
[Symbol.asyncIterator]() {
|
|
38
|
+
return this;
|
|
39
|
+
}
|
|
40
|
+
async next() {
|
|
41
|
+
if (this.idx >= this.events.length) {
|
|
42
|
+
return { done: true, value: void 0 };
|
|
43
|
+
}
|
|
44
|
+
const value = this.events[this.idx++];
|
|
45
|
+
return { done: false, value };
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
class FakeVAD extends import_vad.VAD {
|
|
49
|
+
label = "fake-vad";
|
|
50
|
+
events;
|
|
51
|
+
constructor(events) {
|
|
52
|
+
super({ updateInterval: 1 });
|
|
53
|
+
this.events = events;
|
|
54
|
+
}
|
|
55
|
+
stream() {
|
|
56
|
+
return new FakeVADStream(this.events);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
const alwaysTrueTurnDetector = {
|
|
60
|
+
supportsLanguage: async () => true,
|
|
61
|
+
unlikelyThreshold: async () => void 0,
|
|
62
|
+
predictEndOfTurn: async () => 1
|
|
63
|
+
};
|
|
64
|
+
(0, import_vitest.describe)("AudioRecognition user_turn span parity", () => {
|
|
65
|
+
(0, import_log.initializeLogger)({ pretty: false, level: "silent" });
|
|
66
|
+
(0, import_vitest.it)("creates user_turn and parents eou_detection under it (stt mode)", async () => {
|
|
67
|
+
const { exporter } = setupInMemoryTracing();
|
|
68
|
+
const hooks = {
|
|
69
|
+
onStartOfSpeech: import_vitest.vi.fn(),
|
|
70
|
+
onVADInferenceDone: import_vitest.vi.fn(),
|
|
71
|
+
onEndOfSpeech: import_vitest.vi.fn(),
|
|
72
|
+
onInterimTranscript: import_vitest.vi.fn(),
|
|
73
|
+
onFinalTranscript: import_vitest.vi.fn(),
|
|
74
|
+
onPreemptiveGeneration: import_vitest.vi.fn(),
|
|
75
|
+
retrieveChatCtx: () => ({
|
|
76
|
+
copy() {
|
|
77
|
+
return this;
|
|
78
|
+
},
|
|
79
|
+
addMessage() {
|
|
80
|
+
},
|
|
81
|
+
toJSON() {
|
|
82
|
+
return { items: [] };
|
|
83
|
+
}
|
|
84
|
+
}),
|
|
85
|
+
onEndOfTurn: import_vitest.vi.fn(async () => true)
|
|
86
|
+
};
|
|
87
|
+
const sttEvents = [
|
|
88
|
+
{ type: import_stt.SpeechEventType.START_OF_SPEECH },
|
|
89
|
+
{
|
|
90
|
+
type: import_stt.SpeechEventType.FINAL_TRANSCRIPT,
|
|
91
|
+
alternatives: [
|
|
92
|
+
{
|
|
93
|
+
language: "en",
|
|
94
|
+
text: "hello",
|
|
95
|
+
startTime: 0,
|
|
96
|
+
endTime: 0,
|
|
97
|
+
confidence: 0.9
|
|
98
|
+
}
|
|
99
|
+
]
|
|
100
|
+
},
|
|
101
|
+
{ type: import_stt.SpeechEventType.END_OF_SPEECH }
|
|
102
|
+
];
|
|
103
|
+
const sttNode = async () => new ReadableStream({
|
|
104
|
+
start(controller) {
|
|
105
|
+
for (const ev of sttEvents) controller.enqueue(ev);
|
|
106
|
+
controller.close();
|
|
107
|
+
}
|
|
108
|
+
});
|
|
109
|
+
const ar = new import_audio_recognition.AudioRecognition({
|
|
110
|
+
recognitionHooks: hooks,
|
|
111
|
+
stt: sttNode,
|
|
112
|
+
vad: void 0,
|
|
113
|
+
turnDetector: alwaysTrueTurnDetector,
|
|
114
|
+
turnDetectionMode: "stt",
|
|
115
|
+
minEndpointingDelay: 0,
|
|
116
|
+
maxEndpointingDelay: 0,
|
|
117
|
+
sttModel: "deepgram-nova2",
|
|
118
|
+
sttProvider: "deepgram",
|
|
119
|
+
getLinkedParticipant: () => ({ sid: "p1", identity: "bob", kind: import_rtc_node.ParticipantKind.AGENT })
|
|
120
|
+
});
|
|
121
|
+
await ar.start();
|
|
122
|
+
await new Promise((r) => setTimeout(r, 20));
|
|
123
|
+
await ar.close();
|
|
124
|
+
const spans = exporter.getFinishedSpans();
|
|
125
|
+
const userTurn = spanByName(spans, "user_turn");
|
|
126
|
+
const eou = spanByName(spans, "eou_detection");
|
|
127
|
+
(0, import_vitest.expect)(userTurn, "user_turn span missing").toBeTruthy();
|
|
128
|
+
(0, import_vitest.expect)(eou, "eou_detection span missing").toBeTruthy();
|
|
129
|
+
(0, import_vitest.expect)(eou.parentSpanId).toBe(userTurn.spanContext().spanId);
|
|
130
|
+
(0, import_vitest.expect)(userTurn.attributes["lk.participant_id"]).toBe("p1");
|
|
131
|
+
(0, import_vitest.expect)(userTurn.attributes["lk.participant_identity"]).toBe("bob");
|
|
132
|
+
(0, import_vitest.expect)(userTurn.attributes["lk.participant_kind"]).toBe("AGENT");
|
|
133
|
+
(0, import_vitest.expect)(userTurn.attributes["gen_ai.request.model"]).toBe("deepgram-nova2");
|
|
134
|
+
(0, import_vitest.expect)(userTurn.attributes["gen_ai.provider.name"]).toBe("deepgram");
|
|
135
|
+
(0, import_vitest.expect)(userTurn.attributes["lk.user_transcript"]).toContain("hello");
|
|
136
|
+
(0, import_vitest.expect)(userTurn.attributes["lk.transcript_confidence"]).toBeGreaterThan(0);
|
|
137
|
+
});
|
|
138
|
+
(0, import_vitest.it)("creates user_turn from VAD startTime (vad mode) and keeps same parenting", async () => {
|
|
139
|
+
const { exporter } = setupInMemoryTracing();
|
|
140
|
+
const hooks = {
|
|
141
|
+
onStartOfSpeech: import_vitest.vi.fn(),
|
|
142
|
+
onVADInferenceDone: import_vitest.vi.fn(),
|
|
143
|
+
onEndOfSpeech: import_vitest.vi.fn(),
|
|
144
|
+
onInterimTranscript: import_vitest.vi.fn(),
|
|
145
|
+
onFinalTranscript: import_vitest.vi.fn(),
|
|
146
|
+
onPreemptiveGeneration: import_vitest.vi.fn(),
|
|
147
|
+
retrieveChatCtx: () => ({
|
|
148
|
+
copy() {
|
|
149
|
+
return this;
|
|
150
|
+
},
|
|
151
|
+
addMessage() {
|
|
152
|
+
},
|
|
153
|
+
toJSON() {
|
|
154
|
+
return { items: [] };
|
|
155
|
+
}
|
|
156
|
+
}),
|
|
157
|
+
onEndOfTurn: import_vitest.vi.fn(async () => true)
|
|
158
|
+
};
|
|
159
|
+
const now = Date.now();
|
|
160
|
+
const vadEvents = [
|
|
161
|
+
{
|
|
162
|
+
type: import_vad.VADEventType.START_OF_SPEECH,
|
|
163
|
+
samplesIndex: 0,
|
|
164
|
+
timestamp: now,
|
|
165
|
+
speechDuration: 100,
|
|
166
|
+
silenceDuration: 0,
|
|
167
|
+
frames: [],
|
|
168
|
+
probability: 0,
|
|
169
|
+
inferenceDuration: 0,
|
|
170
|
+
speaking: true,
|
|
171
|
+
rawAccumulatedSilence: 0,
|
|
172
|
+
rawAccumulatedSpeech: 0
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
type: import_vad.VADEventType.END_OF_SPEECH,
|
|
176
|
+
samplesIndex: 0,
|
|
177
|
+
timestamp: now + 200,
|
|
178
|
+
speechDuration: 100,
|
|
179
|
+
silenceDuration: 100,
|
|
180
|
+
frames: [],
|
|
181
|
+
probability: 0,
|
|
182
|
+
inferenceDuration: 0,
|
|
183
|
+
speaking: false,
|
|
184
|
+
rawAccumulatedSilence: 0,
|
|
185
|
+
rawAccumulatedSpeech: 0
|
|
186
|
+
}
|
|
187
|
+
];
|
|
188
|
+
const sttEvents = [
|
|
189
|
+
{
|
|
190
|
+
type: import_stt.SpeechEventType.FINAL_TRANSCRIPT,
|
|
191
|
+
alternatives: [
|
|
192
|
+
{
|
|
193
|
+
language: "en",
|
|
194
|
+
text: "test",
|
|
195
|
+
startTime: 0,
|
|
196
|
+
endTime: 0,
|
|
197
|
+
confidence: 0.8
|
|
198
|
+
}
|
|
199
|
+
]
|
|
200
|
+
}
|
|
201
|
+
];
|
|
202
|
+
const sttNode = async () => new ReadableStream({
|
|
203
|
+
start(controller) {
|
|
204
|
+
for (const ev of sttEvents) controller.enqueue(ev);
|
|
205
|
+
controller.close();
|
|
206
|
+
}
|
|
207
|
+
});
|
|
208
|
+
const ar = new import_audio_recognition.AudioRecognition({
|
|
209
|
+
recognitionHooks: hooks,
|
|
210
|
+
stt: sttNode,
|
|
211
|
+
vad: new FakeVAD(vadEvents),
|
|
212
|
+
turnDetector: alwaysTrueTurnDetector,
|
|
213
|
+
turnDetectionMode: "vad",
|
|
214
|
+
minEndpointingDelay: 0,
|
|
215
|
+
maxEndpointingDelay: 0,
|
|
216
|
+
sttModel: "stt-model",
|
|
217
|
+
sttProvider: "stt-provider",
|
|
218
|
+
getLinkedParticipant: () => ({ sid: "p2", identity: "alice", kind: import_rtc_node.ParticipantKind.AGENT })
|
|
219
|
+
});
|
|
220
|
+
await ar.start();
|
|
221
|
+
await new Promise((r) => setTimeout(r, 20));
|
|
222
|
+
await ar.close();
|
|
223
|
+
const spans = exporter.getFinishedSpans();
|
|
224
|
+
const userTurn = spanByName(spans, "user_turn");
|
|
225
|
+
const eou = spanByName(spans, "eou_detection");
|
|
226
|
+
(0, import_vitest.expect)(userTurn).toBeTruthy();
|
|
227
|
+
(0, import_vitest.expect)(eou).toBeTruthy();
|
|
228
|
+
(0, import_vitest.expect)(eou.parentSpanId).toBe(userTurn.spanContext().spanId);
|
|
229
|
+
(0, import_vitest.expect)(hooks.onStartOfSpeech).toHaveBeenCalled();
|
|
230
|
+
(0, import_vitest.expect)(hooks.onEndOfSpeech).toHaveBeenCalled();
|
|
231
|
+
});
|
|
232
|
+
});
|
|
233
|
+
//# sourceMappingURL=audio_recognition_span.test.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/voice/audio_recognition_span.test.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2026 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { ParticipantKind } from '@livekit/rtc-node';\nimport { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base';\nimport { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';\nimport { describe, expect, it, vi } from 'vitest';\nimport { initializeLogger } from '../log.js';\nimport { type SpeechEvent, SpeechEventType } from '../stt/stt.js';\nimport { setTracerProvider } from '../telemetry/index.js';\nimport { VAD, type VADEvent, VADEventType, type VADStream } from '../vad.js';\nimport { AudioRecognition, type _TurnDetector } from './audio_recognition.js';\n\nfunction setupInMemoryTracing() {\n const exporter = new InMemorySpanExporter();\n const provider = new NodeTracerProvider();\n provider.addSpanProcessor(new SimpleSpanProcessor(exporter));\n provider.register();\n setTracerProvider(provider);\n return { exporter };\n}\n\nfunction spanByName(spans: any[], name: string) {\n return spans.find((s) => s.name === name);\n}\n\nclass FakeVADStream extends (Object as unknown as { new (): VADStream }) {\n // We intentionally avoid extending the real VADStream (it is not exported as a value in JS output\n // in some bundling contexts). Instead we emulate the async iterator shape used by AudioRecognition.\n private events: VADEvent[];\n private idx = 0;\n constructor(events: VADEvent[]) {\n super();\n this.events = events;\n }\n updateInputStream() {}\n detachInputStream() {}\n close() {}\n [Symbol.asyncIterator]() {\n return this;\n }\n async next(): Promise<IteratorResult<VADEvent>> {\n if (this.idx >= this.events.length) {\n return { done: true, value: undefined };\n }\n const value = this.events[this.idx++]!;\n return { done: false, value };\n }\n}\n\nclass FakeVAD extends VAD {\n label = 'fake-vad';\n private events: VADEvent[];\n constructor(events: VADEvent[]) {\n super({ updateInterval: 1 });\n this.events = events;\n }\n stream(): any {\n return new FakeVADStream(this.events);\n }\n}\n\nconst alwaysTrueTurnDetector: _TurnDetector = {\n supportsLanguage: async () => true,\n unlikelyThreshold: async () => undefined,\n predictEndOfTurn: async () => 1.0,\n};\n\ndescribe('AudioRecognition user_turn span parity', () => {\n initializeLogger({ pretty: false, level: 'silent' });\n\n it('creates user_turn and parents eou_detection under it (stt mode)', async () => {\n const { exporter } = setupInMemoryTracing();\n\n const hooks = {\n onStartOfSpeech: vi.fn(),\n onVADInferenceDone: vi.fn(),\n onEndOfSpeech: vi.fn(),\n onInterimTranscript: vi.fn(),\n onFinalTranscript: vi.fn(),\n onPreemptiveGeneration: vi.fn(),\n retrieveChatCtx: () =>\n ({\n copy() {\n return this;\n },\n addMessage() {},\n toJSON() {\n return { items: [] };\n },\n }) as any,\n onEndOfTurn: vi.fn(async () => true),\n };\n\n const sttEvents: SpeechEvent[] = [\n { type: SpeechEventType.START_OF_SPEECH },\n {\n type: SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [\n {\n language: 'en',\n text: 'hello',\n startTime: 0,\n endTime: 0,\n confidence: 0.9,\n },\n ],\n },\n { type: SpeechEventType.END_OF_SPEECH },\n ];\n\n const sttNode = async () =>\n new ReadableStream<SpeechEvent>({\n start(controller) {\n for (const ev of sttEvents) controller.enqueue(ev);\n controller.close();\n },\n });\n\n const ar = new AudioRecognition({\n recognitionHooks: hooks as any,\n stt: sttNode as any,\n vad: undefined,\n turnDetector: alwaysTrueTurnDetector,\n turnDetectionMode: 'stt',\n minEndpointingDelay: 0,\n maxEndpointingDelay: 0,\n sttModel: 'deepgram-nova2',\n sttProvider: 'deepgram',\n getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }),\n });\n\n await ar.start();\n // allow background task to drain\n await new Promise((r) => setTimeout(r, 20));\n await ar.close();\n\n const spans = exporter.getFinishedSpans();\n const userTurn = spanByName(spans, 'user_turn');\n const eou = spanByName(spans, 'eou_detection');\n expect(userTurn, 'user_turn span missing').toBeTruthy();\n expect(eou, 'eou_detection span missing').toBeTruthy();\n\n expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);\n\n // creation-time attributes\n expect(userTurn.attributes['lk.participant_id']).toBe('p1');\n expect(userTurn.attributes['lk.participant_identity']).toBe('bob');\n expect(userTurn.attributes['lk.participant_kind']).toBe('AGENT');\n expect(userTurn.attributes['gen_ai.request.model']).toBe('deepgram-nova2');\n expect(userTurn.attributes['gen_ai.provider.name']).toBe('deepgram');\n\n // end-of-turn attributes\n expect(userTurn.attributes['lk.user_transcript']).toContain('hello');\n expect(userTurn.attributes['lk.transcript_confidence']).toBeGreaterThan(0);\n });\n\n it('creates user_turn from VAD startTime (vad mode) and keeps same parenting', async () => {\n const { exporter } = setupInMemoryTracing();\n\n const hooks = {\n onStartOfSpeech: vi.fn(),\n onVADInferenceDone: vi.fn(),\n onEndOfSpeech: vi.fn(),\n onInterimTranscript: vi.fn(),\n onFinalTranscript: vi.fn(),\n onPreemptiveGeneration: vi.fn(),\n retrieveChatCtx: () =>\n ({\n copy() {\n return this;\n },\n addMessage() {},\n toJSON() {\n return { items: [] };\n },\n }) as any,\n onEndOfTurn: vi.fn(async () => true),\n };\n\n const now = Date.now();\n const vadEvents: VADEvent[] = [\n {\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: 0,\n timestamp: now,\n speechDuration: 100,\n silenceDuration: 0,\n frames: [],\n probability: 0,\n inferenceDuration: 0,\n speaking: true,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n },\n {\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: 0,\n timestamp: now + 200,\n speechDuration: 100,\n silenceDuration: 100,\n frames: [],\n probability: 0,\n inferenceDuration: 0,\n speaking: false,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n },\n ];\n\n const sttEvents: SpeechEvent[] = [\n {\n type: SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [\n {\n language: 'en',\n text: 'test',\n startTime: 0,\n endTime: 0,\n confidence: 0.8,\n },\n ],\n },\n ];\n\n const sttNode = async () =>\n new ReadableStream<SpeechEvent>({\n start(controller) {\n for (const ev of sttEvents) controller.enqueue(ev);\n controller.close();\n },\n });\n\n const ar = new AudioRecognition({\n recognitionHooks: hooks as any,\n stt: sttNode as any,\n vad: new FakeVAD(vadEvents) as any,\n turnDetector: alwaysTrueTurnDetector,\n turnDetectionMode: 'vad',\n minEndpointingDelay: 0,\n maxEndpointingDelay: 0,\n sttModel: 'stt-model',\n sttProvider: 'stt-provider',\n getLinkedParticipant: () => ({ sid: 'p2', identity: 'alice', kind: ParticipantKind.AGENT }),\n });\n\n await ar.start();\n await new Promise((r) => setTimeout(r, 20));\n await ar.close();\n\n const spans = exporter.getFinishedSpans();\n const userTurn = spanByName(spans, 'user_turn');\n const eou = spanByName(spans, 'eou_detection');\n expect(userTurn).toBeTruthy();\n expect(eou).toBeTruthy();\n expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);\n\n expect(hooks.onStartOfSpeech).toHaveBeenCalled();\n expect(hooks.onEndOfSpeech).toHaveBeenCalled();\n });\n});\n"],"mappings":";AAGA,sBAAgC;AAChC,4BAA0D;AAC1D,4BAAmC;AACnC,oBAAyC;AACzC,iBAAiC;AACjC,iBAAkD;AAClD,uBAAkC;AAClC,iBAAiE;AACjE,+BAAqD;AAErD,SAAS,uBAAuB;AAC9B,QAAM,WAAW,IAAI,2CAAqB;AAC1C,QAAM,WAAW,IAAI,yCAAmB;AACxC,WAAS,iBAAiB,IAAI,0CAAoB,QAAQ,CAAC;AAC3D,WAAS,SAAS;AAClB,0CAAkB,QAAQ;AAC1B,SAAO,EAAE,SAAS;AACpB;AAEA,SAAS,WAAW,OAAc,MAAc;AAC9C,SAAO,MAAM,KAAK,CAAC,MAAM,EAAE,SAAS,IAAI;AAC1C;AAEA,MAAM,sBAAuB,OAA4C;AAAA;AAAA;AAAA,EAG/D;AAAA,EACA,MAAM;AAAA,EACd,YAAY,QAAoB;AAC9B,UAAM;AACN,SAAK,SAAS;AAAA,EAChB;AAAA,EACA,oBAAoB;AAAA,EAAC;AAAA,EACrB,oBAAoB;AAAA,EAAC;AAAA,EACrB,QAAQ;AAAA,EAAC;AAAA,EACT,CAAC,OAAO,aAAa,IAAI;AACvB,WAAO;AAAA,EACT;AAAA,EACA,MAAM,OAA0C;AAC9C,QAAI,KAAK,OAAO,KAAK,OAAO,QAAQ;AAClC,aAAO,EAAE,MAAM,MAAM,OAAO,OAAU;AAAA,IACxC;AACA,UAAM,QAAQ,KAAK,OAAO,KAAK,KAAK;AACpC,WAAO,EAAE,MAAM,OAAO,MAAM;AAAA,EAC9B;AACF;AAEA,MAAM,gBAAgB,eAAI;AAAA,EACxB,QAAQ;AAAA,EACA;AAAA,EACR,YAAY,QAAoB;AAC9B,UAAM,EAAE,gBAAgB,EAAE,CAAC;AAC3B,SAAK,SAAS;AAAA,EAChB;AAAA,EACA,SAAc;AACZ,WAAO,IAAI,cAAc,KAAK,MAAM;AAAA,EACtC;AACF;AAEA,MAAM,yBAAwC;AAAA,EAC5C,kBAAkB,YAAY;AAAA,EAC9B,mBAAmB,YAAY;AAAA,EAC/B,kBAAkB,YAAY;AAChC;AAAA,IAEA,wBAAS,0CAA0C,MAAM;AACvD,mCAAiB,EAAE,QAAQ,OAAO,OAAO,SAAS,CAAC;AAEnD,wBAAG,mEAAmE,YAAY;AAChF,UAAM,EAAE,SAAS,IAAI,qBAAqB;AAE1C,UAAM,QAAQ;AAAA,MACZ,iBAAiB,iBAAG,GAAG;AAAA,MACvB,oBAAoB,iBAAG,GAAG;AAAA,MAC1B,eAAe,iBAAG,GAAG;AAAA,MACrB,qBAAqB,iBAAG,GAAG;AAAA,MAC3B,mBAAmB,iBAAG,GAAG;AAAA,MACzB,wBAAwB,iBAAG,GAAG;AAAA,MAC9B,iBAAiB,OACd;AAAA,QACC,OAAO;AACL,iBAAO;AAAA,QACT;AAAA,QACA,aAAa;AAAA,QAAC;AAAA,QACd,SAAS;AACP,iBAAO,EAAE,OAAO,CAAC,EAAE;AAAA,QACrB;AAAA,MACF;AAAA,MACF,aAAa,iBAAG,GAAG,YAAY,IAAI;AAAA,IACrC;AAEA,UAAM,YAA2B;AAAA,MAC/B,EAAE,MAAM,2BAAgB,gBAAgB;AAAA,MACxC;AAAA,QACE,MAAM,2BAAgB;AAAA,QACtB,cAAc;AAAA,UACZ;AAAA,YACE,UAAU;AAAA,YACV,MAAM;AAAA,YACN,WAAW;AAAA,YACX,SAAS;AAAA,YACT,YAAY;AAAA,UACd;AAAA,QACF;AAAA,MACF;AAAA,MACA,EAAE,MAAM,2BAAgB,cAAc;AAAA,IACxC;AAEA,UAAM,UAAU,YACd,IAAI,eAA4B;AAAA,MAC9B,MAAM,YAAY;AAChB,mBAAW,MAAM,UAAW,YAAW,QAAQ,EAAE;AACjD,mBAAW,MAAM;AAAA,MACnB;AAAA,IACF,CAAC;AAEH,UAAM,KAAK,IAAI,0CAAiB;AAAA,MAC9B,kBAAkB;AAAA,MAClB,KAAK;AAAA,MACL,KAAK;AAAA,MACL,cAAc;AAAA,MACd,mBAAmB;AAAA,MACnB,qBAAqB;AAAA,MACrB,qBAAqB;AAAA,MACrB,UAAU;AAAA,MACV,aAAa;AAAA,MACb,sBAAsB,OAAO,EAAE,KAAK,MAAM,UAAU,OAAO,MAAM,gCAAgB,MAAM;AAAA,IACzF,CAAC;AAED,UAAM,GAAG,MAAM;AAEf,UAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,EAAE,CAAC;AAC1C,UAAM,GAAG,MAAM;AAEf,UAAM,QAAQ,SAAS,iBAAiB;AACxC,UAAM,WAAW,WAAW,OAAO,WAAW;AAC9C,UAAM,MAAM,WAAW,OAAO,eAAe;AAC7C,8BAAO,UAAU,wBAAwB,EAAE,WAAW;AACtD,8BAAO,KAAK,4BAA4B,EAAE,WAAW;AAErD,8BAAO,IAAI,YAAY,EAAE,KAAK,SAAS,YAAY,EAAE,MAAM;AAG3D,8BAAO,SAAS,WAAW,mBAAmB,CAAC,EAAE,KAAK,IAAI;AAC1D,8BAAO,SAAS,WAAW,yBAAyB,CAAC,EAAE,KAAK,KAAK;AACjE,8BAAO,SAAS,WAAW,qBAAqB,CAAC,EAAE,KAAK,OAAO;AAC/D,8BAAO,SAAS,WAAW,sBAAsB,CAAC,EAAE,KAAK,gBAAgB;AACzE,8BAAO,SAAS,WAAW,sBAAsB,CAAC,EAAE,KAAK,UAAU;AAGnE,8BAAO,SAAS,WAAW,oBAAoB,CAAC,EAAE,UAAU,OAAO;AACnE,8BAAO,SAAS,WAAW,0BAA0B,CAAC,EAAE,gBAAgB,CAAC;AAAA,EAC3E,CAAC;AAED,wBAAG,4EAA4E,YAAY;AACzF,UAAM,EAAE,SAAS,IAAI,qBAAqB;AAE1C,UAAM,QAAQ;AAAA,MACZ,iBAAiB,iBAAG,GAAG;AAAA,MACvB,oBAAoB,iBAAG,GAAG;AAAA,MAC1B,eAAe,iBAAG,GAAG;AAAA,MACrB,qBAAqB,iBAAG,GAAG;AAAA,MAC3B,mBAAmB,iBAAG,GAAG;AAAA,MACzB,wBAAwB,iBAAG,GAAG;AAAA,MAC9B,iBAAiB,OACd;AAAA,QACC,OAAO;AACL,iBAAO;AAAA,QACT;AAAA,QACA,aAAa;AAAA,QAAC;AAAA,QACd,SAAS;AACP,iBAAO,EAAE,OAAO,CAAC,EAAE;AAAA,QACrB;AAAA,MACF;AAAA,MACF,aAAa,iBAAG,GAAG,YAAY,IAAI;AAAA,IACrC;AAEA,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,YAAwB;AAAA,MAC5B;AAAA,QACE,MAAM,wBAAa;AAAA,QACnB,cAAc;AAAA,QACd,WAAW;AAAA,QACX,gBAAgB;AAAA,QAChB,iBAAiB;AAAA,QACjB,QAAQ,CAAC;AAAA,QACT,aAAa;AAAA,QACb,mBAAmB;AAAA,QACnB,UAAU;AAAA,QACV,uBAAuB;AAAA,QACvB,sBAAsB;AAAA,MACxB;AAAA,MACA;AAAA,QACE,MAAM,wBAAa;AAAA,QACnB,cAAc;AAAA,QACd,WAAW,MAAM;AAAA,QACjB,gBAAgB;AAAA,QAChB,iBAAiB;AAAA,QACjB,QAAQ,CAAC;AAAA,QACT,aAAa;AAAA,QACb,mBAAmB;AAAA,QACnB,UAAU;AAAA,QACV,uBAAuB;AAAA,QACvB,sBAAsB;AAAA,MACxB;AAAA,IACF;AAEA,UAAM,YAA2B;AAAA,MAC/B;AAAA,QACE,MAAM,2BAAgB;AAAA,QACtB,cAAc;AAAA,UACZ;AAAA,YACE,UAAU;AAAA,YACV,MAAM;AAAA,YACN,WAAW;AAAA,YACX,SAAS;AAAA,YACT,YAAY;AAAA,UACd;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,UAAU,YACd,IAAI,eAA4B;AAAA,MAC9B,MAAM,YAAY;AAChB,mBAAW,MAAM,UAAW,YAAW,QAAQ,EAAE;AACjD,mBAAW,MAAM;AAAA,MACnB;AAAA,IACF,CAAC;AAEH,UAAM,KAAK,IAAI,0CAAiB;AAAA,MAC9B,kBAAkB;AAAA,MAClB,KAAK;AAAA,MACL,KAAK,IAAI,QAAQ,SAAS;AAAA,MAC1B,cAAc;AAAA,MACd,mBAAmB;AAAA,MACnB,qBAAqB;AAAA,MACrB,qBAAqB;AAAA,MACrB,UAAU;AAAA,MACV,aAAa;AAAA,MACb,sBAAsB,OAAO,EAAE,KAAK,MAAM,UAAU,SAAS,MAAM,gCAAgB,MAAM;AAAA,IAC3F,CAAC;AAED,UAAM,GAAG,MAAM;AACf,UAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,EAAE,CAAC;AAC1C,UAAM,GAAG,MAAM;AAEf,UAAM,QAAQ,SAAS,iBAAiB;AACxC,UAAM,WAAW,WAAW,OAAO,WAAW;AAC9C,UAAM,MAAM,WAAW,OAAO,eAAe;AAC7C,8BAAO,QAAQ,EAAE,WAAW;AAC5B,8BAAO,GAAG,EAAE,WAAW;AACvB,8BAAO,IAAI,YAAY,EAAE,KAAK,SAAS,YAAY,EAAE,MAAM;AAE3D,8BAAO,MAAM,eAAe,EAAE,iBAAiB;AAC/C,8BAAO,MAAM,aAAa,EAAE,iBAAiB;AAAA,EAC/C,CAAC;AACH,CAAC;","names":[]}
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
import { ParticipantKind } from "@livekit/rtc-node";
|
|
2
|
+
import { InMemorySpanExporter, SimpleSpanProcessor } from "@opentelemetry/sdk-trace-base";
|
|
3
|
+
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
4
|
+
import { describe, expect, it, vi } from "vitest";
|
|
5
|
+
import { initializeLogger } from "../log.js";
|
|
6
|
+
import { SpeechEventType } from "../stt/stt.js";
|
|
7
|
+
import { setTracerProvider } from "../telemetry/index.js";
|
|
8
|
+
import { VAD, VADEventType } from "../vad.js";
|
|
9
|
+
import { AudioRecognition } from "./audio_recognition.js";
|
|
10
|
+
function setupInMemoryTracing() {
|
|
11
|
+
const exporter = new InMemorySpanExporter();
|
|
12
|
+
const provider = new NodeTracerProvider();
|
|
13
|
+
provider.addSpanProcessor(new SimpleSpanProcessor(exporter));
|
|
14
|
+
provider.register();
|
|
15
|
+
setTracerProvider(provider);
|
|
16
|
+
return { exporter };
|
|
17
|
+
}
|
|
18
|
+
function spanByName(spans, name) {
|
|
19
|
+
return spans.find((s) => s.name === name);
|
|
20
|
+
}
|
|
21
|
+
class FakeVADStream extends Object {
|
|
22
|
+
// We intentionally avoid extending the real VADStream (it is not exported as a value in JS output
|
|
23
|
+
// in some bundling contexts). Instead we emulate the async iterator shape used by AudioRecognition.
|
|
24
|
+
events;
|
|
25
|
+
idx = 0;
|
|
26
|
+
constructor(events) {
|
|
27
|
+
super();
|
|
28
|
+
this.events = events;
|
|
29
|
+
}
|
|
30
|
+
updateInputStream() {
|
|
31
|
+
}
|
|
32
|
+
detachInputStream() {
|
|
33
|
+
}
|
|
34
|
+
close() {
|
|
35
|
+
}
|
|
36
|
+
[Symbol.asyncIterator]() {
|
|
37
|
+
return this;
|
|
38
|
+
}
|
|
39
|
+
async next() {
|
|
40
|
+
if (this.idx >= this.events.length) {
|
|
41
|
+
return { done: true, value: void 0 };
|
|
42
|
+
}
|
|
43
|
+
const value = this.events[this.idx++];
|
|
44
|
+
return { done: false, value };
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
class FakeVAD extends VAD {
|
|
48
|
+
label = "fake-vad";
|
|
49
|
+
events;
|
|
50
|
+
constructor(events) {
|
|
51
|
+
super({ updateInterval: 1 });
|
|
52
|
+
this.events = events;
|
|
53
|
+
}
|
|
54
|
+
stream() {
|
|
55
|
+
return new FakeVADStream(this.events);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
const alwaysTrueTurnDetector = {
|
|
59
|
+
supportsLanguage: async () => true,
|
|
60
|
+
unlikelyThreshold: async () => void 0,
|
|
61
|
+
predictEndOfTurn: async () => 1
|
|
62
|
+
};
|
|
63
|
+
describe("AudioRecognition user_turn span parity", () => {
|
|
64
|
+
initializeLogger({ pretty: false, level: "silent" });
|
|
65
|
+
it("creates user_turn and parents eou_detection under it (stt mode)", async () => {
|
|
66
|
+
const { exporter } = setupInMemoryTracing();
|
|
67
|
+
const hooks = {
|
|
68
|
+
onStartOfSpeech: vi.fn(),
|
|
69
|
+
onVADInferenceDone: vi.fn(),
|
|
70
|
+
onEndOfSpeech: vi.fn(),
|
|
71
|
+
onInterimTranscript: vi.fn(),
|
|
72
|
+
onFinalTranscript: vi.fn(),
|
|
73
|
+
onPreemptiveGeneration: vi.fn(),
|
|
74
|
+
retrieveChatCtx: () => ({
|
|
75
|
+
copy() {
|
|
76
|
+
return this;
|
|
77
|
+
},
|
|
78
|
+
addMessage() {
|
|
79
|
+
},
|
|
80
|
+
toJSON() {
|
|
81
|
+
return { items: [] };
|
|
82
|
+
}
|
|
83
|
+
}),
|
|
84
|
+
onEndOfTurn: vi.fn(async () => true)
|
|
85
|
+
};
|
|
86
|
+
const sttEvents = [
|
|
87
|
+
{ type: SpeechEventType.START_OF_SPEECH },
|
|
88
|
+
{
|
|
89
|
+
type: SpeechEventType.FINAL_TRANSCRIPT,
|
|
90
|
+
alternatives: [
|
|
91
|
+
{
|
|
92
|
+
language: "en",
|
|
93
|
+
text: "hello",
|
|
94
|
+
startTime: 0,
|
|
95
|
+
endTime: 0,
|
|
96
|
+
confidence: 0.9
|
|
97
|
+
}
|
|
98
|
+
]
|
|
99
|
+
},
|
|
100
|
+
{ type: SpeechEventType.END_OF_SPEECH }
|
|
101
|
+
];
|
|
102
|
+
const sttNode = async () => new ReadableStream({
|
|
103
|
+
start(controller) {
|
|
104
|
+
for (const ev of sttEvents) controller.enqueue(ev);
|
|
105
|
+
controller.close();
|
|
106
|
+
}
|
|
107
|
+
});
|
|
108
|
+
const ar = new AudioRecognition({
|
|
109
|
+
recognitionHooks: hooks,
|
|
110
|
+
stt: sttNode,
|
|
111
|
+
vad: void 0,
|
|
112
|
+
turnDetector: alwaysTrueTurnDetector,
|
|
113
|
+
turnDetectionMode: "stt",
|
|
114
|
+
minEndpointingDelay: 0,
|
|
115
|
+
maxEndpointingDelay: 0,
|
|
116
|
+
sttModel: "deepgram-nova2",
|
|
117
|
+
sttProvider: "deepgram",
|
|
118
|
+
getLinkedParticipant: () => ({ sid: "p1", identity: "bob", kind: ParticipantKind.AGENT })
|
|
119
|
+
});
|
|
120
|
+
await ar.start();
|
|
121
|
+
await new Promise((r) => setTimeout(r, 20));
|
|
122
|
+
await ar.close();
|
|
123
|
+
const spans = exporter.getFinishedSpans();
|
|
124
|
+
const userTurn = spanByName(spans, "user_turn");
|
|
125
|
+
const eou = spanByName(spans, "eou_detection");
|
|
126
|
+
expect(userTurn, "user_turn span missing").toBeTruthy();
|
|
127
|
+
expect(eou, "eou_detection span missing").toBeTruthy();
|
|
128
|
+
expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);
|
|
129
|
+
expect(userTurn.attributes["lk.participant_id"]).toBe("p1");
|
|
130
|
+
expect(userTurn.attributes["lk.participant_identity"]).toBe("bob");
|
|
131
|
+
expect(userTurn.attributes["lk.participant_kind"]).toBe("AGENT");
|
|
132
|
+
expect(userTurn.attributes["gen_ai.request.model"]).toBe("deepgram-nova2");
|
|
133
|
+
expect(userTurn.attributes["gen_ai.provider.name"]).toBe("deepgram");
|
|
134
|
+
expect(userTurn.attributes["lk.user_transcript"]).toContain("hello");
|
|
135
|
+
expect(userTurn.attributes["lk.transcript_confidence"]).toBeGreaterThan(0);
|
|
136
|
+
});
|
|
137
|
+
it("creates user_turn from VAD startTime (vad mode) and keeps same parenting", async () => {
|
|
138
|
+
const { exporter } = setupInMemoryTracing();
|
|
139
|
+
const hooks = {
|
|
140
|
+
onStartOfSpeech: vi.fn(),
|
|
141
|
+
onVADInferenceDone: vi.fn(),
|
|
142
|
+
onEndOfSpeech: vi.fn(),
|
|
143
|
+
onInterimTranscript: vi.fn(),
|
|
144
|
+
onFinalTranscript: vi.fn(),
|
|
145
|
+
onPreemptiveGeneration: vi.fn(),
|
|
146
|
+
retrieveChatCtx: () => ({
|
|
147
|
+
copy() {
|
|
148
|
+
return this;
|
|
149
|
+
},
|
|
150
|
+
addMessage() {
|
|
151
|
+
},
|
|
152
|
+
toJSON() {
|
|
153
|
+
return { items: [] };
|
|
154
|
+
}
|
|
155
|
+
}),
|
|
156
|
+
onEndOfTurn: vi.fn(async () => true)
|
|
157
|
+
};
|
|
158
|
+
const now = Date.now();
|
|
159
|
+
const vadEvents = [
|
|
160
|
+
{
|
|
161
|
+
type: VADEventType.START_OF_SPEECH,
|
|
162
|
+
samplesIndex: 0,
|
|
163
|
+
timestamp: now,
|
|
164
|
+
speechDuration: 100,
|
|
165
|
+
silenceDuration: 0,
|
|
166
|
+
frames: [],
|
|
167
|
+
probability: 0,
|
|
168
|
+
inferenceDuration: 0,
|
|
169
|
+
speaking: true,
|
|
170
|
+
rawAccumulatedSilence: 0,
|
|
171
|
+
rawAccumulatedSpeech: 0
|
|
172
|
+
},
|
|
173
|
+
{
|
|
174
|
+
type: VADEventType.END_OF_SPEECH,
|
|
175
|
+
samplesIndex: 0,
|
|
176
|
+
timestamp: now + 200,
|
|
177
|
+
speechDuration: 100,
|
|
178
|
+
silenceDuration: 100,
|
|
179
|
+
frames: [],
|
|
180
|
+
probability: 0,
|
|
181
|
+
inferenceDuration: 0,
|
|
182
|
+
speaking: false,
|
|
183
|
+
rawAccumulatedSilence: 0,
|
|
184
|
+
rawAccumulatedSpeech: 0
|
|
185
|
+
}
|
|
186
|
+
];
|
|
187
|
+
const sttEvents = [
|
|
188
|
+
{
|
|
189
|
+
type: SpeechEventType.FINAL_TRANSCRIPT,
|
|
190
|
+
alternatives: [
|
|
191
|
+
{
|
|
192
|
+
language: "en",
|
|
193
|
+
text: "test",
|
|
194
|
+
startTime: 0,
|
|
195
|
+
endTime: 0,
|
|
196
|
+
confidence: 0.8
|
|
197
|
+
}
|
|
198
|
+
]
|
|
199
|
+
}
|
|
200
|
+
];
|
|
201
|
+
const sttNode = async () => new ReadableStream({
|
|
202
|
+
start(controller) {
|
|
203
|
+
for (const ev of sttEvents) controller.enqueue(ev);
|
|
204
|
+
controller.close();
|
|
205
|
+
}
|
|
206
|
+
});
|
|
207
|
+
const ar = new AudioRecognition({
|
|
208
|
+
recognitionHooks: hooks,
|
|
209
|
+
stt: sttNode,
|
|
210
|
+
vad: new FakeVAD(vadEvents),
|
|
211
|
+
turnDetector: alwaysTrueTurnDetector,
|
|
212
|
+
turnDetectionMode: "vad",
|
|
213
|
+
minEndpointingDelay: 0,
|
|
214
|
+
maxEndpointingDelay: 0,
|
|
215
|
+
sttModel: "stt-model",
|
|
216
|
+
sttProvider: "stt-provider",
|
|
217
|
+
getLinkedParticipant: () => ({ sid: "p2", identity: "alice", kind: ParticipantKind.AGENT })
|
|
218
|
+
});
|
|
219
|
+
await ar.start();
|
|
220
|
+
await new Promise((r) => setTimeout(r, 20));
|
|
221
|
+
await ar.close();
|
|
222
|
+
const spans = exporter.getFinishedSpans();
|
|
223
|
+
const userTurn = spanByName(spans, "user_turn");
|
|
224
|
+
const eou = spanByName(spans, "eou_detection");
|
|
225
|
+
expect(userTurn).toBeTruthy();
|
|
226
|
+
expect(eou).toBeTruthy();
|
|
227
|
+
expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);
|
|
228
|
+
expect(hooks.onStartOfSpeech).toHaveBeenCalled();
|
|
229
|
+
expect(hooks.onEndOfSpeech).toHaveBeenCalled();
|
|
230
|
+
});
|
|
231
|
+
});
|
|
232
|
+
//# sourceMappingURL=audio_recognition_span.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/voice/audio_recognition_span.test.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2026 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { ParticipantKind } from '@livekit/rtc-node';\nimport { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base';\nimport { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';\nimport { describe, expect, it, vi } from 'vitest';\nimport { initializeLogger } from '../log.js';\nimport { type SpeechEvent, SpeechEventType } from '../stt/stt.js';\nimport { setTracerProvider } from '../telemetry/index.js';\nimport { VAD, type VADEvent, VADEventType, type VADStream } from '../vad.js';\nimport { AudioRecognition, type _TurnDetector } from './audio_recognition.js';\n\nfunction setupInMemoryTracing() {\n const exporter = new InMemorySpanExporter();\n const provider = new NodeTracerProvider();\n provider.addSpanProcessor(new SimpleSpanProcessor(exporter));\n provider.register();\n setTracerProvider(provider);\n return { exporter };\n}\n\nfunction spanByName(spans: any[], name: string) {\n return spans.find((s) => s.name === name);\n}\n\nclass FakeVADStream extends (Object as unknown as { new (): VADStream }) {\n // We intentionally avoid extending the real VADStream (it is not exported as a value in JS output\n // in some bundling contexts). Instead we emulate the async iterator shape used by AudioRecognition.\n private events: VADEvent[];\n private idx = 0;\n constructor(events: VADEvent[]) {\n super();\n this.events = events;\n }\n updateInputStream() {}\n detachInputStream() {}\n close() {}\n [Symbol.asyncIterator]() {\n return this;\n }\n async next(): Promise<IteratorResult<VADEvent>> {\n if (this.idx >= this.events.length) {\n return { done: true, value: undefined };\n }\n const value = this.events[this.idx++]!;\n return { done: false, value };\n }\n}\n\nclass FakeVAD extends VAD {\n label = 'fake-vad';\n private events: VADEvent[];\n constructor(events: VADEvent[]) {\n super({ updateInterval: 1 });\n this.events = events;\n }\n stream(): any {\n return new FakeVADStream(this.events);\n }\n}\n\nconst alwaysTrueTurnDetector: _TurnDetector = {\n supportsLanguage: async () => true,\n unlikelyThreshold: async () => undefined,\n predictEndOfTurn: async () => 1.0,\n};\n\ndescribe('AudioRecognition user_turn span parity', () => {\n initializeLogger({ pretty: false, level: 'silent' });\n\n it('creates user_turn and parents eou_detection under it (stt mode)', async () => {\n const { exporter } = setupInMemoryTracing();\n\n const hooks = {\n onStartOfSpeech: vi.fn(),\n onVADInferenceDone: vi.fn(),\n onEndOfSpeech: vi.fn(),\n onInterimTranscript: vi.fn(),\n onFinalTranscript: vi.fn(),\n onPreemptiveGeneration: vi.fn(),\n retrieveChatCtx: () =>\n ({\n copy() {\n return this;\n },\n addMessage() {},\n toJSON() {\n return { items: [] };\n },\n }) as any,\n onEndOfTurn: vi.fn(async () => true),\n };\n\n const sttEvents: SpeechEvent[] = [\n { type: SpeechEventType.START_OF_SPEECH },\n {\n type: SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [\n {\n language: 'en',\n text: 'hello',\n startTime: 0,\n endTime: 0,\n confidence: 0.9,\n },\n ],\n },\n { type: SpeechEventType.END_OF_SPEECH },\n ];\n\n const sttNode = async () =>\n new ReadableStream<SpeechEvent>({\n start(controller) {\n for (const ev of sttEvents) controller.enqueue(ev);\n controller.close();\n },\n });\n\n const ar = new AudioRecognition({\n recognitionHooks: hooks as any,\n stt: sttNode as any,\n vad: undefined,\n turnDetector: alwaysTrueTurnDetector,\n turnDetectionMode: 'stt',\n minEndpointingDelay: 0,\n maxEndpointingDelay: 0,\n sttModel: 'deepgram-nova2',\n sttProvider: 'deepgram',\n getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }),\n });\n\n await ar.start();\n // allow background task to drain\n await new Promise((r) => setTimeout(r, 20));\n await ar.close();\n\n const spans = exporter.getFinishedSpans();\n const userTurn = spanByName(spans, 'user_turn');\n const eou = spanByName(spans, 'eou_detection');\n expect(userTurn, 'user_turn span missing').toBeTruthy();\n expect(eou, 'eou_detection span missing').toBeTruthy();\n\n expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);\n\n // creation-time attributes\n expect(userTurn.attributes['lk.participant_id']).toBe('p1');\n expect(userTurn.attributes['lk.participant_identity']).toBe('bob');\n expect(userTurn.attributes['lk.participant_kind']).toBe('AGENT');\n expect(userTurn.attributes['gen_ai.request.model']).toBe('deepgram-nova2');\n expect(userTurn.attributes['gen_ai.provider.name']).toBe('deepgram');\n\n // end-of-turn attributes\n expect(userTurn.attributes['lk.user_transcript']).toContain('hello');\n expect(userTurn.attributes['lk.transcript_confidence']).toBeGreaterThan(0);\n });\n\n it('creates user_turn from VAD startTime (vad mode) and keeps same parenting', async () => {\n const { exporter } = setupInMemoryTracing();\n\n const hooks = {\n onStartOfSpeech: vi.fn(),\n onVADInferenceDone: vi.fn(),\n onEndOfSpeech: vi.fn(),\n onInterimTranscript: vi.fn(),\n onFinalTranscript: vi.fn(),\n onPreemptiveGeneration: vi.fn(),\n retrieveChatCtx: () =>\n ({\n copy() {\n return this;\n },\n addMessage() {},\n toJSON() {\n return { items: [] };\n },\n }) as any,\n onEndOfTurn: vi.fn(async () => true),\n };\n\n const now = Date.now();\n const vadEvents: VADEvent[] = [\n {\n type: VADEventType.START_OF_SPEECH,\n samplesIndex: 0,\n timestamp: now,\n speechDuration: 100,\n silenceDuration: 0,\n frames: [],\n probability: 0,\n inferenceDuration: 0,\n speaking: true,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n },\n {\n type: VADEventType.END_OF_SPEECH,\n samplesIndex: 0,\n timestamp: now + 200,\n speechDuration: 100,\n silenceDuration: 100,\n frames: [],\n probability: 0,\n inferenceDuration: 0,\n speaking: false,\n rawAccumulatedSilence: 0,\n rawAccumulatedSpeech: 0,\n },\n ];\n\n const sttEvents: SpeechEvent[] = [\n {\n type: SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [\n {\n language: 'en',\n text: 'test',\n startTime: 0,\n endTime: 0,\n confidence: 0.8,\n },\n ],\n },\n ];\n\n const sttNode = async () =>\n new ReadableStream<SpeechEvent>({\n start(controller) {\n for (const ev of sttEvents) controller.enqueue(ev);\n controller.close();\n },\n });\n\n const ar = new AudioRecognition({\n recognitionHooks: hooks as any,\n stt: sttNode as any,\n vad: new FakeVAD(vadEvents) as any,\n turnDetector: alwaysTrueTurnDetector,\n turnDetectionMode: 'vad',\n minEndpointingDelay: 0,\n maxEndpointingDelay: 0,\n sttModel: 'stt-model',\n sttProvider: 'stt-provider',\n getLinkedParticipant: () => ({ sid: 'p2', identity: 'alice', kind: ParticipantKind.AGENT }),\n });\n\n await ar.start();\n await new Promise((r) => setTimeout(r, 20));\n await ar.close();\n\n const spans = exporter.getFinishedSpans();\n const userTurn = spanByName(spans, 'user_turn');\n const eou = spanByName(spans, 'eou_detection');\n expect(userTurn).toBeTruthy();\n expect(eou).toBeTruthy();\n expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);\n\n expect(hooks.onStartOfSpeech).toHaveBeenCalled();\n expect(hooks.onEndOfSpeech).toHaveBeenCalled();\n });\n});\n"],"mappings":"AAGA,SAAS,uBAAuB;AAChC,SAAS,sBAAsB,2BAA2B;AAC1D,SAAS,0BAA0B;AACnC,SAAS,UAAU,QAAQ,IAAI,UAAU;AACzC,SAAS,wBAAwB;AACjC,SAA2B,uBAAuB;AAClD,SAAS,yBAAyB;AAClC,SAAS,KAAoB,oBAAoC;AACjE,SAAS,wBAA4C;AAErD,SAAS,uBAAuB;AAC9B,QAAM,WAAW,IAAI,qBAAqB;AAC1C,QAAM,WAAW,IAAI,mBAAmB;AACxC,WAAS,iBAAiB,IAAI,oBAAoB,QAAQ,CAAC;AAC3D,WAAS,SAAS;AAClB,oBAAkB,QAAQ;AAC1B,SAAO,EAAE,SAAS;AACpB;AAEA,SAAS,WAAW,OAAc,MAAc;AAC9C,SAAO,MAAM,KAAK,CAAC,MAAM,EAAE,SAAS,IAAI;AAC1C;AAEA,MAAM,sBAAuB,OAA4C;AAAA;AAAA;AAAA,EAG/D;AAAA,EACA,MAAM;AAAA,EACd,YAAY,QAAoB;AAC9B,UAAM;AACN,SAAK,SAAS;AAAA,EAChB;AAAA,EACA,oBAAoB;AAAA,EAAC;AAAA,EACrB,oBAAoB;AAAA,EAAC;AAAA,EACrB,QAAQ;AAAA,EAAC;AAAA,EACT,CAAC,OAAO,aAAa,IAAI;AACvB,WAAO;AAAA,EACT;AAAA,EACA,MAAM,OAA0C;AAC9C,QAAI,KAAK,OAAO,KAAK,OAAO,QAAQ;AAClC,aAAO,EAAE,MAAM,MAAM,OAAO,OAAU;AAAA,IACxC;AACA,UAAM,QAAQ,KAAK,OAAO,KAAK,KAAK;AACpC,WAAO,EAAE,MAAM,OAAO,MAAM;AAAA,EAC9B;AACF;AAEA,MAAM,gBAAgB,IAAI;AAAA,EACxB,QAAQ;AAAA,EACA;AAAA,EACR,YAAY,QAAoB;AAC9B,UAAM,EAAE,gBAAgB,EAAE,CAAC;AAC3B,SAAK,SAAS;AAAA,EAChB;AAAA,EACA,SAAc;AACZ,WAAO,IAAI,cAAc,KAAK,MAAM;AAAA,EACtC;AACF;AAEA,MAAM,yBAAwC;AAAA,EAC5C,kBAAkB,YAAY;AAAA,EAC9B,mBAAmB,YAAY;AAAA,EAC/B,kBAAkB,YAAY;AAChC;AAEA,SAAS,0CAA0C,MAAM;AACvD,mBAAiB,EAAE,QAAQ,OAAO,OAAO,SAAS,CAAC;AAEnD,KAAG,mEAAmE,YAAY;AAChF,UAAM,EAAE,SAAS,IAAI,qBAAqB;AAE1C,UAAM,QAAQ;AAAA,MACZ,iBAAiB,GAAG,GAAG;AAAA,MACvB,oBAAoB,GAAG,GAAG;AAAA,MAC1B,eAAe,GAAG,GAAG;AAAA,MACrB,qBAAqB,GAAG,GAAG;AAAA,MAC3B,mBAAmB,GAAG,GAAG;AAAA,MACzB,wBAAwB,GAAG,GAAG;AAAA,MAC9B,iBAAiB,OACd;AAAA,QACC,OAAO;AACL,iBAAO;AAAA,QACT;AAAA,QACA,aAAa;AAAA,QAAC;AAAA,QACd,SAAS;AACP,iBAAO,EAAE,OAAO,CAAC,EAAE;AAAA,QACrB;AAAA,MACF;AAAA,MACF,aAAa,GAAG,GAAG,YAAY,IAAI;AAAA,IACrC;AAEA,UAAM,YAA2B;AAAA,MAC/B,EAAE,MAAM,gBAAgB,gBAAgB;AAAA,MACxC;AAAA,QACE,MAAM,gBAAgB;AAAA,QACtB,cAAc;AAAA,UACZ;AAAA,YACE,UAAU;AAAA,YACV,MAAM;AAAA,YACN,WAAW;AAAA,YACX,SAAS;AAAA,YACT,YAAY;AAAA,UACd;AAAA,QACF;AAAA,MACF;AAAA,MACA,EAAE,MAAM,gBAAgB,cAAc;AAAA,IACxC;AAEA,UAAM,UAAU,YACd,IAAI,eAA4B;AAAA,MAC9B,MAAM,YAAY;AAChB,mBAAW,MAAM,UAAW,YAAW,QAAQ,EAAE;AACjD,mBAAW,MAAM;AAAA,MACnB;AAAA,IACF,CAAC;AAEH,UAAM,KAAK,IAAI,iBAAiB;AAAA,MAC9B,kBAAkB;AAAA,MAClB,KAAK;AAAA,MACL,KAAK;AAAA,MACL,cAAc;AAAA,MACd,mBAAmB;AAAA,MACnB,qBAAqB;AAAA,MACrB,qBAAqB;AAAA,MACrB,UAAU;AAAA,MACV,aAAa;AAAA,MACb,sBAAsB,OAAO,EAAE,KAAK,MAAM,UAAU,OAAO,MAAM,gBAAgB,MAAM;AAAA,IACzF,CAAC;AAED,UAAM,GAAG,MAAM;AAEf,UAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,EAAE,CAAC;AAC1C,UAAM,GAAG,MAAM;AAEf,UAAM,QAAQ,SAAS,iBAAiB;AACxC,UAAM,WAAW,WAAW,OAAO,WAAW;AAC9C,UAAM,MAAM,WAAW,OAAO,eAAe;AAC7C,WAAO,UAAU,wBAAwB,EAAE,WAAW;AACtD,WAAO,KAAK,4BAA4B,EAAE,WAAW;AAErD,WAAO,IAAI,YAAY,EAAE,KAAK,SAAS,YAAY,EAAE,MAAM;AAG3D,WAAO,SAAS,WAAW,mBAAmB,CAAC,EAAE,KAAK,IAAI;AAC1D,WAAO,SAAS,WAAW,yBAAyB,CAAC,EAAE,KAAK,KAAK;AACjE,WAAO,SAAS,WAAW,qBAAqB,CAAC,EAAE,KAAK,OAAO;AAC/D,WAAO,SAAS,WAAW,sBAAsB,CAAC,EAAE,KAAK,gBAAgB;AACzE,WAAO,SAAS,WAAW,sBAAsB,CAAC,EAAE,KAAK,UAAU;AAGnE,WAAO,SAAS,WAAW,oBAAoB,CAAC,EAAE,UAAU,OAAO;AACnE,WAAO,SAAS,WAAW,0BAA0B,CAAC,EAAE,gBAAgB,CAAC;AAAA,EAC3E,CAAC;AAED,KAAG,4EAA4E,YAAY;AACzF,UAAM,EAAE,SAAS,IAAI,qBAAqB;AAE1C,UAAM,QAAQ;AAAA,MACZ,iBAAiB,GAAG,GAAG;AAAA,MACvB,oBAAoB,GAAG,GAAG;AAAA,MAC1B,eAAe,GAAG,GAAG;AAAA,MACrB,qBAAqB,GAAG,GAAG;AAAA,MAC3B,mBAAmB,GAAG,GAAG;AAAA,MACzB,wBAAwB,GAAG,GAAG;AAAA,MAC9B,iBAAiB,OACd;AAAA,QACC,OAAO;AACL,iBAAO;AAAA,QACT;AAAA,QACA,aAAa;AAAA,QAAC;AAAA,QACd,SAAS;AACP,iBAAO,EAAE,OAAO,CAAC,EAAE;AAAA,QACrB;AAAA,MACF;AAAA,MACF,aAAa,GAAG,GAAG,YAAY,IAAI;AAAA,IACrC;AAEA,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,YAAwB;AAAA,MAC5B;AAAA,QACE,MAAM,aAAa;AAAA,QACnB,cAAc;AAAA,QACd,WAAW;AAAA,QACX,gBAAgB;AAAA,QAChB,iBAAiB;AAAA,QACjB,QAAQ,CAAC;AAAA,QACT,aAAa;AAAA,QACb,mBAAmB;AAAA,QACnB,UAAU;AAAA,QACV,uBAAuB;AAAA,QACvB,sBAAsB;AAAA,MACxB;AAAA,MACA;AAAA,QACE,MAAM,aAAa;AAAA,QACnB,cAAc;AAAA,QACd,WAAW,MAAM;AAAA,QACjB,gBAAgB;AAAA,QAChB,iBAAiB;AAAA,QACjB,QAAQ,CAAC;AAAA,QACT,aAAa;AAAA,QACb,mBAAmB;AAAA,QACnB,UAAU;AAAA,QACV,uBAAuB;AAAA,QACvB,sBAAsB;AAAA,MACxB;AAAA,IACF;AAEA,UAAM,YAA2B;AAAA,MAC/B;AAAA,QACE,MAAM,gBAAgB;AAAA,QACtB,cAAc;AAAA,UACZ;AAAA,YACE,UAAU;AAAA,YACV,MAAM;AAAA,YACN,WAAW;AAAA,YACX,SAAS;AAAA,YACT,YAAY;AAAA,UACd;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,UAAU,YACd,IAAI,eAA4B;AAAA,MAC9B,MAAM,YAAY;AAChB,mBAAW,MAAM,UAAW,YAAW,QAAQ,EAAE;AACjD,mBAAW,MAAM;AAAA,MACnB;AAAA,IACF,CAAC;AAEH,UAAM,KAAK,IAAI,iBAAiB;AAAA,MAC9B,kBAAkB;AAAA,MAClB,KAAK;AAAA,MACL,KAAK,IAAI,QAAQ,SAAS;AAAA,MAC1B,cAAc;AAAA,MACd,mBAAmB;AAAA,MACnB,qBAAqB;AAAA,MACrB,qBAAqB;AAAA,MACrB,UAAU;AAAA,MACV,aAAa;AAAA,MACb,sBAAsB,OAAO,EAAE,KAAK,MAAM,UAAU,SAAS,MAAM,gBAAgB,MAAM;AAAA,IAC3F,CAAC;AAED,UAAM,GAAG,MAAM;AACf,UAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,EAAE,CAAC;AAC1C,UAAM,GAAG,MAAM;AAEf,UAAM,QAAQ,SAAS,iBAAiB;AACxC,UAAM,WAAW,WAAW,OAAO,WAAW;AAC9C,UAAM,MAAM,WAAW,OAAO,eAAe;AAC7C,WAAO,QAAQ,EAAE,WAAW;AAC5B,WAAO,GAAG,EAAE,WAAW;AACvB,WAAO,IAAI,YAAY,EAAE,KAAK,SAAS,YAAY,EAAE,MAAM;AAE3D,WAAO,MAAM,eAAe,EAAE,iBAAiB;AAC/C,WAAO,MAAM,aAAa,EAAE,iBAAiB;AAAA,EAC/C,CAAC;AACH,CAAC;","names":[]}
|
package/dist/voice/io.cjs
CHANGED
|
@@ -30,7 +30,7 @@ __export(io_exports, {
|
|
|
30
30
|
module.exports = __toCommonJS(io_exports);
|
|
31
31
|
var import_node_events = require("node:events");
|
|
32
32
|
var import_log = require("../log.cjs");
|
|
33
|
-
var
|
|
33
|
+
var import_multi_input_stream = require("../stream/multi_input_stream.cjs");
|
|
34
34
|
var import_utils = require("../utils.cjs");
|
|
35
35
|
const TIMED_STRING_SYMBOL = Symbol.for("lk.TimedString");
|
|
36
36
|
function createTimedString(opts) {
|
|
@@ -47,9 +47,12 @@ function isTimedString(value) {
|
|
|
47
47
|
return typeof value === "object" && value !== null && TIMED_STRING_SYMBOL in value && value[TIMED_STRING_SYMBOL] === true;
|
|
48
48
|
}
|
|
49
49
|
class AudioInput {
|
|
50
|
-
|
|
50
|
+
multiStream = new import_multi_input_stream.MultiInputStream();
|
|
51
51
|
get stream() {
|
|
52
|
-
return this.
|
|
52
|
+
return this.multiStream.stream;
|
|
53
|
+
}
|
|
54
|
+
async close() {
|
|
55
|
+
await this.multiStream.close();
|
|
53
56
|
}
|
|
54
57
|
onAttached() {
|
|
55
58
|
}
|
package/dist/voice/io.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/voice/io.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2025 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { EventEmitter } from 'node:events';\nimport type { ReadableStream } from 'node:stream/web';\nimport type { ChatContext } from '../llm/chat_context.js';\nimport type { ChatChunk } from '../llm/llm.js';\nimport type { ToolContext } from '../llm/tool_context.js';\nimport { log } from '../log.js';\nimport { DeferredReadableStream } from '../stream/deferred_stream.js';\nimport type { SpeechEvent } from '../stt/stt.js';\nimport { Future } from '../utils.js';\nimport type { ModelSettings } from './agent.js';\n\nexport type STTNode = (\n audio: ReadableStream<AudioFrame>,\n modelSettings: ModelSettings,\n) => Promise<ReadableStream<SpeechEvent | string> | null>;\n\nexport type LLMNode = (\n chatCtx: ChatContext,\n toolCtx: ToolContext,\n modelSettings: ModelSettings,\n) => Promise<ReadableStream<ChatChunk | string> | null>;\n\nexport type TTSNode = (\n text: ReadableStream<string>,\n modelSettings: ModelSettings,\n) => Promise<ReadableStream<AudioFrame> | null>;\n\n/**\n * Symbol used to identify TimedString objects.\n */\nexport const TIMED_STRING_SYMBOL = Symbol.for('lk.TimedString');\n\n/**\n * A string with optional start and end timestamps for word-level alignment.\n */\nexport interface TimedString {\n readonly [TIMED_STRING_SYMBOL]: true;\n text: string;\n startTime?: number; // seconds\n endTime?: number; // seconds\n confidence?: number;\n startTimeOffset?: number;\n}\n\n/**\n * Factory function to create a TimedString object.\n */\nexport function createTimedString(opts: {\n text: string;\n startTime?: number;\n endTime?: number;\n confidence?: number;\n startTimeOffset?: number;\n}): TimedString {\n return {\n [TIMED_STRING_SYMBOL]: true,\n text: opts.text,\n startTime: opts.startTime,\n endTime: opts.endTime,\n confidence: opts.confidence,\n startTimeOffset: opts.startTimeOffset,\n };\n}\n\n/**\n * Type guard to check if a value is a TimedString.\n */\nexport function isTimedString(value: unknown): value is TimedString {\n return (\n typeof value === 'object' &&\n value !== null &&\n TIMED_STRING_SYMBOL in value &&\n (value as TimedString)[TIMED_STRING_SYMBOL] === true\n );\n}\n\nexport interface AudioOutputCapabilities {\n /** Whether this output supports pause/resume functionality */\n pause: boolean;\n}\n\nexport abstract class AudioInput {\n protected deferredStream: DeferredReadableStream<AudioFrame> =\n new DeferredReadableStream<AudioFrame>();\n\n get stream(): ReadableStream<AudioFrame> {\n return this.deferredStream.stream;\n }\n\n onAttached(): void {}\n\n onDetached(): void {}\n}\n\nexport abstract class AudioOutput extends EventEmitter {\n static readonly EVENT_PLAYBACK_STARTED = 'playbackStarted';\n static readonly EVENT_PLAYBACK_FINISHED = 'playbackFinished';\n\n private playbackFinishedFuture: Future<void> = new Future();\n private _capturing: boolean = false;\n private playbackFinishedCount: number = 0;\n private playbackSegmentsCount: number = 0;\n private lastPlaybackEvent: PlaybackFinishedEvent = {\n playbackPosition: 0,\n interrupted: false,\n };\n protected logger = log();\n protected readonly capabilities: AudioOutputCapabilities;\n\n constructor(\n public sampleRate?: number,\n protected readonly nextInChain?: AudioOutput,\n capabilities: AudioOutputCapabilities = { pause: false },\n ) {\n super();\n this.capabilities = capabilities;\n\n if (this.nextInChain) {\n this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_STARTED, (ev: PlaybackStartedEvent) =>\n this.onPlaybackStarted(ev.createdAt),\n );\n this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_FINISHED, (ev: PlaybackFinishedEvent) =>\n this.onPlaybackFinished(ev),\n );\n }\n }\n\n /**\n * Whether this output and all outputs in the chain support pause/resume.\n */\n get canPause(): boolean {\n return this.capabilities.pause && (this.nextInChain?.canPause ?? true);\n }\n\n /**\n * Capture an audio frame for playback, frames can be pushed faster than real-time\n */\n async captureFrame(_frame: AudioFrame): Promise<void> {\n if (!this._capturing) {\n this._capturing = true;\n this.playbackSegmentsCount++;\n }\n }\n\n /**\n * Wait for the past audio segments to finish playing out.\n *\n * @returns The event that was emitted when the audio finished playing out (only the last segment information)\n */\n async waitForPlayout(): Promise<PlaybackFinishedEvent> {\n const target = this.playbackSegmentsCount;\n\n while (this.playbackFinishedCount < target) {\n await this.playbackFinishedFuture.await;\n this.playbackFinishedFuture = new Future();\n }\n\n return this.lastPlaybackEvent;\n }\n\n /**\n * Called when playback actually starts (first frame is sent to output).\n * Developers building audio sinks should call this when the first frame is captured.\n */\n onPlaybackStarted(createdAt: number): void {\n this.emit(AudioOutput.EVENT_PLAYBACK_STARTED, { createdAt } as PlaybackStartedEvent);\n }\n\n /**\n * Developers building audio sinks must call this method when a playback/segment is finished.\n * Segments are segmented by calls to flush() or clearBuffer()\n */\n onPlaybackFinished(options: PlaybackFinishedEvent) {\n if (this.playbackFinishedCount >= this.playbackSegmentsCount) {\n this.logger.warn('playback_finished called more times than playback segments were captured');\n return;\n }\n\n this.lastPlaybackEvent = options;\n this.playbackFinishedCount++;\n this.playbackFinishedFuture.resolve();\n this.emit(AudioOutput.EVENT_PLAYBACK_FINISHED, options);\n }\n\n flush(): void {\n this._capturing = false;\n }\n\n /**\n * Clear the buffer, stopping playback immediately\n */\n abstract clearBuffer(): void;\n\n onAttached(): void {\n if (this.nextInChain) {\n this.nextInChain.onAttached();\n }\n }\n\n onDetached(): void {\n if (this.nextInChain) {\n this.nextInChain.onDetached();\n }\n }\n\n /**\n * Pause the audio playback\n */\n pause(): void {\n if (this.nextInChain) {\n this.nextInChain.pause();\n }\n }\n\n /**\n * Resume the audio playback\n */\n resume(): void {\n if (this.nextInChain) {\n this.nextInChain.resume();\n }\n }\n}\n\nexport interface PlaybackFinishedEvent {\n /** How much of the audio was played back, in seconds */\n playbackPosition: number;\n /** True if playback was interrupted (clearBuffer() was called) */\n interrupted: boolean;\n /**\n * Transcript synced with playback; may be partial if the audio was interrupted.\n * When undefined, the transcript is not synchronized with the playback.\n */\n synchronizedTranscript?: string;\n}\n\nexport interface PlaybackStartedEvent {\n /** The timestamp (Date.now()) when the playback started */\n createdAt: number;\n}\n\nexport abstract class TextOutput {\n constructor(protected readonly nextInChain?: TextOutput) {}\n\n abstract captureText(text: string | TimedString): Promise<void>;\n\n /**\n * Mark the current text segment as complete (e.g LLM generation is complete)\n */\n abstract flush(): void;\n\n onAttached(): void {\n if (this.nextInChain) {\n this.nextInChain.onAttached();\n }\n }\n\n onDetached(): void {\n if (this.nextInChain) {\n this.nextInChain.onDetached();\n }\n }\n}\n\nexport class AgentInput {\n private _audioStream: AudioInput | null = null;\n // enabled by default\n private _audioEnabled: boolean = true;\n\n constructor(private readonly audioChanged: () => void) {}\n\n setAudioEnabled(enable: boolean): void {\n if (enable === this._audioEnabled) {\n return;\n }\n\n this._audioEnabled = enable;\n\n if (!this._audioStream) {\n return;\n }\n\n if (enable) {\n this._audioStream.onAttached();\n } else {\n this._audioStream.onDetached();\n }\n }\n\n get audioEnabled(): boolean {\n return this._audioEnabled;\n }\n\n get audio(): AudioInput | null {\n return this._audioStream;\n }\n\n set audio(stream: AudioInput | null) {\n this._audioStream = stream;\n this.audioChanged();\n }\n}\n\nexport class AgentOutput {\n private _audioSink: AudioOutput | null = null;\n private _transcriptionSink: TextOutput | null = null;\n private _audioEnabled: boolean = true;\n private _transcriptionEnabled: boolean = true;\n\n constructor(\n private readonly audioChanged: () => void,\n private readonly transcriptionChanged: () => void,\n ) {}\n\n setAudioEnabled(enabled: boolean): void {\n if (enabled === this._audioEnabled) {\n return;\n }\n\n this._audioEnabled = enabled;\n\n if (!this._audioSink) {\n return;\n }\n\n if (enabled) {\n this._audioSink.onAttached();\n } else {\n this._audioSink.onDetached();\n }\n }\n\n setTranscriptionEnabled(enabled: boolean): void {\n if (enabled === this._transcriptionEnabled) {\n return;\n }\n\n this._transcriptionEnabled = enabled;\n\n if (!this._transcriptionSink) {\n return;\n }\n\n if (enabled) {\n this._transcriptionSink.onAttached();\n } else {\n this._transcriptionSink.onDetached();\n }\n }\n\n get audioEnabled(): boolean {\n return this._audioEnabled;\n }\n\n get transcriptionEnabled(): boolean {\n return this._transcriptionEnabled;\n }\n\n get audio(): AudioOutput | null {\n return this._audioSink;\n }\n\n set audio(sink: AudioOutput | null) {\n if (sink === this._audioSink) {\n return;\n }\n\n if (this._audioSink) {\n this._audioSink.onDetached();\n }\n\n this._audioSink = sink;\n this.audioChanged();\n\n if (this._audioSink) {\n this._audioSink.onAttached();\n }\n }\n\n get transcription(): TextOutput | null {\n return this._transcriptionSink;\n }\n\n set transcription(sink: TextOutput | null) {\n if (sink === this._transcriptionSink) {\n return;\n }\n\n if (this._transcriptionSink) {\n this._transcriptionSink.onDetached();\n }\n\n this._transcriptionSink = sink;\n this.transcriptionChanged();\n\n if (this._transcriptionSink) {\n this._transcriptionSink.onAttached();\n }\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAIA,yBAA6B;AAK7B,iBAAoB;AACpB,6BAAuC;AAEvC,mBAAuB;AAsBhB,MAAM,sBAAsB,OAAO,IAAI,gBAAgB;AAiBvD,SAAS,kBAAkB,MAMlB;AACd,SAAO;AAAA,IACL,CAAC,mBAAmB,GAAG;AAAA,IACvB,MAAM,KAAK;AAAA,IACX,WAAW,KAAK;AAAA,IAChB,SAAS,KAAK;AAAA,IACd,YAAY,KAAK;AAAA,IACjB,iBAAiB,KAAK;AAAA,EACxB;AACF;AAKO,SAAS,cAAc,OAAsC;AAClE,SACE,OAAO,UAAU,YACjB,UAAU,QACV,uBAAuB,SACtB,MAAsB,mBAAmB,MAAM;AAEpD;AAOO,MAAe,WAAW;AAAA,EACrB,iBACR,IAAI,8CAAmC;AAAA,EAEzC,IAAI,SAAqC;AACvC,WAAO,KAAK,eAAe;AAAA,EAC7B;AAAA,EAEA,aAAmB;AAAA,EAAC;AAAA,EAEpB,aAAmB;AAAA,EAAC;AACtB;AAEO,MAAe,oBAAoB,gCAAa;AAAA,EAerD,YACS,YACY,aACnB,eAAwC,EAAE,OAAO,MAAM,GACvD;AACA,UAAM;AAJC;AACY;AAInB,SAAK,eAAe;AAEpB,QAAI,KAAK,aAAa;AACpB,WAAK,YAAY;AAAA,QAAG,YAAY;AAAA,QAAwB,CAAC,OACvD,KAAK,kBAAkB,GAAG,SAAS;AAAA,MACrC;AACA,WAAK,YAAY;AAAA,QAAG,YAAY;AAAA,QAAyB,CAAC,OACxD,KAAK,mBAAmB,EAAE;AAAA,MAC5B;AAAA,IACF;AAAA,EACF;AAAA,EA9BA,OAAgB,yBAAyB;AAAA,EACzC,OAAgB,0BAA0B;AAAA,EAElC,yBAAuC,IAAI,oBAAO;AAAA,EAClD,aAAsB;AAAA,EACtB,wBAAgC;AAAA,EAChC,wBAAgC;AAAA,EAChC,oBAA2C;AAAA,IACjD,kBAAkB;AAAA,IAClB,aAAa;AAAA,EACf;AAAA,EACU,aAAS,gBAAI;AAAA,EACJ;AAAA;AAAA;AAAA;AAAA,EAuBnB,IAAI,WAAoB;AAtI1B;AAuII,WAAO,KAAK,aAAa,YAAU,UAAK,gBAAL,mBAAkB,aAAY;AAAA,EACnE;AAAA;AAAA;AAAA;AAAA,EAKA,MAAM,aAAa,QAAmC;AACpD,QAAI,CAAC,KAAK,YAAY;AACpB,WAAK,aAAa;AAClB,WAAK;AAAA,IACP;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOA,MAAM,iBAAiD;AACrD,UAAM,SAAS,KAAK;AAEpB,WAAO,KAAK,wBAAwB,QAAQ;AAC1C,YAAM,KAAK,uBAAuB;AAClC,WAAK,yBAAyB,IAAI,oBAAO;AAAA,IAC3C;AAEA,WAAO,KAAK;AAAA,EACd;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,kBAAkB,WAAyB;AACzC,SAAK,KAAK,YAAY,wBAAwB,EAAE,UAAU,CAAyB;AAAA,EACrF;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,mBAAmB,SAAgC;AACjD,QAAI,KAAK,yBAAyB,KAAK,uBAAuB;AAC5D,WAAK,OAAO,KAAK,0EAA0E;AAC3F;AAAA,IACF;AAEA,SAAK,oBAAoB;AACzB,SAAK;AACL,SAAK,uBAAuB,QAAQ;AACpC,SAAK,KAAK,YAAY,yBAAyB,OAAO;AAAA,EACxD;AAAA,EAEA,QAAc;AACZ,SAAK,aAAa;AAAA,EACpB;AAAA,EAOA,aAAmB;AACjB,QAAI,KAAK,aAAa;AACpB,WAAK,YAAY,WAAW;AAAA,IAC9B;AAAA,EACF;AAAA,EAEA,aAAmB;AACjB,QAAI,KAAK,aAAa;AACpB,WAAK,YAAY,WAAW;AAAA,IAC9B;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,QAAc;AACZ,QAAI,KAAK,aAAa;AACpB,WAAK,YAAY,MAAM;AAAA,IACzB;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,SAAe;AACb,QAAI,KAAK,aAAa;AACpB,WAAK,YAAY,OAAO;AAAA,IAC1B;AAAA,EACF;AACF;AAmBO,MAAe,WAAW;AAAA,EAC/B,YAA+B,aAA0B;AAA1B;AAAA,EAA2B;AAAA,EAS1D,aAAmB;AACjB,QAAI,KAAK,aAAa;AACpB,WAAK,YAAY,WAAW;AAAA,IAC9B;AAAA,EACF;AAAA,EAEA,aAAmB;AACjB,QAAI,KAAK,aAAa;AACpB,WAAK,YAAY,WAAW;AAAA,IAC9B;AAAA,EACF;AACF;AAEO,MAAM,WAAW;AAAA,EAKtB,YAA6B,cAA0B;AAA1B;AAAA,EAA2B;AAAA,EAJhD,eAAkC;AAAA;AAAA,EAElC,gBAAyB;AAAA,EAIjC,gBAAgB,QAAuB;AACrC,QAAI,WAAW,KAAK,eAAe;AACjC;AAAA,IACF;AAEA,SAAK,gBAAgB;AAErB,QAAI,CAAC,KAAK,cAAc;AACtB;AAAA,IACF;AAEA,QAAI,QAAQ;AACV,WAAK,aAAa,WAAW;AAAA,IAC/B,OAAO;AACL,WAAK,aAAa,WAAW;AAAA,IAC/B;AAAA,EACF;AAAA,EAEA,IAAI,eAAwB;AAC1B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,QAA2B;AAC7B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAM,QAA2B;AACnC,SAAK,eAAe;AACpB,SAAK,aAAa;AAAA,EACpB;AACF;AAEO,MAAM,YAAY;AAAA,EAMvB,YACmB,cACA,sBACjB;AAFiB;AACA;AAAA,EAChB;AAAA,EARK,aAAiC;AAAA,EACjC,qBAAwC;AAAA,EACxC,gBAAyB;AAAA,EACzB,wBAAiC;AAAA,EAOzC,gBAAgB,SAAwB;AACtC,QAAI,YAAY,KAAK,eAAe;AAClC;AAAA,IACF;AAEA,SAAK,gBAAgB;AAErB,QAAI,CAAC,KAAK,YAAY;AACpB;AAAA,IACF;AAEA,QAAI,SAAS;AACX,WAAK,WAAW,WAAW;AAAA,IAC7B,OAAO;AACL,WAAK,WAAW,WAAW;AAAA,IAC7B;AAAA,EACF;AAAA,EAEA,wBAAwB,SAAwB;AAC9C,QAAI,YAAY,KAAK,uBAAuB;AAC1C;AAAA,IACF;AAEA,SAAK,wBAAwB;AAE7B,QAAI,CAAC,KAAK,oBAAoB;AAC5B;AAAA,IACF;AAEA,QAAI,SAAS;AACX,WAAK,mBAAmB,WAAW;AAAA,IACrC,OAAO;AACL,WAAK,mBAAmB,WAAW;AAAA,IACrC;AAAA,EACF;AAAA,EAEA,IAAI,eAAwB;AAC1B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,uBAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,QAA4B;AAC9B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAM,MAA0B;AAClC,QAAI,SAAS,KAAK,YAAY;AAC5B;AAAA,IACF;AAEA,QAAI,KAAK,YAAY;AACnB,WAAK,WAAW,WAAW;AAAA,IAC7B;AAEA,SAAK,aAAa;AAClB,SAAK,aAAa;AAElB,QAAI,KAAK,YAAY;AACnB,WAAK,WAAW,WAAW;AAAA,IAC7B;AAAA,EACF;AAAA,EAEA,IAAI,gBAAmC;AACrC,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,cAAc,MAAyB;AACzC,QAAI,SAAS,KAAK,oBAAoB;AACpC;AAAA,IACF;AAEA,QAAI,KAAK,oBAAoB;AAC3B,WAAK,mBAAmB,WAAW;AAAA,IACrC;AAEA,SAAK,qBAAqB;AAC1B,SAAK,qBAAqB;AAE1B,QAAI,KAAK,oBAAoB;AAC3B,WAAK,mBAAmB,WAAW;AAAA,IACrC;AAAA,EACF;AACF;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../../src/voice/io.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2025 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { EventEmitter } from 'node:events';\nimport type { ReadableStream } from 'node:stream/web';\nimport type { ChatContext } from '../llm/chat_context.js';\nimport type { ChatChunk } from '../llm/llm.js';\nimport type { ToolContext } from '../llm/tool_context.js';\nimport { log } from '../log.js';\nimport { MultiInputStream } from '../stream/multi_input_stream.js';\nimport type { SpeechEvent } from '../stt/stt.js';\nimport { Future } from '../utils.js';\nimport type { ModelSettings } from './agent.js';\n\nexport type STTNode = (\n audio: ReadableStream<AudioFrame>,\n modelSettings: ModelSettings,\n) => Promise<ReadableStream<SpeechEvent | string> | null>;\n\nexport type LLMNode = (\n chatCtx: ChatContext,\n toolCtx: ToolContext,\n modelSettings: ModelSettings,\n) => Promise<ReadableStream<ChatChunk | string> | null>;\n\nexport type TTSNode = (\n text: ReadableStream<string>,\n modelSettings: ModelSettings,\n) => Promise<ReadableStream<AudioFrame> | null>;\n\n/**\n * Symbol used to identify TimedString objects.\n */\nexport const TIMED_STRING_SYMBOL = Symbol.for('lk.TimedString');\n\n/**\n * A string with optional start and end timestamps for word-level alignment.\n */\nexport interface TimedString {\n readonly [TIMED_STRING_SYMBOL]: true;\n text: string;\n startTime?: number; // seconds\n endTime?: number; // seconds\n confidence?: number;\n startTimeOffset?: number;\n}\n\n/**\n * Factory function to create a TimedString object.\n */\nexport function createTimedString(opts: {\n text: string;\n startTime?: number;\n endTime?: number;\n confidence?: number;\n startTimeOffset?: number;\n}): TimedString {\n return {\n [TIMED_STRING_SYMBOL]: true,\n text: opts.text,\n startTime: opts.startTime,\n endTime: opts.endTime,\n confidence: opts.confidence,\n startTimeOffset: opts.startTimeOffset,\n };\n}\n\n/**\n * Type guard to check if a value is a TimedString.\n */\nexport function isTimedString(value: unknown): value is TimedString {\n return (\n typeof value === 'object' &&\n value !== null &&\n TIMED_STRING_SYMBOL in value &&\n (value as TimedString)[TIMED_STRING_SYMBOL] === true\n );\n}\n\nexport interface AudioOutputCapabilities {\n /** Whether this output supports pause/resume functionality */\n pause: boolean;\n}\n\nexport abstract class AudioInput {\n protected multiStream: MultiInputStream<AudioFrame> = new MultiInputStream<AudioFrame>();\n\n get stream(): ReadableStream<AudioFrame> {\n return this.multiStream.stream;\n }\n\n async close(): Promise<void> {\n await this.multiStream.close();\n }\n\n onAttached(): void {}\n\n onDetached(): void {}\n}\n\nexport abstract class AudioOutput extends EventEmitter {\n static readonly EVENT_PLAYBACK_STARTED = 'playbackStarted';\n static readonly EVENT_PLAYBACK_FINISHED = 'playbackFinished';\n\n private playbackFinishedFuture: Future<void> = new Future();\n private _capturing: boolean = false;\n private playbackFinishedCount: number = 0;\n private playbackSegmentsCount: number = 0;\n private lastPlaybackEvent: PlaybackFinishedEvent = {\n playbackPosition: 0,\n interrupted: false,\n };\n protected logger = log();\n protected readonly capabilities: AudioOutputCapabilities;\n\n constructor(\n public sampleRate?: number,\n protected readonly nextInChain?: AudioOutput,\n capabilities: AudioOutputCapabilities = { pause: false },\n ) {\n super();\n this.capabilities = capabilities;\n\n if (this.nextInChain) {\n this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_STARTED, (ev: PlaybackStartedEvent) =>\n this.onPlaybackStarted(ev.createdAt),\n );\n this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_FINISHED, (ev: PlaybackFinishedEvent) =>\n this.onPlaybackFinished(ev),\n );\n }\n }\n\n /**\n * Whether this output and all outputs in the chain support pause/resume.\n */\n get canPause(): boolean {\n return this.capabilities.pause && (this.nextInChain?.canPause ?? true);\n }\n\n /**\n * Capture an audio frame for playback, frames can be pushed faster than real-time\n */\n async captureFrame(_frame: AudioFrame): Promise<void> {\n if (!this._capturing) {\n this._capturing = true;\n this.playbackSegmentsCount++;\n }\n }\n\n /**\n * Wait for the past audio segments to finish playing out.\n *\n * @returns The event that was emitted when the audio finished playing out (only the last segment information)\n */\n async waitForPlayout(): Promise<PlaybackFinishedEvent> {\n const target = this.playbackSegmentsCount;\n\n while (this.playbackFinishedCount < target) {\n await this.playbackFinishedFuture.await;\n this.playbackFinishedFuture = new Future();\n }\n\n return this.lastPlaybackEvent;\n }\n\n /**\n * Called when playback actually starts (first frame is sent to output).\n * Developers building audio sinks should call this when the first frame is captured.\n */\n onPlaybackStarted(createdAt: number): void {\n this.emit(AudioOutput.EVENT_PLAYBACK_STARTED, { createdAt } as PlaybackStartedEvent);\n }\n\n /**\n * Developers building audio sinks must call this method when a playback/segment is finished.\n * Segments are segmented by calls to flush() or clearBuffer()\n */\n onPlaybackFinished(options: PlaybackFinishedEvent) {\n if (this.playbackFinishedCount >= this.playbackSegmentsCount) {\n this.logger.warn('playback_finished called more times than playback segments were captured');\n return;\n }\n\n this.lastPlaybackEvent = options;\n this.playbackFinishedCount++;\n this.playbackFinishedFuture.resolve();\n this.emit(AudioOutput.EVENT_PLAYBACK_FINISHED, options);\n }\n\n flush(): void {\n this._capturing = false;\n }\n\n /**\n * Clear the buffer, stopping playback immediately\n */\n abstract clearBuffer(): void;\n\n onAttached(): void {\n if (this.nextInChain) {\n this.nextInChain.onAttached();\n }\n }\n\n onDetached(): void {\n if (this.nextInChain) {\n this.nextInChain.onDetached();\n }\n }\n\n /**\n * Pause the audio playback\n */\n pause(): void {\n if (this.nextInChain) {\n this.nextInChain.pause();\n }\n }\n\n /**\n * Resume the audio playback\n */\n resume(): void {\n if (this.nextInChain) {\n this.nextInChain.resume();\n }\n }\n}\n\nexport interface PlaybackFinishedEvent {\n /** How much of the audio was played back, in seconds */\n playbackPosition: number;\n /** True if playback was interrupted (clearBuffer() was called) */\n interrupted: boolean;\n /**\n * Transcript synced with playback; may be partial if the audio was interrupted.\n * When undefined, the transcript is not synchronized with the playback.\n */\n synchronizedTranscript?: string;\n}\n\nexport interface PlaybackStartedEvent {\n /** The timestamp (Date.now()) when the playback started */\n createdAt: number;\n}\n\nexport abstract class TextOutput {\n constructor(protected readonly nextInChain?: TextOutput) {}\n\n abstract captureText(text: string | TimedString): Promise<void>;\n\n /**\n * Mark the current text segment as complete (e.g LLM generation is complete)\n */\n abstract flush(): void;\n\n onAttached(): void {\n if (this.nextInChain) {\n this.nextInChain.onAttached();\n }\n }\n\n onDetached(): void {\n if (this.nextInChain) {\n this.nextInChain.onDetached();\n }\n }\n}\n\nexport class AgentInput {\n private _audioStream: AudioInput | null = null;\n // enabled by default\n private _audioEnabled: boolean = true;\n\n constructor(private readonly audioChanged: () => void) {}\n\n setAudioEnabled(enable: boolean): void {\n if (enable === this._audioEnabled) {\n return;\n }\n\n this._audioEnabled = enable;\n\n if (!this._audioStream) {\n return;\n }\n\n if (enable) {\n this._audioStream.onAttached();\n } else {\n this._audioStream.onDetached();\n }\n }\n\n get audioEnabled(): boolean {\n return this._audioEnabled;\n }\n\n get audio(): AudioInput | null {\n return this._audioStream;\n }\n\n set audio(stream: AudioInput | null) {\n this._audioStream = stream;\n this.audioChanged();\n }\n}\n\nexport class AgentOutput {\n private _audioSink: AudioOutput | null = null;\n private _transcriptionSink: TextOutput | null = null;\n private _audioEnabled: boolean = true;\n private _transcriptionEnabled: boolean = true;\n\n constructor(\n private readonly audioChanged: () => void,\n private readonly transcriptionChanged: () => void,\n ) {}\n\n setAudioEnabled(enabled: boolean): void {\n if (enabled === this._audioEnabled) {\n return;\n }\n\n this._audioEnabled = enabled;\n\n if (!this._audioSink) {\n return;\n }\n\n if (enabled) {\n this._audioSink.onAttached();\n } else {\n this._audioSink.onDetached();\n }\n }\n\n setTranscriptionEnabled(enabled: boolean): void {\n if (enabled === this._transcriptionEnabled) {\n return;\n }\n\n this._transcriptionEnabled = enabled;\n\n if (!this._transcriptionSink) {\n return;\n }\n\n if (enabled) {\n this._transcriptionSink.onAttached();\n } else {\n this._transcriptionSink.onDetached();\n }\n }\n\n get audioEnabled(): boolean {\n return this._audioEnabled;\n }\n\n get transcriptionEnabled(): boolean {\n return this._transcriptionEnabled;\n }\n\n get audio(): AudioOutput | null {\n return this._audioSink;\n }\n\n set audio(sink: AudioOutput | null) {\n if (sink === this._audioSink) {\n return;\n }\n\n if (this._audioSink) {\n this._audioSink.onDetached();\n }\n\n this._audioSink = sink;\n this.audioChanged();\n\n if (this._audioSink) {\n this._audioSink.onAttached();\n }\n }\n\n get transcription(): TextOutput | null {\n return this._transcriptionSink;\n }\n\n set transcription(sink: TextOutput | null) {\n if (sink === this._transcriptionSink) {\n return;\n }\n\n if (this._transcriptionSink) {\n this._transcriptionSink.onDetached();\n }\n\n this._transcriptionSink = sink;\n this.transcriptionChanged();\n\n if (this._transcriptionSink) {\n this._transcriptionSink.onAttached();\n }\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAIA,yBAA6B;AAK7B,iBAAoB;AACpB,gCAAiC;AAEjC,mBAAuB;AAsBhB,MAAM,sBAAsB,OAAO,IAAI,gBAAgB;AAiBvD,SAAS,kBAAkB,MAMlB;AACd,SAAO;AAAA,IACL,CAAC,mBAAmB,GAAG;AAAA,IACvB,MAAM,KAAK;AAAA,IACX,WAAW,KAAK;AAAA,IAChB,SAAS,KAAK;AAAA,IACd,YAAY,KAAK;AAAA,IACjB,iBAAiB,KAAK;AAAA,EACxB;AACF;AAKO,SAAS,cAAc,OAAsC;AAClE,SACE,OAAO,UAAU,YACjB,UAAU,QACV,uBAAuB,SACtB,MAAsB,mBAAmB,MAAM;AAEpD;AAOO,MAAe,WAAW;AAAA,EACrB,cAA4C,IAAI,2CAA6B;AAAA,EAEvF,IAAI,SAAqC;AACvC,WAAO,KAAK,YAAY;AAAA,EAC1B;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,KAAK,YAAY,MAAM;AAAA,EAC/B;AAAA,EAEA,aAAmB;AAAA,EAAC;AAAA,EAEpB,aAAmB;AAAA,EAAC;AACtB;AAEO,MAAe,oBAAoB,gCAAa;AAAA,EAerD,YACS,YACY,aACnB,eAAwC,EAAE,OAAO,MAAM,GACvD;AACA,UAAM;AAJC;AACY;AAInB,SAAK,eAAe;AAEpB,QAAI,KAAK,aAAa;AACpB,WAAK,YAAY;AAAA,QAAG,YAAY;AAAA,QAAwB,CAAC,OACvD,KAAK,kBAAkB,GAAG,SAAS;AAAA,MACrC;AACA,WAAK,YAAY;AAAA,QAAG,YAAY;AAAA,QAAyB,CAAC,OACxD,KAAK,mBAAmB,EAAE;AAAA,MAC5B;AAAA,IACF;AAAA,EACF;AAAA,EA9BA,OAAgB,yBAAyB;AAAA,EACzC,OAAgB,0BAA0B;AAAA,EAElC,yBAAuC,IAAI,oBAAO;AAAA,EAClD,aAAsB;AAAA,EACtB,wBAAgC;AAAA,EAChC,wBAAgC;AAAA,EAChC,oBAA2C;AAAA,IACjD,kBAAkB;AAAA,IAClB,aAAa;AAAA,EACf;AAAA,EACU,aAAS,gBAAI;AAAA,EACJ;AAAA;AAAA;AAAA;AAAA,EAuBnB,IAAI,WAAoB;AAzI1B;AA0II,WAAO,KAAK,aAAa,YAAU,UAAK,gBAAL,mBAAkB,aAAY;AAAA,EACnE;AAAA;AAAA;AAAA;AAAA,EAKA,MAAM,aAAa,QAAmC;AACpD,QAAI,CAAC,KAAK,YAAY;AACpB,WAAK,aAAa;AAClB,WAAK;AAAA,IACP;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOA,MAAM,iBAAiD;AACrD,UAAM,SAAS,KAAK;AAEpB,WAAO,KAAK,wBAAwB,QAAQ;AAC1C,YAAM,KAAK,uBAAuB;AAClC,WAAK,yBAAyB,IAAI,oBAAO;AAAA,IAC3C;AAEA,WAAO,KAAK;AAAA,EACd;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,kBAAkB,WAAyB;AACzC,SAAK,KAAK,YAAY,wBAAwB,EAAE,UAAU,CAAyB;AAAA,EACrF;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,mBAAmB,SAAgC;AACjD,QAAI,KAAK,yBAAyB,KAAK,uBAAuB;AAC5D,WAAK,OAAO,KAAK,0EAA0E;AAC3F;AAAA,IACF;AAEA,SAAK,oBAAoB;AACzB,SAAK;AACL,SAAK,uBAAuB,QAAQ;AACpC,SAAK,KAAK,YAAY,yBAAyB,OAAO;AAAA,EACxD;AAAA,EAEA,QAAc;AACZ,SAAK,aAAa;AAAA,EACpB;AAAA,EAOA,aAAmB;AACjB,QAAI,KAAK,aAAa;AACpB,WAAK,YAAY,WAAW;AAAA,IAC9B;AAAA,EACF;AAAA,EAEA,aAAmB;AACjB,QAAI,KAAK,aAAa;AACpB,WAAK,YAAY,WAAW;AAAA,IAC9B;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,QAAc;AACZ,QAAI,KAAK,aAAa;AACpB,WAAK,YAAY,MAAM;AAAA,IACzB;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,SAAe;AACb,QAAI,KAAK,aAAa;AACpB,WAAK,YAAY,OAAO;AAAA,IAC1B;AAAA,EACF;AACF;AAmBO,MAAe,WAAW;AAAA,EAC/B,YAA+B,aAA0B;AAA1B;AAAA,EAA2B;AAAA,EAS1D,aAAmB;AACjB,QAAI,KAAK,aAAa;AACpB,WAAK,YAAY,WAAW;AAAA,IAC9B;AAAA,EACF;AAAA,EAEA,aAAmB;AACjB,QAAI,KAAK,aAAa;AACpB,WAAK,YAAY,WAAW;AAAA,IAC9B;AAAA,EACF;AACF;AAEO,MAAM,WAAW;AAAA,EAKtB,YAA6B,cAA0B;AAA1B;AAAA,EAA2B;AAAA,EAJhD,eAAkC;AAAA;AAAA,EAElC,gBAAyB;AAAA,EAIjC,gBAAgB,QAAuB;AACrC,QAAI,WAAW,KAAK,eAAe;AACjC;AAAA,IACF;AAEA,SAAK,gBAAgB;AAErB,QAAI,CAAC,KAAK,cAAc;AACtB;AAAA,IACF;AAEA,QAAI,QAAQ;AACV,WAAK,aAAa,WAAW;AAAA,IAC/B,OAAO;AACL,WAAK,aAAa,WAAW;AAAA,IAC/B;AAAA,EACF;AAAA,EAEA,IAAI,eAAwB;AAC1B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,QAA2B;AAC7B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAM,QAA2B;AACnC,SAAK,eAAe;AACpB,SAAK,aAAa;AAAA,EACpB;AACF;AAEO,MAAM,YAAY;AAAA,EAMvB,YACmB,cACA,sBACjB;AAFiB;AACA;AAAA,EAChB;AAAA,EARK,aAAiC;AAAA,EACjC,qBAAwC;AAAA,EACxC,gBAAyB;AAAA,EACzB,wBAAiC;AAAA,EAOzC,gBAAgB,SAAwB;AACtC,QAAI,YAAY,KAAK,eAAe;AAClC;AAAA,IACF;AAEA,SAAK,gBAAgB;AAErB,QAAI,CAAC,KAAK,YAAY;AACpB;AAAA,IACF;AAEA,QAAI,SAAS;AACX,WAAK,WAAW,WAAW;AAAA,IAC7B,OAAO;AACL,WAAK,WAAW,WAAW;AAAA,IAC7B;AAAA,EACF;AAAA,EAEA,wBAAwB,SAAwB;AAC9C,QAAI,YAAY,KAAK,uBAAuB;AAC1C;AAAA,IACF;AAEA,SAAK,wBAAwB;AAE7B,QAAI,CAAC,KAAK,oBAAoB;AAC5B;AAAA,IACF;AAEA,QAAI,SAAS;AACX,WAAK,mBAAmB,WAAW;AAAA,IACrC,OAAO;AACL,WAAK,mBAAmB,WAAW;AAAA,IACrC;AAAA,EACF;AAAA,EAEA,IAAI,eAAwB;AAC1B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,uBAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,QAA4B;AAC9B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAM,MAA0B;AAClC,QAAI,SAAS,KAAK,YAAY;AAC5B;AAAA,IACF;AAEA,QAAI,KAAK,YAAY;AACnB,WAAK,WAAW,WAAW;AAAA,IAC7B;AAEA,SAAK,aAAa;AAClB,SAAK,aAAa;AAElB,QAAI,KAAK,YAAY;AACnB,WAAK,WAAW,WAAW;AAAA,IAC7B;AAAA,EACF;AAAA,EAEA,IAAI,gBAAmC;AACrC,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,cAAc,MAAyB;AACzC,QAAI,SAAS,KAAK,oBAAoB;AACpC;AAAA,IACF;AAEA,QAAI,KAAK,oBAAoB;AAC3B,WAAK,mBAAmB,WAAW;AAAA,IACrC;AAEA,SAAK,qBAAqB;AAC1B,SAAK,qBAAqB;AAE1B,QAAI,KAAK,oBAAoB;AAC3B,WAAK,mBAAmB,WAAW;AAAA,IACrC;AAAA,EACF;AACF;","names":[]}
|
package/dist/voice/io.d.cts
CHANGED
|
@@ -6,7 +6,7 @@ import type { ReadableStream } from 'node:stream/web';
|
|
|
6
6
|
import type { ChatContext } from '../llm/chat_context.js';
|
|
7
7
|
import type { ChatChunk } from '../llm/llm.js';
|
|
8
8
|
import type { ToolContext } from '../llm/tool_context.js';
|
|
9
|
-
import {
|
|
9
|
+
import { MultiInputStream } from '../stream/multi_input_stream.js';
|
|
10
10
|
import type { SpeechEvent } from '../stt/stt.js';
|
|
11
11
|
import type { ModelSettings } from './agent.js';
|
|
12
12
|
export type STTNode = (audio: ReadableStream<AudioFrame>, modelSettings: ModelSettings) => Promise<ReadableStream<SpeechEvent | string> | null>;
|
|
@@ -46,8 +46,9 @@ export interface AudioOutputCapabilities {
|
|
|
46
46
|
pause: boolean;
|
|
47
47
|
}
|
|
48
48
|
export declare abstract class AudioInput {
|
|
49
|
-
protected
|
|
49
|
+
protected multiStream: MultiInputStream<AudioFrame>;
|
|
50
50
|
get stream(): ReadableStream<AudioFrame>;
|
|
51
|
+
close(): Promise<void>;
|
|
51
52
|
onAttached(): void;
|
|
52
53
|
onDetached(): void;
|
|
53
54
|
}
|
package/dist/voice/io.d.ts
CHANGED
|
@@ -6,7 +6,7 @@ import type { ReadableStream } from 'node:stream/web';
|
|
|
6
6
|
import type { ChatContext } from '../llm/chat_context.js';
|
|
7
7
|
import type { ChatChunk } from '../llm/llm.js';
|
|
8
8
|
import type { ToolContext } from '../llm/tool_context.js';
|
|
9
|
-
import {
|
|
9
|
+
import { MultiInputStream } from '../stream/multi_input_stream.js';
|
|
10
10
|
import type { SpeechEvent } from '../stt/stt.js';
|
|
11
11
|
import type { ModelSettings } from './agent.js';
|
|
12
12
|
export type STTNode = (audio: ReadableStream<AudioFrame>, modelSettings: ModelSettings) => Promise<ReadableStream<SpeechEvent | string> | null>;
|
|
@@ -46,8 +46,9 @@ export interface AudioOutputCapabilities {
|
|
|
46
46
|
pause: boolean;
|
|
47
47
|
}
|
|
48
48
|
export declare abstract class AudioInput {
|
|
49
|
-
protected
|
|
49
|
+
protected multiStream: MultiInputStream<AudioFrame>;
|
|
50
50
|
get stream(): ReadableStream<AudioFrame>;
|
|
51
|
+
close(): Promise<void>;
|
|
51
52
|
onAttached(): void;
|
|
52
53
|
onDetached(): void;
|
|
53
54
|
}
|