@livekit/agents 1.0.24 → 1.0.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/inference/llm.cjs +1 -2
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +1 -2
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +1 -1
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +1 -1
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs +4 -2
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +4 -2
- package/dist/inference/tts.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +1 -1
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/job.cjs +29 -2
- package/dist/job.cjs.map +1 -1
- package/dist/job.d.cts +6 -0
- package/dist/job.d.ts +6 -0
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +19 -2
- package/dist/job.js.map +1 -1
- package/dist/llm/llm.cjs +2 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +1 -1
- package/dist/llm/llm.d.ts +1 -1
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +2 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +12 -4
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.cts +6 -1
- package/dist/stream/deferred_stream.d.ts +6 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +12 -4
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/stream/deferred_stream.test.cjs +2 -2
- package/dist/stream/deferred_stream.test.cjs.map +1 -1
- package/dist/stream/deferred_stream.test.js +2 -2
- package/dist/stream/deferred_stream.test.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +15 -8
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.cts +7 -3
- package/dist/stt/stream_adapter.d.ts +7 -3
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +15 -8
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +8 -3
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +9 -3
- package/dist/stt/stt.d.ts +9 -3
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +9 -4
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/traces.cjs +23 -2
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +23 -2
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +4 -4
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.cts +5 -2
- package/dist/tts/stream_adapter.d.ts +5 -2
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +4 -4
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +2 -2
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +5 -1
- package/dist/tts/tts.d.ts +5 -1
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +3 -3
- package/dist/tts/tts.js.map +1 -1
- package/dist/types.cjs +21 -32
- package/dist/types.cjs.map +1 -1
- package/dist/types.d.cts +41 -10
- package/dist/types.d.ts +41 -10
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +18 -30
- package/dist/types.js.map +1 -1
- package/dist/voice/agent.cjs +54 -19
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +54 -19
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +0 -3
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +0 -3
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +105 -27
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +16 -2
- package/dist/voice/agent_session.d.ts +16 -2
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +108 -27
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +4 -4
- package/dist/voice/events.d.ts +4 -4
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +6 -7
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +7 -8
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/io.cjs +16 -0
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +8 -0
- package/dist/voice/io.d.ts +8 -0
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +16 -0
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/index.cjs +23 -0
- package/dist/voice/recorder_io/index.cjs.map +1 -0
- package/dist/voice/recorder_io/index.d.cts +2 -0
- package/dist/voice/recorder_io/index.d.ts +2 -0
- package/dist/voice/recorder_io/index.d.ts.map +1 -0
- package/dist/voice/recorder_io/index.js +2 -0
- package/dist/voice/recorder_io/index.js.map +1 -0
- package/dist/voice/recorder_io/recorder_io.cjs +542 -0
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -0
- package/dist/voice/recorder_io/recorder_io.d.cts +100 -0
- package/dist/voice/recorder_io/recorder_io.d.ts +100 -0
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -0
- package/dist/voice/recorder_io/recorder_io.js +508 -0
- package/dist/voice/recorder_io/recorder_io.js.map +1 -0
- package/dist/voice/report.cjs +7 -2
- package/dist/voice/report.cjs.map +1 -1
- package/dist/voice/report.d.cts +11 -1
- package/dist/voice/report.d.ts +11 -1
- package/dist/voice/report.d.ts.map +1 -1
- package/dist/voice/report.js +7 -2
- package/dist/voice/report.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +2 -1
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +2 -1
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +8 -7
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +2 -1
- package/dist/voice/room_io/_output.d.ts +2 -1
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +8 -7
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/worker.cjs +4 -3
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.js +4 -3
- package/dist/worker.js.map +1 -1
- package/package.json +1 -1
- package/src/inference/llm.ts +0 -1
- package/src/inference/stt.ts +1 -2
- package/src/inference/tts.ts +5 -2
- package/src/ipc/job_proc_lazy_main.ts +1 -1
- package/src/job.ts +21 -2
- package/src/llm/llm.ts +2 -2
- package/src/stream/deferred_stream.test.ts +3 -3
- package/src/stream/deferred_stream.ts +22 -5
- package/src/stt/stream_adapter.ts +18 -8
- package/src/stt/stt.ts +19 -6
- package/src/telemetry/traces.ts +25 -3
- package/src/tts/stream_adapter.ts +5 -4
- package/src/tts/tts.ts +6 -4
- package/src/types.ts +57 -33
- package/src/voice/agent.ts +59 -19
- package/src/voice/agent_activity.ts +0 -3
- package/src/voice/agent_session.ts +140 -35
- package/src/voice/events.ts +6 -3
- package/src/voice/generation.ts +10 -8
- package/src/voice/io.ts +19 -0
- package/src/voice/recorder_io/index.ts +4 -0
- package/src/voice/recorder_io/recorder_io.ts +690 -0
- package/src/voice/report.ts +20 -3
- package/src/voice/room_io/_input.ts +2 -1
- package/src/voice/room_io/_output.ts +10 -7
- package/src/worker.ts +1 -1
|
@@ -9,15 +9,22 @@ import type {
|
|
|
9
9
|
import { IdentityTransform } from './identity_transform.js';
|
|
10
10
|
|
|
11
11
|
/**
|
|
12
|
-
* Check if error is related to
|
|
12
|
+
* Check if error is related to stream cleanup operations.
|
|
13
|
+
*
|
|
14
|
+
* These errors are expected when calling reader.read() after releaseLock()
|
|
15
|
+
* or when writing to already closed streams during cleanup:
|
|
13
16
|
*
|
|
14
17
|
* Invalid state: Releasing reader
|
|
15
18
|
* Invalid state: The reader is not attached to a stream
|
|
19
|
+
* Invalid state: Controller is already closed
|
|
20
|
+
* Invalid state: WritableStream is closed
|
|
16
21
|
*/
|
|
17
22
|
export function isStreamReaderReleaseError(e: unknown) {
|
|
18
23
|
const allowedMessages = [
|
|
19
24
|
'Invalid state: Releasing reader',
|
|
20
25
|
'Invalid state: The reader is not attached to a stream',
|
|
26
|
+
'Controller is already closed',
|
|
27
|
+
'WritableStream is closed',
|
|
21
28
|
];
|
|
22
29
|
|
|
23
30
|
if (e instanceof TypeError) {
|
|
@@ -66,18 +73,27 @@ export class DeferredReadableStream<T> {
|
|
|
66
73
|
await this.writer.write(value);
|
|
67
74
|
}
|
|
68
75
|
} catch (e) {
|
|
69
|
-
// skip
|
|
76
|
+
// skip stream cleanup related errors
|
|
70
77
|
if (isStreamReaderReleaseError(e)) return;
|
|
78
|
+
|
|
71
79
|
sourceError = e;
|
|
72
80
|
} finally {
|
|
73
81
|
// any other error from source will be propagated to the consumer
|
|
74
82
|
if (sourceError) {
|
|
75
|
-
|
|
83
|
+
try {
|
|
84
|
+
this.writer.abort(sourceError);
|
|
85
|
+
} catch (e) {
|
|
86
|
+
// ignore if writer is already closed
|
|
87
|
+
}
|
|
76
88
|
return;
|
|
77
89
|
}
|
|
78
90
|
|
|
79
91
|
// release lock so this.stream.getReader().read() will terminate with done: true
|
|
80
|
-
|
|
92
|
+
try {
|
|
93
|
+
this.writer.releaseLock();
|
|
94
|
+
} catch (e) {
|
|
95
|
+
// ignore if writer lock is already released
|
|
96
|
+
}
|
|
81
97
|
|
|
82
98
|
// we only close the writable stream after done
|
|
83
99
|
try {
|
|
@@ -98,7 +114,8 @@ export class DeferredReadableStream<T> {
|
|
|
98
114
|
*/
|
|
99
115
|
async detachSource() {
|
|
100
116
|
if (!this.isSourceSet) {
|
|
101
|
-
|
|
117
|
+
// No-op if source was never set - this is a common case during cleanup
|
|
118
|
+
return;
|
|
102
119
|
}
|
|
103
120
|
|
|
104
121
|
// release lock will make any pending read() throw TypeError
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
5
|
import { log } from '../log.js';
|
|
6
|
+
import type { APIConnectOptions } from '../types.js';
|
|
6
7
|
import type { VAD, VADStream } from '../vad.js';
|
|
7
8
|
import { VADEventType } from '../vad.js';
|
|
8
9
|
import type { SpeechEvent } from './stt.js';
|
|
@@ -22,14 +23,18 @@ export class StreamAdapter extends STT {
|
|
|
22
23
|
this.#stt.on('metrics_collected', (metrics) => {
|
|
23
24
|
this.emit('metrics_collected', metrics);
|
|
24
25
|
});
|
|
26
|
+
|
|
27
|
+
this.#stt.on('error', (error) => {
|
|
28
|
+
this.emit('error', error);
|
|
29
|
+
});
|
|
25
30
|
}
|
|
26
31
|
|
|
27
|
-
_recognize(frame: AudioFrame): Promise<SpeechEvent> {
|
|
28
|
-
return this.#stt.recognize(frame);
|
|
32
|
+
_recognize(frame: AudioFrame, abortSignal?: AbortSignal): Promise<SpeechEvent> {
|
|
33
|
+
return this.#stt.recognize(frame, abortSignal);
|
|
29
34
|
}
|
|
30
35
|
|
|
31
|
-
stream(): StreamAdapterWrapper {
|
|
32
|
-
return new StreamAdapterWrapper(this.#stt, this.#vad);
|
|
36
|
+
stream(options?: { connOptions?: APIConnectOptions }): StreamAdapterWrapper {
|
|
37
|
+
return new StreamAdapterWrapper(this.#stt, this.#vad, options?.connOptions);
|
|
33
38
|
}
|
|
34
39
|
}
|
|
35
40
|
|
|
@@ -38,13 +43,18 @@ export class StreamAdapterWrapper extends SpeechStream {
|
|
|
38
43
|
#vadStream: VADStream;
|
|
39
44
|
label: string;
|
|
40
45
|
|
|
41
|
-
constructor(stt: STT, vad: VAD) {
|
|
42
|
-
super(stt);
|
|
46
|
+
constructor(stt: STT, vad: VAD, connOptions?: APIConnectOptions) {
|
|
47
|
+
super(stt, undefined, connOptions);
|
|
43
48
|
this.#stt = stt;
|
|
44
49
|
this.#vadStream = vad.stream();
|
|
45
50
|
this.label = `stt.StreamAdapterWrapper<${this.#stt.label}>`;
|
|
46
51
|
}
|
|
47
52
|
|
|
53
|
+
close() {
|
|
54
|
+
super.close();
|
|
55
|
+
this.#vadStream.close();
|
|
56
|
+
}
|
|
57
|
+
|
|
48
58
|
async monitorMetrics() {
|
|
49
59
|
return; // do nothing
|
|
50
60
|
}
|
|
@@ -71,7 +81,7 @@ export class StreamAdapterWrapper extends SpeechStream {
|
|
|
71
81
|
this.output.put({ type: SpeechEventType.END_OF_SPEECH });
|
|
72
82
|
|
|
73
83
|
try {
|
|
74
|
-
const event = await this.#stt.recognize(ev.frames);
|
|
84
|
+
const event = await this.#stt.recognize(ev.frames, this.abortSignal);
|
|
75
85
|
if (!event.alternatives![0].text) {
|
|
76
86
|
continue;
|
|
77
87
|
}
|
|
@@ -92,6 +102,6 @@ export class StreamAdapterWrapper extends SpeechStream {
|
|
|
92
102
|
}
|
|
93
103
|
};
|
|
94
104
|
|
|
95
|
-
Promise.all([forwardInput(), recognize()]);
|
|
105
|
+
await Promise.all([forwardInput(), recognize()]);
|
|
96
106
|
}
|
|
97
107
|
}
|
package/src/stt/stt.ts
CHANGED
|
@@ -10,7 +10,7 @@ import { calculateAudioDurationSeconds } from '../audio.js';
|
|
|
10
10
|
import { log } from '../log.js';
|
|
11
11
|
import type { STTMetrics } from '../metrics/base.js';
|
|
12
12
|
import { DeferredReadableStream } from '../stream/deferred_stream.js';
|
|
13
|
-
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
|
|
13
|
+
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS, intervalForRetry } from '../types.js';
|
|
14
14
|
import type { AudioBuffer } from '../utils.js';
|
|
15
15
|
import { AsyncIterableQueue, delay, startSoon, toError } from '../utils.js';
|
|
16
16
|
|
|
@@ -113,9 +113,9 @@ export abstract class STT extends (EventEmitter as new () => TypedEmitter<STTCal
|
|
|
113
113
|
}
|
|
114
114
|
|
|
115
115
|
/** Receives an audio buffer and returns transcription in the form of a {@link SpeechEvent} */
|
|
116
|
-
async recognize(frame: AudioBuffer): Promise<SpeechEvent> {
|
|
116
|
+
async recognize(frame: AudioBuffer, abortSignal?: AbortSignal): Promise<SpeechEvent> {
|
|
117
117
|
const startTime = process.hrtime.bigint();
|
|
118
|
-
const event = await this._recognize(frame);
|
|
118
|
+
const event = await this._recognize(frame, abortSignal);
|
|
119
119
|
const durationMs = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));
|
|
120
120
|
this.emit('metrics_collected', {
|
|
121
121
|
type: 'stt_metrics',
|
|
@@ -128,13 +128,19 @@ export abstract class STT extends (EventEmitter as new () => TypedEmitter<STTCal
|
|
|
128
128
|
});
|
|
129
129
|
return event;
|
|
130
130
|
}
|
|
131
|
-
|
|
131
|
+
|
|
132
|
+
protected abstract _recognize(
|
|
133
|
+
frame: AudioBuffer,
|
|
134
|
+
abortSignal?: AbortSignal,
|
|
135
|
+
): Promise<SpeechEvent>;
|
|
132
136
|
|
|
133
137
|
/**
|
|
134
138
|
* Returns a {@link SpeechStream} that can be used to push audio frames and receive
|
|
135
139
|
* transcriptions
|
|
140
|
+
*
|
|
141
|
+
* @param options - Optional configuration including connection options
|
|
136
142
|
*/
|
|
137
|
-
abstract stream(): SpeechStream;
|
|
143
|
+
abstract stream(options?: { connOptions?: APIConnectOptions }): SpeechStream;
|
|
138
144
|
|
|
139
145
|
async close(): Promise<void> {
|
|
140
146
|
return;
|
|
@@ -171,6 +177,8 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
171
177
|
private logger = log();
|
|
172
178
|
private _connOptions: APIConnectOptions;
|
|
173
179
|
|
|
180
|
+
protected abortController = new AbortController();
|
|
181
|
+
|
|
174
182
|
constructor(
|
|
175
183
|
stt: STT,
|
|
176
184
|
sampleRate?: number,
|
|
@@ -196,7 +204,7 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
196
204
|
return await this.run();
|
|
197
205
|
} catch (error) {
|
|
198
206
|
if (error instanceof APIError) {
|
|
199
|
-
const retryInterval = this._connOptions
|
|
207
|
+
const retryInterval = intervalForRetry(this._connOptions, i);
|
|
200
208
|
|
|
201
209
|
if (this._connOptions.maxRetry === 0 || !error.retryable) {
|
|
202
210
|
this.emitError({ error, recoverable: false });
|
|
@@ -288,6 +296,10 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
288
296
|
|
|
289
297
|
protected abstract run(): Promise<void>;
|
|
290
298
|
|
|
299
|
+
protected get abortSignal(): AbortSignal {
|
|
300
|
+
return this.abortController.signal;
|
|
301
|
+
}
|
|
302
|
+
|
|
291
303
|
updateInputStream(audioStream: ReadableStream<AudioFrame>) {
|
|
292
304
|
this.deferredInputStream.setSource(audioStream);
|
|
293
305
|
}
|
|
@@ -352,6 +364,7 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
352
364
|
if (!this.input.closed) this.input.close();
|
|
353
365
|
if (!this.queue.closed) this.queue.close();
|
|
354
366
|
if (!this.output.closed) this.output.close();
|
|
367
|
+
if (!this.abortController.signal.aborted) this.abortController.abort();
|
|
355
368
|
this.closed = true;
|
|
356
369
|
}
|
|
357
370
|
|
package/src/telemetry/traces.ts
CHANGED
|
@@ -21,6 +21,7 @@ import { BatchSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace
|
|
|
21
21
|
import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions';
|
|
22
22
|
import FormData from 'form-data';
|
|
23
23
|
import { AccessToken } from 'livekit-server-sdk';
|
|
24
|
+
import fs from 'node:fs/promises';
|
|
24
25
|
import type { ChatContent, ChatItem } from '../llm/index.js';
|
|
25
26
|
import { enableOtelLogging } from '../log.js';
|
|
26
27
|
import type { SessionReport } from '../voice/report.js';
|
|
@@ -497,12 +498,13 @@ export async function uploadSessionReport(options: {
|
|
|
497
498
|
const formData = new FormData();
|
|
498
499
|
|
|
499
500
|
// Add header (protobuf MetricsRecordingHeader)
|
|
501
|
+
const audioStartTime = report.audioRecordingStartedAt ?? 0;
|
|
500
502
|
const headerMsg = new MetricsRecordingHeader({
|
|
501
503
|
roomId: report.roomId,
|
|
502
504
|
duration: BigInt(0), // TODO: Calculate actual duration from report
|
|
503
505
|
startTime: {
|
|
504
|
-
seconds: BigInt(Math.floor(
|
|
505
|
-
nanos: Math.floor((
|
|
506
|
+
seconds: BigInt(Math.floor(audioStartTime / 1000)),
|
|
507
|
+
nanos: Math.floor((audioStartTime % 1000) * 1e6),
|
|
506
508
|
},
|
|
507
509
|
});
|
|
508
510
|
|
|
@@ -530,7 +532,27 @@ export async function uploadSessionReport(options: {
|
|
|
530
532
|
},
|
|
531
533
|
});
|
|
532
534
|
|
|
533
|
-
//
|
|
535
|
+
// Add audio recording file if available
|
|
536
|
+
if (report.audioRecordingPath && report.audioRecordingStartedAt) {
|
|
537
|
+
let audioBytes: Buffer;
|
|
538
|
+
try {
|
|
539
|
+
audioBytes = await fs.readFile(report.audioRecordingPath);
|
|
540
|
+
} catch {
|
|
541
|
+
audioBytes = Buffer.alloc(0);
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
if (audioBytes.length > 0) {
|
|
545
|
+
formData.append('audio', audioBytes, {
|
|
546
|
+
filename: 'recording.ogg',
|
|
547
|
+
contentType: 'audio/ogg',
|
|
548
|
+
knownLength: audioBytes.length,
|
|
549
|
+
header: {
|
|
550
|
+
'Content-Type': 'audio/ogg',
|
|
551
|
+
'Content-Length': audioBytes.length.toString(),
|
|
552
|
+
},
|
|
553
|
+
});
|
|
554
|
+
}
|
|
555
|
+
}
|
|
534
556
|
|
|
535
557
|
// Upload to LiveKit Cloud using form-data's submit method
|
|
536
558
|
// This properly streams the multipart form with all headers including Content-Length
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { SentenceStream, SentenceTokenizer } from '../tokenize/index.js';
|
|
5
|
+
import type { APIConnectOptions } from '../types.js';
|
|
5
6
|
import { Task } from '../utils.js';
|
|
6
7
|
import type { ChunkedStream } from './tts.js';
|
|
7
8
|
import { SynthesizeStream, TTS } from './tts.js';
|
|
@@ -27,8 +28,8 @@ export class StreamAdapter extends TTS {
|
|
|
27
28
|
return this.#tts.synthesize(text);
|
|
28
29
|
}
|
|
29
30
|
|
|
30
|
-
stream(): StreamAdapterWrapper {
|
|
31
|
-
return new StreamAdapterWrapper(this.#tts, this.#sentenceTokenizer);
|
|
31
|
+
stream(options?: { connOptions?: APIConnectOptions }): StreamAdapterWrapper {
|
|
32
|
+
return new StreamAdapterWrapper(this.#tts, this.#sentenceTokenizer, options?.connOptions);
|
|
32
33
|
}
|
|
33
34
|
}
|
|
34
35
|
|
|
@@ -37,8 +38,8 @@ export class StreamAdapterWrapper extends SynthesizeStream {
|
|
|
37
38
|
#sentenceStream: SentenceStream;
|
|
38
39
|
label: string;
|
|
39
40
|
|
|
40
|
-
constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {
|
|
41
|
-
super(tts);
|
|
41
|
+
constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer, connOptions?: APIConnectOptions) {
|
|
42
|
+
super(tts, connOptions);
|
|
42
43
|
this.#tts = tts;
|
|
43
44
|
this.#sentenceStream = sentenceTokenizer.stream();
|
|
44
45
|
this.label = `tts.StreamAdapterWrapper<${this.#tts.label}>`;
|
package/src/tts/tts.ts
CHANGED
|
@@ -11,7 +11,7 @@ import { log } from '../log.js';
|
|
|
11
11
|
import type { TTSMetrics } from '../metrics/base.js';
|
|
12
12
|
import { DeferredReadableStream } from '../stream/deferred_stream.js';
|
|
13
13
|
import { recordException, traceTypes, tracer } from '../telemetry/index.js';
|
|
14
|
-
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
|
|
14
|
+
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS, intervalForRetry } from '../types.js';
|
|
15
15
|
import { AsyncIterableQueue, delay, mergeFrames, startSoon, toError } from '../utils.js';
|
|
16
16
|
|
|
17
17
|
/** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */
|
|
@@ -94,8 +94,10 @@ export abstract class TTS extends (EventEmitter as new () => TypedEmitter<TTSCal
|
|
|
94
94
|
|
|
95
95
|
/**
|
|
96
96
|
* Returns a {@link SynthesizeStream} that can be used to push text and receive audio data
|
|
97
|
+
*
|
|
98
|
+
* @param options - Optional configuration including connection options
|
|
97
99
|
*/
|
|
98
|
-
abstract stream(): SynthesizeStream;
|
|
100
|
+
abstract stream(options?: { connOptions?: APIConnectOptions }): SynthesizeStream;
|
|
99
101
|
|
|
100
102
|
async close(): Promise<void> {
|
|
101
103
|
return;
|
|
@@ -186,7 +188,7 @@ export abstract class SynthesizeStream
|
|
|
186
188
|
);
|
|
187
189
|
} catch (error) {
|
|
188
190
|
if (error instanceof APIError) {
|
|
189
|
-
const retryInterval = this._connOptions
|
|
191
|
+
const retryInterval = intervalForRetry(this._connOptions, i);
|
|
190
192
|
|
|
191
193
|
if (this._connOptions.maxRetry === 0 || !error.retryable) {
|
|
192
194
|
this.emitError({ error, recoverable: false });
|
|
@@ -454,7 +456,7 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
|
|
|
454
456
|
);
|
|
455
457
|
} catch (error) {
|
|
456
458
|
if (error instanceof APIError) {
|
|
457
|
-
const retryInterval = this._connOptions
|
|
459
|
+
const retryInterval = intervalForRetry(this._connOptions, i);
|
|
458
460
|
|
|
459
461
|
if (this._connOptions.maxRetry === 0 || !error.retryable) {
|
|
460
462
|
this.emitError({ error, recoverable: false });
|
package/src/types.ts
CHANGED
|
@@ -1,42 +1,66 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
export class APIConnectOptions {
|
|
5
|
-
/** Maximum number of retries to connect to the API. */
|
|
6
|
-
readonly maxRetry: number;
|
|
7
|
-
/** Interval between retries to connect to the API in milliseconds. */
|
|
8
|
-
readonly retryIntervalMs: number;
|
|
9
|
-
/** Timeout for connecting to the API in milliseconds. */
|
|
10
|
-
readonly timeoutMs: number;
|
|
11
4
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
5
|
+
/**
|
|
6
|
+
* Connection options for API calls, controlling retry and timeout behavior.
|
|
7
|
+
*/
|
|
8
|
+
export interface APIConnectOptions {
|
|
9
|
+
/** Maximum number of retries to connect to the API. Default: 3 */
|
|
10
|
+
maxRetry: number;
|
|
11
|
+
/** Interval between retries to connect to the API in milliseconds. Default: 2000 */
|
|
12
|
+
retryIntervalMs: number;
|
|
13
|
+
/** Timeout for connecting to the API in milliseconds. Default: 10000 */
|
|
14
|
+
timeoutMs: number;
|
|
15
|
+
}
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
}
|
|
23
|
-
if (this.timeoutMs < 0) {
|
|
24
|
-
throw new Error('timeoutMs must be greater than or equal to 0');
|
|
25
|
-
}
|
|
26
|
-
}
|
|
17
|
+
export const DEFAULT_API_CONNECT_OPTIONS: APIConnectOptions = {
|
|
18
|
+
maxRetry: 3,
|
|
19
|
+
retryIntervalMs: 2000,
|
|
20
|
+
timeoutMs: 10000,
|
|
21
|
+
};
|
|
27
22
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
return 0.1;
|
|
37
|
-
}
|
|
38
|
-
return this.retryIntervalMs;
|
|
23
|
+
/**
|
|
24
|
+
* Return the interval for the given number of retries.
|
|
25
|
+
* The first retry is immediate, and then uses specified retryIntervalMs.
|
|
26
|
+
* @internal
|
|
27
|
+
*/
|
|
28
|
+
export function intervalForRetry(connOptions: APIConnectOptions, numRetries: number): number {
|
|
29
|
+
if (numRetries === 0) {
|
|
30
|
+
return 0.1;
|
|
39
31
|
}
|
|
32
|
+
return connOptions.retryIntervalMs;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Connection options for the agent session, controlling retry and timeout behavior
|
|
37
|
+
* for STT, LLM, and TTS connections.
|
|
38
|
+
*/
|
|
39
|
+
export interface SessionConnectOptions {
|
|
40
|
+
/** Connection options for speech-to-text. */
|
|
41
|
+
sttConnOptions?: Partial<APIConnectOptions>;
|
|
42
|
+
/** Connection options for the language model. */
|
|
43
|
+
llmConnOptions?: Partial<APIConnectOptions>;
|
|
44
|
+
/** Connection options for text-to-speech. */
|
|
45
|
+
ttsConnOptions?: Partial<APIConnectOptions>;
|
|
46
|
+
/** Maximum number of consecutive unrecoverable errors from LLM or TTS before closing the session. Default: 3 */
|
|
47
|
+
maxUnrecoverableErrors?: number;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Resolved session connect options with all values populated.
|
|
52
|
+
* @internal
|
|
53
|
+
*/
|
|
54
|
+
export interface ResolvedSessionConnectOptions {
|
|
55
|
+
sttConnOptions: APIConnectOptions;
|
|
56
|
+
llmConnOptions: APIConnectOptions;
|
|
57
|
+
ttsConnOptions: APIConnectOptions;
|
|
58
|
+
maxUnrecoverableErrors: number;
|
|
40
59
|
}
|
|
41
60
|
|
|
42
|
-
export const
|
|
61
|
+
export const DEFAULT_SESSION_CONNECT_OPTIONS: ResolvedSessionConnectOptions = {
|
|
62
|
+
sttConnOptions: DEFAULT_API_CONNECT_OPTIONS,
|
|
63
|
+
llmConnOptions: DEFAULT_API_CONNECT_OPTIONS,
|
|
64
|
+
ttsConnOptions: DEFAULT_API_CONNECT_OPTIONS,
|
|
65
|
+
maxUnrecoverableErrors: 3,
|
|
66
|
+
};
|
package/src/voice/agent.ts
CHANGED
|
@@ -260,27 +260,41 @@ export class Agent<UserData = any> {
|
|
|
260
260
|
let wrapped_stt = activity.stt;
|
|
261
261
|
|
|
262
262
|
if (!wrapped_stt.capabilities.streaming) {
|
|
263
|
-
|
|
263
|
+
const vad = agent.vad || activity.vad;
|
|
264
|
+
if (!vad) {
|
|
264
265
|
throw new Error(
|
|
265
266
|
'STT does not support streaming, add a VAD to the AgentTask/VoiceAgent to enable streaming',
|
|
266
267
|
);
|
|
267
268
|
}
|
|
268
|
-
wrapped_stt = new STTStreamAdapter(wrapped_stt,
|
|
269
|
+
wrapped_stt = new STTStreamAdapter(wrapped_stt, vad);
|
|
269
270
|
}
|
|
270
271
|
|
|
271
|
-
const
|
|
272
|
+
const connOptions = activity.agentSession.connOptions.sttConnOptions;
|
|
273
|
+
const stream = wrapped_stt.stream({ connOptions });
|
|
272
274
|
stream.updateInputStream(audio);
|
|
273
275
|
|
|
276
|
+
let cleaned = false;
|
|
277
|
+
const cleanup = () => {
|
|
278
|
+
if (cleaned) return;
|
|
279
|
+
cleaned = true;
|
|
280
|
+
stream.detachInputStream();
|
|
281
|
+
stream.close();
|
|
282
|
+
};
|
|
283
|
+
|
|
274
284
|
return new ReadableStream({
|
|
275
285
|
async start(controller) {
|
|
276
|
-
|
|
277
|
-
|
|
286
|
+
try {
|
|
287
|
+
for await (const event of stream) {
|
|
288
|
+
controller.enqueue(event);
|
|
289
|
+
}
|
|
290
|
+
controller.close();
|
|
291
|
+
} finally {
|
|
292
|
+
// Always clean up the STT stream, whether it ends naturally or is cancelled
|
|
293
|
+
cleanup();
|
|
278
294
|
}
|
|
279
|
-
controller.close();
|
|
280
295
|
},
|
|
281
296
|
cancel() {
|
|
282
|
-
|
|
283
|
-
stream.close();
|
|
297
|
+
cleanup();
|
|
284
298
|
},
|
|
285
299
|
});
|
|
286
300
|
},
|
|
@@ -304,22 +318,36 @@ export class Agent<UserData = any> {
|
|
|
304
318
|
|
|
305
319
|
// TODO(brian): make parallelToolCalls configurable
|
|
306
320
|
const { toolChoice } = modelSettings;
|
|
321
|
+
const connOptions = activity.agentSession.connOptions.llmConnOptions;
|
|
307
322
|
|
|
308
323
|
const stream = activity.llm.chat({
|
|
309
324
|
chatCtx,
|
|
310
325
|
toolCtx,
|
|
311
326
|
toolChoice,
|
|
327
|
+
connOptions,
|
|
312
328
|
parallelToolCalls: true,
|
|
313
329
|
});
|
|
330
|
+
|
|
331
|
+
let cleaned = false;
|
|
332
|
+
const cleanup = () => {
|
|
333
|
+
if (cleaned) return;
|
|
334
|
+
cleaned = true;
|
|
335
|
+
stream.close();
|
|
336
|
+
};
|
|
337
|
+
|
|
314
338
|
return new ReadableStream({
|
|
315
339
|
async start(controller) {
|
|
316
|
-
|
|
317
|
-
|
|
340
|
+
try {
|
|
341
|
+
for await (const chunk of stream) {
|
|
342
|
+
controller.enqueue(chunk);
|
|
343
|
+
}
|
|
344
|
+
controller.close();
|
|
345
|
+
} finally {
|
|
346
|
+
cleanup();
|
|
318
347
|
}
|
|
319
|
-
controller.close();
|
|
320
348
|
},
|
|
321
349
|
cancel() {
|
|
322
|
-
|
|
350
|
+
cleanup();
|
|
323
351
|
},
|
|
324
352
|
});
|
|
325
353
|
},
|
|
@@ -340,21 +368,33 @@ export class Agent<UserData = any> {
|
|
|
340
368
|
wrapped_tts = new TTSStreamAdapter(wrapped_tts, new BasicSentenceTokenizer());
|
|
341
369
|
}
|
|
342
370
|
|
|
343
|
-
const
|
|
371
|
+
const connOptions = activity.agentSession.connOptions.ttsConnOptions;
|
|
372
|
+
const stream = wrapped_tts.stream({ connOptions });
|
|
344
373
|
stream.updateInputStream(text);
|
|
345
374
|
|
|
375
|
+
let cleaned = false;
|
|
376
|
+
const cleanup = () => {
|
|
377
|
+
if (cleaned) return;
|
|
378
|
+
cleaned = true;
|
|
379
|
+
stream.close();
|
|
380
|
+
};
|
|
381
|
+
|
|
346
382
|
return new ReadableStream({
|
|
347
383
|
async start(controller) {
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
384
|
+
try {
|
|
385
|
+
for await (const chunk of stream) {
|
|
386
|
+
if (chunk === SynthesizeStream.END_OF_STREAM) {
|
|
387
|
+
break;
|
|
388
|
+
}
|
|
389
|
+
controller.enqueue(chunk.frame);
|
|
351
390
|
}
|
|
352
|
-
controller.
|
|
391
|
+
controller.close();
|
|
392
|
+
} finally {
|
|
393
|
+
cleanup();
|
|
353
394
|
}
|
|
354
|
-
controller.close();
|
|
355
395
|
},
|
|
356
396
|
cancel() {
|
|
357
|
-
|
|
397
|
+
cleanup();
|
|
358
398
|
},
|
|
359
399
|
});
|
|
360
400
|
},
|
|
@@ -2259,15 +2259,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2259
2259
|
}
|
|
2260
2260
|
if (this.stt instanceof STT) {
|
|
2261
2261
|
this.stt.off('metrics_collected', this.onMetricsCollected);
|
|
2262
|
-
await this.stt.close();
|
|
2263
2262
|
}
|
|
2264
2263
|
if (this.tts instanceof TTS) {
|
|
2265
2264
|
this.tts.off('metrics_collected', this.onMetricsCollected);
|
|
2266
|
-
await this.tts.close();
|
|
2267
2265
|
}
|
|
2268
2266
|
if (this.vad instanceof VAD) {
|
|
2269
2267
|
this.vad.off('metrics_collected', this.onMetricsCollected);
|
|
2270
|
-
await this.vad.close();
|
|
2271
2268
|
}
|
|
2272
2269
|
|
|
2273
2270
|
this.detachAudioInput();
|