@livekit/agents 1.0.24 → 1.0.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/inference/llm.cjs +1 -2
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +1 -2
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +1 -1
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +1 -1
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs +4 -4
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +0 -1
- package/dist/inference/tts.d.ts +0 -1
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +4 -4
- package/dist/inference/tts.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +1 -1
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/job.cjs +29 -2
- package/dist/job.cjs.map +1 -1
- package/dist/job.d.cts +6 -0
- package/dist/job.d.ts +6 -0
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +19 -2
- package/dist/job.js.map +1 -1
- package/dist/llm/llm.cjs +2 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +1 -1
- package/dist/llm/llm.d.ts +1 -1
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +2 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +12 -4
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.cts +6 -1
- package/dist/stream/deferred_stream.d.ts +6 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +12 -4
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/stream/deferred_stream.test.cjs +2 -2
- package/dist/stream/deferred_stream.test.cjs.map +1 -1
- package/dist/stream/deferred_stream.test.js +2 -2
- package/dist/stream/deferred_stream.test.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +15 -8
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.cts +7 -3
- package/dist/stt/stream_adapter.d.ts +7 -3
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +15 -8
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +8 -3
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +9 -3
- package/dist/stt/stt.d.ts +9 -3
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +9 -4
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/traces.cjs +23 -2
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +23 -2
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +10 -7
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.cts +6 -3
- package/dist/tts/stream_adapter.d.ts +6 -3
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +10 -7
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +27 -16
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +12 -5
- package/dist/tts/tts.d.ts +12 -5
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +28 -17
- package/dist/tts/tts.js.map +1 -1
- package/dist/types.cjs +21 -32
- package/dist/types.cjs.map +1 -1
- package/dist/types.d.cts +41 -10
- package/dist/types.d.ts +41 -10
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +18 -30
- package/dist/types.js.map +1 -1
- package/dist/voice/agent.cjs +54 -19
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +54 -19
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +0 -3
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +0 -3
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +107 -27
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +16 -2
- package/dist/voice/agent_session.d.ts +16 -2
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +110 -27
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +4 -4
- package/dist/voice/events.d.ts +4 -4
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +6 -7
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +7 -8
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/io.cjs +16 -0
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +8 -0
- package/dist/voice/io.d.ts +8 -0
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +16 -0
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/index.cjs +23 -0
- package/dist/voice/recorder_io/index.cjs.map +1 -0
- package/dist/voice/recorder_io/index.d.cts +2 -0
- package/dist/voice/recorder_io/index.d.ts +2 -0
- package/dist/voice/recorder_io/index.d.ts.map +1 -0
- package/dist/voice/recorder_io/index.js +2 -0
- package/dist/voice/recorder_io/index.js.map +1 -0
- package/dist/voice/recorder_io/recorder_io.cjs +542 -0
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -0
- package/dist/voice/recorder_io/recorder_io.d.cts +100 -0
- package/dist/voice/recorder_io/recorder_io.d.ts +100 -0
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -0
- package/dist/voice/recorder_io/recorder_io.js +508 -0
- package/dist/voice/recorder_io/recorder_io.js.map +1 -0
- package/dist/voice/report.cjs +7 -2
- package/dist/voice/report.cjs.map +1 -1
- package/dist/voice/report.d.cts +11 -1
- package/dist/voice/report.d.ts +11 -1
- package/dist/voice/report.d.ts.map +1 -1
- package/dist/voice/report.js +7 -2
- package/dist/voice/report.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +2 -1
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +2 -1
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +8 -7
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +2 -1
- package/dist/voice/room_io/_output.d.ts +2 -1
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +8 -7
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/worker.cjs +4 -3
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.js +4 -3
- package/dist/worker.js.map +1 -1
- package/package.json +1 -1
- package/src/inference/llm.ts +0 -1
- package/src/inference/stt.ts +1 -2
- package/src/inference/tts.ts +5 -4
- package/src/ipc/job_proc_lazy_main.ts +1 -1
- package/src/job.ts +21 -2
- package/src/llm/llm.ts +2 -2
- package/src/stream/deferred_stream.test.ts +3 -3
- package/src/stream/deferred_stream.ts +22 -5
- package/src/stt/stream_adapter.ts +18 -8
- package/src/stt/stt.ts +19 -6
- package/src/telemetry/traces.ts +25 -3
- package/src/tts/stream_adapter.ts +15 -7
- package/src/tts/tts.ts +46 -21
- package/src/types.ts +57 -33
- package/src/voice/agent.ts +59 -19
- package/src/voice/agent_activity.ts +0 -3
- package/src/voice/agent_session.ts +142 -35
- package/src/voice/events.ts +6 -3
- package/src/voice/generation.ts +10 -8
- package/src/voice/io.ts +19 -0
- package/src/voice/recorder_io/index.ts +4 -0
- package/src/voice/recorder_io/recorder_io.ts +690 -0
- package/src/voice/report.ts +20 -3
- package/src/voice/room_io/_input.ts +2 -1
- package/src/voice/room_io/_output.ts +10 -7
- package/src/worker.ts +1 -1
|
@@ -9,15 +9,22 @@ import type {
|
|
|
9
9
|
import { IdentityTransform } from './identity_transform.js';
|
|
10
10
|
|
|
11
11
|
/**
|
|
12
|
-
* Check if error is related to
|
|
12
|
+
* Check if error is related to stream cleanup operations.
|
|
13
|
+
*
|
|
14
|
+
* These errors are expected when calling reader.read() after releaseLock()
|
|
15
|
+
* or when writing to already closed streams during cleanup:
|
|
13
16
|
*
|
|
14
17
|
* Invalid state: Releasing reader
|
|
15
18
|
* Invalid state: The reader is not attached to a stream
|
|
19
|
+
* Invalid state: Controller is already closed
|
|
20
|
+
* Invalid state: WritableStream is closed
|
|
16
21
|
*/
|
|
17
22
|
export function isStreamReaderReleaseError(e: unknown) {
|
|
18
23
|
const allowedMessages = [
|
|
19
24
|
'Invalid state: Releasing reader',
|
|
20
25
|
'Invalid state: The reader is not attached to a stream',
|
|
26
|
+
'Controller is already closed',
|
|
27
|
+
'WritableStream is closed',
|
|
21
28
|
];
|
|
22
29
|
|
|
23
30
|
if (e instanceof TypeError) {
|
|
@@ -66,18 +73,27 @@ export class DeferredReadableStream<T> {
|
|
|
66
73
|
await this.writer.write(value);
|
|
67
74
|
}
|
|
68
75
|
} catch (e) {
|
|
69
|
-
// skip
|
|
76
|
+
// skip stream cleanup related errors
|
|
70
77
|
if (isStreamReaderReleaseError(e)) return;
|
|
78
|
+
|
|
71
79
|
sourceError = e;
|
|
72
80
|
} finally {
|
|
73
81
|
// any other error from source will be propagated to the consumer
|
|
74
82
|
if (sourceError) {
|
|
75
|
-
|
|
83
|
+
try {
|
|
84
|
+
this.writer.abort(sourceError);
|
|
85
|
+
} catch (e) {
|
|
86
|
+
// ignore if writer is already closed
|
|
87
|
+
}
|
|
76
88
|
return;
|
|
77
89
|
}
|
|
78
90
|
|
|
79
91
|
// release lock so this.stream.getReader().read() will terminate with done: true
|
|
80
|
-
|
|
92
|
+
try {
|
|
93
|
+
this.writer.releaseLock();
|
|
94
|
+
} catch (e) {
|
|
95
|
+
// ignore if writer lock is already released
|
|
96
|
+
}
|
|
81
97
|
|
|
82
98
|
// we only close the writable stream after done
|
|
83
99
|
try {
|
|
@@ -98,7 +114,8 @@ export class DeferredReadableStream<T> {
|
|
|
98
114
|
*/
|
|
99
115
|
async detachSource() {
|
|
100
116
|
if (!this.isSourceSet) {
|
|
101
|
-
|
|
117
|
+
// No-op if source was never set - this is a common case during cleanup
|
|
118
|
+
return;
|
|
102
119
|
}
|
|
103
120
|
|
|
104
121
|
// release lock will make any pending read() throw TypeError
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
5
|
import { log } from '../log.js';
|
|
6
|
+
import type { APIConnectOptions } from '../types.js';
|
|
6
7
|
import type { VAD, VADStream } from '../vad.js';
|
|
7
8
|
import { VADEventType } from '../vad.js';
|
|
8
9
|
import type { SpeechEvent } from './stt.js';
|
|
@@ -22,14 +23,18 @@ export class StreamAdapter extends STT {
|
|
|
22
23
|
this.#stt.on('metrics_collected', (metrics) => {
|
|
23
24
|
this.emit('metrics_collected', metrics);
|
|
24
25
|
});
|
|
26
|
+
|
|
27
|
+
this.#stt.on('error', (error) => {
|
|
28
|
+
this.emit('error', error);
|
|
29
|
+
});
|
|
25
30
|
}
|
|
26
31
|
|
|
27
|
-
_recognize(frame: AudioFrame): Promise<SpeechEvent> {
|
|
28
|
-
return this.#stt.recognize(frame);
|
|
32
|
+
_recognize(frame: AudioFrame, abortSignal?: AbortSignal): Promise<SpeechEvent> {
|
|
33
|
+
return this.#stt.recognize(frame, abortSignal);
|
|
29
34
|
}
|
|
30
35
|
|
|
31
|
-
stream(): StreamAdapterWrapper {
|
|
32
|
-
return new StreamAdapterWrapper(this.#stt, this.#vad);
|
|
36
|
+
stream(options?: { connOptions?: APIConnectOptions }): StreamAdapterWrapper {
|
|
37
|
+
return new StreamAdapterWrapper(this.#stt, this.#vad, options?.connOptions);
|
|
33
38
|
}
|
|
34
39
|
}
|
|
35
40
|
|
|
@@ -38,13 +43,18 @@ export class StreamAdapterWrapper extends SpeechStream {
|
|
|
38
43
|
#vadStream: VADStream;
|
|
39
44
|
label: string;
|
|
40
45
|
|
|
41
|
-
constructor(stt: STT, vad: VAD) {
|
|
42
|
-
super(stt);
|
|
46
|
+
constructor(stt: STT, vad: VAD, connOptions?: APIConnectOptions) {
|
|
47
|
+
super(stt, undefined, connOptions);
|
|
43
48
|
this.#stt = stt;
|
|
44
49
|
this.#vadStream = vad.stream();
|
|
45
50
|
this.label = `stt.StreamAdapterWrapper<${this.#stt.label}>`;
|
|
46
51
|
}
|
|
47
52
|
|
|
53
|
+
close() {
|
|
54
|
+
super.close();
|
|
55
|
+
this.#vadStream.close();
|
|
56
|
+
}
|
|
57
|
+
|
|
48
58
|
async monitorMetrics() {
|
|
49
59
|
return; // do nothing
|
|
50
60
|
}
|
|
@@ -71,7 +81,7 @@ export class StreamAdapterWrapper extends SpeechStream {
|
|
|
71
81
|
this.output.put({ type: SpeechEventType.END_OF_SPEECH });
|
|
72
82
|
|
|
73
83
|
try {
|
|
74
|
-
const event = await this.#stt.recognize(ev.frames);
|
|
84
|
+
const event = await this.#stt.recognize(ev.frames, this.abortSignal);
|
|
75
85
|
if (!event.alternatives![0].text) {
|
|
76
86
|
continue;
|
|
77
87
|
}
|
|
@@ -92,6 +102,6 @@ export class StreamAdapterWrapper extends SpeechStream {
|
|
|
92
102
|
}
|
|
93
103
|
};
|
|
94
104
|
|
|
95
|
-
Promise.all([forwardInput(), recognize()]);
|
|
105
|
+
await Promise.all([forwardInput(), recognize()]);
|
|
96
106
|
}
|
|
97
107
|
}
|
package/src/stt/stt.ts
CHANGED
|
@@ -10,7 +10,7 @@ import { calculateAudioDurationSeconds } from '../audio.js';
|
|
|
10
10
|
import { log } from '../log.js';
|
|
11
11
|
import type { STTMetrics } from '../metrics/base.js';
|
|
12
12
|
import { DeferredReadableStream } from '../stream/deferred_stream.js';
|
|
13
|
-
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
|
|
13
|
+
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS, intervalForRetry } from '../types.js';
|
|
14
14
|
import type { AudioBuffer } from '../utils.js';
|
|
15
15
|
import { AsyncIterableQueue, delay, startSoon, toError } from '../utils.js';
|
|
16
16
|
|
|
@@ -113,9 +113,9 @@ export abstract class STT extends (EventEmitter as new () => TypedEmitter<STTCal
|
|
|
113
113
|
}
|
|
114
114
|
|
|
115
115
|
/** Receives an audio buffer and returns transcription in the form of a {@link SpeechEvent} */
|
|
116
|
-
async recognize(frame: AudioBuffer): Promise<SpeechEvent> {
|
|
116
|
+
async recognize(frame: AudioBuffer, abortSignal?: AbortSignal): Promise<SpeechEvent> {
|
|
117
117
|
const startTime = process.hrtime.bigint();
|
|
118
|
-
const event = await this._recognize(frame);
|
|
118
|
+
const event = await this._recognize(frame, abortSignal);
|
|
119
119
|
const durationMs = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));
|
|
120
120
|
this.emit('metrics_collected', {
|
|
121
121
|
type: 'stt_metrics',
|
|
@@ -128,13 +128,19 @@ export abstract class STT extends (EventEmitter as new () => TypedEmitter<STTCal
|
|
|
128
128
|
});
|
|
129
129
|
return event;
|
|
130
130
|
}
|
|
131
|
-
|
|
131
|
+
|
|
132
|
+
protected abstract _recognize(
|
|
133
|
+
frame: AudioBuffer,
|
|
134
|
+
abortSignal?: AbortSignal,
|
|
135
|
+
): Promise<SpeechEvent>;
|
|
132
136
|
|
|
133
137
|
/**
|
|
134
138
|
* Returns a {@link SpeechStream} that can be used to push audio frames and receive
|
|
135
139
|
* transcriptions
|
|
140
|
+
*
|
|
141
|
+
* @param options - Optional configuration including connection options
|
|
136
142
|
*/
|
|
137
|
-
abstract stream(): SpeechStream;
|
|
143
|
+
abstract stream(options?: { connOptions?: APIConnectOptions }): SpeechStream;
|
|
138
144
|
|
|
139
145
|
async close(): Promise<void> {
|
|
140
146
|
return;
|
|
@@ -171,6 +177,8 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
171
177
|
private logger = log();
|
|
172
178
|
private _connOptions: APIConnectOptions;
|
|
173
179
|
|
|
180
|
+
protected abortController = new AbortController();
|
|
181
|
+
|
|
174
182
|
constructor(
|
|
175
183
|
stt: STT,
|
|
176
184
|
sampleRate?: number,
|
|
@@ -196,7 +204,7 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
196
204
|
return await this.run();
|
|
197
205
|
} catch (error) {
|
|
198
206
|
if (error instanceof APIError) {
|
|
199
|
-
const retryInterval = this._connOptions
|
|
207
|
+
const retryInterval = intervalForRetry(this._connOptions, i);
|
|
200
208
|
|
|
201
209
|
if (this._connOptions.maxRetry === 0 || !error.retryable) {
|
|
202
210
|
this.emitError({ error, recoverable: false });
|
|
@@ -288,6 +296,10 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
288
296
|
|
|
289
297
|
protected abstract run(): Promise<void>;
|
|
290
298
|
|
|
299
|
+
protected get abortSignal(): AbortSignal {
|
|
300
|
+
return this.abortController.signal;
|
|
301
|
+
}
|
|
302
|
+
|
|
291
303
|
updateInputStream(audioStream: ReadableStream<AudioFrame>) {
|
|
292
304
|
this.deferredInputStream.setSource(audioStream);
|
|
293
305
|
}
|
|
@@ -352,6 +364,7 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
352
364
|
if (!this.input.closed) this.input.close();
|
|
353
365
|
if (!this.queue.closed) this.queue.close();
|
|
354
366
|
if (!this.output.closed) this.output.close();
|
|
367
|
+
if (!this.abortController.signal.aborted) this.abortController.abort();
|
|
355
368
|
this.closed = true;
|
|
356
369
|
}
|
|
357
370
|
|
package/src/telemetry/traces.ts
CHANGED
|
@@ -21,6 +21,7 @@ import { BatchSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace
|
|
|
21
21
|
import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions';
|
|
22
22
|
import FormData from 'form-data';
|
|
23
23
|
import { AccessToken } from 'livekit-server-sdk';
|
|
24
|
+
import fs from 'node:fs/promises';
|
|
24
25
|
import type { ChatContent, ChatItem } from '../llm/index.js';
|
|
25
26
|
import { enableOtelLogging } from '../log.js';
|
|
26
27
|
import type { SessionReport } from '../voice/report.js';
|
|
@@ -497,12 +498,13 @@ export async function uploadSessionReport(options: {
|
|
|
497
498
|
const formData = new FormData();
|
|
498
499
|
|
|
499
500
|
// Add header (protobuf MetricsRecordingHeader)
|
|
501
|
+
const audioStartTime = report.audioRecordingStartedAt ?? 0;
|
|
500
502
|
const headerMsg = new MetricsRecordingHeader({
|
|
501
503
|
roomId: report.roomId,
|
|
502
504
|
duration: BigInt(0), // TODO: Calculate actual duration from report
|
|
503
505
|
startTime: {
|
|
504
|
-
seconds: BigInt(Math.floor(
|
|
505
|
-
nanos: Math.floor((
|
|
506
|
+
seconds: BigInt(Math.floor(audioStartTime / 1000)),
|
|
507
|
+
nanos: Math.floor((audioStartTime % 1000) * 1e6),
|
|
506
508
|
},
|
|
507
509
|
});
|
|
508
510
|
|
|
@@ -530,7 +532,27 @@ export async function uploadSessionReport(options: {
|
|
|
530
532
|
},
|
|
531
533
|
});
|
|
532
534
|
|
|
533
|
-
//
|
|
535
|
+
// Add audio recording file if available
|
|
536
|
+
if (report.audioRecordingPath && report.audioRecordingStartedAt) {
|
|
537
|
+
let audioBytes: Buffer;
|
|
538
|
+
try {
|
|
539
|
+
audioBytes = await fs.readFile(report.audioRecordingPath);
|
|
540
|
+
} catch {
|
|
541
|
+
audioBytes = Buffer.alloc(0);
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
if (audioBytes.length > 0) {
|
|
545
|
+
formData.append('audio', audioBytes, {
|
|
546
|
+
filename: 'recording.ogg',
|
|
547
|
+
contentType: 'audio/ogg',
|
|
548
|
+
knownLength: audioBytes.length,
|
|
549
|
+
header: {
|
|
550
|
+
'Content-Type': 'audio/ogg',
|
|
551
|
+
'Content-Length': audioBytes.length.toString(),
|
|
552
|
+
},
|
|
553
|
+
});
|
|
554
|
+
}
|
|
555
|
+
}
|
|
534
556
|
|
|
535
557
|
// Upload to LiveKit Cloud using form-data's submit method
|
|
536
558
|
// This properly streams the multipart form with all headers including Content-Length
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { SentenceStream, SentenceTokenizer } from '../tokenize/index.js';
|
|
5
|
+
import type { APIConnectOptions } from '../types.js';
|
|
5
6
|
import { Task } from '../utils.js';
|
|
6
7
|
import type { ChunkedStream } from './tts.js';
|
|
7
8
|
import { SynthesizeStream, TTS } from './tts.js';
|
|
@@ -21,14 +22,21 @@ export class StreamAdapter extends TTS {
|
|
|
21
22
|
this.#tts.on('metrics_collected', (metrics) => {
|
|
22
23
|
this.emit('metrics_collected', metrics);
|
|
23
24
|
});
|
|
25
|
+
this.#tts.on('error', (error) => {
|
|
26
|
+
this.emit('error', error);
|
|
27
|
+
});
|
|
24
28
|
}
|
|
25
29
|
|
|
26
|
-
synthesize(
|
|
27
|
-
|
|
30
|
+
synthesize(
|
|
31
|
+
text: string,
|
|
32
|
+
connOptions?: APIConnectOptions,
|
|
33
|
+
abortSignal?: AbortSignal,
|
|
34
|
+
): ChunkedStream {
|
|
35
|
+
return this.#tts.synthesize(text, connOptions, abortSignal);
|
|
28
36
|
}
|
|
29
37
|
|
|
30
|
-
stream(): StreamAdapterWrapper {
|
|
31
|
-
return new StreamAdapterWrapper(this.#tts, this.#sentenceTokenizer);
|
|
38
|
+
stream(options?: { connOptions?: APIConnectOptions }): StreamAdapterWrapper {
|
|
39
|
+
return new StreamAdapterWrapper(this.#tts, this.#sentenceTokenizer, options?.connOptions);
|
|
32
40
|
}
|
|
33
41
|
}
|
|
34
42
|
|
|
@@ -37,8 +45,8 @@ export class StreamAdapterWrapper extends SynthesizeStream {
|
|
|
37
45
|
#sentenceStream: SentenceStream;
|
|
38
46
|
label: string;
|
|
39
47
|
|
|
40
|
-
constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {
|
|
41
|
-
super(tts);
|
|
48
|
+
constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer, connOptions?: APIConnectOptions) {
|
|
49
|
+
super(tts, connOptions);
|
|
42
50
|
this.#tts = tts;
|
|
43
51
|
this.#sentenceStream = sentenceTokenizer.stream();
|
|
44
52
|
this.label = `tts.StreamAdapterWrapper<${this.#tts.label}>`;
|
|
@@ -84,7 +92,7 @@ export class StreamAdapterWrapper extends SynthesizeStream {
|
|
|
84
92
|
prevTask: Task<void> | undefined,
|
|
85
93
|
controller: AbortController,
|
|
86
94
|
) => {
|
|
87
|
-
const audioStream = this.#tts.synthesize(token);
|
|
95
|
+
const audioStream = this.#tts.synthesize(token, this.connOptions, this.abortSignal);
|
|
88
96
|
|
|
89
97
|
// wait for previous audio transcription to complete before starting
|
|
90
98
|
// to queuing audio frames of the current token
|
package/src/tts/tts.ts
CHANGED
|
@@ -11,7 +11,7 @@ import { log } from '../log.js';
|
|
|
11
11
|
import type { TTSMetrics } from '../metrics/base.js';
|
|
12
12
|
import { DeferredReadableStream } from '../stream/deferred_stream.js';
|
|
13
13
|
import { recordException, traceTypes, tracer } from '../telemetry/index.js';
|
|
14
|
-
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
|
|
14
|
+
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS, intervalForRetry } from '../types.js';
|
|
15
15
|
import { AsyncIterableQueue, delay, mergeFrames, startSoon, toError } from '../utils.js';
|
|
16
16
|
|
|
17
17
|
/** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */
|
|
@@ -90,12 +90,18 @@ export abstract class TTS extends (EventEmitter as new () => TypedEmitter<TTSCal
|
|
|
90
90
|
/**
|
|
91
91
|
* Receives text and returns synthesis in the form of a {@link ChunkedStream}
|
|
92
92
|
*/
|
|
93
|
-
abstract synthesize(
|
|
93
|
+
abstract synthesize(
|
|
94
|
+
text: string,
|
|
95
|
+
connOptions?: APIConnectOptions,
|
|
96
|
+
abortSignal?: AbortSignal,
|
|
97
|
+
): ChunkedStream;
|
|
94
98
|
|
|
95
99
|
/**
|
|
96
100
|
* Returns a {@link SynthesizeStream} that can be used to push text and receive audio data
|
|
101
|
+
*
|
|
102
|
+
* @param options - Optional configuration including connection options
|
|
97
103
|
*/
|
|
98
|
-
abstract stream(): SynthesizeStream;
|
|
104
|
+
abstract stream(options?: { connOptions?: APIConnectOptions }): SynthesizeStream;
|
|
99
105
|
|
|
100
106
|
async close(): Promise<void> {
|
|
101
107
|
return;
|
|
@@ -129,30 +135,33 @@ export abstract class SynthesizeStream
|
|
|
129
135
|
SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM
|
|
130
136
|
>();
|
|
131
137
|
protected closed = false;
|
|
132
|
-
|
|
133
|
-
#tts: TTS;
|
|
134
|
-
#metricsPendingTexts: string[] = [];
|
|
135
|
-
#metricsText = '';
|
|
136
|
-
#monitorMetricsTask?: Promise<void>;
|
|
137
|
-
private _connOptions: APIConnectOptions;
|
|
138
|
+
protected connOptions: APIConnectOptions;
|
|
138
139
|
protected abortController = new AbortController();
|
|
139
|
-
#ttsRequestSpan?: Span;
|
|
140
140
|
|
|
141
141
|
private deferredInputStream: DeferredReadableStream<
|
|
142
142
|
string | typeof SynthesizeStream.FLUSH_SENTINEL
|
|
143
143
|
>;
|
|
144
144
|
private logger = log();
|
|
145
145
|
|
|
146
|
+
abstract label: string;
|
|
147
|
+
|
|
148
|
+
#tts: TTS;
|
|
149
|
+
#metricsPendingTexts: string[] = [];
|
|
150
|
+
#metricsText = '';
|
|
151
|
+
#monitorMetricsTask?: Promise<void>;
|
|
152
|
+
#ttsRequestSpan?: Span;
|
|
153
|
+
|
|
146
154
|
constructor(tts: TTS, connOptions: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS) {
|
|
147
155
|
this.#tts = tts;
|
|
148
|
-
this.
|
|
156
|
+
this.connOptions = connOptions;
|
|
149
157
|
this.deferredInputStream = new DeferredReadableStream();
|
|
150
158
|
this.pumpInput();
|
|
159
|
+
|
|
151
160
|
this.abortController.signal.addEventListener('abort', () => {
|
|
152
161
|
this.deferredInputStream.detachSource();
|
|
153
162
|
// TODO (AJS-36) clean this up when we refactor with streams
|
|
154
|
-
this.input.close();
|
|
155
|
-
this.output.close();
|
|
163
|
+
if (!this.input.closed) this.input.close();
|
|
164
|
+
if (!this.output.closed) this.output.close();
|
|
156
165
|
this.closed = true;
|
|
157
166
|
});
|
|
158
167
|
|
|
@@ -170,7 +179,7 @@ export abstract class SynthesizeStream
|
|
|
170
179
|
[traceTypes.ATTR_TTS_LABEL]: this.#tts.label,
|
|
171
180
|
});
|
|
172
181
|
|
|
173
|
-
for (let i = 0; i < this.
|
|
182
|
+
for (let i = 0; i < this.connOptions.maxRetry + 1; i++) {
|
|
174
183
|
try {
|
|
175
184
|
return await tracer.startActiveSpan(
|
|
176
185
|
async (attemptSpan) => {
|
|
@@ -186,15 +195,15 @@ export abstract class SynthesizeStream
|
|
|
186
195
|
);
|
|
187
196
|
} catch (error) {
|
|
188
197
|
if (error instanceof APIError) {
|
|
189
|
-
const retryInterval = this.
|
|
198
|
+
const retryInterval = intervalForRetry(this.connOptions, i);
|
|
190
199
|
|
|
191
|
-
if (this.
|
|
200
|
+
if (this.connOptions.maxRetry === 0 || !error.retryable) {
|
|
192
201
|
this.emitError({ error, recoverable: false });
|
|
193
202
|
throw error;
|
|
194
|
-
} else if (i === this.
|
|
203
|
+
} else if (i === this.connOptions.maxRetry) {
|
|
195
204
|
this.emitError({ error, recoverable: false });
|
|
196
205
|
throw new APIConnectionError({
|
|
197
|
-
message: `failed to generate TTS completion after ${this.
|
|
206
|
+
message: `failed to generate TTS completion after ${this.connOptions.maxRetry + 1} attempts`,
|
|
198
207
|
options: { retryable: false },
|
|
199
208
|
});
|
|
200
209
|
} else {
|
|
@@ -378,6 +387,10 @@ export abstract class SynthesizeStream
|
|
|
378
387
|
return this.output.next();
|
|
379
388
|
}
|
|
380
389
|
|
|
390
|
+
get abortSignal(): AbortSignal {
|
|
391
|
+
return this.abortController.signal;
|
|
392
|
+
}
|
|
393
|
+
|
|
381
394
|
/** Close both the input and output of the TTS stream */
|
|
382
395
|
close() {
|
|
383
396
|
this.abortController.abort();
|
|
@@ -413,15 +426,22 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
|
|
|
413
426
|
private _connOptions: APIConnectOptions;
|
|
414
427
|
private logger = log();
|
|
415
428
|
|
|
429
|
+
protected abortController = new AbortController();
|
|
430
|
+
|
|
416
431
|
constructor(
|
|
417
432
|
text: string,
|
|
418
433
|
tts: TTS,
|
|
419
434
|
connOptions: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
|
|
435
|
+
abortSignal?: AbortSignal,
|
|
420
436
|
) {
|
|
421
437
|
this.#text = text;
|
|
422
438
|
this.#tts = tts;
|
|
423
439
|
this._connOptions = connOptions;
|
|
424
440
|
|
|
441
|
+
if (abortSignal) {
|
|
442
|
+
abortSignal.addEventListener('abort', () => this.abortController.abort(), { once: true });
|
|
443
|
+
}
|
|
444
|
+
|
|
425
445
|
this.monitorMetrics();
|
|
426
446
|
|
|
427
447
|
// this is a hack to immitate asyncio.create_task so that mainTask
|
|
@@ -454,7 +474,7 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
|
|
|
454
474
|
);
|
|
455
475
|
} catch (error) {
|
|
456
476
|
if (error instanceof APIError) {
|
|
457
|
-
const retryInterval = this._connOptions
|
|
477
|
+
const retryInterval = intervalForRetry(this._connOptions, i);
|
|
458
478
|
|
|
459
479
|
if (this._connOptions.maxRetry === 0 || !error.retryable) {
|
|
460
480
|
this.emitError({ error, recoverable: false });
|
|
@@ -508,6 +528,10 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
|
|
|
508
528
|
return this.#text;
|
|
509
529
|
}
|
|
510
530
|
|
|
531
|
+
get abortSignal(): AbortSignal {
|
|
532
|
+
return this.abortController.signal;
|
|
533
|
+
}
|
|
534
|
+
|
|
511
535
|
protected async monitorMetrics() {
|
|
512
536
|
const startTime = process.hrtime.bigint();
|
|
513
537
|
let audioDurationMs = 0;
|
|
@@ -562,8 +586,9 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
|
|
|
562
586
|
|
|
563
587
|
/** Close both the input and output of the TTS stream */
|
|
564
588
|
close() {
|
|
565
|
-
this.queue.close();
|
|
566
|
-
this.output.close();
|
|
589
|
+
if (!this.queue.closed) this.queue.close();
|
|
590
|
+
if (!this.output.closed) this.output.close();
|
|
591
|
+
if (!this.abortController.signal.aborted) this.abortController.abort();
|
|
567
592
|
this.closed = true;
|
|
568
593
|
}
|
|
569
594
|
|
package/src/types.ts
CHANGED
|
@@ -1,42 +1,66 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
export class APIConnectOptions {
|
|
5
|
-
/** Maximum number of retries to connect to the API. */
|
|
6
|
-
readonly maxRetry: number;
|
|
7
|
-
/** Interval between retries to connect to the API in milliseconds. */
|
|
8
|
-
readonly retryIntervalMs: number;
|
|
9
|
-
/** Timeout for connecting to the API in milliseconds. */
|
|
10
|
-
readonly timeoutMs: number;
|
|
11
4
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
5
|
+
/**
|
|
6
|
+
* Connection options for API calls, controlling retry and timeout behavior.
|
|
7
|
+
*/
|
|
8
|
+
export interface APIConnectOptions {
|
|
9
|
+
/** Maximum number of retries to connect to the API. Default: 3 */
|
|
10
|
+
maxRetry: number;
|
|
11
|
+
/** Interval between retries to connect to the API in milliseconds. Default: 2000 */
|
|
12
|
+
retryIntervalMs: number;
|
|
13
|
+
/** Timeout for connecting to the API in milliseconds. Default: 10000 */
|
|
14
|
+
timeoutMs: number;
|
|
15
|
+
}
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
}
|
|
23
|
-
if (this.timeoutMs < 0) {
|
|
24
|
-
throw new Error('timeoutMs must be greater than or equal to 0');
|
|
25
|
-
}
|
|
26
|
-
}
|
|
17
|
+
export const DEFAULT_API_CONNECT_OPTIONS: APIConnectOptions = {
|
|
18
|
+
maxRetry: 3,
|
|
19
|
+
retryIntervalMs: 2000,
|
|
20
|
+
timeoutMs: 10000,
|
|
21
|
+
};
|
|
27
22
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
return 0.1;
|
|
37
|
-
}
|
|
38
|
-
return this.retryIntervalMs;
|
|
23
|
+
/**
|
|
24
|
+
* Return the interval for the given number of retries.
|
|
25
|
+
* The first retry is immediate, and then uses specified retryIntervalMs.
|
|
26
|
+
* @internal
|
|
27
|
+
*/
|
|
28
|
+
export function intervalForRetry(connOptions: APIConnectOptions, numRetries: number): number {
|
|
29
|
+
if (numRetries === 0) {
|
|
30
|
+
return 0.1;
|
|
39
31
|
}
|
|
32
|
+
return connOptions.retryIntervalMs;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Connection options for the agent session, controlling retry and timeout behavior
|
|
37
|
+
* for STT, LLM, and TTS connections.
|
|
38
|
+
*/
|
|
39
|
+
export interface SessionConnectOptions {
|
|
40
|
+
/** Connection options for speech-to-text. */
|
|
41
|
+
sttConnOptions?: Partial<APIConnectOptions>;
|
|
42
|
+
/** Connection options for the language model. */
|
|
43
|
+
llmConnOptions?: Partial<APIConnectOptions>;
|
|
44
|
+
/** Connection options for text-to-speech. */
|
|
45
|
+
ttsConnOptions?: Partial<APIConnectOptions>;
|
|
46
|
+
/** Maximum number of consecutive unrecoverable errors from LLM or TTS before closing the session. Default: 3 */
|
|
47
|
+
maxUnrecoverableErrors?: number;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Resolved session connect options with all values populated.
|
|
52
|
+
* @internal
|
|
53
|
+
*/
|
|
54
|
+
export interface ResolvedSessionConnectOptions {
|
|
55
|
+
sttConnOptions: APIConnectOptions;
|
|
56
|
+
llmConnOptions: APIConnectOptions;
|
|
57
|
+
ttsConnOptions: APIConnectOptions;
|
|
58
|
+
maxUnrecoverableErrors: number;
|
|
40
59
|
}
|
|
41
60
|
|
|
42
|
-
export const
|
|
61
|
+
export const DEFAULT_SESSION_CONNECT_OPTIONS: ResolvedSessionConnectOptions = {
|
|
62
|
+
sttConnOptions: DEFAULT_API_CONNECT_OPTIONS,
|
|
63
|
+
llmConnOptions: DEFAULT_API_CONNECT_OPTIONS,
|
|
64
|
+
ttsConnOptions: DEFAULT_API_CONNECT_OPTIONS,
|
|
65
|
+
maxUnrecoverableErrors: 3,
|
|
66
|
+
};
|