@livekit/agents 1.0.37 → 1.0.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs.map +1 -1
- package/dist/inference/api_protos.cjs +68 -0
- package/dist/inference/api_protos.cjs.map +1 -1
- package/dist/inference/api_protos.d.cts +345 -4
- package/dist/inference/api_protos.d.ts +345 -4
- package/dist/inference/api_protos.d.ts.map +1 -1
- package/dist/inference/api_protos.js +60 -0
- package/dist/inference/api_protos.js.map +1 -1
- package/dist/inference/llm.cjs +7 -3
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +5 -6
- package/dist/inference/llm.d.ts +5 -6
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +7 -3
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +32 -21
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +5 -4
- package/dist/inference/stt.d.ts +5 -4
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +34 -21
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +10 -7
- package/dist/inference/tts.d.ts +10 -7
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js.map +1 -1
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/stt/stream_adapter.cjs +9 -1
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +9 -1
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +10 -0
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +12 -0
- package/dist/stt/stt.d.ts +12 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +10 -0
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/traces.cjs +4 -3
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.cts +2 -0
- package/dist/telemetry/traces.d.ts +2 -0
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +4 -3
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/utils.cjs +11 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +10 -0
- package/dist/utils.d.ts +10 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +10 -0
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent.cjs +6 -2
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +6 -2
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +72 -37
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +2 -1
- package/dist/voice/agent_activity.d.ts +2 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +73 -38
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +7 -5
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +5 -2
- package/dist/voice/agent_session.d.ts +5 -2
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +7 -5
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +3 -1
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +3 -1
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/avatar/datastream_io.cjs +6 -0
- package/dist/voice/avatar/datastream_io.cjs.map +1 -1
- package/dist/voice/avatar/datastream_io.d.cts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
- package/dist/voice/avatar/datastream_io.js +6 -0
- package/dist/voice/avatar/datastream_io.js.map +1 -1
- package/dist/voice/background_audio.cjs.map +1 -1
- package/dist/voice/generation.cjs +14 -5
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -2
- package/dist/voice/generation.d.ts +3 -2
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +14 -5
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/io.cjs +12 -0
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +19 -1
- package/dist/voice/io.d.ts +19 -1
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +12 -0
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +91 -28
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
- package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +91 -28
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +40 -11
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +4 -1
- package/dist/voice/room_io/_input.d.ts +4 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +31 -2
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +6 -0
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +1 -0
- package/dist/voice/room_io/_output.d.ts +1 -0
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +6 -0
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +2 -2
- package/dist/voice/room_io/room_io.d.ts +2 -2
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +2 -0
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +3 -0
- package/dist/voice/speech_handle.d.ts +3 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +2 -0
- package/dist/voice/speech_handle.js.map +1 -1
- package/package.json +2 -2
- package/src/inference/api_protos.ts +83 -0
- package/src/inference/llm.ts +20 -15
- package/src/inference/stt.ts +48 -29
- package/src/inference/tts.ts +36 -16
- package/src/stt/stream_adapter.ts +12 -1
- package/src/stt/stt.ts +21 -0
- package/src/telemetry/traces.ts +6 -2
- package/src/utils.ts +21 -0
- package/src/voice/agent.ts +11 -2
- package/src/voice/agent_activity.ts +108 -41
- package/src/voice/agent_session.ts +6 -5
- package/src/voice/audio_recognition.ts +2 -0
- package/src/voice/avatar/datastream_io.ts +8 -0
- package/src/voice/generation.ts +24 -12
- package/src/voice/io.ts +27 -5
- package/src/voice/recorder_io/recorder_io.ts +123 -31
- package/src/voice/room_io/_input.ts +32 -4
- package/src/voice/room_io/_output.ts +8 -0
- package/src/voice/room_io/room_io.ts +3 -1
- package/src/voice/speech_handle.ts +4 -0
package/src/inference/stt.ts
CHANGED
|
@@ -16,22 +16,30 @@ import {
|
|
|
16
16
|
} from '../stt/index.js';
|
|
17
17
|
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
|
|
18
18
|
import { type AudioBuffer, Event, Task, cancelAndWait, shortuuid, waitForAbort } from '../utils.js';
|
|
19
|
+
import type { TimedString } from '../voice/io.js';
|
|
20
|
+
import {
|
|
21
|
+
type SttServerEvent,
|
|
22
|
+
type SttTranscriptEvent,
|
|
23
|
+
sttServerEventSchema,
|
|
24
|
+
} from './api_protos.js';
|
|
19
25
|
import { type AnyString, connectWs, createAccessToken } from './utils.js';
|
|
20
26
|
|
|
21
27
|
export type DeepgramModels =
|
|
22
|
-
| 'deepgram'
|
|
28
|
+
| 'deepgram/flux-general'
|
|
23
29
|
| 'deepgram/nova-3'
|
|
24
|
-
| 'deepgram/nova-3-general'
|
|
25
30
|
| 'deepgram/nova-3-medical'
|
|
26
|
-
| 'deepgram/nova-2-conversationalai'
|
|
27
31
|
| 'deepgram/nova-2'
|
|
28
|
-
| 'deepgram/nova-2-general'
|
|
29
32
|
| 'deepgram/nova-2-medical'
|
|
33
|
+
| 'deepgram/nova-2-conversationalai'
|
|
30
34
|
| 'deepgram/nova-2-phonecall';
|
|
31
35
|
|
|
32
|
-
export type CartesiaModels = 'cartesia
|
|
36
|
+
export type CartesiaModels = 'cartesia/ink-whisper';
|
|
37
|
+
|
|
38
|
+
export type AssemblyaiModels =
|
|
39
|
+
| 'assemblyai/universal-streaming'
|
|
40
|
+
| 'assemblyai/universal-streaming-multilingual';
|
|
33
41
|
|
|
34
|
-
export type
|
|
42
|
+
export type ElevenlabsSTTModels = 'elevenlabs/scribe_v2_realtime';
|
|
35
43
|
|
|
36
44
|
export interface CartesiaOptions {
|
|
37
45
|
min_volume?: number; // default: not specified
|
|
@@ -71,7 +79,7 @@ export type STTLanguages =
|
|
|
71
79
|
| 'hi'
|
|
72
80
|
| AnyString;
|
|
73
81
|
|
|
74
|
-
type _STTModels = DeepgramModels | CartesiaModels | AssemblyaiModels;
|
|
82
|
+
type _STTModels = DeepgramModels | CartesiaModels | AssemblyaiModels | ElevenlabsSTTModels;
|
|
75
83
|
|
|
76
84
|
export type STTModels = _STTModels | 'auto' | AnyString;
|
|
77
85
|
|
|
@@ -122,7 +130,7 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
122
130
|
apiSecret?: string;
|
|
123
131
|
modelOptions?: STTOptions<TModel>;
|
|
124
132
|
}) {
|
|
125
|
-
super({ streaming: true, interimResults: true });
|
|
133
|
+
super({ streaming: true, interimResults: true, alignedTranscript: 'word' });
|
|
126
134
|
|
|
127
135
|
const {
|
|
128
136
|
model,
|
|
@@ -271,7 +279,6 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
|
|
|
271
279
|
let closing = false;
|
|
272
280
|
let finalReceived = false;
|
|
273
281
|
|
|
274
|
-
type SttServerEvent = Record<string, any>;
|
|
275
282
|
const eventChannel = createStreamChannel<SttServerEvent>();
|
|
276
283
|
|
|
277
284
|
const resourceCleanup = () => {
|
|
@@ -380,10 +387,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
|
|
|
380
387
|
if (signal.aborted) return;
|
|
381
388
|
if (result.done) return;
|
|
382
389
|
|
|
383
|
-
|
|
384
|
-
const
|
|
390
|
+
// Parse and validate with Zod schema
|
|
391
|
+
const parseResult = await sttServerEventSchema.safeParseAsync(result.value);
|
|
392
|
+
if (!parseResult.success) {
|
|
393
|
+
this.#logger.warn(
|
|
394
|
+
{ error: parseResult.error, rawData: result.value },
|
|
395
|
+
'Failed to parse STT server event',
|
|
396
|
+
);
|
|
397
|
+
continue;
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
const event: SttServerEvent = parseResult.data;
|
|
385
401
|
|
|
386
|
-
switch (type) {
|
|
402
|
+
switch (event.type) {
|
|
387
403
|
case 'session.created':
|
|
388
404
|
case 'session.finalized':
|
|
389
405
|
break;
|
|
@@ -392,21 +408,15 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
|
|
|
392
408
|
resourceCleanup();
|
|
393
409
|
break;
|
|
394
410
|
case 'interim_transcript':
|
|
395
|
-
this.processTranscript(
|
|
411
|
+
this.processTranscript(event, false);
|
|
396
412
|
break;
|
|
397
413
|
case 'final_transcript':
|
|
398
|
-
this.processTranscript(
|
|
414
|
+
this.processTranscript(event, true);
|
|
399
415
|
break;
|
|
400
416
|
case 'error':
|
|
401
|
-
this.#logger.error({ error:
|
|
417
|
+
this.#logger.error({ error: event }, 'Received error from LiveKit STT');
|
|
402
418
|
resourceCleanup();
|
|
403
|
-
throw new APIError(`LiveKit STT returned error: ${JSON.stringify(
|
|
404
|
-
default:
|
|
405
|
-
this.#logger.warn(
|
|
406
|
-
{ message: json },
|
|
407
|
-
'Received unexpected message from LiveKit STT',
|
|
408
|
-
);
|
|
409
|
-
break;
|
|
419
|
+
throw new APIError(`LiveKit STT returned error: ${JSON.stringify(event)}`);
|
|
410
420
|
}
|
|
411
421
|
}
|
|
412
422
|
} finally {
|
|
@@ -457,13 +467,13 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
|
|
|
457
467
|
}
|
|
458
468
|
}
|
|
459
469
|
|
|
460
|
-
private processTranscript(data:
|
|
470
|
+
private processTranscript(data: SttTranscriptEvent, isFinal: boolean) {
|
|
461
471
|
// Check if queue is closed to avoid race condition during disconnect
|
|
462
472
|
if (this.queue.closed) return;
|
|
463
473
|
|
|
464
|
-
const requestId = data.
|
|
465
|
-
const text = data.transcript
|
|
466
|
-
const language = data.language
|
|
474
|
+
const requestId = data.session_id || this.requestId;
|
|
475
|
+
const text = data.transcript;
|
|
476
|
+
const language = data.language || this.opts.language || 'en';
|
|
467
477
|
|
|
468
478
|
if (!text && !isFinal) return;
|
|
469
479
|
|
|
@@ -476,10 +486,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
|
|
|
476
486
|
|
|
477
487
|
const speechData: SpeechData = {
|
|
478
488
|
language,
|
|
479
|
-
startTime: data.start
|
|
480
|
-
endTime: data.
|
|
481
|
-
confidence: data.confidence
|
|
489
|
+
startTime: this.startTimeOffset + data.start,
|
|
490
|
+
endTime: this.startTimeOffset + data.start + data.duration,
|
|
491
|
+
confidence: data.confidence,
|
|
482
492
|
text,
|
|
493
|
+
words: data.words.map(
|
|
494
|
+
(word): TimedString => ({
|
|
495
|
+
text: word.word,
|
|
496
|
+
startTime: word.start + this.startTimeOffset,
|
|
497
|
+
endTime: word.end + this.startTimeOffset,
|
|
498
|
+
startTimeOffset: this.startTimeOffset,
|
|
499
|
+
confidence: word.confidence,
|
|
500
|
+
}),
|
|
501
|
+
),
|
|
483
502
|
};
|
|
484
503
|
|
|
485
504
|
if (isFinal) {
|
package/src/inference/tts.ts
CHANGED
|
@@ -23,22 +23,27 @@ import {
|
|
|
23
23
|
import { type AnyString, connectWs, createAccessToken } from './utils.js';
|
|
24
24
|
|
|
25
25
|
export type CartesiaModels =
|
|
26
|
-
| 'cartesia'
|
|
27
|
-
| 'cartesia/sonic'
|
|
26
|
+
| 'cartesia/sonic-3'
|
|
28
27
|
| 'cartesia/sonic-2'
|
|
29
|
-
| 'cartesia/sonic-turbo'
|
|
28
|
+
| 'cartesia/sonic-turbo'
|
|
29
|
+
| 'cartesia/sonic';
|
|
30
|
+
|
|
31
|
+
export type DeepgramTTSModels = 'deepgram/aura' | 'deepgram/aura-2';
|
|
30
32
|
|
|
31
33
|
export type ElevenlabsModels =
|
|
32
|
-
| 'elevenlabs'
|
|
33
34
|
| 'elevenlabs/eleven_flash_v2'
|
|
34
35
|
| 'elevenlabs/eleven_flash_v2_5'
|
|
35
36
|
| 'elevenlabs/eleven_turbo_v2'
|
|
36
37
|
| 'elevenlabs/eleven_turbo_v2_5'
|
|
37
38
|
| 'elevenlabs/eleven_multilingual_v2';
|
|
38
39
|
|
|
39
|
-
export type
|
|
40
|
+
export type InworldModels =
|
|
41
|
+
| 'inworld/inworld-tts-1.5-max'
|
|
42
|
+
| 'inworld/inworld-tts-1.5-mini'
|
|
43
|
+
| 'inworld/inworld-tts-1-max'
|
|
44
|
+
| 'inworld/inworld-tts-1';
|
|
40
45
|
|
|
41
|
-
export type
|
|
46
|
+
export type RimeModels = 'rime/arcana' | 'rime/mistv2';
|
|
42
47
|
|
|
43
48
|
export interface CartesiaOptions {
|
|
44
49
|
duration?: number; // max duration of audio in seconds
|
|
@@ -50,25 +55,40 @@ export interface ElevenlabsOptions {
|
|
|
50
55
|
apply_text_normalization?: 'auto' | 'off' | 'on'; // default: "auto"
|
|
51
56
|
}
|
|
52
57
|
|
|
58
|
+
export interface DeepgramTTSOptions {}
|
|
59
|
+
|
|
53
60
|
export interface RimeOptions {}
|
|
54
61
|
|
|
55
62
|
export interface InworldOptions {}
|
|
56
63
|
|
|
57
|
-
type _TTSModels =
|
|
58
|
-
|
|
59
|
-
|
|
64
|
+
type _TTSModels =
|
|
65
|
+
| CartesiaModels
|
|
66
|
+
| DeepgramTTSModels
|
|
67
|
+
| ElevenlabsModels
|
|
68
|
+
| RimeModels
|
|
69
|
+
| InworldModels;
|
|
70
|
+
|
|
71
|
+
export type TTSModels =
|
|
72
|
+
| CartesiaModels
|
|
73
|
+
| DeepgramTTSModels
|
|
74
|
+
| ElevenlabsModels
|
|
75
|
+
| RimeModels
|
|
76
|
+
| InworldModels
|
|
77
|
+
| AnyString;
|
|
60
78
|
|
|
61
79
|
export type ModelWithVoice = `${_TTSModels}:${string}` | TTSModels;
|
|
62
80
|
|
|
63
81
|
export type TTSOptions<TModel extends TTSModels> = TModel extends CartesiaModels
|
|
64
82
|
? CartesiaOptions
|
|
65
|
-
: TModel extends
|
|
66
|
-
?
|
|
67
|
-
: TModel extends
|
|
68
|
-
?
|
|
69
|
-
: TModel extends
|
|
70
|
-
?
|
|
71
|
-
:
|
|
83
|
+
: TModel extends DeepgramTTSModels
|
|
84
|
+
? DeepgramTTSOptions
|
|
85
|
+
: TModel extends ElevenlabsModels
|
|
86
|
+
? ElevenlabsOptions
|
|
87
|
+
: TModel extends RimeModels
|
|
88
|
+
? RimeOptions
|
|
89
|
+
: TModel extends InworldModels
|
|
90
|
+
? InworldOptions
|
|
91
|
+
: Record<string, unknown>;
|
|
72
92
|
|
|
73
93
|
type TTSEncoding = 'pcm_s16le';
|
|
74
94
|
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
5
|
import { log } from '../log.js';
|
|
6
6
|
import type { APIConnectOptions } from '../types.js';
|
|
7
|
+
import { isStreamClosedError } from '../utils.js';
|
|
7
8
|
import type { VAD, VADStream } from '../vad.js';
|
|
8
9
|
import { VADEventType } from '../vad.js';
|
|
9
10
|
import type { SpeechEvent } from './stt.js';
|
|
@@ -68,7 +69,17 @@ export class StreamAdapterWrapper extends SpeechStream {
|
|
|
68
69
|
this.#vadStream.pushFrame(input);
|
|
69
70
|
}
|
|
70
71
|
}
|
|
71
|
-
|
|
72
|
+
|
|
73
|
+
// Guard against calling endInput() on already-closed stream
|
|
74
|
+
// This happens during handover when close() is called while forwardInput is running
|
|
75
|
+
try {
|
|
76
|
+
this.#vadStream.endInput();
|
|
77
|
+
} catch (e) {
|
|
78
|
+
if (isStreamClosedError(e)) {
|
|
79
|
+
return;
|
|
80
|
+
}
|
|
81
|
+
throw e;
|
|
82
|
+
}
|
|
72
83
|
};
|
|
73
84
|
|
|
74
85
|
const recognize = async () => {
|
package/src/stt/stt.ts
CHANGED
|
@@ -13,6 +13,7 @@ import { DeferredReadableStream } from '../stream/deferred_stream.js';
|
|
|
13
13
|
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS, intervalForRetry } from '../types.js';
|
|
14
14
|
import type { AudioBuffer } from '../utils.js';
|
|
15
15
|
import { AsyncIterableQueue, delay, startSoon, toError } from '../utils.js';
|
|
16
|
+
import type { TimedString } from '../voice/index.js';
|
|
16
17
|
|
|
17
18
|
/** Indicates start/middle/end of speech */
|
|
18
19
|
export enum SpeechEventType {
|
|
@@ -53,6 +54,7 @@ export interface SpeechData {
|
|
|
53
54
|
startTime: number;
|
|
54
55
|
endTime: number;
|
|
55
56
|
confidence: number;
|
|
57
|
+
words?: TimedString[];
|
|
56
58
|
}
|
|
57
59
|
|
|
58
60
|
export interface RecognitionUsage {
|
|
@@ -76,6 +78,13 @@ export interface SpeechEvent {
|
|
|
76
78
|
export interface STTCapabilities {
|
|
77
79
|
streaming: boolean;
|
|
78
80
|
interimResults: boolean;
|
|
81
|
+
/**
|
|
82
|
+
* Whether this STT supports aligned transcripts with word/chunk timestamps.
|
|
83
|
+
* - 'word': Provider returns word-level timestamps
|
|
84
|
+
* - 'chunk': Provider returns chunk-level timestamps (e.g., sentence/phrase boundaries)
|
|
85
|
+
* - false: Provider does not support aligned transcripts
|
|
86
|
+
*/
|
|
87
|
+
alignedTranscript?: 'word' | 'chunk' | false;
|
|
79
88
|
}
|
|
80
89
|
|
|
81
90
|
export interface STTError {
|
|
@@ -176,6 +185,7 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
176
185
|
private deferredInputStream: DeferredReadableStream<AudioFrame>;
|
|
177
186
|
private logger = log();
|
|
178
187
|
private _connOptions: APIConnectOptions;
|
|
188
|
+
private _startTimeOffset: number = 0;
|
|
179
189
|
|
|
180
190
|
protected abortController = new AbortController();
|
|
181
191
|
|
|
@@ -300,6 +310,17 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
300
310
|
return this.abortController.signal;
|
|
301
311
|
}
|
|
302
312
|
|
|
313
|
+
get startTimeOffset(): number {
|
|
314
|
+
return this._startTimeOffset;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
set startTimeOffset(value: number) {
|
|
318
|
+
if (value < 0) {
|
|
319
|
+
throw new Error('startTimeOffset must be non-negative');
|
|
320
|
+
}
|
|
321
|
+
this._startTimeOffset = value;
|
|
322
|
+
}
|
|
323
|
+
|
|
303
324
|
updateInputStream(audioStream: ReadableStream<AudioFrame>) {
|
|
304
325
|
this.deferredInputStream.setSource(audioStream);
|
|
305
326
|
}
|
package/src/telemetry/traces.ts
CHANGED
|
@@ -37,6 +37,8 @@ export interface StartSpanOptions {
|
|
|
37
37
|
attributes?: Attributes;
|
|
38
38
|
/** Whether to end the span when the function exits (default: true) */
|
|
39
39
|
endOnExit?: boolean;
|
|
40
|
+
/** Optional start time for the span in milliseconds (Date.now() format) */
|
|
41
|
+
startTime?: number;
|
|
40
42
|
}
|
|
41
43
|
|
|
42
44
|
/**
|
|
@@ -79,10 +81,12 @@ class DynamicTracer {
|
|
|
79
81
|
*/
|
|
80
82
|
startSpan(options: StartSpanOptions): Span {
|
|
81
83
|
const ctx = options.context || otelContext.active();
|
|
84
|
+
|
|
82
85
|
const span = this.tracer.startSpan(
|
|
83
86
|
options.name,
|
|
84
87
|
{
|
|
85
88
|
attributes: options.attributes,
|
|
89
|
+
startTime: options.startTime,
|
|
86
90
|
},
|
|
87
91
|
ctx,
|
|
88
92
|
);
|
|
@@ -101,7 +105,7 @@ class DynamicTracer {
|
|
|
101
105
|
async startActiveSpan<T>(fn: (span: Span) => Promise<T>, options: StartSpanOptions): Promise<T> {
|
|
102
106
|
const ctx = options.context || otelContext.active();
|
|
103
107
|
const endOnExit = options.endOnExit === undefined ? true : options.endOnExit; // default true
|
|
104
|
-
const opts: SpanOptions = { attributes: options.attributes };
|
|
108
|
+
const opts: SpanOptions = { attributes: options.attributes, startTime: options.startTime };
|
|
105
109
|
|
|
106
110
|
// Directly return the tracer's startActiveSpan result - it handles async correctly
|
|
107
111
|
return await this.tracer.startActiveSpan(options.name, opts, ctx, async (span) => {
|
|
@@ -125,7 +129,7 @@ class DynamicTracer {
|
|
|
125
129
|
startActiveSpanSync<T>(fn: (span: Span) => T, options: StartSpanOptions): T {
|
|
126
130
|
const ctx = options.context || otelContext.active();
|
|
127
131
|
const endOnExit = options.endOnExit === undefined ? true : options.endOnExit; // default true
|
|
128
|
-
const opts: SpanOptions = { attributes: options.attributes };
|
|
132
|
+
const opts: SpanOptions = { attributes: options.attributes, startTime: options.startTime };
|
|
129
133
|
|
|
130
134
|
return this.tracer.startActiveSpan(options.name, opts, ctx, (span) => {
|
|
131
135
|
try {
|
package/src/utils.ts
CHANGED
|
@@ -125,6 +125,7 @@ export class Future<T = void> {
|
|
|
125
125
|
#resolvePromise!: (value: T) => void;
|
|
126
126
|
#rejectPromise!: (error: Error) => void;
|
|
127
127
|
#done: boolean = false;
|
|
128
|
+
#rejected: boolean = false;
|
|
128
129
|
|
|
129
130
|
constructor() {
|
|
130
131
|
this.#await = new Promise<T>((resolve, reject) => {
|
|
@@ -141,6 +142,11 @@ export class Future<T = void> {
|
|
|
141
142
|
return this.#done;
|
|
142
143
|
}
|
|
143
144
|
|
|
145
|
+
/** Whether the future was rejected (cancelled) */
|
|
146
|
+
get rejected() {
|
|
147
|
+
return this.#rejected;
|
|
148
|
+
}
|
|
149
|
+
|
|
144
150
|
resolve(value: T) {
|
|
145
151
|
this.#done = true;
|
|
146
152
|
this.#resolvePromise(value);
|
|
@@ -148,6 +154,7 @@ export class Future<T = void> {
|
|
|
148
154
|
|
|
149
155
|
reject(error: Error) {
|
|
150
156
|
this.#done = true;
|
|
157
|
+
this.#rejected = true;
|
|
151
158
|
this.#rejectPromise(error);
|
|
152
159
|
}
|
|
153
160
|
}
|
|
@@ -668,6 +675,20 @@ export class InvalidErrorType extends Error {
|
|
|
668
675
|
}
|
|
669
676
|
}
|
|
670
677
|
|
|
678
|
+
/**
|
|
679
|
+
* Check if an error is a stream closed error that can be safely ignored during cleanup.
|
|
680
|
+
* This happens during handover/cleanup when close() is called while operations are still running.
|
|
681
|
+
*
|
|
682
|
+
* @param error - The error to check.
|
|
683
|
+
* @returns True if the error is a stream closed error.
|
|
684
|
+
*/
|
|
685
|
+
export function isStreamClosedError(error: unknown): boolean {
|
|
686
|
+
return (
|
|
687
|
+
error instanceof Error &&
|
|
688
|
+
(error.message === 'Stream is closed' || error.message === 'Input is closed')
|
|
689
|
+
);
|
|
690
|
+
}
|
|
691
|
+
|
|
671
692
|
/**
|
|
672
693
|
* In JS an error can be any arbitrary value.
|
|
673
694
|
* This function converts an unknown error to an Error and stores the original value in the error object.
|
package/src/voice/agent.ts
CHANGED
|
@@ -271,6 +271,15 @@ export class Agent<UserData = any> {
|
|
|
271
271
|
|
|
272
272
|
const connOptions = activity.agentSession.connOptions.sttConnOptions;
|
|
273
273
|
const stream = wrapped_stt.stream({ connOptions });
|
|
274
|
+
|
|
275
|
+
// Set startTimeOffset to provide linear timestamps across reconnections
|
|
276
|
+
const audioInputStartedAt =
|
|
277
|
+
activity.agentSession._recorderIO?.recordingStartedAt ?? // Use recording start time if available
|
|
278
|
+
activity.agentSession._startedAt ?? // Fallback to session start time
|
|
279
|
+
Date.now(); // Fallback to current time
|
|
280
|
+
|
|
281
|
+
stream.startTimeOffset = (Date.now() - audioInputStartedAt) / 1000;
|
|
282
|
+
|
|
274
283
|
stream.updateInputStream(audio);
|
|
275
284
|
|
|
276
285
|
let cleaned = false;
|
|
@@ -316,16 +325,16 @@ export class Agent<UserData = any> {
|
|
|
316
325
|
);
|
|
317
326
|
}
|
|
318
327
|
|
|
319
|
-
// TODO(brian): make parallelToolCalls configurable
|
|
320
328
|
const { toolChoice } = modelSettings;
|
|
321
329
|
const connOptions = activity.agentSession.connOptions.llmConnOptions;
|
|
322
330
|
|
|
331
|
+
// parallelToolCalls is not passed here - it will use the value from LLM's modelOptions
|
|
332
|
+
// This allows users to configure it via: new inference.LLM({ modelOptions: { parallel_tool_calls: false } })
|
|
323
333
|
const stream = activity.llm.chat({
|
|
324
334
|
chatCtx,
|
|
325
335
|
toolCtx,
|
|
326
336
|
toolChoice,
|
|
327
337
|
connOptions,
|
|
328
|
-
parallelToolCalls: true,
|
|
329
338
|
});
|
|
330
339
|
|
|
331
340
|
let cleaned = false;
|