@livekit/agents 1.0.36 → 1.0.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs.map +1 -1
- package/dist/inference/api_protos.cjs +68 -0
- package/dist/inference/api_protos.cjs.map +1 -1
- package/dist/inference/api_protos.d.cts +345 -4
- package/dist/inference/api_protos.d.ts +345 -4
- package/dist/inference/api_protos.d.ts.map +1 -1
- package/dist/inference/api_protos.js +60 -0
- package/dist/inference/api_protos.js.map +1 -1
- package/dist/inference/stt.cjs +32 -21
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +34 -21
- package/dist/inference/stt.js.map +1 -1
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/stt/stt.cjs +10 -0
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +12 -0
- package/dist/stt/stt.d.ts +12 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +10 -0
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/traces.cjs +4 -3
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.cts +2 -0
- package/dist/telemetry/traces.d.ts +2 -0
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +4 -3
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/utils.cjs +6 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +2 -0
- package/dist/utils.d.ts +2 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +6 -0
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent.cjs +5 -0
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +5 -0
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +49 -23
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +1 -1
- package/dist/voice/agent_activity.d.ts +1 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +50 -24
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +7 -5
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +5 -2
- package/dist/voice/agent_session.d.ts +5 -2
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +7 -5
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +3 -1
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +3 -1
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/avatar/datastream_io.cjs +6 -0
- package/dist/voice/avatar/datastream_io.cjs.map +1 -1
- package/dist/voice/avatar/datastream_io.d.cts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
- package/dist/voice/avatar/datastream_io.js +6 -0
- package/dist/voice/avatar/datastream_io.js.map +1 -1
- package/dist/voice/background_audio.cjs.map +1 -1
- package/dist/voice/generation.cjs +14 -5
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -2
- package/dist/voice/generation.d.ts +3 -2
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +14 -5
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/io.cjs +12 -0
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +19 -1
- package/dist/voice/io.d.ts +19 -1
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +12 -0
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +91 -28
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
- package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +91 -28
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +40 -11
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +4 -1
- package/dist/voice/room_io/_input.d.ts +4 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +31 -2
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +6 -0
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +1 -0
- package/dist/voice/room_io/_output.d.ts +1 -0
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +6 -0
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +2 -2
- package/dist/voice/room_io/room_io.d.ts +2 -2
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +2 -0
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +3 -0
- package/dist/voice/speech_handle.d.ts +3 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +2 -0
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/index.cjs +2 -0
- package/dist/voice/testing/index.cjs.map +1 -1
- package/dist/voice/testing/index.d.cts +1 -1
- package/dist/voice/testing/index.d.ts +1 -1
- package/dist/voice/testing/index.d.ts.map +1 -1
- package/dist/voice/testing/index.js +2 -0
- package/dist/voice/testing/index.js.map +1 -1
- package/dist/voice/testing/run_result.cjs +294 -5
- package/dist/voice/testing/run_result.cjs.map +1 -1
- package/dist/voice/testing/run_result.d.cts +149 -1
- package/dist/voice/testing/run_result.d.ts +149 -1
- package/dist/voice/testing/run_result.d.ts.map +1 -1
- package/dist/voice/testing/run_result.js +293 -5
- package/dist/voice/testing/run_result.js.map +1 -1
- package/package.json +1 -1
- package/src/inference/api_protos.ts +83 -0
- package/src/inference/stt.ts +39 -22
- package/src/stt/stt.ts +21 -0
- package/src/telemetry/traces.ts +6 -2
- package/src/utils.ts +7 -0
- package/src/voice/agent.ts +9 -0
- package/src/voice/agent_activity.ts +72 -26
- package/src/voice/agent_session.ts +6 -5
- package/src/voice/audio_recognition.ts +2 -0
- package/src/voice/avatar/datastream_io.ts +8 -0
- package/src/voice/generation.ts +24 -12
- package/src/voice/io.ts +27 -5
- package/src/voice/recorder_io/recorder_io.ts +123 -31
- package/src/voice/room_io/_input.ts +32 -4
- package/src/voice/room_io/_output.ts +8 -0
- package/src/voice/room_io/room_io.ts +3 -1
- package/src/voice/speech_handle.ts +4 -0
- package/src/voice/testing/index.ts +1 -0
- package/src/voice/testing/run_result.ts +373 -12
package/src/voice/io.ts
CHANGED
|
@@ -30,12 +30,14 @@ export type TTSNode = (
|
|
|
30
30
|
) => Promise<ReadableStream<AudioFrame> | null>;
|
|
31
31
|
|
|
32
32
|
/**
|
|
33
|
-
*
|
|
33
|
+
*A string with optional start and end timestamps for word-level alignment.
|
|
34
34
|
*/
|
|
35
35
|
export interface TimedString {
|
|
36
36
|
text: string;
|
|
37
37
|
startTime?: number; // seconds
|
|
38
38
|
endTime?: number; // seconds
|
|
39
|
+
confidence?: number;
|
|
40
|
+
startTimeOffset?: number;
|
|
39
41
|
}
|
|
40
42
|
|
|
41
43
|
export interface AudioOutputCapabilities {
|
|
@@ -57,6 +59,7 @@ export abstract class AudioInput {
|
|
|
57
59
|
}
|
|
58
60
|
|
|
59
61
|
export abstract class AudioOutput extends EventEmitter {
|
|
62
|
+
static readonly EVENT_PLAYBACK_STARTED = 'playbackStarted';
|
|
60
63
|
static readonly EVENT_PLAYBACK_FINISHED = 'playbackFinished';
|
|
61
64
|
|
|
62
65
|
private playbackFinishedFuture: Future<void> = new Future();
|
|
@@ -77,7 +80,11 @@ export abstract class AudioOutput extends EventEmitter {
|
|
|
77
80
|
) {
|
|
78
81
|
super();
|
|
79
82
|
this.capabilities = capabilities;
|
|
83
|
+
|
|
80
84
|
if (this.nextInChain) {
|
|
85
|
+
this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_STARTED, (ev: PlaybackStartedEvent) =>
|
|
86
|
+
this.onPlaybackStarted(ev.createdAt),
|
|
87
|
+
);
|
|
81
88
|
this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_FINISHED, (ev: PlaybackFinishedEvent) =>
|
|
82
89
|
this.onPlaybackFinished(ev),
|
|
83
90
|
);
|
|
@@ -117,6 +124,14 @@ export abstract class AudioOutput extends EventEmitter {
|
|
|
117
124
|
return this.lastPlaybackEvent;
|
|
118
125
|
}
|
|
119
126
|
|
|
127
|
+
/**
|
|
128
|
+
* Called when playback actually starts (first frame is sent to output).
|
|
129
|
+
* Developers building audio sinks should call this when the first frame is captured.
|
|
130
|
+
*/
|
|
131
|
+
onPlaybackStarted(createdAt: number): void {
|
|
132
|
+
this.emit(AudioOutput.EVENT_PLAYBACK_STARTED, { createdAt } as PlaybackStartedEvent);
|
|
133
|
+
}
|
|
134
|
+
|
|
120
135
|
/**
|
|
121
136
|
* Developers building audio sinks must call this method when a playback/segment is finished.
|
|
122
137
|
* Segments are segmented by calls to flush() or clearBuffer()
|
|
@@ -174,15 +189,22 @@ export abstract class AudioOutput extends EventEmitter {
|
|
|
174
189
|
}
|
|
175
190
|
|
|
176
191
|
export interface PlaybackFinishedEvent {
|
|
177
|
-
|
|
192
|
+
/** How much of the audio was played back, in seconds */
|
|
178
193
|
playbackPosition: number;
|
|
179
|
-
|
|
194
|
+
/** True if playback was interrupted (clearBuffer() was called) */
|
|
180
195
|
interrupted: boolean;
|
|
181
|
-
|
|
182
|
-
|
|
196
|
+
/**
|
|
197
|
+
* Transcript synced with playback; may be partial if the audio was interrupted.
|
|
198
|
+
* When undefined, the transcript is not synchronized with the playback.
|
|
199
|
+
*/
|
|
183
200
|
synchronizedTranscript?: string;
|
|
184
201
|
}
|
|
185
202
|
|
|
203
|
+
export interface PlaybackStartedEvent {
|
|
204
|
+
/** The timestamp (Date.now()) when the playback started */
|
|
205
|
+
createdAt: number;
|
|
206
|
+
}
|
|
207
|
+
|
|
186
208
|
export abstract class TextOutput {
|
|
187
209
|
constructor(protected readonly nextInChain?: TextOutput) {}
|
|
188
210
|
|
|
@@ -123,7 +123,7 @@ export class RecorderIO {
|
|
|
123
123
|
}
|
|
124
124
|
|
|
125
125
|
private writeCb(buf: AudioFrame[]): void {
|
|
126
|
-
const inputBuf = this.inRecord!.takeBuf();
|
|
126
|
+
const inputBuf = this.inRecord!.takeBuf(this.outRecord?._lastSpeechEndTime);
|
|
127
127
|
this.inChan.write(inputBuf);
|
|
128
128
|
this.outChan.write(buf);
|
|
129
129
|
}
|
|
@@ -137,8 +137,18 @@ export class RecorderIO {
|
|
|
137
137
|
}
|
|
138
138
|
|
|
139
139
|
get recordingStartedAt(): number | undefined {
|
|
140
|
-
|
|
141
|
-
|
|
140
|
+
const inT = this.inRecord?.startedWallTime;
|
|
141
|
+
const outT = this.outRecord?.startedWallTime;
|
|
142
|
+
|
|
143
|
+
if (inT === undefined) {
|
|
144
|
+
return outT;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
if (outT === undefined) {
|
|
148
|
+
return inT;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
return Math.min(inT, outT);
|
|
142
152
|
}
|
|
143
153
|
|
|
144
154
|
/**
|
|
@@ -159,7 +169,7 @@ export class RecorderIO {
|
|
|
159
169
|
}
|
|
160
170
|
|
|
161
171
|
// Flush input buffer
|
|
162
|
-
const inputBuf = this.inRecord!.takeBuf();
|
|
172
|
+
const inputBuf = this.inRecord!.takeBuf(this.outRecord!._lastSpeechEndTime);
|
|
163
173
|
this.inChan
|
|
164
174
|
.write(inputBuf)
|
|
165
175
|
.catch((err) => this.logger.error({ err }, 'Error writing RecorderIO input buffer'));
|
|
@@ -359,6 +369,8 @@ class RecorderAudioInput extends AudioInput {
|
|
|
359
369
|
private recorderIO: RecorderIO;
|
|
360
370
|
private accFrames: AudioFrame[] = [];
|
|
361
371
|
private _startedWallTime?: number;
|
|
372
|
+
private _padded: boolean = false;
|
|
373
|
+
private logger = log();
|
|
362
374
|
|
|
363
375
|
constructor(recorderIO: RecorderIO, source: AudioInput) {
|
|
364
376
|
super();
|
|
@@ -378,10 +390,46 @@ class RecorderAudioInput extends AudioInput {
|
|
|
378
390
|
|
|
379
391
|
/**
|
|
380
392
|
* Take accumulated frames and clear the buffer
|
|
393
|
+
* @param padSince - If provided and input started after this time, pad with silence
|
|
381
394
|
*/
|
|
382
|
-
takeBuf(): AudioFrame[] {
|
|
383
|
-
|
|
395
|
+
takeBuf(padSince?: number): AudioFrame[] {
|
|
396
|
+
let frames = this.accFrames;
|
|
384
397
|
this.accFrames = [];
|
|
398
|
+
|
|
399
|
+
if (
|
|
400
|
+
padSince !== undefined &&
|
|
401
|
+
this._startedWallTime !== undefined &&
|
|
402
|
+
this._startedWallTime > padSince &&
|
|
403
|
+
!this._padded &&
|
|
404
|
+
frames.length > 0
|
|
405
|
+
) {
|
|
406
|
+
const padding = this._startedWallTime - padSince;
|
|
407
|
+
this.logger.warn(
|
|
408
|
+
{
|
|
409
|
+
lastAgentSpeechTime: padSince,
|
|
410
|
+
inputStartedTime: this._startedWallTime,
|
|
411
|
+
},
|
|
412
|
+
'input speech started after last agent speech ended',
|
|
413
|
+
);
|
|
414
|
+
this._padded = true;
|
|
415
|
+
const firstFrame = frames[0]!;
|
|
416
|
+
frames = [
|
|
417
|
+
createSilenceFrame(padding / 1000, firstFrame.sampleRate, firstFrame.channels),
|
|
418
|
+
...frames,
|
|
419
|
+
];
|
|
420
|
+
} else if (
|
|
421
|
+
padSince !== undefined &&
|
|
422
|
+
this._startedWallTime === undefined &&
|
|
423
|
+
!this._padded &&
|
|
424
|
+
frames.length === 0
|
|
425
|
+
) {
|
|
426
|
+
// We could pad with silence here with some fixed SR and channels,
|
|
427
|
+
// but it's better for the user to know that this is happening
|
|
428
|
+
this.logger.warn(
|
|
429
|
+
"input speech hasn't started yet, skipping silence padding, recording may be inaccurate until the speech starts",
|
|
430
|
+
);
|
|
431
|
+
}
|
|
432
|
+
|
|
385
433
|
return frames;
|
|
386
434
|
}
|
|
387
435
|
|
|
@@ -455,6 +503,10 @@ class RecorderAudioOutput extends AudioOutput {
|
|
|
455
503
|
private writeFn: (buf: AudioFrame[]) => void;
|
|
456
504
|
private accFrames: AudioFrame[] = [];
|
|
457
505
|
private _startedWallTime?: number;
|
|
506
|
+
private _logger = log();
|
|
507
|
+
|
|
508
|
+
_lastSpeechEndTime?: number;
|
|
509
|
+
private _lastSpeechStartTime?: number;
|
|
458
510
|
|
|
459
511
|
// Pause tracking
|
|
460
512
|
private currentPauseStart?: number;
|
|
@@ -508,9 +560,32 @@ class RecorderAudioOutput extends AudioOutput {
|
|
|
508
560
|
}
|
|
509
561
|
|
|
510
562
|
onPlaybackFinished(options: PlaybackFinishedEvent): void {
|
|
511
|
-
const finishTime = Date.now();
|
|
563
|
+
const finishTime = this.currentPauseStart ?? Date.now();
|
|
564
|
+
const trailingSilenceDuration = Math.max(0, Date.now() - finishTime);
|
|
565
|
+
|
|
566
|
+
// Convert playbackPosition from seconds to ms for internal calculations
|
|
567
|
+
let playbackPosition = options.playbackPosition * 1000;
|
|
568
|
+
|
|
569
|
+
if (this._lastSpeechStartTime === undefined) {
|
|
570
|
+
this._logger.warn(
|
|
571
|
+
{
|
|
572
|
+
finishTime,
|
|
573
|
+
playbackPosition,
|
|
574
|
+
interrupted: options.interrupted,
|
|
575
|
+
},
|
|
576
|
+
'playback finished before speech started',
|
|
577
|
+
);
|
|
578
|
+
playbackPosition = 0;
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
// Clamp playbackPosition to actual elapsed time (all in ms)
|
|
582
|
+
playbackPosition = Math.max(
|
|
583
|
+
0,
|
|
584
|
+
Math.min(finishTime - (this._lastSpeechStartTime ?? 0), playbackPosition),
|
|
585
|
+
);
|
|
512
586
|
|
|
513
|
-
|
|
587
|
+
// Convert back to seconds for the event
|
|
588
|
+
super.onPlaybackFinished({ ...options, playbackPosition: playbackPosition / 1000 });
|
|
514
589
|
|
|
515
590
|
if (!this.recorderIO.recording) {
|
|
516
591
|
return;
|
|
@@ -523,28 +598,29 @@ class RecorderAudioOutput extends AudioOutput {
|
|
|
523
598
|
|
|
524
599
|
if (this.accFrames.length === 0) {
|
|
525
600
|
this.resetPauseState();
|
|
601
|
+
this._lastSpeechEndTime = Date.now();
|
|
602
|
+
this._lastSpeechStartTime = undefined;
|
|
526
603
|
return;
|
|
527
604
|
}
|
|
528
605
|
|
|
529
|
-
|
|
530
|
-
|
|
606
|
+
// pauseEvents stores (position, duration) in ms
|
|
531
607
|
const pauseEvents: Array<[number, number]> = [];
|
|
608
|
+
let playbackStartTime = finishTime - playbackPosition;
|
|
532
609
|
|
|
533
610
|
if (this.pauseWallTimes.length > 0) {
|
|
534
611
|
const totalPauseDuration = this.pauseWallTimes.reduce(
|
|
535
612
|
(sum, [start, end]) => sum + (end - start),
|
|
536
613
|
0,
|
|
537
614
|
);
|
|
538
|
-
|
|
539
|
-
const playbackStartTime = finishTime - playbackPosition * 1000 - totalPauseDuration;
|
|
615
|
+
playbackStartTime = finishTime - playbackPosition - totalPauseDuration;
|
|
540
616
|
|
|
541
617
|
let accumulatedPause = 0;
|
|
542
618
|
for (const [pauseStart, pauseEnd] of this.pauseWallTimes) {
|
|
543
|
-
let position =
|
|
544
|
-
const duration =
|
|
619
|
+
let position = pauseStart - playbackStartTime - accumulatedPause;
|
|
620
|
+
const duration = pauseEnd - pauseStart;
|
|
545
621
|
position = Math.max(0, Math.min(position, playbackPosition));
|
|
546
622
|
pauseEvents.push([position, duration]);
|
|
547
|
-
accumulatedPause +=
|
|
623
|
+
accumulatedPause += duration;
|
|
548
624
|
}
|
|
549
625
|
}
|
|
550
626
|
|
|
@@ -558,10 +634,10 @@ class RecorderAudioOutput extends AudioOutput {
|
|
|
558
634
|
|
|
559
635
|
for (const frame of this.accFrames) {
|
|
560
636
|
let currentFrame = frame;
|
|
561
|
-
const frameDuration = frame.samplesPerChannel / frame.sampleRate;
|
|
637
|
+
const frameDuration = (frame.samplesPerChannel / frame.sampleRate) * 1000;
|
|
562
638
|
|
|
563
639
|
if (frameDuration + accDur > playbackPosition) {
|
|
564
|
-
const [left] = splitFrame(currentFrame, playbackPosition - accDur);
|
|
640
|
+
const [left] = splitFrame(currentFrame, (playbackPosition - accDur) / 1000);
|
|
565
641
|
currentFrame = left;
|
|
566
642
|
shouldBreak = true;
|
|
567
643
|
}
|
|
@@ -569,27 +645,29 @@ class RecorderAudioOutput extends AudioOutput {
|
|
|
569
645
|
// Process any pauses before this frame starts
|
|
570
646
|
while (pauseIdx < pauseEvents.length && pauseEvents[pauseIdx]![0] <= accDur) {
|
|
571
647
|
const [, pauseDur] = pauseEvents[pauseIdx]!;
|
|
572
|
-
buf.push(createSilenceFrame(pauseDur, sampleRate, numChannels));
|
|
648
|
+
buf.push(createSilenceFrame(pauseDur / 1000, sampleRate, numChannels));
|
|
573
649
|
pauseIdx++;
|
|
574
650
|
}
|
|
575
651
|
|
|
576
652
|
// Process any pauses within this frame
|
|
577
|
-
const currentFrameDuration =
|
|
653
|
+
const currentFrameDuration =
|
|
654
|
+
(currentFrame.samplesPerChannel / currentFrame.sampleRate) * 1000;
|
|
578
655
|
while (
|
|
579
656
|
pauseIdx < pauseEvents.length &&
|
|
580
657
|
pauseEvents[pauseIdx]![0] < accDur + currentFrameDuration
|
|
581
658
|
) {
|
|
582
659
|
const [pausePos, pauseDur] = pauseEvents[pauseIdx]!;
|
|
583
|
-
const [left, right] = splitFrame(currentFrame, pausePos - accDur);
|
|
660
|
+
const [left, right] = splitFrame(currentFrame, (pausePos - accDur) / 1000);
|
|
584
661
|
buf.push(left);
|
|
585
|
-
accDur += left.samplesPerChannel / left.sampleRate;
|
|
586
|
-
buf.push(createSilenceFrame(pauseDur, sampleRate, numChannels));
|
|
662
|
+
accDur += (left.samplesPerChannel / left.sampleRate) * 1000;
|
|
663
|
+
buf.push(createSilenceFrame(pauseDur / 1000, sampleRate, numChannels));
|
|
664
|
+
|
|
587
665
|
currentFrame = right;
|
|
588
666
|
pauseIdx++;
|
|
589
667
|
}
|
|
590
668
|
|
|
591
669
|
buf.push(currentFrame);
|
|
592
|
-
accDur += currentFrame.samplesPerChannel / currentFrame.sampleRate;
|
|
670
|
+
accDur += (currentFrame.samplesPerChannel / currentFrame.sampleRate) * 1000;
|
|
593
671
|
|
|
594
672
|
if (shouldBreak) {
|
|
595
673
|
break;
|
|
@@ -600,31 +678,41 @@ class RecorderAudioOutput extends AudioOutput {
|
|
|
600
678
|
while (pauseIdx < pauseEvents.length) {
|
|
601
679
|
const [pausePos, pauseDur] = pauseEvents[pauseIdx]!;
|
|
602
680
|
if (pausePos <= playbackPosition) {
|
|
603
|
-
buf.push(createSilenceFrame(pauseDur, sampleRate, numChannels));
|
|
681
|
+
buf.push(createSilenceFrame(pauseDur / 1000, sampleRate, numChannels));
|
|
604
682
|
}
|
|
605
683
|
pauseIdx++;
|
|
606
684
|
}
|
|
607
685
|
|
|
608
686
|
if (buf.length > 0) {
|
|
687
|
+
if (trailingSilenceDuration > 0) {
|
|
688
|
+
buf.push(createSilenceFrame(trailingSilenceDuration / 1000, sampleRate, numChannels));
|
|
689
|
+
}
|
|
609
690
|
this.writeFn(buf);
|
|
610
691
|
}
|
|
611
692
|
|
|
612
693
|
this.accFrames = [];
|
|
613
694
|
this.resetPauseState();
|
|
695
|
+
this._lastSpeechEndTime = Date.now();
|
|
696
|
+
this._lastSpeechStartTime = undefined;
|
|
614
697
|
}
|
|
615
698
|
|
|
616
699
|
async captureFrame(frame: AudioFrame): Promise<void> {
|
|
700
|
+
if (this.nextInChain) {
|
|
701
|
+
await this.nextInChain.captureFrame(frame);
|
|
702
|
+
}
|
|
703
|
+
|
|
617
704
|
await super.captureFrame(frame);
|
|
618
705
|
|
|
619
706
|
if (this.recorderIO.recording) {
|
|
620
|
-
if (this._startedWallTime === undefined) {
|
|
621
|
-
this._startedWallTime = Date.now();
|
|
622
|
-
}
|
|
623
707
|
this.accFrames.push(frame);
|
|
624
708
|
}
|
|
625
709
|
|
|
626
|
-
if (this.
|
|
627
|
-
|
|
710
|
+
if (this._startedWallTime === undefined) {
|
|
711
|
+
this._startedWallTime = Date.now();
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
if (this._lastSpeechStartTime === undefined) {
|
|
715
|
+
this._lastSpeechStartTime = Date.now();
|
|
628
716
|
}
|
|
629
717
|
}
|
|
630
718
|
|
|
@@ -646,8 +734,12 @@ class RecorderAudioOutput extends AudioOutput {
|
|
|
646
734
|
/**
|
|
647
735
|
* Create a silent audio frame with the given duration
|
|
648
736
|
*/
|
|
649
|
-
function createSilenceFrame(
|
|
650
|
-
|
|
737
|
+
function createSilenceFrame(
|
|
738
|
+
durationInS: number,
|
|
739
|
+
sampleRate: number,
|
|
740
|
+
numChannels: number,
|
|
741
|
+
): AudioFrame {
|
|
742
|
+
const samples = Math.floor(durationInS * sampleRate);
|
|
651
743
|
const data = new Int16Array(samples * numChannels); // Zero-filled by default
|
|
652
744
|
return new AudioFrame(data, sampleRate, numChannels, samples);
|
|
653
745
|
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
import type
|
|
4
|
+
import { type AudioFrame, FrameProcessor } from '@livekit/rtc-node';
|
|
5
5
|
import {
|
|
6
6
|
AudioStream,
|
|
7
7
|
type NoiseCancellationOptions,
|
|
@@ -22,6 +22,7 @@ export class ParticipantAudioInputStream extends AudioInput {
|
|
|
22
22
|
private sampleRate: number;
|
|
23
23
|
private numChannels: number;
|
|
24
24
|
private noiseCancellation?: NoiseCancellationOptions;
|
|
25
|
+
private frameProcessor?: FrameProcessor<AudioFrame>;
|
|
25
26
|
private publication: RemoteTrackPublication | null = null;
|
|
26
27
|
private participantIdentity: string | null = null;
|
|
27
28
|
private logger = log();
|
|
@@ -34,16 +35,21 @@ export class ParticipantAudioInputStream extends AudioInput {
|
|
|
34
35
|
room: Room;
|
|
35
36
|
sampleRate: number;
|
|
36
37
|
numChannels: number;
|
|
37
|
-
noiseCancellation?: NoiseCancellationOptions
|
|
38
|
+
noiseCancellation?: NoiseCancellationOptions | FrameProcessor<AudioFrame>;
|
|
38
39
|
}) {
|
|
39
40
|
super();
|
|
40
41
|
this.room = room;
|
|
41
42
|
this.sampleRate = sampleRate;
|
|
42
43
|
this.numChannels = numChannels;
|
|
43
|
-
|
|
44
|
+
if (noiseCancellation instanceof FrameProcessor) {
|
|
45
|
+
this.frameProcessor = noiseCancellation;
|
|
46
|
+
} else {
|
|
47
|
+
this.noiseCancellation = noiseCancellation;
|
|
48
|
+
}
|
|
44
49
|
|
|
45
50
|
this.room.on(RoomEvent.TrackSubscribed, this.onTrackSubscribed);
|
|
46
51
|
this.room.on(RoomEvent.TrackUnpublished, this.onTrackUnpublished);
|
|
52
|
+
this.room.on(RoomEvent.TokenRefreshed, this.onTokenRefreshed);
|
|
47
53
|
}
|
|
48
54
|
|
|
49
55
|
setParticipant(participant: RemoteParticipant | string | null) {
|
|
@@ -116,6 +122,9 @@ export class ParticipantAudioInputStream extends AudioInput {
|
|
|
116
122
|
if (this.deferredStream.isSourceSet) {
|
|
117
123
|
this.deferredStream.detachSource();
|
|
118
124
|
}
|
|
125
|
+
|
|
126
|
+
this.frameProcessor?.close();
|
|
127
|
+
|
|
119
128
|
this.publication = null;
|
|
120
129
|
}
|
|
121
130
|
|
|
@@ -140,14 +149,32 @@ export class ParticipantAudioInputStream extends AudioInput {
|
|
|
140
149
|
outputRate: this.sampleRate,
|
|
141
150
|
}),
|
|
142
151
|
);
|
|
152
|
+
this.frameProcessor?.onStreamInfoUpdated({
|
|
153
|
+
participantIdentity: participant.identity,
|
|
154
|
+
roomName: this.room.name!,
|
|
155
|
+
publicationSid: publication.sid!,
|
|
156
|
+
});
|
|
157
|
+
this.frameProcessor?.onCredentialsUpdated({
|
|
158
|
+
token: this.room.token!,
|
|
159
|
+
url: this.room.serverUrl!,
|
|
160
|
+
});
|
|
143
161
|
return true;
|
|
144
162
|
};
|
|
145
163
|
|
|
164
|
+
private onTokenRefreshed = () => {
|
|
165
|
+
if (this.room.token && this.room.serverUrl) {
|
|
166
|
+
this.frameProcessor?.onCredentialsUpdated({
|
|
167
|
+
token: this.room.token,
|
|
168
|
+
url: this.room.serverUrl,
|
|
169
|
+
});
|
|
170
|
+
}
|
|
171
|
+
};
|
|
172
|
+
|
|
146
173
|
private createStream(track: RemoteTrack): ReadableStream<AudioFrame> {
|
|
147
174
|
return new AudioStream(track, {
|
|
148
175
|
sampleRate: this.sampleRate,
|
|
149
176
|
numChannels: this.numChannels,
|
|
150
|
-
noiseCancellation: this.noiseCancellation,
|
|
177
|
+
noiseCancellation: this.frameProcessor || this.noiseCancellation,
|
|
151
178
|
// TODO(AJS-269): resolve compatibility issue with node-sdk to remove the forced type casting
|
|
152
179
|
}) as unknown as ReadableStream<AudioFrame>;
|
|
153
180
|
}
|
|
@@ -155,6 +182,7 @@ export class ParticipantAudioInputStream extends AudioInput {
|
|
|
155
182
|
async close() {
|
|
156
183
|
this.room.off(RoomEvent.TrackSubscribed, this.onTrackSubscribed);
|
|
157
184
|
this.room.off(RoomEvent.TrackUnpublished, this.onTrackUnpublished);
|
|
185
|
+
this.room.off(RoomEvent.TokenRefreshed, this.onTokenRefreshed);
|
|
158
186
|
this.closeStream();
|
|
159
187
|
// Ignore errors - stream may be locked by RecorderIO or already cancelled
|
|
160
188
|
await this.deferredStream.stream.cancel().catch(() => {});
|
|
@@ -326,6 +326,7 @@ export class ParticipantAudioOutput extends AudioOutput {
|
|
|
326
326
|
private pushedDuration: number = 0;
|
|
327
327
|
private startedFuture: Future<void> = new Future();
|
|
328
328
|
private interruptedFuture: Future<void> = new Future();
|
|
329
|
+
private firstFrameEmitted: boolean = false;
|
|
329
330
|
|
|
330
331
|
constructor(room: Room, options: AudioOutputOptions) {
|
|
331
332
|
super(options.sampleRate, undefined, { pause: true });
|
|
@@ -347,6 +348,11 @@ export class ParticipantAudioOutput extends AudioOutput {
|
|
|
347
348
|
|
|
348
349
|
super.captureFrame(frame);
|
|
349
350
|
|
|
351
|
+
if (!this.firstFrameEmitted) {
|
|
352
|
+
this.firstFrameEmitted = true;
|
|
353
|
+
this.onPlaybackStarted(Date.now());
|
|
354
|
+
}
|
|
355
|
+
|
|
350
356
|
// TODO(AJS-102): use frame.durationMs once available in rtc-node
|
|
351
357
|
this.pushedDuration += frame.samplesPerChannel / frame.sampleRate;
|
|
352
358
|
await this.audioSource.captureFrame(frame);
|
|
@@ -382,6 +388,8 @@ export class ParticipantAudioOutput extends AudioOutput {
|
|
|
382
388
|
|
|
383
389
|
this.pushedDuration = 0;
|
|
384
390
|
this.interruptedFuture = new Future();
|
|
391
|
+
this.firstFrameEmitted = false;
|
|
392
|
+
|
|
385
393
|
this.onPlaybackFinished({
|
|
386
394
|
playbackPosition: pushedDuration,
|
|
387
395
|
interrupted,
|
|
@@ -2,8 +2,10 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import {
|
|
5
|
+
type AudioFrame,
|
|
5
6
|
ConnectionState,
|
|
6
7
|
DisconnectReason,
|
|
8
|
+
type FrameProcessor,
|
|
7
9
|
type NoiseCancellationOptions,
|
|
8
10
|
type Participant,
|
|
9
11
|
ParticipantKind,
|
|
@@ -75,7 +77,7 @@ export interface RoomInputOptions {
|
|
|
75
77
|
Can be overridden by the `participant` argument of RoomIO constructor or `set_participant`.
|
|
76
78
|
*/
|
|
77
79
|
participantIdentity?: string;
|
|
78
|
-
noiseCancellation?: NoiseCancellationOptions
|
|
80
|
+
noiseCancellation?: NoiseCancellationOptions | FrameProcessor<AudioFrame>;
|
|
79
81
|
textInputCallback?: TextInputCallback;
|
|
80
82
|
/** Participant kinds accepted for auto subscription. If not provided,
|
|
81
83
|
accept `DEFAULT_PARTICIPANT_KINDS`
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import type { Context } from '@opentelemetry/api';
|
|
4
5
|
import type { ChatItem } from '../llm/index.js';
|
|
5
6
|
import type { Task } from '../utils.js';
|
|
6
7
|
import { Event, Future, shortuuid } from '../utils.js';
|
|
@@ -42,6 +43,9 @@ export class SpeechHandle {
|
|
|
42
43
|
/** @internal */
|
|
43
44
|
_numSteps = 1;
|
|
44
45
|
|
|
46
|
+
/** @internal - OpenTelemetry context for the agent turn span */
|
|
47
|
+
_agentTurnContext?: Context;
|
|
48
|
+
|
|
45
49
|
private itemAddedCallbacks: Set<(item: ChatItem) => void> = new Set();
|
|
46
50
|
private doneCallbacks: Set<(sh: SpeechHandle) => void> = new Set();
|
|
47
51
|
|