@livekit/agents 1.0.34 → 1.0.36-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs.map +1 -1
- package/dist/index.cjs +3 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/inference/api_protos.d.cts +4 -4
- package/dist/inference/api_protos.d.ts +4 -4
- package/dist/inference/interruption/AdaptiveInterruptionDetector.cjs +152 -0
- package/dist/inference/interruption/AdaptiveInterruptionDetector.cjs.map +1 -0
- package/dist/inference/interruption/AdaptiveInterruptionDetector.d.cts +50 -0
- package/dist/inference/interruption/AdaptiveInterruptionDetector.d.ts +50 -0
- package/dist/inference/interruption/AdaptiveInterruptionDetector.d.ts.map +1 -0
- package/dist/inference/interruption/AdaptiveInterruptionDetector.js +125 -0
- package/dist/inference/interruption/AdaptiveInterruptionDetector.js.map +1 -0
- package/dist/inference/interruption/InterruptionStream.cjs +310 -0
- package/dist/inference/interruption/InterruptionStream.cjs.map +1 -0
- package/dist/inference/interruption/InterruptionStream.d.cts +57 -0
- package/dist/inference/interruption/InterruptionStream.d.ts +57 -0
- package/dist/inference/interruption/InterruptionStream.d.ts.map +1 -0
- package/dist/inference/interruption/InterruptionStream.js +288 -0
- package/dist/inference/interruption/InterruptionStream.js.map +1 -0
- package/dist/inference/interruption/defaults.cjs +76 -0
- package/dist/inference/interruption/defaults.cjs.map +1 -0
- package/dist/inference/interruption/defaults.d.cts +14 -0
- package/dist/inference/interruption/defaults.d.ts +14 -0
- package/dist/inference/interruption/defaults.d.ts.map +1 -0
- package/dist/inference/interruption/defaults.js +42 -0
- package/dist/inference/interruption/defaults.js.map +1 -0
- package/dist/inference/interruption/errors.cjs +2 -0
- package/dist/inference/interruption/errors.cjs.map +1 -0
- package/dist/inference/interruption/errors.d.cts +2 -0
- package/dist/inference/interruption/errors.d.ts +2 -0
- package/dist/inference/interruption/errors.d.ts.map +1 -0
- package/dist/inference/interruption/errors.js +1 -0
- package/dist/inference/interruption/errors.js.map +1 -0
- package/dist/inference/interruption/http_transport.cjs +57 -0
- package/dist/inference/interruption/http_transport.cjs.map +1 -0
- package/dist/inference/interruption/http_transport.d.cts +23 -0
- package/dist/inference/interruption/http_transport.d.ts +23 -0
- package/dist/inference/interruption/http_transport.d.ts.map +1 -0
- package/dist/inference/interruption/http_transport.js +33 -0
- package/dist/inference/interruption/http_transport.js.map +1 -0
- package/dist/inference/interruption/index.cjs +34 -0
- package/dist/inference/interruption/index.cjs.map +1 -0
- package/dist/inference/interruption/index.d.cts +5 -0
- package/dist/inference/interruption/index.d.ts +5 -0
- package/dist/inference/interruption/index.d.ts.map +1 -0
- package/dist/inference/interruption/index.js +7 -0
- package/dist/inference/interruption/index.js.map +1 -0
- package/dist/inference/interruption/interruption.cjs +85 -0
- package/dist/inference/interruption/interruption.cjs.map +1 -0
- package/dist/inference/interruption/interruption.d.cts +48 -0
- package/dist/inference/interruption/interruption.d.ts +48 -0
- package/dist/inference/interruption/interruption.d.ts.map +1 -0
- package/dist/inference/interruption/interruption.js +59 -0
- package/dist/inference/interruption/interruption.js.map +1 -0
- package/dist/inference/llm.cjs +30 -3
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +3 -1
- package/dist/inference/llm.d.ts +3 -1
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +30 -3
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/utils.cjs +15 -2
- package/dist/inference/utils.cjs.map +1 -1
- package/dist/inference/utils.d.cts +1 -0
- package/dist/inference/utils.d.ts +1 -0
- package/dist/inference/utils.d.ts.map +1 -1
- package/dist/inference/utils.js +13 -1
- package/dist/inference/utils.js.map +1 -1
- package/dist/inference/utils.test.cjs +20 -0
- package/dist/inference/utils.test.cjs.map +1 -0
- package/dist/inference/utils.test.js +19 -0
- package/dist/inference/utils.test.js.map +1 -0
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +1 -1
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/llm/chat_context.cjs +20 -2
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +9 -0
- package/dist/llm/chat_context.d.ts +9 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +20 -2
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +1 -0
- package/dist/llm/llm.d.ts +1 -0
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +43 -20
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +43 -20
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +35 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +35 -0
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +1 -1
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +1 -1
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/stream/stream_channel.cjs +3 -0
- package/dist/stream/stream_channel.cjs.map +1 -1
- package/dist/stream/stream_channel.d.cts +3 -2
- package/dist/stream/stream_channel.d.ts +3 -2
- package/dist/stream/stream_channel.d.ts.map +1 -1
- package/dist/stream/stream_channel.js +3 -0
- package/dist/stream/stream_channel.js.map +1 -1
- package/dist/telemetry/trace_types.cjs +15 -0
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +5 -0
- package/dist/telemetry/trace_types.d.ts +5 -0
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +10 -0
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/utils/ws_transport.cjs +51 -0
- package/dist/utils/ws_transport.cjs.map +1 -0
- package/dist/utils/ws_transport.d.cts +9 -0
- package/dist/utils/ws_transport.d.ts +9 -0
- package/dist/utils/ws_transport.d.ts.map +1 -0
- package/dist/utils/ws_transport.js +17 -0
- package/dist/utils/ws_transport.js.map +1 -0
- package/dist/utils/ws_transport.test.cjs +212 -0
- package/dist/utils/ws_transport.test.cjs.map +1 -0
- package/dist/utils/ws_transport.test.js +211 -0
- package/dist/utils/ws_transport.test.js.map +1 -0
- package/dist/voice/agent_activity.cjs +49 -0
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +14 -0
- package/dist/voice/agent_activity.d.ts +14 -0
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +49 -0
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +12 -1
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +3 -0
- package/dist/voice/agent_session.d.ts +3 -0
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +12 -1
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +124 -2
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +32 -1
- package/dist/voice/audio_recognition.d.ts +32 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +127 -2
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/background_audio.cjs.map +1 -1
- package/dist/voice/generation.cjs +2 -1
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +2 -1
- package/dist/voice/generation.js.map +1 -1
- package/package.json +2 -1
- package/src/index.ts +2 -0
- package/src/inference/interruption/AdaptiveInterruptionDetector.ts +166 -0
- package/src/inference/interruption/InterruptionStream.ts +397 -0
- package/src/inference/interruption/defaults.ts +33 -0
- package/src/inference/interruption/errors.ts +0 -0
- package/src/inference/interruption/http_transport.ts +61 -0
- package/src/inference/interruption/index.ts +4 -0
- package/src/inference/interruption/interruption.ts +88 -0
- package/src/inference/llm.ts +42 -3
- package/src/inference/utils.test.ts +31 -0
- package/src/inference/utils.ts +15 -0
- package/src/ipc/job_proc_lazy_main.ts +1 -1
- package/src/llm/chat_context.ts +32 -2
- package/src/llm/llm.ts +1 -0
- package/src/llm/provider_format/openai.test.ts +40 -0
- package/src/llm/provider_format/openai.ts +46 -19
- package/src/llm/provider_format/utils.ts +5 -1
- package/src/stream/stream_channel.ts +6 -2
- package/src/telemetry/trace_types.ts +7 -0
- package/src/utils/ws_transport.test.ts +282 -0
- package/src/utils/ws_transport.ts +22 -0
- package/src/voice/agent_activity.ts +61 -0
- package/src/voice/agent_session.ts +22 -2
- package/src/voice/audio_recognition.ts +161 -1
- package/src/voice/generation.ts +1 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import type { TypedEventEmitter } from '@livekit/typed-emitter';
|
|
2
|
+
import EventEmitter from 'events';
|
|
3
|
+
import { type ReadableStream, TransformStream } from 'stream/web';
|
|
4
|
+
import { InterruptionStreamBase } from './InterruptionStream.js';
|
|
5
|
+
import {
|
|
6
|
+
DEFAULT_BASE_URL,
|
|
7
|
+
FRAMES_PER_SECOND,
|
|
8
|
+
SAMPLE_RATE,
|
|
9
|
+
interruptionOptionDefaults,
|
|
10
|
+
} from './defaults.js';
|
|
11
|
+
import {
|
|
12
|
+
type InterruptionDetectionError,
|
|
13
|
+
type InterruptionEvent,
|
|
14
|
+
InterruptionEventType,
|
|
15
|
+
} from './interruption.js';
|
|
16
|
+
|
|
17
|
+
type InterruptionCallbacks = {
|
|
18
|
+
interruptionDetected: () => void;
|
|
19
|
+
overlapSpeechDetected: () => void;
|
|
20
|
+
error: (error: InterruptionDetectionError) => void;
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
export interface InterruptionOptions {
|
|
24
|
+
sampleRate: number;
|
|
25
|
+
threshold: number;
|
|
26
|
+
minFrames: number;
|
|
27
|
+
maxAudioDuration: number;
|
|
28
|
+
audioPrefixDuration: number;
|
|
29
|
+
detectionInterval: number;
|
|
30
|
+
inferenceTimeout: number;
|
|
31
|
+
minInterruptionDuration: number;
|
|
32
|
+
baseUrl: string;
|
|
33
|
+
apiKey: string;
|
|
34
|
+
apiSecret: string;
|
|
35
|
+
useProxy: boolean;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export type AdaptiveInterruptionDetectorOptions = Partial<InterruptionOptions>;
|
|
39
|
+
|
|
40
|
+
export class AdaptiveInterruptionDetector extends (EventEmitter as new () => TypedEventEmitter<InterruptionCallbacks>) {
|
|
41
|
+
options: InterruptionOptions;
|
|
42
|
+
private label: string;
|
|
43
|
+
private streams: WeakSet<object>; // TODO: Union of InterruptionHttpStream | InterruptionWebSocketStream
|
|
44
|
+
|
|
45
|
+
constructor(options: AdaptiveInterruptionDetectorOptions = {}) {
|
|
46
|
+
super();
|
|
47
|
+
|
|
48
|
+
const {
|
|
49
|
+
maxAudioDuration,
|
|
50
|
+
baseUrl,
|
|
51
|
+
apiKey,
|
|
52
|
+
apiSecret,
|
|
53
|
+
useProxy: useProxyArg,
|
|
54
|
+
audioPrefixDuration,
|
|
55
|
+
threshold,
|
|
56
|
+
detectionInterval,
|
|
57
|
+
inferenceTimeout,
|
|
58
|
+
minInterruptionDuration,
|
|
59
|
+
} = { ...interruptionOptionDefaults, ...options };
|
|
60
|
+
|
|
61
|
+
if (maxAudioDuration > 3.0) {
|
|
62
|
+
throw new Error('maxAudioDuration must be less than or equal to 3.0 seconds');
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const lkBaseUrl = baseUrl ?? process.env.LIVEKIT_REMOTE_EOT_URL ?? DEFAULT_BASE_URL;
|
|
66
|
+
let lkApiKey = apiKey ?? '';
|
|
67
|
+
let lkApiSecret = apiSecret ?? '';
|
|
68
|
+
let useProxy: boolean;
|
|
69
|
+
|
|
70
|
+
// use LiveKit credentials if using the default base URL (inference)
|
|
71
|
+
if (lkBaseUrl === DEFAULT_BASE_URL) {
|
|
72
|
+
lkApiKey =
|
|
73
|
+
apiKey ?? process.env.LIVEKIT_INFERENCE_API_KEY ?? process.env.LIVEKIT_API_KEY ?? '';
|
|
74
|
+
if (!lkApiKey) {
|
|
75
|
+
throw new Error(
|
|
76
|
+
'apiKey is required, either as argument or set LIVEKIT_API_KEY environmental variable',
|
|
77
|
+
);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
lkApiSecret =
|
|
81
|
+
apiSecret ??
|
|
82
|
+
process.env.LIVEKIT_INFERENCE_API_SECRET ??
|
|
83
|
+
process.env.LIVEKIT_API_SECRET ??
|
|
84
|
+
'';
|
|
85
|
+
if (!lkApiSecret) {
|
|
86
|
+
throw new Error(
|
|
87
|
+
'apiSecret is required, either as argument or set LIVEKIT_API_SECRET environmental variable',
|
|
88
|
+
);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
useProxy = true;
|
|
92
|
+
} else {
|
|
93
|
+
useProxy = useProxyArg ?? false;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
this.options = {
|
|
97
|
+
sampleRate: SAMPLE_RATE,
|
|
98
|
+
threshold,
|
|
99
|
+
minFrames: Math.ceil(minInterruptionDuration * FRAMES_PER_SECOND),
|
|
100
|
+
maxAudioDuration,
|
|
101
|
+
audioPrefixDuration,
|
|
102
|
+
detectionInterval,
|
|
103
|
+
inferenceTimeout,
|
|
104
|
+
baseUrl: lkBaseUrl,
|
|
105
|
+
apiKey: lkApiKey,
|
|
106
|
+
apiSecret: lkApiSecret,
|
|
107
|
+
useProxy,
|
|
108
|
+
minInterruptionDuration,
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
this.label = `${this.constructor.name}`;
|
|
112
|
+
this.streams = new WeakSet();
|
|
113
|
+
|
|
114
|
+
console.info('adaptive interruption detector initialized', {
|
|
115
|
+
baseUrl: this.options.baseUrl,
|
|
116
|
+
detectionInterval: this.options.detectionInterval,
|
|
117
|
+
audioPrefixDuration: this.options.audioPrefixDuration,
|
|
118
|
+
maxAudioDuration: this.options.maxAudioDuration,
|
|
119
|
+
minFrames: this.options.minFrames,
|
|
120
|
+
threshold: this.options.threshold,
|
|
121
|
+
inferenceTimeout: this.options.inferenceTimeout,
|
|
122
|
+
useProxy: this.options.useProxy,
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Creates a new InterruptionStreamBase for internal use.
|
|
128
|
+
* The stream can receive audio frames and sentinels via pushFrame().
|
|
129
|
+
* Use this when you need direct access to the stream for pushing frames.
|
|
130
|
+
*/
|
|
131
|
+
createStream(): InterruptionStreamBase {
|
|
132
|
+
const stream = new InterruptionStreamBase(this, {});
|
|
133
|
+
this.streams.add(stream);
|
|
134
|
+
return stream;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Creates a new interruption stream and returns a ReadableStream of InterruptionEvents.
|
|
139
|
+
* This is a convenience method for consuming interruption events without needing
|
|
140
|
+
* to manage the underlying stream directly.
|
|
141
|
+
*/
|
|
142
|
+
stream(): ReadableStream<InterruptionEvent> {
|
|
143
|
+
const httpStream = this.createStream();
|
|
144
|
+
const transformer = new TransformStream<InterruptionEvent, InterruptionEvent>({
|
|
145
|
+
transform: (chunk, controller) => {
|
|
146
|
+
if (chunk.type === InterruptionEventType.INTERRUPTION) {
|
|
147
|
+
this.emit('interruptionDetected'); // TODO payload
|
|
148
|
+
} else if (chunk.type === InterruptionEventType.OVERLAP_SPEECH_ENDED) {
|
|
149
|
+
this.emit('overlapSpeechDetected'); // TODO payload
|
|
150
|
+
}
|
|
151
|
+
controller.enqueue(chunk);
|
|
152
|
+
},
|
|
153
|
+
});
|
|
154
|
+
const stream = httpStream.stream.pipeThrough(transformer);
|
|
155
|
+
return stream;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
updateOptions(options: { threshold?: number; minInterruptionDuration?: number }): void {
|
|
159
|
+
if (options.threshold !== undefined) {
|
|
160
|
+
this.options.threshold = options.threshold;
|
|
161
|
+
}
|
|
162
|
+
if (options.minInterruptionDuration !== undefined) {
|
|
163
|
+
this.options.minFrames = Math.ceil(options.minInterruptionDuration * FRAMES_PER_SECOND);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
import { AudioFrame, AudioResampler } from '@livekit/rtc-node';
|
|
2
|
+
import type { Span } from '@opentelemetry/api';
|
|
3
|
+
import { traceTypes } from '../../telemetry/index.js';
|
|
4
|
+
import { type ReadableStream, TransformStream, WritableStream } from 'stream/web';
|
|
5
|
+
import { log } from '../../log.js';
|
|
6
|
+
import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js';
|
|
7
|
+
import { createAccessToken } from '../utils.js';
|
|
8
|
+
import type {
|
|
9
|
+
AdaptiveInterruptionDetector,
|
|
10
|
+
InterruptionOptions,
|
|
11
|
+
} from './AdaptiveInterruptionDetector.js';
|
|
12
|
+
import { apiConnectDefaults } from './defaults.js';
|
|
13
|
+
import { predictHTTP } from './http_transport.js';
|
|
14
|
+
import {
|
|
15
|
+
InterruptionCacheEntry,
|
|
16
|
+
type InterruptionDetectionError,
|
|
17
|
+
type InterruptionEvent,
|
|
18
|
+
InterruptionEventType,
|
|
19
|
+
} from './interruption.js';
|
|
20
|
+
|
|
21
|
+
export interface AgentSpeechStarted {
|
|
22
|
+
type: 'agent-speech-started';
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface AgentSpeechEnded {
|
|
26
|
+
type: 'agent-speech-ended';
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface OverlapSpeechStarted {
|
|
30
|
+
type: 'overlap-speech-started';
|
|
31
|
+
speechDuration: number;
|
|
32
|
+
userSpeakingSpan: Span;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export interface OverlapSpeechEnded {
|
|
36
|
+
type: 'overlap-speech-ended';
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export interface Flush {
|
|
40
|
+
type: 'flush';
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export type InterruptionSentinel =
|
|
44
|
+
| AgentSpeechStarted
|
|
45
|
+
| AgentSpeechEnded
|
|
46
|
+
| OverlapSpeechStarted
|
|
47
|
+
| OverlapSpeechEnded
|
|
48
|
+
| Flush;
|
|
49
|
+
|
|
50
|
+
export class InterruptionStreamSentinel {
|
|
51
|
+
static speechStarted(): AgentSpeechStarted {
|
|
52
|
+
return { type: 'agent-speech-started' };
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
static speechEnded(): AgentSpeechEnded {
|
|
56
|
+
return { type: 'agent-speech-ended' };
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
static overlapSpeechStarted(
|
|
60
|
+
speechDuration: number,
|
|
61
|
+
userSpeakingSpan: Span,
|
|
62
|
+
): OverlapSpeechStarted {
|
|
63
|
+
return { type: 'overlap-speech-started', speechDuration, userSpeakingSpan };
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
static overlapSpeechEnded(): OverlapSpeechEnded {
|
|
67
|
+
return { type: 'overlap-speech-ended' };
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
static flush(): Flush {
|
|
71
|
+
return { type: 'flush' };
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export interface ApiConnectOptions {
|
|
76
|
+
maxRetries: number;
|
|
77
|
+
retryInterval: number;
|
|
78
|
+
timeout: number;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function updateUserSpeakingSpan(span: Span, entry: InterruptionCacheEntry) {
|
|
82
|
+
span.setAttribute(
|
|
83
|
+
traceTypes.ATTR_IS_INTERRUPTION,
|
|
84
|
+
(entry.isInterruption ?? false).toString().toLowerCase(),
|
|
85
|
+
);
|
|
86
|
+
span.setAttribute(traceTypes.ATTR_INTERRUPTION_PROBABILITY, entry.probability);
|
|
87
|
+
span.setAttribute(traceTypes.ATTR_INTERRUPTION_TOTAL_DURATION, entry.totalDuration);
|
|
88
|
+
span.setAttribute(traceTypes.ATTR_INTERRUPTION_PREDICTION_DURATION, entry.predictionDuration);
|
|
89
|
+
span.setAttribute(traceTypes.ATTR_INTERRUPTION_DETECTION_DELAY, entry.detectionDelay);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
export class InterruptionStreamBase {
|
|
93
|
+
private inputStream: StreamChannel<InterruptionSentinel | AudioFrame, InterruptionDetectionError>;
|
|
94
|
+
|
|
95
|
+
private eventStream: StreamChannel<InterruptionEvent, InterruptionDetectionError>;
|
|
96
|
+
|
|
97
|
+
private resampler?: AudioResampler;
|
|
98
|
+
|
|
99
|
+
private userSpeakingSpan: Span | undefined;
|
|
100
|
+
|
|
101
|
+
private overlapSpeechStartedAt: number | undefined;
|
|
102
|
+
|
|
103
|
+
private options: InterruptionOptions;
|
|
104
|
+
|
|
105
|
+
private apiOptions: ApiConnectOptions;
|
|
106
|
+
|
|
107
|
+
private model: AdaptiveInterruptionDetector;
|
|
108
|
+
|
|
109
|
+
constructor(model: AdaptiveInterruptionDetector, apiOptions: Partial<ApiConnectOptions>) {
|
|
110
|
+
this.inputStream = createStreamChannel<
|
|
111
|
+
InterruptionSentinel | AudioFrame,
|
|
112
|
+
InterruptionDetectionError
|
|
113
|
+
>();
|
|
114
|
+
|
|
115
|
+
this.eventStream = createStreamChannel<InterruptionEvent, InterruptionDetectionError>();
|
|
116
|
+
|
|
117
|
+
this.model = model;
|
|
118
|
+
this.options = model.options;
|
|
119
|
+
this.apiOptions = { ...apiConnectDefaults, ...apiOptions };
|
|
120
|
+
|
|
121
|
+
this.setupTransform();
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
private setupTransform() {
|
|
125
|
+
let agentSpeechStarted = false;
|
|
126
|
+
let startIdx = 0;
|
|
127
|
+
let accumulatedSamples = 0;
|
|
128
|
+
let overlapSpeechStarted = false;
|
|
129
|
+
const cache = new Map<number, InterruptionCacheEntry>(); // TODO limit cache size
|
|
130
|
+
const inferenceS16Data = new Int16Array(
|
|
131
|
+
Math.ceil(this.options.maxAudioDuration * this.options.sampleRate),
|
|
132
|
+
).fill(0);
|
|
133
|
+
|
|
134
|
+
const transformer = new TransformStream<InterruptionSentinel | AudioFrame, Int16Array>(
|
|
135
|
+
{
|
|
136
|
+
transform: (chunk, controller) => {
|
|
137
|
+
if (chunk instanceof AudioFrame) {
|
|
138
|
+
if (!agentSpeechStarted) {
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
if (this.options.sampleRate !== chunk.sampleRate) {
|
|
142
|
+
controller.error('the sample rate of the input frames must be consistent');
|
|
143
|
+
return;
|
|
144
|
+
}
|
|
145
|
+
const result = writeToInferenceS16Data(
|
|
146
|
+
chunk,
|
|
147
|
+
startIdx,
|
|
148
|
+
inferenceS16Data,
|
|
149
|
+
this.options.maxAudioDuration,
|
|
150
|
+
);
|
|
151
|
+
startIdx = result.startIdx;
|
|
152
|
+
accumulatedSamples += result.samplesWritten;
|
|
153
|
+
|
|
154
|
+
// Send data for inference when enough samples accumulated during overlap
|
|
155
|
+
if (
|
|
156
|
+
accumulatedSamples >=
|
|
157
|
+
Math.floor(this.options.detectionInterval * this.options.sampleRate) &&
|
|
158
|
+
overlapSpeechStarted
|
|
159
|
+
) {
|
|
160
|
+
// Send a copy of the audio data up to startIdx for inference
|
|
161
|
+
const audioSlice = inferenceS16Data.slice(0, startIdx);
|
|
162
|
+
// TODO: send to data channel - dataChan.send(audioSlice);
|
|
163
|
+
accumulatedSamples = 0;
|
|
164
|
+
controller.enqueue(audioSlice);
|
|
165
|
+
}
|
|
166
|
+
} else if (chunk.type === 'agent-speech-started') {
|
|
167
|
+
log().debug('agent speech started');
|
|
168
|
+
|
|
169
|
+
agentSpeechStarted = true;
|
|
170
|
+
overlapSpeechStarted = false;
|
|
171
|
+
accumulatedSamples = 0;
|
|
172
|
+
startIdx = 0;
|
|
173
|
+
cache.clear();
|
|
174
|
+
} else if (chunk.type === 'agent-speech-ended') {
|
|
175
|
+
log().debug('agent speech ended');
|
|
176
|
+
|
|
177
|
+
agentSpeechStarted = false;
|
|
178
|
+
overlapSpeechStarted = false;
|
|
179
|
+
accumulatedSamples = 0;
|
|
180
|
+
startIdx = 0;
|
|
181
|
+
cache.clear();
|
|
182
|
+
} else if (chunk.type === 'overlap-speech-started' && agentSpeechStarted) {
|
|
183
|
+
this.userSpeakingSpan = chunk.userSpeakingSpan;
|
|
184
|
+
log().debug('overlap speech started, starting interruption inference');
|
|
185
|
+
overlapSpeechStarted = true;
|
|
186
|
+
accumulatedSamples = 0;
|
|
187
|
+
// Include both speech duration and audio prefix duration for context
|
|
188
|
+
const shiftSize = Math.min(
|
|
189
|
+
startIdx,
|
|
190
|
+
Math.round(chunk.speechDuration * this.options.sampleRate) +
|
|
191
|
+
Math.round(this.options.audioPrefixDuration * this.options.sampleRate),
|
|
192
|
+
);
|
|
193
|
+
// Shift the buffer: copy the last `shiftSize` samples before startIdx
|
|
194
|
+
// to the beginning of the buffer. This preserves recent audio context
|
|
195
|
+
// (the user's speech that occurred just before overlap was detected).
|
|
196
|
+
inferenceS16Data.copyWithin(0, startIdx - shiftSize, startIdx);
|
|
197
|
+
startIdx = shiftSize;
|
|
198
|
+
cache.clear();
|
|
199
|
+
} else if (chunk.type === 'overlap-speech-ended') {
|
|
200
|
+
log().debug('overlap speech ended');
|
|
201
|
+
|
|
202
|
+
if (overlapSpeechStarted) {
|
|
203
|
+
this.userSpeakingSpan = undefined;
|
|
204
|
+
let latestEntry = Array.from(cache.values()).at(-1);
|
|
205
|
+
if (!latestEntry) {
|
|
206
|
+
log().debug('no request made for overlap speech');
|
|
207
|
+
latestEntry = InterruptionCacheEntry.default();
|
|
208
|
+
} else {
|
|
209
|
+
cache.delete(latestEntry.createdAt);
|
|
210
|
+
}
|
|
211
|
+
const event: InterruptionEvent = {
|
|
212
|
+
type: InterruptionEventType.OVERLAP_SPEECH_ENDED,
|
|
213
|
+
timestamp: Date.now(),
|
|
214
|
+
isInterruption: false,
|
|
215
|
+
overlapSpeechStartedAt: this.overlapSpeechStartedAt,
|
|
216
|
+
speechInput: latestEntry.speechInput,
|
|
217
|
+
probabilities: latestEntry.probabilities,
|
|
218
|
+
totalDuration: latestEntry.totalDuration,
|
|
219
|
+
detectionDelay: latestEntry.detectionDelay,
|
|
220
|
+
predictionDuration: latestEntry.predictionDuration,
|
|
221
|
+
probability: latestEntry.probability,
|
|
222
|
+
};
|
|
223
|
+
this.eventStream.write(event);
|
|
224
|
+
}
|
|
225
|
+
} else if (chunk.type === 'flush') {
|
|
226
|
+
log().debug('flushing');
|
|
227
|
+
// do nothing
|
|
228
|
+
}
|
|
229
|
+
},
|
|
230
|
+
},
|
|
231
|
+
{ highWaterMark: Number.MAX_SAFE_INTEGER },
|
|
232
|
+
{ highWaterMark: Number.MAX_SAFE_INTEGER },
|
|
233
|
+
);
|
|
234
|
+
|
|
235
|
+
const httpPostWriter = new WritableStream<Int16Array>(
|
|
236
|
+
{
|
|
237
|
+
// Implement the sink
|
|
238
|
+
write: async (chunk) => {
|
|
239
|
+
if (!this.overlapSpeechStartedAt) {
|
|
240
|
+
return;
|
|
241
|
+
}
|
|
242
|
+
const resp = await predictHTTP(
|
|
243
|
+
chunk,
|
|
244
|
+
{ threshold: this.options.threshold, minFrames: this.options.minFrames },
|
|
245
|
+
{
|
|
246
|
+
baseUrl: this.options.baseUrl,
|
|
247
|
+
timeout: this.options.inferenceTimeout,
|
|
248
|
+
token: await createAccessToken(this.options.apiKey, this.options.apiSecret),
|
|
249
|
+
},
|
|
250
|
+
);
|
|
251
|
+
console.log('received inference response', resp);
|
|
252
|
+
const { createdAt, isBargein, probabilities, predictionDuration } = resp;
|
|
253
|
+
const entry = new InterruptionCacheEntry({
|
|
254
|
+
createdAt,
|
|
255
|
+
probabilities,
|
|
256
|
+
isInterruption: isBargein,
|
|
257
|
+
speechInput: chunk,
|
|
258
|
+
totalDuration: (performance.now() - createdAt) / 1e9,
|
|
259
|
+
detectionDelay: Date.now() - this.overlapSpeechStartedAt,
|
|
260
|
+
predictionDuration,
|
|
261
|
+
});
|
|
262
|
+
cache.set(createdAt, entry);
|
|
263
|
+
if (overlapSpeechStarted && entry.isInterruption) {
|
|
264
|
+
if (this.userSpeakingSpan) {
|
|
265
|
+
updateUserSpeakingSpan(this.userSpeakingSpan, entry);
|
|
266
|
+
}
|
|
267
|
+
const event: InterruptionEvent = {
|
|
268
|
+
type: InterruptionEventType.INTERRUPTION,
|
|
269
|
+
timestamp: Date.now(),
|
|
270
|
+
overlapSpeechStartedAt: this.overlapSpeechStartedAt,
|
|
271
|
+
isInterruption: entry.isInterruption,
|
|
272
|
+
speechInput: entry.speechInput,
|
|
273
|
+
probabilities: entry.probabilities,
|
|
274
|
+
totalDuration: entry.totalDuration,
|
|
275
|
+
predictionDuration: entry.predictionDuration,
|
|
276
|
+
detectionDelay: entry.detectionDelay,
|
|
277
|
+
probability: entry.probability,
|
|
278
|
+
};
|
|
279
|
+
this.eventStream.write(event);
|
|
280
|
+
}
|
|
281
|
+
},
|
|
282
|
+
close() {
|
|
283
|
+
console.log('closing http writer');
|
|
284
|
+
},
|
|
285
|
+
abort(err) {
|
|
286
|
+
console.log('Sink error:', err);
|
|
287
|
+
},
|
|
288
|
+
},
|
|
289
|
+
{ highWaterMark: Number.MAX_SAFE_INTEGER },
|
|
290
|
+
);
|
|
291
|
+
|
|
292
|
+
this.inputStream.stream().pipeThrough(transformer).pipeTo(httpPostWriter);
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
private ensureInputNotEnded() {
|
|
296
|
+
if (this.inputStream.closed) {
|
|
297
|
+
throw new Error('input stream is closed');
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
private ensureStreamsNotEnded() {
|
|
302
|
+
this.ensureInputNotEnded();
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
private getResamplerFor(inputSampleRate: number): AudioResampler {
|
|
306
|
+
if (!this.resampler) {
|
|
307
|
+
this.resampler = new AudioResampler(inputSampleRate, this.options.sampleRate);
|
|
308
|
+
}
|
|
309
|
+
return this.resampler;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
get stream(): ReadableStream<InterruptionEvent> {
|
|
313
|
+
return this.eventStream.stream();
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
async pushFrame(frame: InterruptionSentinel | AudioFrame): Promise<void> {
|
|
317
|
+
this.ensureStreamsNotEnded();
|
|
318
|
+
if (!(frame instanceof AudioFrame)) {
|
|
319
|
+
if (frame.type === 'overlap-speech-started') {
|
|
320
|
+
this.overlapSpeechStartedAt = Date.now() - frame.speechDuration;
|
|
321
|
+
}
|
|
322
|
+
return this.inputStream.write(frame);
|
|
323
|
+
} else if (this.options.sampleRate !== frame.sampleRate) {
|
|
324
|
+
const resampler = this.getResamplerFor(frame.sampleRate);
|
|
325
|
+
if (resampler.inputRate !== frame.sampleRate) {
|
|
326
|
+
throw new Error('the sample rate of the input frames must be consistent');
|
|
327
|
+
}
|
|
328
|
+
for (const resampledFrame of resampler.push(frame)) {
|
|
329
|
+
await this.inputStream.write(resampledFrame);
|
|
330
|
+
}
|
|
331
|
+
} else {
|
|
332
|
+
await this.inputStream.write(frame);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
async flush(): Promise<void> {
|
|
337
|
+
this.ensureStreamsNotEnded();
|
|
338
|
+
this.inputStream.write(InterruptionStreamSentinel.flush());
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
async endInput(): Promise<void> {
|
|
342
|
+
await this.flush();
|
|
343
|
+
await this.inputStream.close();
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
async close(): Promise<void> {
|
|
347
|
+
if (!this.inputStream.closed) await this.inputStream.close();
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
/**
|
|
352
|
+
* Write the audio frame to the output data array and return the new start index
|
|
353
|
+
* and the number of samples written.
|
|
354
|
+
*/
|
|
355
|
+
function writeToInferenceS16Data(
|
|
356
|
+
frame: AudioFrame,
|
|
357
|
+
startIdx: number,
|
|
358
|
+
outData: Int16Array,
|
|
359
|
+
maxAudioDuration: number,
|
|
360
|
+
): { startIdx: number; samplesWritten: number } {
|
|
361
|
+
const maxWindowSize = Math.floor(maxAudioDuration * frame.sampleRate);
|
|
362
|
+
|
|
363
|
+
if (frame.samplesPerChannel > outData.length) {
|
|
364
|
+
throw new Error('frame samples are greater than the max window size');
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// Shift the data to the left if the window would overflow
|
|
368
|
+
const shift = startIdx + frame.samplesPerChannel - maxWindowSize;
|
|
369
|
+
if (shift > 0) {
|
|
370
|
+
outData.copyWithin(0, shift, startIdx);
|
|
371
|
+
startIdx -= shift;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// Get the frame data as Int16Array
|
|
375
|
+
const frameData = new Int16Array(
|
|
376
|
+
frame.data.buffer,
|
|
377
|
+
frame.data.byteOffset,
|
|
378
|
+
frame.samplesPerChannel * frame.channels,
|
|
379
|
+
);
|
|
380
|
+
|
|
381
|
+
if (frame.channels > 1) {
|
|
382
|
+
// Mix down multiple channels to mono by averaging
|
|
383
|
+
for (let i = 0; i < frame.samplesPerChannel; i++) {
|
|
384
|
+
let sum = 0;
|
|
385
|
+
for (let ch = 0; ch < frame.channels; ch++) {
|
|
386
|
+
sum += frameData[i * frame.channels + ch] ?? 0;
|
|
387
|
+
}
|
|
388
|
+
outData[startIdx + i] = Math.floor(sum / frame.channels);
|
|
389
|
+
}
|
|
390
|
+
} else {
|
|
391
|
+
// Single channel - copy directly
|
|
392
|
+
outData.set(frameData, startIdx);
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
startIdx += frame.samplesPerChannel;
|
|
396
|
+
return { startIdx, samplesWritten: frame.samplesPerChannel };
|
|
397
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import type { InterruptionOptions } from './AdaptiveInterruptionDetector.js';
|
|
2
|
+
import type { ApiConnectOptions } from './InterruptionStream.js';
|
|
3
|
+
|
|
4
|
+
export const MIN_INTERRUPTION_DURATION = 0.025 * 2; // 25ms per frame, 2 consecutive frames
|
|
5
|
+
export const THRESHOLD = 0.65;
|
|
6
|
+
export const MAX_AUDIO_DURATION = 3.0;
|
|
7
|
+
export const AUDIO_PREFIX_DURATION = 0.5;
|
|
8
|
+
export const DETECTION_INTERVAL = 0.1;
|
|
9
|
+
export const REMOTE_INFERENCE_TIMEOUT = 1.0;
|
|
10
|
+
export const SAMPLE_RATE = 16000;
|
|
11
|
+
export const FRAMES_PER_SECOND = 40;
|
|
12
|
+
export const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1';
|
|
13
|
+
|
|
14
|
+
export const apiConnectDefaults: ApiConnectOptions = {
|
|
15
|
+
maxRetries: 3,
|
|
16
|
+
retryInterval: 2_000,
|
|
17
|
+
timeout: 10_000,
|
|
18
|
+
} as const;
|
|
19
|
+
|
|
20
|
+
export const interruptionOptionDefaults: InterruptionOptions = {
|
|
21
|
+
sampleRate: SAMPLE_RATE,
|
|
22
|
+
threshold: THRESHOLD,
|
|
23
|
+
minFrames: Math.ceil(MIN_INTERRUPTION_DURATION * FRAMES_PER_SECOND),
|
|
24
|
+
maxAudioDuration: MAX_AUDIO_DURATION,
|
|
25
|
+
audioPrefixDuration: AUDIO_PREFIX_DURATION,
|
|
26
|
+
detectionInterval: DETECTION_INTERVAL,
|
|
27
|
+
inferenceTimeout: 10_000,
|
|
28
|
+
baseUrl: DEFAULT_BASE_URL,
|
|
29
|
+
apiKey: process.env.LIVEKIT_API_KEY || '',
|
|
30
|
+
apiSecret: process.env.LIVEKIT_API_SECRET || '',
|
|
31
|
+
useProxy: false,
|
|
32
|
+
minInterruptionDuration: MIN_INTERRUPTION_DURATION,
|
|
33
|
+
} as const;
|
|
File without changes
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { ofetch } from 'ofetch';
|
|
2
|
+
|
|
3
|
+
export interface PostOptions {
|
|
4
|
+
baseUrl: string;
|
|
5
|
+
token: string;
|
|
6
|
+
signal?: AbortSignal;
|
|
7
|
+
timeout?: number;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export interface PredictOptions {
|
|
11
|
+
threshold: number;
|
|
12
|
+
minFrames: number;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface PredictEndpointResponse {
|
|
16
|
+
created_at: number;
|
|
17
|
+
is_bargein: boolean;
|
|
18
|
+
probabilities: number[];
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface PredictResponse {
|
|
22
|
+
createdAt: number;
|
|
23
|
+
isBargein: boolean;
|
|
24
|
+
probabilities: Float32Array;
|
|
25
|
+
predictionDuration: number;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export async function predictHTTP(
|
|
29
|
+
data: Int16Array,
|
|
30
|
+
predictOptions: PredictOptions,
|
|
31
|
+
options: PostOptions,
|
|
32
|
+
): Promise<PredictResponse> {
|
|
33
|
+
const createdAt = performance.now();
|
|
34
|
+
const url = new URL(`/bargein`, options.baseUrl);
|
|
35
|
+
url.searchParams.append('threshold', predictOptions.threshold.toString());
|
|
36
|
+
url.searchParams.append('min_frames', predictOptions.minFrames.toFixed());
|
|
37
|
+
url.searchParams.append('created_at', createdAt.toFixed());
|
|
38
|
+
|
|
39
|
+
const { created_at, is_bargein, probabilities } = await ofetch<PredictEndpointResponse>(
|
|
40
|
+
url.toString(),
|
|
41
|
+
{
|
|
42
|
+
retry: 1,
|
|
43
|
+
retryDelay: 100,
|
|
44
|
+
headers: {
|
|
45
|
+
'Content-Type': 'application/octet-stream',
|
|
46
|
+
Authorization: `Bearer ${options.token}`,
|
|
47
|
+
},
|
|
48
|
+
signal: options.signal,
|
|
49
|
+
timeout: options.timeout,
|
|
50
|
+
method: 'POST',
|
|
51
|
+
body: data,
|
|
52
|
+
},
|
|
53
|
+
);
|
|
54
|
+
|
|
55
|
+
return {
|
|
56
|
+
createdAt: created_at,
|
|
57
|
+
isBargein: is_bargein,
|
|
58
|
+
probabilities: new Float32Array(probabilities),
|
|
59
|
+
predictionDuration: (performance.now() - createdAt) / 1e9,
|
|
60
|
+
};
|
|
61
|
+
}
|