@juspay/neurolink 9.69.3 → 9.70.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/dist/browser/neurolink.min.js +355 -347
- package/dist/core/modules/GenerationHandler.js +75 -23
- package/dist/core/modules/structuredOutputPolicy.d.ts +28 -0
- package/dist/core/modules/structuredOutputPolicy.js +50 -0
- package/dist/lib/core/modules/GenerationHandler.js +75 -23
- package/dist/lib/core/modules/structuredOutputPolicy.d.ts +28 -0
- package/dist/lib/core/modules/structuredOutputPolicy.js +51 -0
- package/dist/lib/neurolink.js +58 -0
- package/dist/lib/providers/anthropic.js +34 -7
- package/dist/lib/providers/googleVertex.js +17 -2
- package/dist/lib/types/generate.d.ts +47 -19
- package/dist/lib/types/index.d.ts +1 -0
- package/dist/lib/types/index.js +1 -0
- package/dist/lib/types/livekit.d.ts +369 -0
- package/dist/lib/types/livekit.js +13 -0
- package/dist/lib/types/utilities.d.ts +16 -0
- package/dist/lib/utils/json/coerce.d.ts +10 -0
- package/dist/lib/utils/json/coerce.js +141 -0
- package/dist/lib/utils/json/extract.d.ts +10 -0
- package/dist/lib/utils/json/extract.js +61 -11
- package/dist/lib/utils/tokenLimits.d.ts +20 -0
- package/dist/lib/utils/tokenLimits.js +55 -0
- package/dist/lib/voice/livekit/brain.d.ts +21 -0
- package/dist/lib/voice/livekit/brain.js +75 -0
- package/dist/lib/voice/livekit/config.d.ts +41 -0
- package/dist/lib/voice/livekit/config.js +80 -0
- package/dist/lib/voice/livekit/eventBridge.d.ts +27 -0
- package/dist/lib/voice/livekit/eventBridge.js +360 -0
- package/dist/lib/voice/livekit/index.d.ts +15 -0
- package/dist/lib/voice/livekit/index.js +16 -0
- package/dist/lib/voice/livekit/tokens.d.ts +19 -0
- package/dist/lib/voice/livekit/tokens.js +51 -0
- package/dist/lib/voice/livekit/voiceAgent.d.ts +32 -0
- package/dist/lib/voice/livekit/voiceAgent.js +415 -0
- package/dist/lib/voice/livekit/voiceAgentWorker.d.ts +27 -0
- package/dist/lib/voice/livekit/voiceAgentWorker.js +58 -0
- package/dist/neurolink.js +58 -0
- package/dist/providers/anthropic.js +34 -7
- package/dist/providers/googleVertex.js +17 -2
- package/dist/types/generate.d.ts +47 -19
- package/dist/types/index.d.ts +1 -0
- package/dist/types/index.js +1 -0
- package/dist/types/livekit.d.ts +369 -0
- package/dist/types/livekit.js +12 -0
- package/dist/types/utilities.d.ts +16 -0
- package/dist/utils/json/coerce.d.ts +10 -0
- package/dist/utils/json/coerce.js +140 -0
- package/dist/utils/json/extract.d.ts +10 -0
- package/dist/utils/json/extract.js +61 -11
- package/dist/utils/tokenLimits.d.ts +20 -0
- package/dist/utils/tokenLimits.js +55 -0
- package/dist/voice/livekit/brain.d.ts +21 -0
- package/dist/voice/livekit/brain.js +74 -0
- package/dist/voice/livekit/config.d.ts +41 -0
- package/dist/voice/livekit/config.js +79 -0
- package/dist/voice/livekit/eventBridge.d.ts +27 -0
- package/dist/voice/livekit/eventBridge.js +359 -0
- package/dist/voice/livekit/index.d.ts +15 -0
- package/dist/voice/livekit/index.js +15 -0
- package/dist/voice/livekit/tokens.d.ts +19 -0
- package/dist/voice/livekit/tokens.js +50 -0
- package/dist/voice/livekit/voiceAgent.d.ts +32 -0
- package/dist/voice/livekit/voiceAgent.js +414 -0
- package/dist/voice/livekit/voiceAgentWorker.d.ts +27 -0
- package/dist/voice/livekit/voiceAgentWorker.js +57 -0
- package/package.json +23 -6
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LiveKit Agents agent definition.
|
|
3
|
+
*
|
|
4
|
+
* `defineVoiceAgent` returns the agent object placed as the default export of a
|
|
5
|
+
* worker entry file. The framework runs it as a Job (one per call, in its own
|
|
6
|
+
* process): it connects to the room, builds the NeuroLink brain via the
|
|
7
|
+
* supplied factory, wires Silero VAD + STT/TTS plugins, and overrides `llmNode`
|
|
8
|
+
* so every turn is generated by `neurolink.stream()`.
|
|
9
|
+
*
|
|
10
|
+
* `@livekit/agents` and the plugins are optional dependencies, imported
|
|
11
|
+
* dynamically so the core package does not require them unless the LiveKit
|
|
12
|
+
* voice agent is used. Type-only imports are erased at build time and add no
|
|
13
|
+
* runtime dependency.
|
|
14
|
+
*
|
|
15
|
+
* See docs/features/livekit-voice-agent.md.
|
|
16
|
+
*/
|
|
17
|
+
import { ReadableStream } from "node:stream/web";
|
|
18
|
+
import { logger } from "../../utils/logger.js";
|
|
19
|
+
import { createVoiceBrain } from "./brain.js";
|
|
20
|
+
import { resolveBrainDefaults, resolveEouTurnDetection } from "./config.js";
|
|
21
|
+
import { attachEventBridge } from "./eventBridge.js";
|
|
22
|
+
const DEFAULT_CONVERSATION_PREFIX = "voice";
|
|
23
|
+
// Turn-end timing defaults (approach A: silence tuning). Longer silence +
|
|
24
|
+
// endpointing floor so natural mid-sentence pauses ("...and so, [pause] um")
|
|
25
|
+
// don't end the turn early and split one utterance into two. Overridable via
|
|
26
|
+
// config (vad.minSilenceDuration / turn.minEndpointingDelay).
|
|
27
|
+
const DEFAULT_MIN_SILENCE_DURATION = 1.0; // seconds (Silero VAD)
|
|
28
|
+
const DEFAULT_MIN_ENDPOINTING_DELAY = 1000; // ms (framework endpointing floor)
|
|
29
|
+
/**
|
|
30
|
+
* Find the most recent user utterance in a chat context.
|
|
31
|
+
* Uses the `type === "message"` discriminant — no type assertions.
|
|
32
|
+
*/
|
|
33
|
+
function latestUserText(chatCtx) {
|
|
34
|
+
const items = chatCtx.items;
|
|
35
|
+
for (let i = items.length - 1; i >= 0; i -= 1) {
|
|
36
|
+
const item = items[i];
|
|
37
|
+
if (item.type === "message" && item.role === "user") {
|
|
38
|
+
return item.textContent;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
return undefined;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Build a text stream for a single turn from the brain, abortable on cancel.
|
|
45
|
+
* When the framework cancels the stream (barge-in), the brain's turn is aborted.
|
|
46
|
+
*/
|
|
47
|
+
function brainTurnStream(brain, transcript, conversationId, onAbortedBeforeOutput) {
|
|
48
|
+
const controller = new AbortController();
|
|
49
|
+
const generator = brain.streamReply({
|
|
50
|
+
transcript,
|
|
51
|
+
conversationId,
|
|
52
|
+
signal: controller.signal,
|
|
53
|
+
});
|
|
54
|
+
const iterator = generator[Symbol.asyncIterator]();
|
|
55
|
+
let producedOutput = false;
|
|
56
|
+
return new ReadableStream({
|
|
57
|
+
async pull(streamController) {
|
|
58
|
+
const next = await iterator.next();
|
|
59
|
+
if (next.done === true) {
|
|
60
|
+
streamController.close();
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
producedOutput = true;
|
|
64
|
+
streamController.enqueue(next.value);
|
|
65
|
+
},
|
|
66
|
+
cancel() {
|
|
67
|
+
controller.abort();
|
|
68
|
+
if (!producedOutput) {
|
|
69
|
+
onAbortedBeforeOutput?.();
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Construct the English semantic EOU turn detector, or `undefined` if disabled.
|
|
76
|
+
*
|
|
77
|
+
* Layered on top of VAD: VAD detects acoustic silence, then this model decides
|
|
78
|
+
* whether the user's turn is semantically complete, so natural mid-sentence
|
|
79
|
+
* pauses don't split one utterance. Opt-in via `LIVEKIT_EOU_TURN_DETECTION`.
|
|
80
|
+
* The runner is registered in the worker process (see `voiceAgentWorker.ts`);
|
|
81
|
+
* here we only construct the model handle, which dispatches inference to the
|
|
82
|
+
* shared executor via the running job context.
|
|
83
|
+
*/
|
|
84
|
+
async function loadEouTurnDetector() {
|
|
85
|
+
const { enabled, unlikelyThreshold } = resolveEouTurnDetection();
|
|
86
|
+
if (!enabled) {
|
|
87
|
+
return undefined;
|
|
88
|
+
}
|
|
89
|
+
const { turnDetector } = await import("@livekit/agents-plugin-livekit");
|
|
90
|
+
return new turnDetector.EnglishModel(unlikelyThreshold);
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Construct the Silero VAD instance for the session.
|
|
94
|
+
*
|
|
95
|
+
* Stricter-than-default thresholds so background noise isn't treated as speech
|
|
96
|
+
* (a higher activation threshold and a minimum speech duration reject short,
|
|
97
|
+
* quiet noise bursts).
|
|
98
|
+
*/
|
|
99
|
+
async function loadVad(config) {
|
|
100
|
+
const silero = await import("@livekit/agents-plugin-silero");
|
|
101
|
+
return silero.VAD.load({
|
|
102
|
+
activationThreshold: config?.activationThreshold ?? 0.6,
|
|
103
|
+
minSpeechDuration: config?.minSpeechDuration ?? 0.2,
|
|
104
|
+
minSilenceDuration: config?.minSilenceDuration ?? DEFAULT_MIN_SILENCE_DURATION,
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Construct the STT plugin instance from configuration.
|
|
109
|
+
*
|
|
110
|
+
* Only defined options are passed — passing `undefined` would override the
|
|
111
|
+
* plugin's own defaults (e.g. its default model) with `undefined` and break it.
|
|
112
|
+
*/
|
|
113
|
+
async function buildStt(config) {
|
|
114
|
+
if (config.provider === "soniox") {
|
|
115
|
+
const soniox = await import("@livekit/agents-plugin-soniox");
|
|
116
|
+
const opts = {};
|
|
117
|
+
if (config.model !== undefined) {
|
|
118
|
+
opts.model = config.model;
|
|
119
|
+
}
|
|
120
|
+
if (config.language !== undefined) {
|
|
121
|
+
// Soft hint only: Soniox biases toward this language but can still
|
|
122
|
+
// auto-detect another (e.g. the user switching to Telugu mid-call).
|
|
123
|
+
// Do NOT set `languageHintsStrict` — forcing the hinted language makes
|
|
124
|
+
// the realtime STT stall/error on other-language audio and the session
|
|
125
|
+
// never recovers (no further transcripts, so no audio at all).
|
|
126
|
+
opts.languageHints = [config.language];
|
|
127
|
+
}
|
|
128
|
+
if (config.maxEndpointDelayMs !== undefined) {
|
|
129
|
+
opts.maxEndpointDelayMs = config.maxEndpointDelayMs;
|
|
130
|
+
}
|
|
131
|
+
return new soniox.STT(opts);
|
|
132
|
+
}
|
|
133
|
+
if (config.provider === "deepgram") {
|
|
134
|
+
const deepgram = await import("@livekit/agents-plugin-deepgram");
|
|
135
|
+
const opts = {};
|
|
136
|
+
if (config.language !== undefined) {
|
|
137
|
+
opts.language = config.language;
|
|
138
|
+
}
|
|
139
|
+
return new deepgram.STT(opts);
|
|
140
|
+
}
|
|
141
|
+
throw new Error(`Unsupported LiveKit STT provider "${config.provider}" (supported: soniox, deepgram)`);
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Construct the TTS plugin instance from configuration.
|
|
145
|
+
*
|
|
146
|
+
* Only defined options are passed — passing `undefined` would override the
|
|
147
|
+
* plugin's own defaults (default voice/model) with `undefined` and break it.
|
|
148
|
+
*/
|
|
149
|
+
async function buildTts(config) {
|
|
150
|
+
if (config.provider === "cartesia") {
|
|
151
|
+
const cartesia = await import("@livekit/agents-plugin-cartesia");
|
|
152
|
+
const opts = {};
|
|
153
|
+
if (config.voice !== undefined) {
|
|
154
|
+
opts.voice = config.voice;
|
|
155
|
+
}
|
|
156
|
+
if (config.model !== undefined) {
|
|
157
|
+
opts.model = config.model;
|
|
158
|
+
}
|
|
159
|
+
return new cartesia.TTS(opts);
|
|
160
|
+
}
|
|
161
|
+
if (config.provider === "elevenlabs") {
|
|
162
|
+
const elevenlabs = await import("@livekit/agents-plugin-elevenlabs");
|
|
163
|
+
const opts = {};
|
|
164
|
+
if (config.voice !== undefined) {
|
|
165
|
+
opts.voiceId = config.voice;
|
|
166
|
+
}
|
|
167
|
+
if (config.model !== undefined) {
|
|
168
|
+
opts.modelID = config.model;
|
|
169
|
+
}
|
|
170
|
+
return new elevenlabs.TTS(opts);
|
|
171
|
+
}
|
|
172
|
+
throw new Error(`Unsupported LiveKit TTS provider "${config.provider}" (supported: cartesia, elevenlabs)`);
|
|
173
|
+
}
|
|
174
|
+
/**
|
|
175
|
+
* Define a LiveKit voice agent backed by NeuroLink.
|
|
176
|
+
*
|
|
177
|
+
* Place the result as the default export of the worker entry file:
|
|
178
|
+
*
|
|
179
|
+
* ```ts
|
|
180
|
+
* export default defineVoiceAgent({
|
|
181
|
+
* createNeuroLink: async () => buildConfiguredNeuroLink(),
|
|
182
|
+
* stt: { provider: "deepgram" },
|
|
183
|
+
* tts: { provider: "elevenlabs" },
|
|
184
|
+
* });
|
|
185
|
+
* ```
|
|
186
|
+
*/
|
|
187
|
+
export function defineVoiceAgent(config) {
|
|
188
|
+
const defaults = resolveBrainDefaults();
|
|
189
|
+
const provider = config.provider ?? defaults.provider;
|
|
190
|
+
const model = config.model ?? defaults.model;
|
|
191
|
+
const conversationPrefix = config.conversationIdPrefix ?? DEFAULT_CONVERSATION_PREFIX;
|
|
192
|
+
async function entry(ctx) {
|
|
193
|
+
const entryStartedAt = Date.now();
|
|
194
|
+
await ctx.connect();
|
|
195
|
+
logger.debug(`[LiveKitVoiceAgent] Joined room "${ctx.room.name}" in ${Date.now() - entryStartedAt}ms`);
|
|
196
|
+
// When the user actually stopped speaking (VAD), used to measure how long
|
|
197
|
+
// the agent waited after speech before committing the turn to the LLM.
|
|
198
|
+
let userStoppedSpeakingAt;
|
|
199
|
+
const neurolink = await config.createNeuroLink();
|
|
200
|
+
const brain = createVoiceBrain({
|
|
201
|
+
neurolink,
|
|
202
|
+
provider,
|
|
203
|
+
model,
|
|
204
|
+
systemPrompt: config.systemPrompt,
|
|
205
|
+
temperature: config.temperature,
|
|
206
|
+
maxTokens: config.maxTokens,
|
|
207
|
+
userId: config.userId,
|
|
208
|
+
});
|
|
209
|
+
const conversationId = `${conversationPrefix}-${ctx.room.name ?? ctx.job.id}`;
|
|
210
|
+
const { voice, llm } = await import("@livekit/agents");
|
|
211
|
+
const [vad, stt, tts, eouTurnDetector] = await Promise.all([
|
|
212
|
+
loadVad(config.vad),
|
|
213
|
+
buildStt(config.stt),
|
|
214
|
+
buildTts(config.tts),
|
|
215
|
+
loadEouTurnDetector(),
|
|
216
|
+
]);
|
|
217
|
+
const transcriptEventsEnabled = config.events?.enabled === true &&
|
|
218
|
+
typeof neurolink.getEventEmitter === "function";
|
|
219
|
+
const transcriptEmitter = transcriptEventsEnabled
|
|
220
|
+
? neurolink.getEventEmitter?.()
|
|
221
|
+
: undefined;
|
|
222
|
+
let userTranscriptBuffer = "";
|
|
223
|
+
let pendingPrefix = "";
|
|
224
|
+
function emitUserTranscriptSegment(segmentText, isFinal) {
|
|
225
|
+
if (transcriptEmitter === undefined) {
|
|
226
|
+
return;
|
|
227
|
+
}
|
|
228
|
+
const trimmed = segmentText.trim();
|
|
229
|
+
if (isFinal) {
|
|
230
|
+
userTranscriptBuffer =
|
|
231
|
+
userTranscriptBuffer.length > 0
|
|
232
|
+
? `${userTranscriptBuffer} ${trimmed}`
|
|
233
|
+
: trimmed;
|
|
234
|
+
transcriptEmitter.emit("voice:user-transcript", {
|
|
235
|
+
text: userTranscriptBuffer,
|
|
236
|
+
final: false,
|
|
237
|
+
});
|
|
238
|
+
return;
|
|
239
|
+
}
|
|
240
|
+
const live = userTranscriptBuffer.length > 0
|
|
241
|
+
? `${userTranscriptBuffer} ${trimmed}`
|
|
242
|
+
: trimmed;
|
|
243
|
+
transcriptEmitter.emit("voice:user-transcript", {
|
|
244
|
+
text: live,
|
|
245
|
+
final: false,
|
|
246
|
+
});
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* Lock the user bubble at turn-end and reset the buffer for the next turn.
|
|
250
|
+
* `replacesPrevious` tells the client this committed turn absorbed a prior
|
|
251
|
+
* interrupted turn, so it should remove the orphaned previous user bubble.
|
|
252
|
+
*/
|
|
253
|
+
function commitUserTranscript(finalText, replacesPrevious = false) {
|
|
254
|
+
if (transcriptEmitter !== undefined) {
|
|
255
|
+
transcriptEmitter.emit("voice:user-transcript", {
|
|
256
|
+
text: finalText,
|
|
257
|
+
final: true,
|
|
258
|
+
replacesPrevious,
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
userTranscriptBuffer = "";
|
|
262
|
+
}
|
|
263
|
+
class NeuroLinkVoiceAgent extends voice.Agent {
|
|
264
|
+
async llmNode(chatCtx, _toolCtx, _modelSettings) {
|
|
265
|
+
const transcript = latestUserText(chatCtx);
|
|
266
|
+
if (transcript === undefined || transcript.trim().length === 0) {
|
|
267
|
+
userTranscriptBuffer = "";
|
|
268
|
+
return null;
|
|
269
|
+
}
|
|
270
|
+
const hadPrefix = pendingPrefix.length > 0;
|
|
271
|
+
const promptText = hadPrefix
|
|
272
|
+
? `${pendingPrefix} ${transcript}`
|
|
273
|
+
: transcript;
|
|
274
|
+
pendingPrefix = "";
|
|
275
|
+
commitUserTranscript(promptText, hadPrefix);
|
|
276
|
+
if (userStoppedSpeakingAt !== undefined) {
|
|
277
|
+
logger.debug(`[LiveKitVoiceAgent] Endpointing waited ${Date.now() - userStoppedSpeakingAt}ms before sending turn to LLM`);
|
|
278
|
+
}
|
|
279
|
+
return brainTurnStream(brain, promptText, conversationId, () => {
|
|
280
|
+
// Interrupted before producing any reply → carry this turn's text
|
|
281
|
+
// forward; the next turn merges it (prompt + UI).
|
|
282
|
+
pendingPrefix = promptText;
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
class PlaceholderLLM extends llm.LLM {
|
|
287
|
+
label() {
|
|
288
|
+
return "neurolink-placeholder";
|
|
289
|
+
}
|
|
290
|
+
chat() {
|
|
291
|
+
throw new Error("PlaceholderLLM.chat must not be called — llmNode overrides generation");
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
const turnHandling = {
|
|
295
|
+
interruption: {
|
|
296
|
+
minWords: config.interruption?.minWords ?? 2,
|
|
297
|
+
minDuration: config.interruption?.minDuration ?? 600,
|
|
298
|
+
},
|
|
299
|
+
};
|
|
300
|
+
if (eouTurnDetector !== undefined) {
|
|
301
|
+
turnHandling.turnDetection = eouTurnDetector;
|
|
302
|
+
logger.info("[LiveKitVoiceAgent] Semantic end-of-utterance turn detection enabled (English)");
|
|
303
|
+
}
|
|
304
|
+
else if (config.turn?.mode) {
|
|
305
|
+
turnHandling.turnDetection = config.turn.mode;
|
|
306
|
+
}
|
|
307
|
+
const endpointing = {};
|
|
308
|
+
endpointing.minDelay =
|
|
309
|
+
config.turn?.minEndpointingDelay ?? DEFAULT_MIN_ENDPOINTING_DELAY;
|
|
310
|
+
if (config.turn?.maxEndpointingDelay !== undefined) {
|
|
311
|
+
endpointing.maxDelay = config.turn.maxEndpointingDelay;
|
|
312
|
+
}
|
|
313
|
+
if (Object.keys(endpointing).length > 0) {
|
|
314
|
+
turnHandling.endpointing = endpointing;
|
|
315
|
+
}
|
|
316
|
+
const session = new voice.AgentSession({
|
|
317
|
+
vad,
|
|
318
|
+
stt,
|
|
319
|
+
tts,
|
|
320
|
+
llm: new PlaceholderLLM(),
|
|
321
|
+
turnHandling,
|
|
322
|
+
// Do NOT speculatively call the LLM on preflight transcripts before the
|
|
323
|
+
// turn ends — with NeuroLink as the brain each call is a real LLM request,
|
|
324
|
+
// and it makes the agent feel like it responds while you're still talking.
|
|
325
|
+
preemptiveGeneration: false,
|
|
326
|
+
});
|
|
327
|
+
const agent = new NeuroLinkVoiceAgent({
|
|
328
|
+
instructions: config.systemPrompt ?? "",
|
|
329
|
+
});
|
|
330
|
+
// Inactivity watchdog: shut the per-call Job down after a stretch with no
|
|
331
|
+
// user or agent activity (mirrors Clairvoyance). On timeout `ctx.shutdown`
|
|
332
|
+
// runs the shutdown callbacks (disposing the bridge) and the Job process
|
|
333
|
+
// exits — freeing its RAM and the EOU model — while the browser observes a
|
|
334
|
+
// room disconnect. Reset on every interaction below. Configure via
|
|
335
|
+
// VOICE_INACTIVITY_TIMEOUT_MS (default 10 min); <= 0 disables the watchdog.
|
|
336
|
+
const inactivityTimeoutMs = Number(process.env.VOICE_INACTIVITY_TIMEOUT_MS ?? 600_000);
|
|
337
|
+
const inactivityEnabled = Number.isFinite(inactivityTimeoutMs) && inactivityTimeoutMs > 0;
|
|
338
|
+
let inactivityTimer;
|
|
339
|
+
let inactivityFired = false;
|
|
340
|
+
function clearInactivityTimer() {
|
|
341
|
+
if (inactivityTimer !== undefined) {
|
|
342
|
+
clearTimeout(inactivityTimer);
|
|
343
|
+
inactivityTimer = undefined;
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
function noteActivity() {
|
|
347
|
+
if (!inactivityEnabled || inactivityFired) {
|
|
348
|
+
return;
|
|
349
|
+
}
|
|
350
|
+
clearInactivityTimer();
|
|
351
|
+
inactivityTimer = setTimeout(() => {
|
|
352
|
+
inactivityFired = true;
|
|
353
|
+
logger.info(`[LiveKitVoiceAgent] Inactivity timeout (${Math.round(inactivityTimeoutMs / 1000)}s) reached — shutting down job for room "${ctx.room.name}"`);
|
|
354
|
+
ctx.shutdown("inactivity timeout");
|
|
355
|
+
}, inactivityTimeoutMs);
|
|
356
|
+
// The watchdog must not, by itself, keep the event loop alive.
|
|
357
|
+
inactivityTimer.unref?.();
|
|
358
|
+
}
|
|
359
|
+
ctx.addShutdownCallback(async () => {
|
|
360
|
+
clearInactivityTimer();
|
|
361
|
+
});
|
|
362
|
+
// Track when the user actually stops speaking (VAD) so endpointing latency
|
|
363
|
+
// can be measured, and reset the inactivity watchdog on user activity.
|
|
364
|
+
session.on(voice.AgentSessionEventTypes.UserStateChanged, (ev) => {
|
|
365
|
+
noteActivity();
|
|
366
|
+
if (ev.oldState === "speaking" && ev.newState !== "speaking") {
|
|
367
|
+
userStoppedSpeakingAt = Date.now();
|
|
368
|
+
}
|
|
369
|
+
});
|
|
370
|
+
// Reset the inactivity watchdog on any agent speech/processing and on every
|
|
371
|
+
// committed conversation item (user turn or agent reply), so the timeout
|
|
372
|
+
// only fires during a genuine lull in the conversation.
|
|
373
|
+
session.on(voice.AgentSessionEventTypes.AgentStateChanged, () => {
|
|
374
|
+
noteActivity();
|
|
375
|
+
});
|
|
376
|
+
session.on(voice.AgentSessionEventTypes.ConversationItemAdded, () => {
|
|
377
|
+
noteActivity();
|
|
378
|
+
});
|
|
379
|
+
// Forward user STT transcripts to the data-channel bridge as a single
|
|
380
|
+
// live-updating bubble. `UserInputTranscribed` fires `isFinal: true` per
|
|
381
|
+
// finalized SEGMENT (several per turn), so we never forward those as the
|
|
382
|
+
// turn-final; `emitUserTranscriptSegment` accumulates them into the per-turn
|
|
383
|
+
// buffer and emits `final: false`. The lone `final: true` is sent from
|
|
384
|
+
// `llmNode` at the real turn boundary.
|
|
385
|
+
if (transcriptEventsEnabled) {
|
|
386
|
+
session.on(voice.AgentSessionEventTypes.UserInputTranscribed, (ev) => {
|
|
387
|
+
emitUserTranscriptSegment(ev.transcript, ev.isFinal);
|
|
388
|
+
});
|
|
389
|
+
}
|
|
390
|
+
logger.info("[LiveKitVoiceAgent] Session starting", {
|
|
391
|
+
room: ctx.room.name,
|
|
392
|
+
provider,
|
|
393
|
+
model,
|
|
394
|
+
});
|
|
395
|
+
await session.start({ agent, room: ctx.room });
|
|
396
|
+
// Start the inactivity countdown now that the session is live; every
|
|
397
|
+
// interaction handler above re-arms it.
|
|
398
|
+
noteActivity();
|
|
399
|
+
// Data-channel event bridge: forward NeuroLink events (text, tool calls,
|
|
400
|
+
// results, HITL prompts, status) to the browser, and accept HITL responses
|
|
401
|
+
// back. Only when enabled and the instance exposes its event emitter.
|
|
402
|
+
if (config.events?.enabled === true && neurolink.getEventEmitter) {
|
|
403
|
+
const bridge = await attachEventBridge({
|
|
404
|
+
room: ctx.room,
|
|
405
|
+
emitter: neurolink.getEventEmitter(),
|
|
406
|
+
options: config.events,
|
|
407
|
+
});
|
|
408
|
+
ctx.addShutdownCallback(async () => {
|
|
409
|
+
bridge.dispose();
|
|
410
|
+
});
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
return { entry };
|
|
414
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LiveKit Agents worker launcher.
|
|
3
|
+
*
|
|
4
|
+
* Registers a worker with the LiveKit server (Cloud or self-hosted) for the
|
|
5
|
+
* given agent entry file. LiveKit dispatches one Job per room, each running in
|
|
6
|
+
* its own process, which provides worker-per-call isolation and horizontal
|
|
7
|
+
* scaling. Connection settings are resolved from the environment.
|
|
8
|
+
*
|
|
9
|
+
* `@livekit/agents` is an optional dependency, imported dynamically.
|
|
10
|
+
*
|
|
11
|
+
* See docs/features/livekit-voice-agent.md.
|
|
12
|
+
*/
|
|
13
|
+
import type { LiveKitWorkerLaunchOptions } from "../../types/index.js";
|
|
14
|
+
/**
|
|
15
|
+
* Launch the LiveKit voice agent worker.
|
|
16
|
+
*
|
|
17
|
+
* Call from a small runner script; `agentFile` must point to the file whose
|
|
18
|
+
* default export is the result of `defineVoiceAgent`.
|
|
19
|
+
*
|
|
20
|
+
* ```ts
|
|
21
|
+
* await startVoiceAgentWorker({
|
|
22
|
+
* agentFile: new URL("./voice-agent-entry.js", import.meta.url).pathname,
|
|
23
|
+
* agentName: "neurolink-voice",
|
|
24
|
+
* });
|
|
25
|
+
* ```
|
|
26
|
+
*/
|
|
27
|
+
export declare function startVoiceAgentWorker(options: LiveKitWorkerLaunchOptions): Promise<void>;
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LiveKit Agents worker launcher.
|
|
3
|
+
*
|
|
4
|
+
* Registers a worker with the LiveKit server (Cloud or self-hosted) for the
|
|
5
|
+
* given agent entry file. LiveKit dispatches one Job per room, each running in
|
|
6
|
+
* its own process, which provides worker-per-call isolation and horizontal
|
|
7
|
+
* scaling. Connection settings are resolved from the environment.
|
|
8
|
+
*
|
|
9
|
+
* `@livekit/agents` is an optional dependency, imported dynamically.
|
|
10
|
+
*
|
|
11
|
+
* See docs/features/livekit-voice-agent.md.
|
|
12
|
+
*/
|
|
13
|
+
import { resolveEouTurnDetection, resolveLiveKitServerConfig, } from "./config.js";
|
|
14
|
+
const DEFAULT_AGENT_NAME = "neurolink-voice";
|
|
15
|
+
const EOU_METHOD_MULTILINGUAL = "lk_end_of_utterance_multilingual";
|
|
16
|
+
/**
|
|
17
|
+
* Register the English EOU inference runner in the worker process.
|
|
18
|
+
*
|
|
19
|
+
* Must run before `cli.runApp`: the worker only spawns the shared inference
|
|
20
|
+
* executor when `InferenceRunner.registeredRunners` is non-empty at startup,
|
|
21
|
+
* and passes that registry to the executor process. Importing the plugin
|
|
22
|
+
* registers both English and multilingual runners, so we delete multilingual to
|
|
23
|
+
* keep only the English model loaded.
|
|
24
|
+
*/
|
|
25
|
+
async function registerEouTurnDetectorRunner() {
|
|
26
|
+
const { InferenceRunner } = await import("@livekit/agents");
|
|
27
|
+
// Importing the plugin's turn-detector module triggers registerRunner().
|
|
28
|
+
await import("@livekit/agents-plugin-livekit");
|
|
29
|
+
delete InferenceRunner.registeredRunners[EOU_METHOD_MULTILINGUAL];
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Launch the LiveKit voice agent worker.
|
|
33
|
+
*
|
|
34
|
+
* Call from a small runner script; `agentFile` must point to the file whose
|
|
35
|
+
* default export is the result of `defineVoiceAgent`.
|
|
36
|
+
*
|
|
37
|
+
* ```ts
|
|
38
|
+
* await startVoiceAgentWorker({
|
|
39
|
+
* agentFile: new URL("./voice-agent-entry.js", import.meta.url).pathname,
|
|
40
|
+
* agentName: "neurolink-voice",
|
|
41
|
+
* });
|
|
42
|
+
* ```
|
|
43
|
+
*/
|
|
44
|
+
export async function startVoiceAgentWorker(options) {
|
|
45
|
+
const server = resolveLiveKitServerConfig();
|
|
46
|
+
const { cli, WorkerOptions } = await import("@livekit/agents");
|
|
47
|
+
if (resolveEouTurnDetection().enabled) {
|
|
48
|
+
await registerEouTurnDetectorRunner();
|
|
49
|
+
}
|
|
50
|
+
cli.runApp(new WorkerOptions({
|
|
51
|
+
agent: options.agentFile,
|
|
52
|
+
agentName: options.agentName ?? DEFAULT_AGENT_NAME,
|
|
53
|
+
wsURL: server.url,
|
|
54
|
+
apiKey: server.apiKey,
|
|
55
|
+
apiSecret: server.apiSecret,
|
|
56
|
+
}));
|
|
57
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@juspay/neurolink",
|
|
3
|
-
"version": "9.
|
|
3
|
+
"version": "9.70.1",
|
|
4
4
|
"packageManager": "pnpm@10.15.1",
|
|
5
5
|
"description": "Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applications with 21+ providers: OpenAI, Anthropic, Google AI Studio, Google Vertex, AWS Bedrock, Azure OpenAI, Mistral, LiteLLM, SageMaker, Hugging Face, Ollama, OpenAI-compatible, OpenRouter, DeepSeek, NVIDIA NIM, LM Studio, llama.cpp, plus voice (OpenAI TTS, ElevenLabs, Deepgram, Azure Speech).",
|
|
6
6
|
"author": {
|
|
@@ -108,6 +108,8 @@
|
|
|
108
108
|
"test:dynamic": "npx tsx test/continuous-test-suite-dynamic.ts",
|
|
109
109
|
"test:proxy": "npx tsx test/continuous-test-suite-proxy.ts",
|
|
110
110
|
"test:bugfixes": "npx tsx test/continuous-test-suite-bugfixes.ts",
|
|
111
|
+
"test:json": "npx tsx test/continuous-test-suite-json.ts",
|
|
112
|
+
"test:json-e2e": "npx tsx test/continuous-test-suite-json-e2e.ts",
|
|
111
113
|
"test:workflow": "npx tsx test/continuous-test-suite-workflow.ts",
|
|
112
114
|
"test:hitl": "npx tsx test/continuous-test-suite-hitl.ts",
|
|
113
115
|
"test:tasks": "npx tsx test/continuous-test-suite-tasks.ts",
|
|
@@ -294,6 +296,11 @@
|
|
|
294
296
|
"types": "./dist/adapters/*.d.ts",
|
|
295
297
|
"import": "./dist/adapters/*.js",
|
|
296
298
|
"default": "./dist/adapters/*.js"
|
|
299
|
+
},
|
|
300
|
+
"./livekit": {
|
|
301
|
+
"types": "./dist/voice/livekit/index.d.ts",
|
|
302
|
+
"import": "./dist/voice/livekit/index.js",
|
|
303
|
+
"default": "./dist/voice/livekit/index.js"
|
|
297
304
|
}
|
|
298
305
|
},
|
|
299
306
|
"dependencies": {
|
|
@@ -332,10 +339,11 @@
|
|
|
332
339
|
"dotenv": "^17.3.1",
|
|
333
340
|
"eventsource-parser": "^3.0.8",
|
|
334
341
|
"google-auth-library": "^10.6.1",
|
|
335
|
-
"hono": "^4.12.
|
|
342
|
+
"hono": "^4.12.21",
|
|
336
343
|
"inquirer": "^13.3.0",
|
|
337
344
|
"jose": "^6.1.3",
|
|
338
345
|
"json-schema-to-zod": "^2.7.0",
|
|
346
|
+
"jsonrepair": "^3.14.0",
|
|
339
347
|
"nanoid": "^5.1.5",
|
|
340
348
|
"open": "^11.0.0",
|
|
341
349
|
"ora": "^9.3.0",
|
|
@@ -343,7 +351,7 @@
|
|
|
343
351
|
"redis": "^5.11.0",
|
|
344
352
|
"tar-stream": "^3.1.8",
|
|
345
353
|
"undici": ">=7.22.0",
|
|
346
|
-
"ws": "^8.
|
|
354
|
+
"ws": "^8.20.1",
|
|
347
355
|
"yargs": "^18.0.0",
|
|
348
356
|
"zod": "^4.3.6",
|
|
349
357
|
"zod-to-json-schema": "^3.25.1"
|
|
@@ -370,10 +378,18 @@
|
|
|
370
378
|
"@aws-sdk/client-sagemaker": "^3.1000.0",
|
|
371
379
|
"@fastify/cors": "^11.2.0",
|
|
372
380
|
"@fastify/rate-limit": "^10.3.0",
|
|
373
|
-
"@hono/node-server": "^1.19.
|
|
381
|
+
"@hono/node-server": "^1.19.13",
|
|
374
382
|
"@koa/cors": "^5.0.0",
|
|
375
383
|
"@koa/router": "^15.3.1",
|
|
376
384
|
"@langfuse/otel": "^5.0.1",
|
|
385
|
+
"@livekit/agents": "^1.4.5",
|
|
386
|
+
"@livekit/agents-plugin-cartesia": "^1.4.5",
|
|
387
|
+
"@livekit/agents-plugin-deepgram": "^1.4.5",
|
|
388
|
+
"@livekit/agents-plugin-elevenlabs": "^1.4.5",
|
|
389
|
+
"@livekit/agents-plugin-livekit": "^1.4.5",
|
|
390
|
+
"@livekit/agents-plugin-silero": "^1.4.5",
|
|
391
|
+
"@livekit/agents-plugin-soniox": "^1.4.5",
|
|
392
|
+
"@livekit/rtc-node": "^0.13.29",
|
|
377
393
|
"@picovoice/cobra-node": "^3.0.2",
|
|
378
394
|
"bullmq": "^5.52.2",
|
|
379
395
|
"cors": "^2.8.5",
|
|
@@ -385,6 +401,7 @@
|
|
|
385
401
|
"fluent-ffmpeg": "^2.1.3",
|
|
386
402
|
"koa": "^3.1.1",
|
|
387
403
|
"koa-bodyparser": "^4.4.1",
|
|
404
|
+
"livekit-server-sdk": "^2.15.4",
|
|
388
405
|
"mammoth": "^1.11.0",
|
|
389
406
|
"mediabunny": "^1.40.1",
|
|
390
407
|
"music-metadata": "^11.11.2",
|
|
@@ -449,7 +466,7 @@
|
|
|
449
466
|
"react-dom": "^19.2.4",
|
|
450
467
|
"semantic-release": "^25.0.3",
|
|
451
468
|
"shell-quote": "^1.8.3",
|
|
452
|
-
"svelte": "^5.
|
|
469
|
+
"svelte": "^5.55.7",
|
|
453
470
|
"svelte-check": "^4.4.4",
|
|
454
471
|
"ts-morph": "^24.0.0",
|
|
455
472
|
"tslib": "^2.8.1",
|
|
@@ -457,7 +474,7 @@
|
|
|
457
474
|
"typedoc": "^0.28.17",
|
|
458
475
|
"typedoc-plugin-markdown": "^4.10.0",
|
|
459
476
|
"typescript": "^5.9.3",
|
|
460
|
-
"vite": "^8.0.
|
|
477
|
+
"vite": "^8.0.5",
|
|
461
478
|
"vitest": "^4.1.0",
|
|
462
479
|
"why-is-node-running": "^3.2.2"
|
|
463
480
|
},
|