@juspay/neurolink 9.69.3 → 9.70.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/dist/browser/neurolink.min.js +355 -347
- package/dist/core/modules/GenerationHandler.js +75 -23
- package/dist/core/modules/structuredOutputPolicy.d.ts +28 -0
- package/dist/core/modules/structuredOutputPolicy.js +50 -0
- package/dist/lib/core/modules/GenerationHandler.js +75 -23
- package/dist/lib/core/modules/structuredOutputPolicy.d.ts +28 -0
- package/dist/lib/core/modules/structuredOutputPolicy.js +51 -0
- package/dist/lib/neurolink.js +58 -0
- package/dist/lib/providers/anthropic.js +34 -7
- package/dist/lib/providers/googleVertex.js +17 -2
- package/dist/lib/types/generate.d.ts +47 -19
- package/dist/lib/types/index.d.ts +1 -0
- package/dist/lib/types/index.js +1 -0
- package/dist/lib/types/livekit.d.ts +369 -0
- package/dist/lib/types/livekit.js +13 -0
- package/dist/lib/types/utilities.d.ts +16 -0
- package/dist/lib/utils/json/coerce.d.ts +10 -0
- package/dist/lib/utils/json/coerce.js +141 -0
- package/dist/lib/utils/json/extract.d.ts +10 -0
- package/dist/lib/utils/json/extract.js +61 -11
- package/dist/lib/utils/tokenLimits.d.ts +20 -0
- package/dist/lib/utils/tokenLimits.js +55 -0
- package/dist/lib/voice/livekit/brain.d.ts +21 -0
- package/dist/lib/voice/livekit/brain.js +75 -0
- package/dist/lib/voice/livekit/config.d.ts +41 -0
- package/dist/lib/voice/livekit/config.js +80 -0
- package/dist/lib/voice/livekit/eventBridge.d.ts +27 -0
- package/dist/lib/voice/livekit/eventBridge.js +360 -0
- package/dist/lib/voice/livekit/index.d.ts +15 -0
- package/dist/lib/voice/livekit/index.js +16 -0
- package/dist/lib/voice/livekit/tokens.d.ts +19 -0
- package/dist/lib/voice/livekit/tokens.js +51 -0
- package/dist/lib/voice/livekit/voiceAgent.d.ts +32 -0
- package/dist/lib/voice/livekit/voiceAgent.js +415 -0
- package/dist/lib/voice/livekit/voiceAgentWorker.d.ts +27 -0
- package/dist/lib/voice/livekit/voiceAgentWorker.js +58 -0
- package/dist/neurolink.js +58 -0
- package/dist/providers/anthropic.js +34 -7
- package/dist/providers/googleVertex.js +17 -2
- package/dist/types/generate.d.ts +47 -19
- package/dist/types/index.d.ts +1 -0
- package/dist/types/index.js +1 -0
- package/dist/types/livekit.d.ts +369 -0
- package/dist/types/livekit.js +12 -0
- package/dist/types/utilities.d.ts +16 -0
- package/dist/utils/json/coerce.d.ts +10 -0
- package/dist/utils/json/coerce.js +140 -0
- package/dist/utils/json/extract.d.ts +10 -0
- package/dist/utils/json/extract.js +61 -11
- package/dist/utils/tokenLimits.d.ts +20 -0
- package/dist/utils/tokenLimits.js +55 -0
- package/dist/voice/livekit/brain.d.ts +21 -0
- package/dist/voice/livekit/brain.js +74 -0
- package/dist/voice/livekit/config.d.ts +41 -0
- package/dist/voice/livekit/config.js +79 -0
- package/dist/voice/livekit/eventBridge.d.ts +27 -0
- package/dist/voice/livekit/eventBridge.js +359 -0
- package/dist/voice/livekit/index.d.ts +15 -0
- package/dist/voice/livekit/index.js +15 -0
- package/dist/voice/livekit/tokens.d.ts +19 -0
- package/dist/voice/livekit/tokens.js +50 -0
- package/dist/voice/livekit/voiceAgent.d.ts +32 -0
- package/dist/voice/livekit/voiceAgent.js +414 -0
- package/dist/voice/livekit/voiceAgentWorker.d.ts +27 -0
- package/dist/voice/livekit/voiceAgentWorker.js +57 -0
- package/package.json +23 -6
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LiveKit Agents agent definition.
|
|
3
|
+
*
|
|
4
|
+
* `defineVoiceAgent` returns the agent object placed as the default export of a
|
|
5
|
+
* worker entry file. The framework runs it as a Job (one per call, in its own
|
|
6
|
+
* process): it connects to the room, builds the NeuroLink brain via the
|
|
7
|
+
* supplied factory, wires Silero VAD + STT/TTS plugins, and overrides `llmNode`
|
|
8
|
+
* so every turn is generated by `neurolink.stream()`.
|
|
9
|
+
*
|
|
10
|
+
* `@livekit/agents` and the plugins are optional dependencies, imported
|
|
11
|
+
* dynamically so the core package does not require them unless the LiveKit
|
|
12
|
+
* voice agent is used. Type-only imports are erased at build time and add no
|
|
13
|
+
* runtime dependency.
|
|
14
|
+
*
|
|
15
|
+
* See docs/features/livekit-voice-agent.md.
|
|
16
|
+
*/
|
|
17
|
+
import { ReadableStream } from "node:stream/web";
|
|
18
|
+
import { logger } from "../../utils/logger.js";
|
|
19
|
+
import { createVoiceBrain } from "./brain.js";
|
|
20
|
+
import { resolveBrainDefaults, resolveEouTurnDetection } from "./config.js";
|
|
21
|
+
import { attachEventBridge } from "./eventBridge.js";
|
|
22
|
+
const DEFAULT_CONVERSATION_PREFIX = "voice";
|
|
23
|
+
// Turn-end timing defaults (approach A: silence tuning). Longer silence +
|
|
24
|
+
// endpointing floor so natural mid-sentence pauses ("...and so, [pause] um")
|
|
25
|
+
// don't end the turn early and split one utterance into two. Overridable via
|
|
26
|
+
// config (vad.minSilenceDuration / turn.minEndpointingDelay).
|
|
27
|
+
const DEFAULT_MIN_SILENCE_DURATION = 1.0; // seconds (Silero VAD)
|
|
28
|
+
const DEFAULT_MIN_ENDPOINTING_DELAY = 1000; // ms (framework endpointing floor)
|
|
29
|
+
/**
|
|
30
|
+
* Find the most recent user utterance in a chat context.
|
|
31
|
+
* Uses the `type === "message"` discriminant — no type assertions.
|
|
32
|
+
*/
|
|
33
|
+
function latestUserText(chatCtx) {
|
|
34
|
+
const items = chatCtx.items;
|
|
35
|
+
for (let i = items.length - 1; i >= 0; i -= 1) {
|
|
36
|
+
const item = items[i];
|
|
37
|
+
if (item.type === "message" && item.role === "user") {
|
|
38
|
+
return item.textContent;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
return undefined;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Build a text stream for a single turn from the brain, abortable on cancel.
|
|
45
|
+
* When the framework cancels the stream (barge-in), the brain's turn is aborted.
|
|
46
|
+
*/
|
|
47
|
+
function brainTurnStream(brain, transcript, conversationId, onAbortedBeforeOutput) {
|
|
48
|
+
const controller = new AbortController();
|
|
49
|
+
const generator = brain.streamReply({
|
|
50
|
+
transcript,
|
|
51
|
+
conversationId,
|
|
52
|
+
signal: controller.signal,
|
|
53
|
+
});
|
|
54
|
+
const iterator = generator[Symbol.asyncIterator]();
|
|
55
|
+
let producedOutput = false;
|
|
56
|
+
return new ReadableStream({
|
|
57
|
+
async pull(streamController) {
|
|
58
|
+
const next = await iterator.next();
|
|
59
|
+
if (next.done === true) {
|
|
60
|
+
streamController.close();
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
producedOutput = true;
|
|
64
|
+
streamController.enqueue(next.value);
|
|
65
|
+
},
|
|
66
|
+
cancel() {
|
|
67
|
+
controller.abort();
|
|
68
|
+
if (!producedOutput) {
|
|
69
|
+
onAbortedBeforeOutput?.();
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Construct the English semantic EOU turn detector, or `undefined` if disabled.
|
|
76
|
+
*
|
|
77
|
+
* Layered on top of VAD: VAD detects acoustic silence, then this model decides
|
|
78
|
+
* whether the user's turn is semantically complete, so natural mid-sentence
|
|
79
|
+
* pauses don't split one utterance. Opt-in via `LIVEKIT_EOU_TURN_DETECTION`.
|
|
80
|
+
* The runner is registered in the worker process (see `voiceAgentWorker.ts`);
|
|
81
|
+
* here we only construct the model handle, which dispatches inference to the
|
|
82
|
+
* shared executor via the running job context.
|
|
83
|
+
*/
|
|
84
|
+
async function loadEouTurnDetector() {
|
|
85
|
+
const { enabled, unlikelyThreshold } = resolveEouTurnDetection();
|
|
86
|
+
if (!enabled) {
|
|
87
|
+
return undefined;
|
|
88
|
+
}
|
|
89
|
+
const { turnDetector } = await import("@livekit/agents-plugin-livekit");
|
|
90
|
+
return new turnDetector.EnglishModel(unlikelyThreshold);
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Construct the Silero VAD instance for the session.
|
|
94
|
+
*
|
|
95
|
+
* Stricter-than-default thresholds so background noise isn't treated as speech
|
|
96
|
+
* (a higher activation threshold and a minimum speech duration reject short,
|
|
97
|
+
* quiet noise bursts).
|
|
98
|
+
*/
|
|
99
|
+
async function loadVad(config) {
|
|
100
|
+
const silero = await import("@livekit/agents-plugin-silero");
|
|
101
|
+
return silero.VAD.load({
|
|
102
|
+
activationThreshold: config?.activationThreshold ?? 0.6,
|
|
103
|
+
minSpeechDuration: config?.minSpeechDuration ?? 0.2,
|
|
104
|
+
minSilenceDuration: config?.minSilenceDuration ?? DEFAULT_MIN_SILENCE_DURATION,
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Construct the STT plugin instance from configuration.
|
|
109
|
+
*
|
|
110
|
+
* Only defined options are passed — passing `undefined` would override the
|
|
111
|
+
* plugin's own defaults (e.g. its default model) with `undefined` and break it.
|
|
112
|
+
*/
|
|
113
|
+
async function buildStt(config) {
|
|
114
|
+
if (config.provider === "soniox") {
|
|
115
|
+
const soniox = await import("@livekit/agents-plugin-soniox");
|
|
116
|
+
const opts = {};
|
|
117
|
+
if (config.model !== undefined) {
|
|
118
|
+
opts.model = config.model;
|
|
119
|
+
}
|
|
120
|
+
if (config.language !== undefined) {
|
|
121
|
+
// Soft hint only: Soniox biases toward this language but can still
|
|
122
|
+
// auto-detect another (e.g. the user switching to Telugu mid-call).
|
|
123
|
+
// Do NOT set `languageHintsStrict` — forcing the hinted language makes
|
|
124
|
+
// the realtime STT stall/error on other-language audio and the session
|
|
125
|
+
// never recovers (no further transcripts, so no audio at all).
|
|
126
|
+
opts.languageHints = [config.language];
|
|
127
|
+
}
|
|
128
|
+
if (config.maxEndpointDelayMs !== undefined) {
|
|
129
|
+
opts.maxEndpointDelayMs = config.maxEndpointDelayMs;
|
|
130
|
+
}
|
|
131
|
+
return new soniox.STT(opts);
|
|
132
|
+
}
|
|
133
|
+
if (config.provider === "deepgram") {
|
|
134
|
+
const deepgram = await import("@livekit/agents-plugin-deepgram");
|
|
135
|
+
const opts = {};
|
|
136
|
+
if (config.language !== undefined) {
|
|
137
|
+
opts.language = config.language;
|
|
138
|
+
}
|
|
139
|
+
return new deepgram.STT(opts);
|
|
140
|
+
}
|
|
141
|
+
throw new Error(`Unsupported LiveKit STT provider "${config.provider}" (supported: soniox, deepgram)`);
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Construct the TTS plugin instance from configuration.
|
|
145
|
+
*
|
|
146
|
+
* Only defined options are passed — passing `undefined` would override the
|
|
147
|
+
* plugin's own defaults (default voice/model) with `undefined` and break it.
|
|
148
|
+
*/
|
|
149
|
+
async function buildTts(config) {
|
|
150
|
+
if (config.provider === "cartesia") {
|
|
151
|
+
const cartesia = await import("@livekit/agents-plugin-cartesia");
|
|
152
|
+
const opts = {};
|
|
153
|
+
if (config.voice !== undefined) {
|
|
154
|
+
opts.voice = config.voice;
|
|
155
|
+
}
|
|
156
|
+
if (config.model !== undefined) {
|
|
157
|
+
opts.model = config.model;
|
|
158
|
+
}
|
|
159
|
+
return new cartesia.TTS(opts);
|
|
160
|
+
}
|
|
161
|
+
if (config.provider === "elevenlabs") {
|
|
162
|
+
const elevenlabs = await import("@livekit/agents-plugin-elevenlabs");
|
|
163
|
+
const opts = {};
|
|
164
|
+
if (config.voice !== undefined) {
|
|
165
|
+
opts.voiceId = config.voice;
|
|
166
|
+
}
|
|
167
|
+
if (config.model !== undefined) {
|
|
168
|
+
opts.modelID = config.model;
|
|
169
|
+
}
|
|
170
|
+
return new elevenlabs.TTS(opts);
|
|
171
|
+
}
|
|
172
|
+
throw new Error(`Unsupported LiveKit TTS provider "${config.provider}" (supported: cartesia, elevenlabs)`);
|
|
173
|
+
}
|
|
174
|
+
/**
|
|
175
|
+
* Define a LiveKit voice agent backed by NeuroLink.
|
|
176
|
+
*
|
|
177
|
+
* Place the result as the default export of the worker entry file:
|
|
178
|
+
*
|
|
179
|
+
* ```ts
|
|
180
|
+
* export default defineVoiceAgent({
|
|
181
|
+
* createNeuroLink: async () => buildConfiguredNeuroLink(),
|
|
182
|
+
* stt: { provider: "deepgram" },
|
|
183
|
+
* tts: { provider: "elevenlabs" },
|
|
184
|
+
* });
|
|
185
|
+
* ```
|
|
186
|
+
*/
|
|
187
|
+
export function defineVoiceAgent(config) {
|
|
188
|
+
const defaults = resolveBrainDefaults();
|
|
189
|
+
const provider = config.provider ?? defaults.provider;
|
|
190
|
+
const model = config.model ?? defaults.model;
|
|
191
|
+
const conversationPrefix = config.conversationIdPrefix ?? DEFAULT_CONVERSATION_PREFIX;
|
|
192
|
+
async function entry(ctx) {
|
|
193
|
+
const entryStartedAt = Date.now();
|
|
194
|
+
await ctx.connect();
|
|
195
|
+
logger.debug(`[LiveKitVoiceAgent] Joined room "${ctx.room.name}" in ${Date.now() - entryStartedAt}ms`);
|
|
196
|
+
// When the user actually stopped speaking (VAD), used to measure how long
|
|
197
|
+
// the agent waited after speech before committing the turn to the LLM.
|
|
198
|
+
let userStoppedSpeakingAt;
|
|
199
|
+
const neurolink = await config.createNeuroLink();
|
|
200
|
+
const brain = createVoiceBrain({
|
|
201
|
+
neurolink,
|
|
202
|
+
provider,
|
|
203
|
+
model,
|
|
204
|
+
systemPrompt: config.systemPrompt,
|
|
205
|
+
temperature: config.temperature,
|
|
206
|
+
maxTokens: config.maxTokens,
|
|
207
|
+
userId: config.userId,
|
|
208
|
+
});
|
|
209
|
+
const conversationId = `${conversationPrefix}-${ctx.room.name ?? ctx.job.id}`;
|
|
210
|
+
const { voice, llm } = await import("@livekit/agents");
|
|
211
|
+
const [vad, stt, tts, eouTurnDetector] = await Promise.all([
|
|
212
|
+
loadVad(config.vad),
|
|
213
|
+
buildStt(config.stt),
|
|
214
|
+
buildTts(config.tts),
|
|
215
|
+
loadEouTurnDetector(),
|
|
216
|
+
]);
|
|
217
|
+
const transcriptEventsEnabled = config.events?.enabled === true &&
|
|
218
|
+
typeof neurolink.getEventEmitter === "function";
|
|
219
|
+
const transcriptEmitter = transcriptEventsEnabled
|
|
220
|
+
? neurolink.getEventEmitter?.()
|
|
221
|
+
: undefined;
|
|
222
|
+
let userTranscriptBuffer = "";
|
|
223
|
+
let pendingPrefix = "";
|
|
224
|
+
function emitUserTranscriptSegment(segmentText, isFinal) {
|
|
225
|
+
if (transcriptEmitter === undefined) {
|
|
226
|
+
return;
|
|
227
|
+
}
|
|
228
|
+
const trimmed = segmentText.trim();
|
|
229
|
+
if (isFinal) {
|
|
230
|
+
userTranscriptBuffer =
|
|
231
|
+
userTranscriptBuffer.length > 0
|
|
232
|
+
? `${userTranscriptBuffer} ${trimmed}`
|
|
233
|
+
: trimmed;
|
|
234
|
+
transcriptEmitter.emit("voice:user-transcript", {
|
|
235
|
+
text: userTranscriptBuffer,
|
|
236
|
+
final: false,
|
|
237
|
+
});
|
|
238
|
+
return;
|
|
239
|
+
}
|
|
240
|
+
const live = userTranscriptBuffer.length > 0
|
|
241
|
+
? `${userTranscriptBuffer} ${trimmed}`
|
|
242
|
+
: trimmed;
|
|
243
|
+
transcriptEmitter.emit("voice:user-transcript", {
|
|
244
|
+
text: live,
|
|
245
|
+
final: false,
|
|
246
|
+
});
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* Lock the user bubble at turn-end and reset the buffer for the next turn.
|
|
250
|
+
* `replacesPrevious` tells the client this committed turn absorbed a prior
|
|
251
|
+
* interrupted turn, so it should remove the orphaned previous user bubble.
|
|
252
|
+
*/
|
|
253
|
+
function commitUserTranscript(finalText, replacesPrevious = false) {
|
|
254
|
+
if (transcriptEmitter !== undefined) {
|
|
255
|
+
transcriptEmitter.emit("voice:user-transcript", {
|
|
256
|
+
text: finalText,
|
|
257
|
+
final: true,
|
|
258
|
+
replacesPrevious,
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
userTranscriptBuffer = "";
|
|
262
|
+
}
|
|
263
|
+
class NeuroLinkVoiceAgent extends voice.Agent {
|
|
264
|
+
async llmNode(chatCtx, _toolCtx, _modelSettings) {
|
|
265
|
+
const transcript = latestUserText(chatCtx);
|
|
266
|
+
if (transcript === undefined || transcript.trim().length === 0) {
|
|
267
|
+
userTranscriptBuffer = "";
|
|
268
|
+
return null;
|
|
269
|
+
}
|
|
270
|
+
const hadPrefix = pendingPrefix.length > 0;
|
|
271
|
+
const promptText = hadPrefix
|
|
272
|
+
? `${pendingPrefix} ${transcript}`
|
|
273
|
+
: transcript;
|
|
274
|
+
pendingPrefix = "";
|
|
275
|
+
commitUserTranscript(promptText, hadPrefix);
|
|
276
|
+
if (userStoppedSpeakingAt !== undefined) {
|
|
277
|
+
logger.debug(`[LiveKitVoiceAgent] Endpointing waited ${Date.now() - userStoppedSpeakingAt}ms before sending turn to LLM`);
|
|
278
|
+
}
|
|
279
|
+
return brainTurnStream(brain, promptText, conversationId, () => {
|
|
280
|
+
// Interrupted before producing any reply → carry this turn's text
|
|
281
|
+
// forward; the next turn merges it (prompt + UI).
|
|
282
|
+
pendingPrefix = promptText;
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
class PlaceholderLLM extends llm.LLM {
|
|
287
|
+
label() {
|
|
288
|
+
return "neurolink-placeholder";
|
|
289
|
+
}
|
|
290
|
+
chat() {
|
|
291
|
+
throw new Error("PlaceholderLLM.chat must not be called — llmNode overrides generation");
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
const turnHandling = {
|
|
295
|
+
interruption: {
|
|
296
|
+
minWords: config.interruption?.minWords ?? 2,
|
|
297
|
+
minDuration: config.interruption?.minDuration ?? 600,
|
|
298
|
+
},
|
|
299
|
+
};
|
|
300
|
+
if (eouTurnDetector !== undefined) {
|
|
301
|
+
turnHandling.turnDetection = eouTurnDetector;
|
|
302
|
+
logger.info("[LiveKitVoiceAgent] Semantic end-of-utterance turn detection enabled (English)");
|
|
303
|
+
}
|
|
304
|
+
else if (config.turn?.mode) {
|
|
305
|
+
turnHandling.turnDetection = config.turn.mode;
|
|
306
|
+
}
|
|
307
|
+
const endpointing = {};
|
|
308
|
+
endpointing.minDelay =
|
|
309
|
+
config.turn?.minEndpointingDelay ?? DEFAULT_MIN_ENDPOINTING_DELAY;
|
|
310
|
+
if (config.turn?.maxEndpointingDelay !== undefined) {
|
|
311
|
+
endpointing.maxDelay = config.turn.maxEndpointingDelay;
|
|
312
|
+
}
|
|
313
|
+
if (Object.keys(endpointing).length > 0) {
|
|
314
|
+
turnHandling.endpointing = endpointing;
|
|
315
|
+
}
|
|
316
|
+
const session = new voice.AgentSession({
|
|
317
|
+
vad,
|
|
318
|
+
stt,
|
|
319
|
+
tts,
|
|
320
|
+
llm: new PlaceholderLLM(),
|
|
321
|
+
turnHandling,
|
|
322
|
+
// Do NOT speculatively call the LLM on preflight transcripts before the
|
|
323
|
+
// turn ends — with NeuroLink as the brain each call is a real LLM request,
|
|
324
|
+
// and it makes the agent feel like it responds while you're still talking.
|
|
325
|
+
preemptiveGeneration: false,
|
|
326
|
+
});
|
|
327
|
+
const agent = new NeuroLinkVoiceAgent({
|
|
328
|
+
instructions: config.systemPrompt ?? "",
|
|
329
|
+
});
|
|
330
|
+
// Inactivity watchdog: shut the per-call Job down after a stretch with no
|
|
331
|
+
// user or agent activity (mirrors Clairvoyance). On timeout `ctx.shutdown`
|
|
332
|
+
// runs the shutdown callbacks (disposing the bridge) and the Job process
|
|
333
|
+
// exits — freeing its RAM and the EOU model — while the browser observes a
|
|
334
|
+
// room disconnect. Reset on every interaction below. Configure via
|
|
335
|
+
// VOICE_INACTIVITY_TIMEOUT_MS (default 10 min); <= 0 disables the watchdog.
|
|
336
|
+
const inactivityTimeoutMs = Number(process.env.VOICE_INACTIVITY_TIMEOUT_MS ?? 600_000);
|
|
337
|
+
const inactivityEnabled = Number.isFinite(inactivityTimeoutMs) && inactivityTimeoutMs > 0;
|
|
338
|
+
let inactivityTimer;
|
|
339
|
+
let inactivityFired = false;
|
|
340
|
+
function clearInactivityTimer() {
|
|
341
|
+
if (inactivityTimer !== undefined) {
|
|
342
|
+
clearTimeout(inactivityTimer);
|
|
343
|
+
inactivityTimer = undefined;
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
function noteActivity() {
|
|
347
|
+
if (!inactivityEnabled || inactivityFired) {
|
|
348
|
+
return;
|
|
349
|
+
}
|
|
350
|
+
clearInactivityTimer();
|
|
351
|
+
inactivityTimer = setTimeout(() => {
|
|
352
|
+
inactivityFired = true;
|
|
353
|
+
logger.info(`[LiveKitVoiceAgent] Inactivity timeout (${Math.round(inactivityTimeoutMs / 1000)}s) reached — shutting down job for room "${ctx.room.name}"`);
|
|
354
|
+
ctx.shutdown("inactivity timeout");
|
|
355
|
+
}, inactivityTimeoutMs);
|
|
356
|
+
// The watchdog must not, by itself, keep the event loop alive.
|
|
357
|
+
inactivityTimer.unref?.();
|
|
358
|
+
}
|
|
359
|
+
ctx.addShutdownCallback(async () => {
|
|
360
|
+
clearInactivityTimer();
|
|
361
|
+
});
|
|
362
|
+
// Track when the user actually stops speaking (VAD) so endpointing latency
|
|
363
|
+
// can be measured, and reset the inactivity watchdog on user activity.
|
|
364
|
+
session.on(voice.AgentSessionEventTypes.UserStateChanged, (ev) => {
|
|
365
|
+
noteActivity();
|
|
366
|
+
if (ev.oldState === "speaking" && ev.newState !== "speaking") {
|
|
367
|
+
userStoppedSpeakingAt = Date.now();
|
|
368
|
+
}
|
|
369
|
+
});
|
|
370
|
+
// Reset the inactivity watchdog on any agent speech/processing and on every
|
|
371
|
+
// committed conversation item (user turn or agent reply), so the timeout
|
|
372
|
+
// only fires during a genuine lull in the conversation.
|
|
373
|
+
session.on(voice.AgentSessionEventTypes.AgentStateChanged, () => {
|
|
374
|
+
noteActivity();
|
|
375
|
+
});
|
|
376
|
+
session.on(voice.AgentSessionEventTypes.ConversationItemAdded, () => {
|
|
377
|
+
noteActivity();
|
|
378
|
+
});
|
|
379
|
+
// Forward user STT transcripts to the data-channel bridge as a single
|
|
380
|
+
// live-updating bubble. `UserInputTranscribed` fires `isFinal: true` per
|
|
381
|
+
// finalized SEGMENT (several per turn), so we never forward those as the
|
|
382
|
+
// turn-final; `emitUserTranscriptSegment` accumulates them into the per-turn
|
|
383
|
+
// buffer and emits `final: false`. The lone `final: true` is sent from
|
|
384
|
+
// `llmNode` at the real turn boundary.
|
|
385
|
+
if (transcriptEventsEnabled) {
|
|
386
|
+
session.on(voice.AgentSessionEventTypes.UserInputTranscribed, (ev) => {
|
|
387
|
+
emitUserTranscriptSegment(ev.transcript, ev.isFinal);
|
|
388
|
+
});
|
|
389
|
+
}
|
|
390
|
+
logger.info("[LiveKitVoiceAgent] Session starting", {
|
|
391
|
+
room: ctx.room.name,
|
|
392
|
+
provider,
|
|
393
|
+
model,
|
|
394
|
+
});
|
|
395
|
+
await session.start({ agent, room: ctx.room });
|
|
396
|
+
// Start the inactivity countdown now that the session is live; every
|
|
397
|
+
// interaction handler above re-arms it.
|
|
398
|
+
noteActivity();
|
|
399
|
+
// Data-channel event bridge: forward NeuroLink events (text, tool calls,
|
|
400
|
+
// results, HITL prompts, status) to the browser, and accept HITL responses
|
|
401
|
+
// back. Only when enabled and the instance exposes its event emitter.
|
|
402
|
+
if (config.events?.enabled === true && neurolink.getEventEmitter) {
|
|
403
|
+
const bridge = await attachEventBridge({
|
|
404
|
+
room: ctx.room,
|
|
405
|
+
emitter: neurolink.getEventEmitter(),
|
|
406
|
+
options: config.events,
|
|
407
|
+
});
|
|
408
|
+
ctx.addShutdownCallback(async () => {
|
|
409
|
+
bridge.dispose();
|
|
410
|
+
});
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
return { entry };
|
|
414
|
+
}
|
|
415
|
+
//# sourceMappingURL=voiceAgent.js.map
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LiveKit Agents worker launcher.
|
|
3
|
+
*
|
|
4
|
+
* Registers a worker with the LiveKit server (Cloud or self-hosted) for the
|
|
5
|
+
* given agent entry file. LiveKit dispatches one Job per room, each running in
|
|
6
|
+
* its own process, which provides worker-per-call isolation and horizontal
|
|
7
|
+
* scaling. Connection settings are resolved from the environment.
|
|
8
|
+
*
|
|
9
|
+
* `@livekit/agents` is an optional dependency, imported dynamically.
|
|
10
|
+
*
|
|
11
|
+
* See docs/features/livekit-voice-agent.md.
|
|
12
|
+
*/
|
|
13
|
+
import type { LiveKitWorkerLaunchOptions } from "../../types/index.js";
|
|
14
|
+
/**
|
|
15
|
+
* Launch the LiveKit voice agent worker.
|
|
16
|
+
*
|
|
17
|
+
* Call from a small runner script; `agentFile` must point to the file whose
|
|
18
|
+
* default export is the result of `defineVoiceAgent`.
|
|
19
|
+
*
|
|
20
|
+
* ```ts
|
|
21
|
+
* await startVoiceAgentWorker({
|
|
22
|
+
* agentFile: new URL("./voice-agent-entry.js", import.meta.url).pathname,
|
|
23
|
+
* agentName: "neurolink-voice",
|
|
24
|
+
* });
|
|
25
|
+
* ```
|
|
26
|
+
*/
|
|
27
|
+
export declare function startVoiceAgentWorker(options: LiveKitWorkerLaunchOptions): Promise<void>;
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LiveKit Agents worker launcher.
|
|
3
|
+
*
|
|
4
|
+
* Registers a worker with the LiveKit server (Cloud or self-hosted) for the
|
|
5
|
+
* given agent entry file. LiveKit dispatches one Job per room, each running in
|
|
6
|
+
* its own process, which provides worker-per-call isolation and horizontal
|
|
7
|
+
* scaling. Connection settings are resolved from the environment.
|
|
8
|
+
*
|
|
9
|
+
* `@livekit/agents` is an optional dependency, imported dynamically.
|
|
10
|
+
*
|
|
11
|
+
* See docs/features/livekit-voice-agent.md.
|
|
12
|
+
*/
|
|
13
|
+
import { resolveEouTurnDetection, resolveLiveKitServerConfig, } from "./config.js";
|
|
14
|
+
const DEFAULT_AGENT_NAME = "neurolink-voice";
|
|
15
|
+
const EOU_METHOD_MULTILINGUAL = "lk_end_of_utterance_multilingual";
|
|
16
|
+
/**
|
|
17
|
+
* Register the English EOU inference runner in the worker process.
|
|
18
|
+
*
|
|
19
|
+
* Must run before `cli.runApp`: the worker only spawns the shared inference
|
|
20
|
+
* executor when `InferenceRunner.registeredRunners` is non-empty at startup,
|
|
21
|
+
* and passes that registry to the executor process. Importing the plugin
|
|
22
|
+
* registers both English and multilingual runners, so we delete multilingual to
|
|
23
|
+
* keep only the English model loaded.
|
|
24
|
+
*/
|
|
25
|
+
async function registerEouTurnDetectorRunner() {
|
|
26
|
+
const { InferenceRunner } = await import("@livekit/agents");
|
|
27
|
+
// Importing the plugin's turn-detector module triggers registerRunner().
|
|
28
|
+
await import("@livekit/agents-plugin-livekit");
|
|
29
|
+
delete InferenceRunner.registeredRunners[EOU_METHOD_MULTILINGUAL];
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Launch the LiveKit voice agent worker.
|
|
33
|
+
*
|
|
34
|
+
* Call from a small runner script; `agentFile` must point to the file whose
|
|
35
|
+
* default export is the result of `defineVoiceAgent`.
|
|
36
|
+
*
|
|
37
|
+
* ```ts
|
|
38
|
+
* await startVoiceAgentWorker({
|
|
39
|
+
* agentFile: new URL("./voice-agent-entry.js", import.meta.url).pathname,
|
|
40
|
+
* agentName: "neurolink-voice",
|
|
41
|
+
* });
|
|
42
|
+
* ```
|
|
43
|
+
*/
|
|
44
|
+
export async function startVoiceAgentWorker(options) {
|
|
45
|
+
const server = resolveLiveKitServerConfig();
|
|
46
|
+
const { cli, WorkerOptions } = await import("@livekit/agents");
|
|
47
|
+
if (resolveEouTurnDetection().enabled) {
|
|
48
|
+
await registerEouTurnDetectorRunner();
|
|
49
|
+
}
|
|
50
|
+
cli.runApp(new WorkerOptions({
|
|
51
|
+
agent: options.agentFile,
|
|
52
|
+
agentName: options.agentName ?? DEFAULT_AGENT_NAME,
|
|
53
|
+
wsURL: server.url,
|
|
54
|
+
apiKey: server.apiKey,
|
|
55
|
+
apiSecret: server.apiSecret,
|
|
56
|
+
}));
|
|
57
|
+
}
|
|
58
|
+
//# sourceMappingURL=voiceAgentWorker.js.map
|
package/dist/neurolink.js
CHANGED
|
@@ -66,6 +66,7 @@ import { CircuitBreaker, ERROR_CODES, ErrorFactory, isAbortError, isRetriableErr
|
|
|
66
66
|
import { hasLifecycleErrorFired, markLifecycleErrorFired, } from "./utils/lifecycleCallbacks.js";
|
|
67
67
|
import { resolveLifecycleTimeoutMs } from "./utils/lifecycleTimeout.js";
|
|
68
68
|
import { cloneOptionsForCallIsolation } from "./utils/cloneOptions.js";
|
|
69
|
+
import { coerceJsonToSchema } from "./utils/json/coerce.js";
|
|
69
70
|
// Factory processing imports
|
|
70
71
|
import { createCleanStreamOptions, enhanceTextGenerationOptions, processFactoryOptions, processStreamingFactoryOptions, validateFactoryConfig, } from "./utils/factoryProcessing.js";
|
|
71
72
|
import { logger, mcpLogger } from "./utils/logger.js";
|
|
@@ -3345,6 +3346,60 @@ Current user's request: ${currentInput}`;
|
|
|
3345
3346
|
}
|
|
3346
3347
|
finalizeGenerateRequestResult(params) {
|
|
3347
3348
|
const { generateSpan, options, textOptions, textResult, factoryResult, originalPrompt, startTime, } = params;
|
|
3349
|
+
// Provider-agnostic JSON coercion for schema requests. Structured-output
|
|
3350
|
+
// enforcement makes valid JSON the overwhelming case; for every other
|
|
3351
|
+
// provider path — including generate() overrides (Vertex, Anthropic,
|
|
3352
|
+
// Bedrock, Google AI Studio) — object/array roots are recovered here via
|
|
3353
|
+
// balanced-scan + jsonrepair and scalar JSON roots via plain JSON.parse,
|
|
3354
|
+
// with the parsed value exposed as `structuredData`. If nothing
|
|
3355
|
+
// JSON-shaped is recoverable (pure prose), the raw text is returned,
|
|
3356
|
+
// `structuredData` stays undefined, and a WARN makes the case observable.
|
|
3357
|
+
// Runs BEFORE the end-of-generation emits below so event consumers see
|
|
3358
|
+
// the same coerced content/structuredData the caller receives.
|
|
3359
|
+
if (textOptions.schema &&
|
|
3360
|
+
textResult.structuredData === undefined &&
|
|
3361
|
+
typeof textResult.content === "string") {
|
|
3362
|
+
const coerced = coerceJsonToSchema(textResult.content, textOptions.schema);
|
|
3363
|
+
if (coerced) {
|
|
3364
|
+
textResult.content = coerced.content;
|
|
3365
|
+
textResult.structuredData = coerced.structuredData;
|
|
3366
|
+
if (coerced.repaired) {
|
|
3367
|
+
textResult.jsonRepaired = true;
|
|
3368
|
+
}
|
|
3369
|
+
if (coerced.truncated) {
|
|
3370
|
+
textResult.jsonTruncated = true;
|
|
3371
|
+
}
|
|
3372
|
+
}
|
|
3373
|
+
else {
|
|
3374
|
+
try {
|
|
3375
|
+
const scalar = JSON.parse(textResult.content);
|
|
3376
|
+
if (scalar !== null && scalar !== undefined) {
|
|
3377
|
+
textResult.structuredData = scalar;
|
|
3378
|
+
}
|
|
3379
|
+
}
|
|
3380
|
+
catch {
|
|
3381
|
+
logger.warn("[NeuroLink] schema requested but no JSON could be recovered from model output; returning raw text", { provider: textResult.provider, model: textResult.model });
|
|
3382
|
+
}
|
|
3383
|
+
}
|
|
3384
|
+
}
|
|
3385
|
+
// Surface truncation when a schema was requested: either the provider
|
|
3386
|
+
// reported finishReason="length" or the recovered JSON came from an
|
|
3387
|
+
// unclosed span. Either way `structuredData` may be incomplete — warn at
|
|
3388
|
+
// info level so it is observable in production (not just debug logs).
|
|
3389
|
+
if (textOptions.schema) {
|
|
3390
|
+
if (textResult.finishReason === "length") {
|
|
3391
|
+
textResult.jsonTruncated = true;
|
|
3392
|
+
}
|
|
3393
|
+
if (textResult.jsonTruncated) {
|
|
3394
|
+
logger.warn("[NeuroLink] Structured output may be truncated (finishReason=length or unclosed JSON); " +
|
|
3395
|
+
"increase maxTokens to fit the full response.", {
|
|
3396
|
+
provider: textResult.provider,
|
|
3397
|
+
model: textResult.model,
|
|
3398
|
+
finishReason: textResult.finishReason,
|
|
3399
|
+
outputTokens: textResult.usage?.output,
|
|
3400
|
+
});
|
|
3401
|
+
}
|
|
3402
|
+
}
|
|
3348
3403
|
// Skip the top-level `generation:end` emission when the provider already
|
|
3349
3404
|
// emitted it from its native generate path (Vertex / Google AI Studio).
|
|
3350
3405
|
// Without this guard, native-path providers would surface TWO events
|
|
@@ -3378,7 +3433,10 @@ Current user's request: ${currentInput}`;
|
|
|
3378
3433
|
this.emitter.emit("message", `Generation completed in ${Date.now() - startTime}ms`);
|
|
3379
3434
|
const generateResult = {
|
|
3380
3435
|
content: textResult.content,
|
|
3436
|
+
structuredData: textResult.structuredData,
|
|
3381
3437
|
finishReason: textResult.finishReason,
|
|
3438
|
+
jsonRepaired: textResult.jsonRepaired,
|
|
3439
|
+
jsonTruncated: textResult.jsonTruncated,
|
|
3382
3440
|
provider: textResult.provider,
|
|
3383
3441
|
model: textResult.model,
|
|
3384
3442
|
usage: textResult.usage
|
|
@@ -21,6 +21,7 @@ import { emitToolEndFromStepFinish } from "../utils/toolEndEmitter.js";
|
|
|
21
21
|
import { NoOutputGeneratedError } from "../utils/generationErrors.js";
|
|
22
22
|
import { buildNoOutputSentinel, stampNoOutputSpan, } from "../utils/noOutputSentinel.js";
|
|
23
23
|
import { convertZodToJsonSchema } from "../utils/schemaConversion.js";
|
|
24
|
+
import { resolveClaudeMaxTokens } from "../utils/tokenLimits.js";
|
|
24
25
|
import { createChunkQueue, createDeferredAnalytics, stringifyToolInput, } from "./openaiChatCompletionsClient.js";
|
|
25
26
|
/**
|
|
26
27
|
* Beta headers for Claude Code integration.
|
|
@@ -493,10 +494,19 @@ const mapAnthropicStopReason = (raw) => {
|
|
|
493
494
|
return "stop";
|
|
494
495
|
}
|
|
495
496
|
};
|
|
496
|
-
// Anthropic's Messages API requires max_tokens on every request.
|
|
497
|
-
//
|
|
498
|
-
//
|
|
499
|
-
|
|
497
|
+
// Anthropic's Messages API requires max_tokens on every request. When the
|
|
498
|
+
// caller omits it, default to the model's real output ceiling via
|
|
499
|
+
// resolveClaudeMaxTokens (e.g. 64K for Sonnet 4.x) instead of the legacy 4096,
|
|
500
|
+
// which silently truncated large structured responses mid-JSON.
|
|
501
|
+
//
|
|
502
|
+
// Client-level request timeout. The Anthropic SDK throws "Streaming is required
|
|
503
|
+
// for long requests" from a NON-streaming `messages.create` when `max_tokens`
|
|
504
|
+
// is large AND no client-level timeout is configured (it can't estimate a safe
|
|
505
|
+
// timeout). Setting an explicit client timeout — equal to the SDK's own default
|
|
506
|
+
// for the non-throwing path — suppresses that pre-flight throw so large
|
|
507
|
+
// max_tokens (our model-ceiling default) works. Per-request duration is still
|
|
508
|
+
// bounded by the abort signal NeuroLink composes for each call.
|
|
509
|
+
const ANTHROPIC_CLIENT_TIMEOUT_MS = 600_000;
|
|
500
510
|
/**
|
|
501
511
|
* Anthropic Provider v2 - BaseProvider Implementation
|
|
502
512
|
* Enhanced with OAuth support, subscription tiers, and beta headers for Claude Code integration.
|
|
@@ -602,6 +612,7 @@ export class AnthropicProvider extends BaseProvider {
|
|
|
602
612
|
apiKey: "oauth-authenticated", // Placeholder, actual auth is in fetch wrapper
|
|
603
613
|
// Note: No headers passed - fetch wrapper sets oauth-2025-04-20 beta header
|
|
604
614
|
fetch: oauthFetch,
|
|
615
|
+
timeout: ANTHROPIC_CLIENT_TIMEOUT_MS,
|
|
605
616
|
});
|
|
606
617
|
logger.debug("[AnthropicProvider] Anthropic SDK client created with OAuth fetch wrapper");
|
|
607
618
|
logger.debug("Anthropic Provider initialized with OAuth", {
|
|
@@ -647,6 +658,7 @@ export class AnthropicProvider extends BaseProvider {
|
|
|
647
658
|
defaultHeaders: headers,
|
|
648
659
|
...(normalizedBaseURL && { baseURL: normalizedBaseURL }),
|
|
649
660
|
fetch: createProxyFetch(),
|
|
661
|
+
timeout: ANTHROPIC_CLIENT_TIMEOUT_MS,
|
|
650
662
|
});
|
|
651
663
|
logger.debug("Anthropic Provider initialized with API key", {
|
|
652
664
|
modelName: this.modelName,
|
|
@@ -1122,7 +1134,7 @@ export class AnthropicProvider extends BaseProvider {
|
|
|
1122
1134
|
const params = {
|
|
1123
1135
|
model: modelId,
|
|
1124
1136
|
messages,
|
|
1125
|
-
max_tokens: options.maxOutputTokens
|
|
1137
|
+
max_tokens: resolveClaudeMaxTokens(modelId, options.maxOutputTokens),
|
|
1126
1138
|
...(system ? { system } : {}),
|
|
1127
1139
|
...(options.temperature !== undefined && options.temperature !== null
|
|
1128
1140
|
? { temperature: options.temperature }
|
|
@@ -1137,7 +1149,22 @@ export class AnthropicProvider extends BaseProvider {
|
|
|
1137
1149
|
...(toolChoice ? { tool_choice: toolChoice } : {}),
|
|
1138
1150
|
...(thinking ? { thinking } : {}),
|
|
1139
1151
|
};
|
|
1140
|
-
|
|
1152
|
+
// The 60s anthropic generate default was tuned for the old ~4096
|
|
1153
|
+
// max_tokens. Now that the default ceiling is the model's real max,
|
|
1154
|
+
// a large structured response needs more wall-clock to be produced —
|
|
1155
|
+
// otherwise the inner controller aborts mid-generation (the AI-SDK
|
|
1156
|
+
// doGenerate layer doesn't see the caller's `timeout`). Raise the
|
|
1157
|
+
// floor to 5 min when a large output budget is in play — but only
|
|
1158
|
+
// when the caller did NOT set an explicit timeout: an explicit value
|
|
1159
|
+
// is a contract and must never be silently extended. The abort
|
|
1160
|
+
// signal stays the real bound.
|
|
1161
|
+
const callerTimeout = options
|
|
1162
|
+
.timeout;
|
|
1163
|
+
const callerSpecifiedTimeout = callerTimeout !== undefined && callerTimeout !== null;
|
|
1164
|
+
const generateTimeoutMs = params.max_tokens > 8192 && !callerSpecifiedTimeout
|
|
1165
|
+
? Math.max(getTimeoutForOptions(options), 300_000)
|
|
1166
|
+
: getTimeoutForOptions(options);
|
|
1167
|
+
const timeoutController = createTimeoutController(generateTimeoutMs, providerName, "generate");
|
|
1141
1168
|
let response;
|
|
1142
1169
|
try {
|
|
1143
1170
|
response = await client.messages.create(params, {
|
|
@@ -1356,7 +1383,7 @@ export class AnthropicProvider extends BaseProvider {
|
|
|
1356
1383
|
const params = {
|
|
1357
1384
|
model: modelId,
|
|
1358
1385
|
messages: conversation,
|
|
1359
|
-
max_tokens: options.maxTokens
|
|
1386
|
+
max_tokens: resolveClaudeMaxTokens(modelId, options.maxTokens),
|
|
1360
1387
|
stream: true,
|
|
1361
1388
|
...(payload.system ? { system: payload.system } : {}),
|
|
1362
1389
|
...(options.temperature !== undefined && options.temperature !== null
|