@juspay/neurolink 9.52.0 → 9.54.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +19 -0
- package/dist/adapters/tts/cartesiaHandler.d.ts +12 -0
- package/dist/adapters/tts/cartesiaHandler.js +130 -0
- package/dist/agent/directTools.d.ts +2 -2
- package/dist/auth/errors.d.ts +1 -1
- package/dist/auth/middleware/AuthMiddleware.d.ts +1 -1
- package/dist/auth/providers/BaseAuthProvider.d.ts +1 -1
- package/dist/autoresearch/config.d.ts +11 -0
- package/dist/autoresearch/config.js +108 -0
- package/dist/autoresearch/errors.d.ts +40 -0
- package/dist/autoresearch/errors.js +20 -0
- package/dist/autoresearch/index.d.ts +23 -0
- package/dist/autoresearch/index.js +34 -0
- package/dist/autoresearch/phasePolicy.d.ts +9 -0
- package/dist/autoresearch/phasePolicy.js +69 -0
- package/dist/autoresearch/promptCompiler.d.ts +15 -0
- package/dist/autoresearch/promptCompiler.js +120 -0
- package/dist/autoresearch/repoPolicy.d.ts +32 -0
- package/dist/autoresearch/repoPolicy.js +128 -0
- package/dist/autoresearch/resultRecorder.d.ts +20 -0
- package/dist/autoresearch/resultRecorder.js +130 -0
- package/dist/autoresearch/runner.d.ts +10 -0
- package/dist/autoresearch/runner.js +102 -0
- package/dist/autoresearch/stateStore.d.ts +12 -0
- package/dist/autoresearch/stateStore.js +163 -0
- package/dist/autoresearch/summaryParser.d.ts +16 -0
- package/dist/autoresearch/summaryParser.js +94 -0
- package/dist/autoresearch/tools.d.ts +257 -0
- package/dist/autoresearch/tools.js +617 -0
- package/dist/autoresearch/worker.d.ts +71 -0
- package/dist/autoresearch/worker.js +417 -0
- package/dist/browser/neurolink.min.js +340 -326
- package/dist/cli/commands/autoresearch.d.ts +41 -0
- package/dist/cli/commands/autoresearch.js +487 -0
- package/dist/cli/commands/config.d.ts +1 -1
- package/dist/cli/commands/task.d.ts +2 -0
- package/dist/cli/commands/task.js +32 -3
- package/dist/cli/commands/voiceServer.d.ts +6 -0
- package/dist/cli/commands/voiceServer.js +17 -0
- package/dist/cli/parser.js +7 -1
- package/dist/core/baseProvider.js +18 -0
- package/dist/evaluation/errors/EvaluationError.d.ts +1 -1
- package/dist/lib/adapters/tts/cartesiaHandler.d.ts +12 -0
- package/dist/lib/adapters/tts/cartesiaHandler.js +131 -0
- package/dist/lib/agent/directTools.d.ts +2 -2
- package/dist/lib/auth/errors.d.ts +1 -1
- package/dist/lib/auth/middleware/AuthMiddleware.d.ts +1 -1
- package/dist/lib/auth/providers/BaseAuthProvider.d.ts +1 -1
- package/dist/lib/autoresearch/config.d.ts +11 -0
- package/dist/lib/autoresearch/config.js +109 -0
- package/dist/lib/autoresearch/errors.d.ts +40 -0
- package/dist/lib/autoresearch/errors.js +21 -0
- package/dist/lib/autoresearch/index.d.ts +23 -0
- package/dist/lib/autoresearch/index.js +35 -0
- package/dist/lib/autoresearch/phasePolicy.d.ts +9 -0
- package/dist/lib/autoresearch/phasePolicy.js +70 -0
- package/dist/lib/autoresearch/promptCompiler.d.ts +15 -0
- package/dist/lib/autoresearch/promptCompiler.js +121 -0
- package/dist/lib/autoresearch/repoPolicy.d.ts +32 -0
- package/dist/lib/autoresearch/repoPolicy.js +129 -0
- package/dist/lib/autoresearch/resultRecorder.d.ts +20 -0
- package/dist/lib/autoresearch/resultRecorder.js +131 -0
- package/dist/lib/autoresearch/runner.d.ts +10 -0
- package/dist/lib/autoresearch/runner.js +103 -0
- package/dist/lib/autoresearch/stateStore.d.ts +12 -0
- package/dist/lib/autoresearch/stateStore.js +164 -0
- package/dist/lib/autoresearch/summaryParser.d.ts +16 -0
- package/dist/lib/autoresearch/summaryParser.js +95 -0
- package/dist/lib/autoresearch/tools.d.ts +257 -0
- package/dist/lib/autoresearch/tools.js +618 -0
- package/dist/lib/autoresearch/worker.d.ts +71 -0
- package/dist/lib/autoresearch/worker.js +418 -0
- package/dist/lib/core/baseProvider.js +18 -0
- package/dist/lib/evaluation/errors/EvaluationError.d.ts +1 -1
- package/dist/lib/files/fileTools.d.ts +1 -1
- package/dist/lib/neurolink.js +22 -2
- package/dist/lib/providers/azureOpenai.d.ts +4 -1
- package/dist/lib/providers/azureOpenai.js +9 -3
- package/dist/lib/providers/litellm.js +2 -2
- package/dist/lib/providers/openRouter.js +2 -2
- package/dist/lib/providers/openaiCompatible.js +3 -1
- package/dist/lib/server/voice/frameBus.d.ts +8 -0
- package/dist/lib/server/voice/frameBus.js +25 -0
- package/dist/lib/server/voice/turnManager.d.ts +15 -0
- package/dist/lib/server/voice/turnManager.js +36 -0
- package/dist/lib/server/voice/types.d.ts +20 -0
- package/dist/lib/server/voice/types.js +2 -0
- package/dist/lib/server/voice/voiceServerApp.d.ts +1 -0
- package/dist/lib/server/voice/voiceServerApp.js +118 -0
- package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +11 -0
- package/dist/lib/server/voice/voiceWebSocketHandler.js +536 -0
- package/dist/lib/tasks/autoresearchTaskExecutor.d.ts +32 -0
- package/dist/lib/tasks/autoresearchTaskExecutor.js +303 -0
- package/dist/lib/tasks/errors.d.ts +3 -1
- package/dist/lib/tasks/errors.js +1 -0
- package/dist/lib/tasks/taskExecutor.d.ts +4 -2
- package/dist/lib/tasks/taskExecutor.js +8 -1
- package/dist/lib/tasks/taskManager.js +27 -3
- package/dist/lib/tasks/tools/taskTools.d.ts +1 -1
- package/dist/lib/telemetry/attributes.d.ts +15 -0
- package/dist/lib/telemetry/attributes.js +16 -0
- package/dist/lib/telemetry/tracers.d.ts +1 -0
- package/dist/lib/telemetry/tracers.js +1 -0
- package/dist/lib/types/autoresearchTypes.d.ts +194 -0
- package/dist/lib/types/autoresearchTypes.js +18 -0
- package/dist/lib/types/common.d.ts +11 -0
- package/dist/lib/types/index.d.ts +16 -14
- package/dist/lib/types/index.js +21 -17
- package/dist/lib/types/taskTypes.d.ts +38 -0
- package/dist/lib/workflow/config.d.ts +3 -3
- package/dist/neurolink.js +22 -2
- package/dist/providers/azureOpenai.d.ts +4 -1
- package/dist/providers/azureOpenai.js +9 -3
- package/dist/providers/litellm.js +2 -2
- package/dist/providers/openRouter.js +2 -2
- package/dist/providers/openaiCompatible.js +3 -1
- package/dist/rag/errors/RAGError.d.ts +1 -1
- package/dist/server/voice/frameBus.d.ts +8 -0
- package/dist/server/voice/frameBus.js +24 -0
- package/dist/server/voice/public/app.js +275 -0
- package/dist/server/voice/public/index.html +18 -0
- package/dist/server/voice/public/pcm-worklet.js +67 -0
- package/dist/server/voice/public/styles.css +102 -0
- package/dist/server/voice/turnManager.d.ts +15 -0
- package/dist/server/voice/turnManager.js +35 -0
- package/dist/server/voice/types.d.ts +20 -0
- package/dist/server/voice/types.js +1 -0
- package/dist/server/voice/voiceServerApp.d.ts +1 -0
- package/dist/server/voice/voiceServerApp.js +117 -0
- package/dist/server/voice/voiceWebSocketHandler.d.ts +11 -0
- package/dist/server/voice/voiceWebSocketHandler.js +535 -0
- package/dist/tasks/autoresearchTaskExecutor.d.ts +32 -0
- package/dist/tasks/autoresearchTaskExecutor.js +302 -0
- package/dist/tasks/errors.d.ts +3 -1
- package/dist/tasks/errors.js +1 -0
- package/dist/tasks/taskExecutor.d.ts +4 -2
- package/dist/tasks/taskExecutor.js +8 -1
- package/dist/tasks/taskManager.js +27 -3
- package/dist/tasks/tools/taskTools.d.ts +1 -1
- package/dist/telemetry/attributes.d.ts +15 -0
- package/dist/telemetry/attributes.js +16 -0
- package/dist/telemetry/tracers.d.ts +1 -0
- package/dist/telemetry/tracers.js +1 -0
- package/dist/types/autoresearchTypes.d.ts +194 -0
- package/dist/types/autoresearchTypes.js +17 -0
- package/dist/types/common.d.ts +11 -0
- package/dist/types/index.d.ts +16 -14
- package/dist/types/index.js +21 -17
- package/dist/types/taskTypes.d.ts +38 -0
- package/package.json +2 -1
|
@@ -0,0 +1,535 @@
|
|
|
1
|
+
import WebSocket, { WebSocketServer } from "ws";
|
|
2
|
+
import { Cobra } from "@picovoice/cobra-node";
|
|
3
|
+
import { FrameBus } from "./frameBus.js";
|
|
4
|
+
import { TurnManager, TurnState } from "./turnManager.js";
|
|
5
|
+
import { CartesiaStream } from "../../adapters/tts/cartesiaHandler.js";
|
|
6
|
+
import { NeuroLink } from "../../neurolink.js";
|
|
7
|
+
import { logger } from "../../utils/logger.js";
|
|
8
|
+
import { withTimeout } from "../../utils/async/withTimeout.js";
|
|
9
|
+
const SONIOX_URL = process.env.SONIOX_WS_URL ?? "wss://stt-rt.soniox.com/transcribe-websocket";
|
|
10
|
+
function getRequiredEnv(name) {
|
|
11
|
+
const value = process.env[name];
|
|
12
|
+
if (!value) {
|
|
13
|
+
throw new Error(`${name} is not set in environment`);
|
|
14
|
+
}
|
|
15
|
+
return value;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Call from the voice-server command handler BEFORE importing anything else
|
|
19
|
+
* so the env change is scoped to voice mode only.
|
|
20
|
+
*/
|
|
21
|
+
export function configureVoiceServerEnvironment() {
|
|
22
|
+
// Disable MCP tools for the voice server — tools add 5-7s of init latency
|
|
23
|
+
// on every turn and are not needed for real-time voice interaction.
|
|
24
|
+
process.env.NEUROLINK_DISABLE_MCP_TOOLS = "true";
|
|
25
|
+
}
|
|
26
|
+
let _sonioxApiKey;
|
|
27
|
+
function getSonioxApiKey() {
|
|
28
|
+
if (!_sonioxApiKey) {
|
|
29
|
+
_sonioxApiKey = getRequiredEnv("SONIOX_API_KEY");
|
|
30
|
+
}
|
|
31
|
+
return _sonioxApiKey;
|
|
32
|
+
}
|
|
33
|
+
// How many consecutive silent Cobra frames (each 32ms) before declaring speech end.
|
|
34
|
+
// 30 x 32ms = 960ms — long enough to distinguish a thinking pause from a real stop.
|
|
35
|
+
const SILENCE_FRAMES_TO_STOP = 30;
|
|
36
|
+
// How many consecutive voice frames (each 32ms) before declaring speech start.
|
|
37
|
+
// 5 x 32ms = 160ms — filters brief noise/echo transients.
|
|
38
|
+
const VOICE_FRAMES_TO_START = 5;
|
|
39
|
+
// Cobra voice probability threshold (0–1)
|
|
40
|
+
const VOICE_THRESHOLD = 0.7;
|
|
41
|
+
// Build a 44-byte WAV header for a streaming PCM connection.
|
|
42
|
+
// Data chunk size set to 0xFFFFFFFF (indefinite length) so Soniox can stream continuously.
|
|
43
|
+
function makeWavHeader(sampleRate, numChannels) {
|
|
44
|
+
const buf = Buffer.alloc(44);
|
|
45
|
+
buf.write("RIFF", 0);
|
|
46
|
+
buf.writeUInt32LE(0xffffffff, 4);
|
|
47
|
+
buf.write("WAVE", 8);
|
|
48
|
+
buf.write("fmt ", 12);
|
|
49
|
+
buf.writeUInt32LE(16, 16);
|
|
50
|
+
buf.writeUInt16LE(1, 20);
|
|
51
|
+
buf.writeUInt16LE(numChannels, 22);
|
|
52
|
+
buf.writeUInt32LE(sampleRate, 24);
|
|
53
|
+
buf.writeUInt32LE(sampleRate * numChannels * 2, 28);
|
|
54
|
+
buf.writeUInt16LE(numChannels * 2, 32);
|
|
55
|
+
buf.writeUInt16LE(16, 34);
|
|
56
|
+
buf.write("data", 36);
|
|
57
|
+
buf.writeUInt32LE(0xffffffff, 40);
|
|
58
|
+
return buf;
|
|
59
|
+
}
|
|
60
|
+
const now = () => Number(process.hrtime.bigint()) / 1e6;
|
|
61
|
+
function parseJson(value) {
|
|
62
|
+
return JSON.parse(value);
|
|
63
|
+
}
|
|
64
|
+
function parseSonioxMessage(message) {
|
|
65
|
+
try {
|
|
66
|
+
return parseJson(message.toString());
|
|
67
|
+
}
|
|
68
|
+
catch (error) {
|
|
69
|
+
logger.warn("[SONIOX] Ignoring invalid JSON message", error);
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
function parseClientControlMessage(data) {
|
|
74
|
+
try {
|
|
75
|
+
return parseJson(data);
|
|
76
|
+
}
|
|
77
|
+
catch (error) {
|
|
78
|
+
logger.warn("[WS] Ignoring invalid client control message", error);
|
|
79
|
+
return null;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
async function streamAnswer(neurolink, messages, options) {
|
|
83
|
+
// Last message is the current user turn; everything before it is history.
|
|
84
|
+
const lastMessage = messages[messages.length - 1];
|
|
85
|
+
const history = messages.slice(0, -1);
|
|
86
|
+
const provider = process.env.VOICE_LLM_PROVIDER ?? "azure";
|
|
87
|
+
const model = process.env.VOICE_LLM_MODEL ?? "gpt-4o-automatic";
|
|
88
|
+
const result = await neurolink.stream({
|
|
89
|
+
provider,
|
|
90
|
+
model,
|
|
91
|
+
// Current user message as the active input.
|
|
92
|
+
input: { text: lastMessage.content },
|
|
93
|
+
// Prior turns passed as structured history so NeuroLink's memory layer
|
|
94
|
+
// picks them up correctly (fixes "No memory or context" warning).
|
|
95
|
+
conversationHistory: history.map((m) => ({
|
|
96
|
+
role: m.role,
|
|
97
|
+
content: m.content,
|
|
98
|
+
})),
|
|
99
|
+
timeout: options?.timeoutMs ?? 30000,
|
|
100
|
+
// CRITICAL FOR LATENCY
|
|
101
|
+
temperature: 0.25, // lower = faster + stable
|
|
102
|
+
maxTokens: 140, // FIXES HALF ANSWERS
|
|
103
|
+
disableTools: true, // removes orchestration overhead
|
|
104
|
+
enableAnalytics: false,
|
|
105
|
+
enableEvaluation: false,
|
|
106
|
+
// Voice-specific instruction
|
|
107
|
+
systemPrompt: `You are a real-time voice assistant. Respond naturally and concisely. Use short spoken sentences. Do not write paragraphs.`,
|
|
108
|
+
});
|
|
109
|
+
return result.stream;
|
|
110
|
+
}
|
|
111
|
+
export function setupWebSocket(server) {
|
|
112
|
+
const wss = new WebSocketServer({ server });
|
|
113
|
+
const accessKey = process.env.PICOVOICE_ACCESS_KEY;
|
|
114
|
+
if (!accessKey) {
|
|
115
|
+
throw new Error("PICOVOICE_ACCESS_KEY is not set in environment");
|
|
116
|
+
}
|
|
117
|
+
const neurolink = new NeuroLink();
|
|
118
|
+
wss.on("connection", (clientWs) => {
|
|
119
|
+
logger.info("[WS] Client connected");
|
|
120
|
+
// --- Per-session Cobra instance ---
|
|
121
|
+
let cobra = null;
|
|
122
|
+
let FRAME_LENGTH = 512;
|
|
123
|
+
let FRAME_BYTES = FRAME_LENGTH * 2;
|
|
124
|
+
try {
|
|
125
|
+
cobra = new Cobra(accessKey);
|
|
126
|
+
FRAME_LENGTH = cobra.frameLength;
|
|
127
|
+
FRAME_BYTES = FRAME_LENGTH * 2;
|
|
128
|
+
logger.info(`[VAD] Cobra ready (frameLength=${FRAME_LENGTH})`);
|
|
129
|
+
}
|
|
130
|
+
catch (err) {
|
|
131
|
+
logger.error("[VAD] Cobra init failed:", err);
|
|
132
|
+
clientWs.close();
|
|
133
|
+
return;
|
|
134
|
+
}
|
|
135
|
+
// --- Per-session state ---
|
|
136
|
+
const bus = new FrameBus();
|
|
137
|
+
const turnManager = new TurnManager(bus);
|
|
138
|
+
let sonioxWs = null;
|
|
139
|
+
let keepAliveTimer = null;
|
|
140
|
+
let sessionClosed = false;
|
|
141
|
+
let transcriptBuffer = "";
|
|
142
|
+
let activeTTS = null;
|
|
143
|
+
const conversation = [];
|
|
144
|
+
let currentTurnId = 0;
|
|
145
|
+
let activePipelineTurnId = null;
|
|
146
|
+
// Safety fallback: if the client never sends playback_done (crash, network drop),
|
|
147
|
+
// auto-reset the turn state after this many ms so the assistant isn't stuck.
|
|
148
|
+
let playbackResetTimer = null;
|
|
149
|
+
// Timestamp (ms) before which barge-in via Soniox is suppressed.
|
|
150
|
+
// Set when TTS starts playing to prevent TTS echo from triggering immediate re-interrupt.
|
|
151
|
+
// AEC on the browser needs ~300-400ms to characterise the echo signal before suppressing it.
|
|
152
|
+
let bargeInLockedUntil = 0;
|
|
153
|
+
// Cobra VAD state
|
|
154
|
+
let isSpeaking = false;
|
|
155
|
+
let silenceFrameCount = 0;
|
|
156
|
+
let voiceFrameCount = 0;
|
|
157
|
+
let frameRemainder = Buffer.alloc(0);
|
|
158
|
+
/* ======= INTERRUPT ======= */
|
|
159
|
+
function closeTts(stream, reason) {
|
|
160
|
+
if (!stream) {
|
|
161
|
+
return;
|
|
162
|
+
}
|
|
163
|
+
try {
|
|
164
|
+
// Close the WS first so that any pending done/error/close listeners
|
|
165
|
+
// in processTurn() can settle immediately, rather than hanging until
|
|
166
|
+
// the withTimeout fires.
|
|
167
|
+
stream.close();
|
|
168
|
+
stream.removeAllListeners();
|
|
169
|
+
}
|
|
170
|
+
catch (error) {
|
|
171
|
+
logger.warn(reason, error);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
function doInterrupt() {
|
|
175
|
+
logger.info("[INTERRUPT] Cutting TTS");
|
|
176
|
+
if (playbackResetTimer) {
|
|
177
|
+
clearTimeout(playbackResetTimer);
|
|
178
|
+
playbackResetTimer = null;
|
|
179
|
+
}
|
|
180
|
+
bargeInLockedUntil = 0;
|
|
181
|
+
currentTurnId++;
|
|
182
|
+
activePipelineTurnId = null;
|
|
183
|
+
transcriptBuffer = "";
|
|
184
|
+
isSpeaking = false;
|
|
185
|
+
silenceFrameCount = 0;
|
|
186
|
+
voiceFrameCount = 0;
|
|
187
|
+
if (activeTTS) {
|
|
188
|
+
closeTts(activeTTS, "[INTERRUPT] Failed to close active TTS stream");
|
|
189
|
+
activeTTS = null;
|
|
190
|
+
}
|
|
191
|
+
turnManager.reset();
|
|
192
|
+
if (clientWs.readyState === WebSocket.OPEN) {
|
|
193
|
+
clientWs.send(JSON.stringify({ type: "interrupt" }));
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
/* ======= SONIOX ======= */
|
|
197
|
+
function connectSoniox() {
|
|
198
|
+
const ws = new WebSocket(SONIOX_URL);
|
|
199
|
+
sonioxWs = ws;
|
|
200
|
+
ws.on("open", () => {
|
|
201
|
+
logger.info("[SONIOX] Connected");
|
|
202
|
+
ws.send(JSON.stringify({
|
|
203
|
+
api_key: getSonioxApiKey(),
|
|
204
|
+
model: "stt-rt-preview",
|
|
205
|
+
audio_format: "auto",
|
|
206
|
+
language_hints: ["en"],
|
|
207
|
+
enable_endpoint_detection: true,
|
|
208
|
+
}));
|
|
209
|
+
ws.send(makeWavHeader(16000, 1));
|
|
210
|
+
startKeepAlive();
|
|
211
|
+
});
|
|
212
|
+
ws.on("message", handleSonioxMessage);
|
|
213
|
+
ws.on("close", (code, reason) => {
|
|
214
|
+
logger.info(`[SONIOX] Closed: code=${code} reason=${reason.toString() || "(none)"}`);
|
|
215
|
+
stopKeepAlive();
|
|
216
|
+
if (!sessionClosed) {
|
|
217
|
+
setTimeout(() => {
|
|
218
|
+
connectSoniox();
|
|
219
|
+
}, 500);
|
|
220
|
+
}
|
|
221
|
+
});
|
|
222
|
+
ws.on("error", (err) => {
|
|
223
|
+
logger.error("[SONIOX] Error:", err.message);
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
function startKeepAlive() {
|
|
227
|
+
keepAliveTimer = setInterval(() => {
|
|
228
|
+
if (sonioxWs?.readyState === WebSocket.OPEN) {
|
|
229
|
+
sonioxWs.send(JSON.stringify({ type: "keepalive" }));
|
|
230
|
+
}
|
|
231
|
+
}, 8000);
|
|
232
|
+
}
|
|
233
|
+
function stopKeepAlive() {
|
|
234
|
+
if (keepAliveTimer) {
|
|
235
|
+
clearInterval(keepAliveTimer);
|
|
236
|
+
keepAliveTimer = null;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
/* ======= STT HANDLER ======= */
|
|
240
|
+
async function handleSonioxMessage(msg) {
|
|
241
|
+
const data = parseSonioxMessage(msg);
|
|
242
|
+
if (!data) {
|
|
243
|
+
return;
|
|
244
|
+
}
|
|
245
|
+
if (!Array.isArray(data.tokens)) {
|
|
246
|
+
if (data.error || data.status || data.type) {
|
|
247
|
+
if (logger.shouldLog("debug")) {
|
|
248
|
+
logger.info("[SONIOX] msg:", JSON.stringify(data));
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
return;
|
|
252
|
+
}
|
|
253
|
+
const tokens = data.tokens;
|
|
254
|
+
// Barge-in detection:
|
|
255
|
+
// Soniox non-final tokens = real speech is being recognised right now.
|
|
256
|
+
// Browser AEC (echo cancellation) suppresses TTS playback at the mic, so
|
|
257
|
+
// non-final tokens can only come from the user's own voice — unlike raw
|
|
258
|
+
// Cobra probability which can be fooled by speaker echo.
|
|
259
|
+
// We only fire interrupt when the TurnManager confirms TTS is actually
|
|
260
|
+
// playing (ASSISTANT_SPEAKING state set by processTurn).
|
|
261
|
+
// bargeInLockedUntil suppresses the first ~400ms after TTS starts so that
|
|
262
|
+
// TTS audio picked up by the mic (before AEC locks on) can't re-trigger.
|
|
263
|
+
if (turnManager.state === TurnState.ASSISTANT_SPEAKING &&
|
|
264
|
+
Date.now() > bargeInLockedUntil) {
|
|
265
|
+
const speechPartials = tokens.filter((token) => !token.is_final && token.text && token.text.trim().length > 1);
|
|
266
|
+
if (speechPartials.length > 0) {
|
|
267
|
+
logger.info(`[BARGE-IN] Detected via Soniox: "${speechPartials.map((token) => token.text).join("")}"`);
|
|
268
|
+
doInterrupt();
|
|
269
|
+
return;
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
const finals = tokens.filter((token) => token.is_final && token.text);
|
|
273
|
+
if (!finals.length) {
|
|
274
|
+
return;
|
|
275
|
+
}
|
|
276
|
+
transcriptBuffer += finals.map((token) => token.text).join("");
|
|
277
|
+
const hasEnd = finals.some((token) => token.text === "<end>");
|
|
278
|
+
if (!hasEnd) {
|
|
279
|
+
return;
|
|
280
|
+
}
|
|
281
|
+
const finalText = transcriptBuffer.replace("<end>", "").trim();
|
|
282
|
+
transcriptBuffer = "";
|
|
283
|
+
if (!finalText) {
|
|
284
|
+
return;
|
|
285
|
+
}
|
|
286
|
+
logger.info("[STT] Final ->", finalText);
|
|
287
|
+
try {
|
|
288
|
+
await processTurn(finalText);
|
|
289
|
+
}
|
|
290
|
+
catch (err) {
|
|
291
|
+
logger.error("[PIPELINE] Unhandled error in processTurn:", err.message);
|
|
292
|
+
turnManager.reset();
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
/* ======= TURN PROCESSOR ======= */
|
|
296
|
+
async function processTurn(userText) {
|
|
297
|
+
if (activePipelineTurnId !== null) {
|
|
298
|
+
logger.info("[PIPELINE] Already running — discarding duplicate STT final");
|
|
299
|
+
return;
|
|
300
|
+
}
|
|
301
|
+
currentTurnId++;
|
|
302
|
+
const myTurn = currentTurnId;
|
|
303
|
+
activePipelineTurnId = myTurn;
|
|
304
|
+
const tSttEnd = now();
|
|
305
|
+
try {
|
|
306
|
+
// Build context without mutating `conversation` — only commit on full completion.
|
|
307
|
+
const stream = await streamAnswer(neurolink, [
|
|
308
|
+
...conversation,
|
|
309
|
+
{ role: "user", content: userText },
|
|
310
|
+
]);
|
|
311
|
+
if (myTurn !== currentTurnId) {
|
|
312
|
+
return;
|
|
313
|
+
}
|
|
314
|
+
const tts = new CartesiaStream(`turn-${Date.now()}`);
|
|
315
|
+
activeTTS = tts;
|
|
316
|
+
await tts.ready();
|
|
317
|
+
if (myTurn !== currentTurnId) {
|
|
318
|
+
return;
|
|
319
|
+
}
|
|
320
|
+
// Register error handler immediately after ready() — before the LLM stream loop —
|
|
321
|
+
// so Cartesia errors emitted mid-stream (during token sending) are captured.
|
|
322
|
+
// Without this, errors during the for-await loop have no listener and are swallowed.
|
|
323
|
+
let ttsError = null;
|
|
324
|
+
tts.on("error", (err) => {
|
|
325
|
+
ttsError = err;
|
|
326
|
+
logger.error("[TTS] Mid-stream error:", err.message);
|
|
327
|
+
});
|
|
328
|
+
// Pre-lock barge-in BEFORE signaling assistant speaking.
|
|
329
|
+
// Without this there is a ~700-1000ms gap where TurnState is ASSISTANT_SPEAKING
|
|
330
|
+
// but bargeInLockedUntil=0, so Soniox residual tokens from the previous TTS echo
|
|
331
|
+
// immediately trigger an interrupt before any audio has even been sent.
|
|
332
|
+
bargeInLockedUntil = Date.now() + 1000;
|
|
333
|
+
// Signal TurnManager that TTS is about to play — barge-in detection is now live.
|
|
334
|
+
turnManager.assistantSpeaking();
|
|
335
|
+
let firstAudioSent = false;
|
|
336
|
+
let assistantReply = "";
|
|
337
|
+
let tokenBuffer = "";
|
|
338
|
+
// Sentence/phrase boundaries to flush on — avoids flooding Cartesia with
|
|
339
|
+
// one tiny message per token, which causes "Service unavailable" errors on
|
|
340
|
+
// long responses. We flush when we hit natural speech breaks or the buffer
|
|
341
|
+
// grows large enough to produce a clean TTS chunk.
|
|
342
|
+
const FLUSH_REGEX = /[.!?,;:]\s/;
|
|
343
|
+
const FLUSH_MIN_LENGTH = 80;
|
|
344
|
+
tts.on("audio", (audio) => {
|
|
345
|
+
if (myTurn !== currentTurnId) {
|
|
346
|
+
return;
|
|
347
|
+
}
|
|
348
|
+
if (!firstAudioSent) {
|
|
349
|
+
firstAudioSent = true;
|
|
350
|
+
// Refresh the lock from when audio ACTUALLY hits the client so it covers
|
|
351
|
+
// the AEC lock-on window (~300-400ms for browser echo cancellation).
|
|
352
|
+
// This extends the protection past the initial 1000ms pre-lock.
|
|
353
|
+
bargeInLockedUntil = Date.now() + 400;
|
|
354
|
+
logger.info(`[LATENCY] STT -> First Audio: ${(now() - tSttEnd).toFixed(0)}ms`);
|
|
355
|
+
}
|
|
356
|
+
if (clientWs.readyState === WebSocket.OPEN) {
|
|
357
|
+
clientWs.send(audio);
|
|
358
|
+
}
|
|
359
|
+
});
|
|
360
|
+
for await (const chunk of stream) {
|
|
361
|
+
if (myTurn !== currentTurnId) {
|
|
362
|
+
logger.info("[PIPELINE] Stale LLM stream — dropping");
|
|
363
|
+
break;
|
|
364
|
+
}
|
|
365
|
+
// If Cartesia errored mid-stream, abort sending more tokens.
|
|
366
|
+
if (ttsError) {
|
|
367
|
+
logger.info("[PIPELINE] Aborting LLM stream — Cartesia error");
|
|
368
|
+
break;
|
|
369
|
+
}
|
|
370
|
+
if (!chunk || typeof chunk !== "object" || !("content" in chunk)) {
|
|
371
|
+
continue;
|
|
372
|
+
}
|
|
373
|
+
if (typeof chunk.content !== "string") {
|
|
374
|
+
continue;
|
|
375
|
+
}
|
|
376
|
+
assistantReply += chunk.content;
|
|
377
|
+
tokenBuffer += chunk.content;
|
|
378
|
+
// Flush buffer to Cartesia at sentence/phrase boundaries or when it's
|
|
379
|
+
// grown large enough. This batches tokens into meaningful speech chunks
|
|
380
|
+
// instead of sending one WebSocket message per token.
|
|
381
|
+
if (FLUSH_REGEX.test(tokenBuffer) ||
|
|
382
|
+
tokenBuffer.length >= FLUSH_MIN_LENGTH) {
|
|
383
|
+
tts.send(tokenBuffer, true);
|
|
384
|
+
tokenBuffer = "";
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
// Flush any remaining buffered tokens before the final flush().
|
|
388
|
+
if (tokenBuffer) {
|
|
389
|
+
tts.send(tokenBuffer, true);
|
|
390
|
+
tokenBuffer = "";
|
|
391
|
+
}
|
|
392
|
+
// If Cartesia errored during the stream, reset and bail out now.
|
|
393
|
+
if (ttsError) {
|
|
394
|
+
logger.error("[TTS] Error during stream — resetting turn so user can retry:", String(ttsError));
|
|
395
|
+
closeTts(tts, "[TTS] Failed to close stream after mid-stream error");
|
|
396
|
+
turnManager.reset();
|
|
397
|
+
return;
|
|
398
|
+
}
|
|
399
|
+
if (myTurn !== currentTurnId) {
|
|
400
|
+
return;
|
|
401
|
+
}
|
|
402
|
+
let ttsSucceeded = false;
|
|
403
|
+
try {
|
|
404
|
+
await withTimeout(new Promise((resolve, reject) => {
|
|
405
|
+
tts.once("done", () => {
|
|
406
|
+
ttsSucceeded = true;
|
|
407
|
+
resolve();
|
|
408
|
+
});
|
|
409
|
+
// Re-use the persistent error handler: if another error arrives during flush,
|
|
410
|
+
// the existing "error" listener fires ttsError; reject via a one-time wrapper.
|
|
411
|
+
tts.once("error", reject);
|
|
412
|
+
// Reject if the socket closes without emitting done or error.
|
|
413
|
+
tts.once("close", () => reject(new Error("Cartesia WS closed before flush completed")));
|
|
414
|
+
tts.flush();
|
|
415
|
+
}), 10000, "Cartesia flush timed out");
|
|
416
|
+
}
|
|
417
|
+
catch (err) {
|
|
418
|
+
// Cartesia failed (e.g. "Service unavailable"). The user heard nothing.
|
|
419
|
+
// Reset state immediately so they can speak and retry — don't commit
|
|
420
|
+
// the turn to conversation history since it was never heard.
|
|
421
|
+
logger.error("[TTS] Error during flush — resetting turn so user can retry:", err.message);
|
|
422
|
+
closeTts(tts, "[TTS] Failed to close stream after flush error");
|
|
423
|
+
turnManager.reset();
|
|
424
|
+
return;
|
|
425
|
+
}
|
|
426
|
+
closeTts(tts, "[TTS] Failed to close stream after successful playback");
|
|
427
|
+
if (!ttsSucceeded || myTurn !== currentTurnId) {
|
|
428
|
+
return;
|
|
429
|
+
}
|
|
430
|
+
// Only commit conversation when the turn completed fully and was heard.
|
|
431
|
+
conversation.push({ role: "user", content: userText });
|
|
432
|
+
conversation.push({ role: "assistant", content: assistantReply });
|
|
433
|
+
// Do NOT reset state here — the client is still playing buffered audio.
|
|
434
|
+
// The client sends playback_done when the last audio chunk finishes playing,
|
|
435
|
+
// which is the correct moment to return to IDLE and allow new user speech.
|
|
436
|
+
// Safety fallback: if the client never sends playback_done (crash, disconnect),
|
|
437
|
+
// auto-reset after 20 seconds so the assistant doesn't stay stuck.
|
|
438
|
+
if (playbackResetTimer) {
|
|
439
|
+
clearTimeout(playbackResetTimer);
|
|
440
|
+
}
|
|
441
|
+
playbackResetTimer = setTimeout(() => {
|
|
442
|
+
playbackResetTimer = null;
|
|
443
|
+
turnManager.reset();
|
|
444
|
+
}, 20000);
|
|
445
|
+
}
|
|
446
|
+
finally {
|
|
447
|
+
if (activePipelineTurnId === myTurn) {
|
|
448
|
+
activePipelineTurnId = null;
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
/* ======= CLIENT AUDIO + CONTROL ======= */
|
|
453
|
+
clientWs.on("message", (data) => {
|
|
454
|
+
if (typeof data === "string") {
|
|
455
|
+
const msg = parseClientControlMessage(data);
|
|
456
|
+
if (msg?.type === "playback_done") {
|
|
457
|
+
// Client finished playing all audio — now it's safe to listen again.
|
|
458
|
+
if (playbackResetTimer) {
|
|
459
|
+
clearTimeout(playbackResetTimer);
|
|
460
|
+
playbackResetTimer = null;
|
|
461
|
+
}
|
|
462
|
+
turnManager.reset();
|
|
463
|
+
}
|
|
464
|
+
return;
|
|
465
|
+
}
|
|
466
|
+
if (!(data instanceof Buffer)) {
|
|
467
|
+
return;
|
|
468
|
+
}
|
|
469
|
+
// Reassemble into exact FRAME_BYTES-sized Cobra frames.
|
|
470
|
+
const combined = Buffer.concat([frameRemainder, data]);
|
|
471
|
+
let pos = 0;
|
|
472
|
+
while (pos + FRAME_BYTES <= combined.length) {
|
|
473
|
+
const frame = new Int16Array(FRAME_LENGTH);
|
|
474
|
+
for (let i = 0; i < FRAME_LENGTH; i++) {
|
|
475
|
+
frame[i] = combined.readInt16LE(pos + i * 2);
|
|
476
|
+
}
|
|
477
|
+
pos += FRAME_BYTES;
|
|
478
|
+
// Cobra VAD:
|
|
479
|
+
// Cobra tracks when the user is speaking vs silent. Its output drives
|
|
480
|
+
// TurnManager state (USER_SPEAKING / PROCESSING) but does NOT trigger
|
|
481
|
+
// interrupt — that comes from Soniox non-final tokens so echo can't fool it.
|
|
482
|
+
let voiceProb = 0;
|
|
483
|
+
try {
|
|
484
|
+
if (!cobra) {
|
|
485
|
+
continue;
|
|
486
|
+
}
|
|
487
|
+
voiceProb = cobra.process(frame);
|
|
488
|
+
}
|
|
489
|
+
catch (err) {
|
|
490
|
+
logger.error("[VAD] Cobra process error:", err);
|
|
491
|
+
}
|
|
492
|
+
const isVoice = voiceProb >= VOICE_THRESHOLD;
|
|
493
|
+
if (isVoice) {
|
|
494
|
+
voiceFrameCount++;
|
|
495
|
+
silenceFrameCount = 0;
|
|
496
|
+
if (!isSpeaking && voiceFrameCount >= VOICE_FRAMES_TO_START) {
|
|
497
|
+
isSpeaking = true;
|
|
498
|
+
logger.info(`[VAD] Speech start (prob=${voiceProb.toFixed(2)})`);
|
|
499
|
+
bus.publish({ type: "vad_start" });
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
else {
|
|
503
|
+
voiceFrameCount = 0;
|
|
504
|
+
if (isSpeaking) {
|
|
505
|
+
silenceFrameCount++;
|
|
506
|
+
if (silenceFrameCount >= SILENCE_FRAMES_TO_STOP) {
|
|
507
|
+
isSpeaking = false;
|
|
508
|
+
silenceFrameCount = 0;
|
|
509
|
+
logger.info("[VAD] Speech stop");
|
|
510
|
+
bus.publish({ type: "vad_stop" });
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
// Always forward every frame to Soniox for continuous transcription.
|
|
515
|
+
if (sonioxWs?.readyState === WebSocket.OPEN) {
|
|
516
|
+
sonioxWs.send(Buffer.from(frame.buffer));
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
frameRemainder = combined.subarray(pos);
|
|
520
|
+
});
|
|
521
|
+
clientWs.on("close", () => {
|
|
522
|
+
logger.info("[WS] Client disconnected");
|
|
523
|
+
sessionClosed = true;
|
|
524
|
+
if (cobra) {
|
|
525
|
+
cobra.release();
|
|
526
|
+
}
|
|
527
|
+
closeTts(activeTTS, "[WS] Failed to close active TTS on disconnect");
|
|
528
|
+
stopKeepAlive();
|
|
529
|
+
if (sonioxWs) {
|
|
530
|
+
sonioxWs.close();
|
|
531
|
+
}
|
|
532
|
+
});
|
|
533
|
+
connectSoniox();
|
|
534
|
+
});
|
|
535
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Autoresearch task executor — bridges TaskManager with the
|
|
3
|
+
* autoresearch experiment loop.
|
|
4
|
+
*
|
|
5
|
+
* Each tick:
|
|
6
|
+
* 1. Loads/creates a ResearchWorker for the task's tag
|
|
7
|
+
* 2. Gets the phase-appropriate tool filter
|
|
8
|
+
* 3. Calls NeuroLink.generate() with research tools + prompt
|
|
9
|
+
* 4. Advances phase based on which tools the AI called
|
|
10
|
+
* 5. Returns a TaskRunResult
|
|
11
|
+
*
|
|
12
|
+
* Workers are cached by tag to avoid re-initialization on each tick.
|
|
13
|
+
* Forwards the NeuroLink emitter to each worker for lifecycle events.
|
|
14
|
+
*/
|
|
15
|
+
import type { AutoresearchEmitter } from "../types/autoresearchTypes.js";
|
|
16
|
+
import type { NeuroLinkExecutable, Task, TaskRunResult } from "../types/taskTypes.js";
|
|
17
|
+
/**
|
|
18
|
+
* Clear all cached workers. Called by TaskManager.shutdown().
|
|
19
|
+
*/
|
|
20
|
+
export declare function clearWorkerCache(): void;
|
|
21
|
+
/**
|
|
22
|
+
* Execute one autoresearch tick for a task.
|
|
23
|
+
*
|
|
24
|
+
* Returns a TaskRunResult-shaped object.
|
|
25
|
+
* If the task is missing autoresearch config, returns an error result
|
|
26
|
+
* instead of throwing (so the scheduler can record the failure).
|
|
27
|
+
*
|
|
28
|
+
* @param emitter - Optional emitter to forward autoresearch lifecycle events
|
|
29
|
+
*/
|
|
30
|
+
export declare function executeAutoresearchTick(task: Task & {
|
|
31
|
+
autoresearch?: unknown;
|
|
32
|
+
}, neurolink: NeuroLinkExecutable, emitter?: AutoresearchEmitter): Promise<TaskRunResult>;
|