agent.libx.js 0.93.42 → 0.93.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +5 -1
- package/dist/cli.js +45 -20
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +6 -0
- package/dist/index.js +31 -17
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -1135,15 +1135,21 @@ declare class CartesiaTTS {
|
|
|
1135
1135
|
firstAudioAt: number;
|
|
1136
1136
|
/** Circuit breaker: consecutive error count + down flag. */
|
|
1137
1137
|
private consecutiveErrors;
|
|
1138
|
+
private consecutiveOk;
|
|
1138
1139
|
private down;
|
|
1140
|
+
private downAt;
|
|
1139
1141
|
private probeTimer;
|
|
1140
1142
|
private static readonly CB_THRESHOLD;
|
|
1143
|
+
private static readonly CB_RECOVER_OK;
|
|
1141
1144
|
private static readonly CB_PROBE_MS;
|
|
1142
1145
|
constructor(options?: Partial<CartesiaTTSOptions>);
|
|
1143
1146
|
private closed;
|
|
1144
1147
|
private connecting;
|
|
1145
1148
|
connect(): Promise<void>;
|
|
1146
1149
|
private doConnect;
|
|
1150
|
+
/** Close the breaker only after CB_RECOVER_OK consecutive good frames, so a single straggler chunk
|
|
1151
|
+
* after a 503 burst doesn't flap open→recover in <1s. A sub-2s down-window is a transient blip → debug. */
|
|
1152
|
+
private markRecovered;
|
|
1147
1153
|
/** Ensure the WS is open before sending — reconnects if idle-closed. */
|
|
1148
1154
|
private ensureConnected;
|
|
1149
1155
|
newContext(): string;
|
package/dist/index.js
CHANGED
|
@@ -3000,11 +3000,14 @@ var Agent = class _Agent {
|
|
|
3000
3000
|
usage.cacheReadTokens += res.usage.cacheReadTokens ?? 0;
|
|
3001
3001
|
}
|
|
3002
3002
|
const toolCalls = res.toolCalls ?? [];
|
|
3003
|
-
|
|
3004
|
-
|
|
3005
|
-
|
|
3006
|
-
|
|
3007
|
-
|
|
3003
|
+
const emptyTurn = toolCalls.length === 0 && contentText(res.content ?? "").trim() === "";
|
|
3004
|
+
if (!emptyTurn) {
|
|
3005
|
+
this.transcript.push({
|
|
3006
|
+
role: "assistant",
|
|
3007
|
+
content: res.content ?? "",
|
|
3008
|
+
...toolCalls.length ? { tool_calls: toolCalls } : {}
|
|
3009
|
+
});
|
|
3010
|
+
}
|
|
3008
3011
|
if (toolCalls.length === 0) {
|
|
3009
3012
|
log3.verbose(`completed in ${steps} step(s)`);
|
|
3010
3013
|
await this.ctx.jobs?.drain();
|
|
@@ -3868,7 +3871,7 @@ var DuplexAgentOptions = class {
|
|
|
3868
3871
|
memoryUserDir;
|
|
3869
3872
|
};
|
|
3870
3873
|
var RESERVED_EVENT_MARKER = /\[task\b[^\]\n]*\b(?:completed|failed|progress|asks)\b/i;
|
|
3871
|
-
var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou have three cognitive tiers \u2014 like a human brain:\n\u2022 YOU (reflex) \u2014 instant, lightweight. Handle greetings, simple questions, status checks, QuickLook.\n\u2022 `Act` \u2014 your hands. A background worker with its own configured tools and access to the user\'s environment (files and shell{{WORKER_WEB}}). Use for reading, editing, searching, running tasks, building \u2014 any real work.\n{{THINK_SLOT}}\nWhen you are unsure whether you can do or access something, do NOT assume and do NOT claim a capability you have not confirmed. To check what you can do, QuickLook `capabilities` (instant \u2014 it lists your worker\'s real tools) and answer from that. Never promise an ability that is not in your capabilities; if it is not there, tell the user plainly you can\'t. To actually DO real work, call `Act`. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), call `Act` IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nALWAYS react before you work: the FIRST thing in your turn is a brief spoken acknowledgement of what you heard and what you are about to do ("got it \u2014 opening that now", "sure, let me pull it up", "okay, checking"). NEVER call a tool (Act, Think, QuickLook) silently \u2014 the user must hear you react before you go quiet to work. After dispatching Act or Think, that same one short sentence IS your turn \u2014 end it and do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, speak the USEFUL gist in one or two short sentences \u2014 the actual answer the user wanted (the headline finding, the key numbers), not the thinnest possible "it\'s done". A forecast \u2192 say it\'s calm AND that it\'s good for swimming but not surf; a count \u2192 say the number. Be brief, but do not drop the substance. If the result is a LIST (search results, multiple files/matches), the user CANNOT see it \u2014 there is no screen and no numbered menu to point at. Speak the gist: say what you found and name the top one or two by NAME (the source, not "the first one" or a number), then ask plainly if they want more. Never ask them to "pick which one" or reference items by position. The completed result stays in YOUR context \u2014 it is yours to draw on. When the user follows up ("tell me more", "what else", "and?"), answer FROM that result first: you already have the detail, so elaborate on what you have. Do NOT spawn a fresh worker to re-search or re-gather what you were just handed. Re-dispatch ONLY when genuinely new information is needed \u2014 e.g. the user wants the full contents of a SPECIFIC source, which is one WebFetch of that URL, not a brand-new search. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nCRITICAL: while a task is still running you have NO answer yet \u2014 never state a specific result of any kind (a number, size, count, name, path, or value). The real answer arrives ONLY in the "[task \u2026 completed]" event; inventing one meanwhile (a made-up disk size, commit count, etc.) is a serious error. Until then, only acknowledge and wait.\nNever read raw file paths, diffs, or code aloud verbatim.\n"[task t1 asks] \u2026" events are QUESTIONS from a background task \u2014 relay to the user in your own words, short, then end your turn. When the user answers, call `AnswerTask` with that id and their answer. NEVER answer on the user\'s behalf for permissions or risky operations; if their reply is ambiguous, confirm first.\nIf the user\'s message sounds INCOMPLETE \u2014 trailing off mid-sentence, a fragment that needs more context ("and then we", "but the problem is"), hesitation fillers ("uh", "um") \u2014 call `Hold` instead of answering. This keeps listening for the rest of their thought. Only respond with substance when you have a complete question or request.\nDispatch discipline: send ONE self-contained task per request \u2014 a single worker with the full brief beats several workers with fragments (each worker starts fresh and re-discovers context). NEVER dispatch a worker just to read files or gather information \u2014 workers explore and discover context themselves; pass on what you already know and let one worker do the whole job. Split into parallel tasks only when the user asks for genuinely independent things. When a task completes, report its result and stop \u2014 do NOT dispatch follow-up work (verification, polish, extras) the user did not ask for, unless the report itself signals failure or doubt.\nDo not fire a second Act/Think for work already in flight, and NEVER spawn a second task to re-count, cross-check, or verify a result a worker already gave you \u2014 trust its answer; a single question gets ONE task. Call `TaskStatus` at most ONCE per turn; if a task is still running, just say "still on it" and end the turn \u2014 never poll it again and again in a loop. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not act, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file, or checking your own `capabilities`/tools \u2014 use `QuickLook` (instant, no task). Whenever the user asks what you can do or whether you have some ability, QuickLook `capabilities` and answer from that \u2014 never guess. Anything requiring searching, reasoning, running commands, or editing goes through `Act`.\n{{MEMORY_SLOT}}\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
|
|
3874
|
+
var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nThis holds even when asked to "print", "list", "show", or "make a table" \u2014 there is no screen for the spoken channel. Speak it as flowing prose ("Tuesday is half a meter, Wednesday a bit less\u2026"), or if they truly need it on screen, route it to Act to render. Never emit dashes or pipes into speech.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou have three cognitive tiers \u2014 like a human brain:\n\u2022 YOU (reflex) \u2014 instant, lightweight. Handle greetings, simple questions, status checks, QuickLook.\n\u2022 `Act` \u2014 your hands. A background worker with its own configured tools and access to the user\'s environment (files and shell{{WORKER_WEB}}). Use for reading, editing, searching, running tasks, building \u2014 any real work.\n{{THINK_SLOT}}\nWhen you are unsure whether you can do or access something, do NOT assume and do NOT claim a capability you have not confirmed. To check what you can do, QuickLook `capabilities` (instant \u2014 it lists your worker\'s real tools) and answer from that. Never promise an ability that is not in your capabilities; if it is not there, tell the user plainly you can\'t. To actually DO real work, call `Act`. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), call `Act` IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nALWAYS react before you work: the FIRST thing in your turn is a brief spoken acknowledgement of what you heard and what you are about to do ("got it \u2014 opening that now", "sure, let me pull it up", "okay, checking"). NEVER call a tool (Act, Think, QuickLook) silently \u2014 the user must hear you react before you go quiet to work. After dispatching Act or Think, that same one short sentence IS your turn \u2014 end it and do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, speak the USEFUL gist in one or two short sentences \u2014 the actual answer the user wanted (the headline finding, the key numbers), not the thinnest possible "it\'s done". A forecast \u2192 say it\'s calm AND that it\'s good for swimming but not surf; a count \u2192 say the number. Be brief, but do not drop the substance. If the result is a LIST (search results, multiple files/matches), the user CANNOT see it \u2014 there is no screen and no numbered menu to point at. Speak the gist: say what you found and name the top one or two by NAME (the source, not "the first one" or a number), then ask plainly if they want more. Never ask them to "pick which one" or reference items by position. The completed result stays in YOUR context \u2014 it is yours to draw on. When the user follows up ("tell me more", "what else", "and?"), answer FROM that result first: you already have the detail, so elaborate on what you have. Do NOT spawn a fresh worker to re-search or re-gather what you were just handed. Re-dispatch ONLY when genuinely new information is needed \u2014 e.g. the user wants the full contents of a SPECIFIC source, which is one WebFetch of that URL, not a brand-new search. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nCRITICAL: while a task is still running you have NO answer yet \u2014 never state a specific result of any kind (a number, size, count, name, path, or value). The real answer arrives ONLY in the "[task \u2026 completed]" event; inventing one meanwhile (a made-up disk size, commit count, etc.) is a serious error. Until then, only acknowledge and wait.\nNever read raw file paths, diffs, or code aloud verbatim.\n"[task t1 asks] \u2026" events are QUESTIONS from a background task \u2014 relay to the user in your own words, short, then end your turn. When the user answers, call `AnswerTask` with that id and their answer. NEVER answer on the user\'s behalf for permissions or risky operations; if their reply is ambiguous, confirm first.\nIf the user\'s message sounds INCOMPLETE \u2014 trailing off mid-sentence, a fragment that needs more context ("and then we", "but the problem is"), hesitation fillers ("uh", "um") \u2014 call `Hold` instead of answering. This keeps listening for the rest of their thought. Only respond with substance when you have a complete question or request.\nDispatch discipline: send ONE self-contained task per request \u2014 a single worker with the full brief beats several workers with fragments (each worker starts fresh and re-discovers context). NEVER dispatch a worker just to read files or gather information \u2014 workers explore and discover context themselves; pass on what you already know and let one worker do the whole job. Split into parallel tasks only when the user asks for genuinely independent things. When a task completes, report its result and stop \u2014 do NOT dispatch follow-up work (verification, polish, extras) the user did not ask for, unless the report itself signals failure or doubt.\nDo not fire a second Act/Think for work already in flight, and NEVER spawn a second task to re-count, cross-check, or verify a result a worker already gave you \u2014 trust its answer; a single question gets ONE task. Call `TaskStatus` at most ONCE per turn; if a task is still running, just say "still on it" and end the turn \u2014 never poll it again and again in a loop. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not act, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file, or checking your own `capabilities`/tools \u2014 use `QuickLook` (instant, no task). Whenever the user asks what you can do or whether you have some ability, QuickLook `capabilities` and answer from that \u2014 never guess. Anything requiring searching, reasoning, running commands, or editing goes through `Act`.\n{{MEMORY_SLOT}}\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
|
|
3872
3875
|
var THINK_GUIDANCE = "\u2022 `Think` \u2014 your brain. A premium reasoning model, FAR more expensive than Act. Reserve it for open-ended architecture/design questions, or a problem Act already FAILED at. ALL implementation work \u2014 coding, refactoring, debugging, edge cases, tests \u2014 goes to Act; Act is highly capable. Never send the same work to both.";
|
|
3873
3876
|
var THINK_DISABLED_GUIDANCE = "(Think tier is not available \u2014 use Act for all escalations.)";
|
|
3874
3877
|
var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you escalate to Act or Think, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now act", no task ids out loud).`;
|
|
@@ -4635,6 +4638,7 @@ init_logging();
|
|
|
4635
4638
|
init_logging();
|
|
4636
4639
|
var log9 = forComponent("VoiceEngine");
|
|
4637
4640
|
var now = () => performance.now();
|
|
4641
|
+
var forSpeech = (t) => t.replace(/[*_`#]+/g, "").replace(/^[ \t]*[-•]\s+/gm, "");
|
|
4638
4642
|
var VoiceEngineOptions = class {
|
|
4639
4643
|
stt;
|
|
4640
4644
|
tts;
|
|
@@ -4791,7 +4795,7 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
4791
4795
|
if (!this.speaking || !this.ctxOpen) this.beginSpeech();
|
|
4792
4796
|
this.reply += text;
|
|
4793
4797
|
for (const w of this.words(this.reply)) this.echoWords.add(w);
|
|
4794
|
-
this.tts.speak(text, true);
|
|
4798
|
+
this.tts.speak(forSpeech(text), true);
|
|
4795
4799
|
if (!this.spokeDeltas && this.turnStartAt) log9.debug(`ttft: ${Math.round(now() - this.turnStartAt)}ms`);
|
|
4796
4800
|
this.spokeDeltas = true;
|
|
4797
4801
|
this.setState("speaking");
|
|
@@ -5197,9 +5201,14 @@ var CartesiaTTS = class _CartesiaTTS {
|
|
|
5197
5201
|
firstAudioAt = 0;
|
|
5198
5202
|
/** Circuit breaker: consecutive error count + down flag. */
|
|
5199
5203
|
consecutiveErrors = 0;
|
|
5204
|
+
consecutiveOk = 0;
|
|
5200
5205
|
down = false;
|
|
5206
|
+
downAt = 0;
|
|
5201
5207
|
probeTimer = null;
|
|
5202
5208
|
static CB_THRESHOLD = 3;
|
|
5209
|
+
// open after 3 consecutive errors
|
|
5210
|
+
static CB_RECOVER_OK = 2;
|
|
5211
|
+
// close only after 2 consecutive good frames (no single-frame flap)
|
|
5203
5212
|
static CB_PROBE_MS = 3e4;
|
|
5204
5213
|
constructor(options) {
|
|
5205
5214
|
this.options = { ...new CartesiaTTSOptions(), ...options };
|
|
@@ -5231,26 +5240,20 @@ var CartesiaTTS = class _CartesiaTTS {
|
|
|
5231
5240
|
if (m.context_id && m.context_id !== this.ctxId) return;
|
|
5232
5241
|
if (m.type === "chunk" && m.data) {
|
|
5233
5242
|
this.consecutiveErrors = 0;
|
|
5234
|
-
|
|
5235
|
-
this.down = false;
|
|
5236
|
-
log11.info("TTS recovered");
|
|
5237
|
-
this.stopProbe();
|
|
5238
|
-
}
|
|
5243
|
+
this.markRecovered();
|
|
5239
5244
|
if (!this.firstAudioAt) this.firstAudioAt = now3();
|
|
5240
5245
|
this.onAudio(base64ToBytes(m.data));
|
|
5241
5246
|
} else if (m.type === "done") {
|
|
5242
5247
|
this.consecutiveErrors = 0;
|
|
5243
|
-
|
|
5244
|
-
this.down = false;
|
|
5245
|
-
log11.info("TTS recovered");
|
|
5246
|
-
this.stopProbe();
|
|
5247
|
-
}
|
|
5248
|
+
this.markRecovered();
|
|
5248
5249
|
this.onDone();
|
|
5249
5250
|
} else if (m.type === "error") {
|
|
5250
5251
|
if (/already been cancelled|does not exist/.test(m.message || "")) return;
|
|
5251
5252
|
this.consecutiveErrors++;
|
|
5252
5253
|
if (!this.down && this.consecutiveErrors >= _CartesiaTTS.CB_THRESHOLD) {
|
|
5253
5254
|
this.down = true;
|
|
5255
|
+
this.downAt = now3();
|
|
5256
|
+
this.consecutiveOk = 0;
|
|
5254
5257
|
log11.warn(`TTS circuit breaker open \u2014 ${this.consecutiveErrors} consecutive errors, switching to text-only`);
|
|
5255
5258
|
this.onDone();
|
|
5256
5259
|
this.startProbe();
|
|
@@ -5260,6 +5263,17 @@ var CartesiaTTS = class _CartesiaTTS {
|
|
|
5260
5263
|
}
|
|
5261
5264
|
};
|
|
5262
5265
|
}
|
|
5266
|
+
/** Close the breaker only after CB_RECOVER_OK consecutive good frames, so a single straggler chunk
|
|
5267
|
+
* after a 503 burst doesn't flap open→recover in <1s. A sub-2s down-window is a transient blip → debug. */
|
|
5268
|
+
markRecovered() {
|
|
5269
|
+
if (!this.down) return;
|
|
5270
|
+
if (++this.consecutiveOk < _CartesiaTTS.CB_RECOVER_OK) return;
|
|
5271
|
+
this.down = false;
|
|
5272
|
+
this.consecutiveOk = 0;
|
|
5273
|
+
this.stopProbe();
|
|
5274
|
+
const downMs = this.downAt ? now3() - this.downAt : 0;
|
|
5275
|
+
(downMs < 2e3 ? log11.debug : log11.info)(`TTS recovered${downMs ? ` (down ${downMs}ms)` : ""}`);
|
|
5276
|
+
}
|
|
5263
5277
|
/** Ensure the WS is open before sending — reconnects if idle-closed. */
|
|
5264
5278
|
async ensureConnected() {
|
|
5265
5279
|
if (this.connecting) await this.connecting;
|