agent.libx.js 0.93.1 → 0.93.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/cli.ts +1 -1
- package/dist/cli.js +35 -4
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +7 -0
- package/dist/index.js +34 -3
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/cli/cli.ts
CHANGED
|
@@ -208,7 +208,7 @@ Flags:
|
|
|
208
208
|
impulsive reactions, human pacing (implies --duplex; aliases: --convo, --voice)
|
|
209
209
|
with SONIOX_API_KEY + CARTESIA_API_KEY(+VOICE_ID) set: real voice I/O — mic in,
|
|
210
210
|
spoken replies out (echo-cancelled; speak over it to interrupt)
|
|
211
|
-
--voice-model <id> with --duplex: the fast voice model (default
|
|
211
|
+
--voice-model <id> with --duplex: the fast voice model (default groq/openai/gpt-oss-20b)
|
|
212
212
|
--add-dir <path> mount another directory into the workspace (repeatable; disk mode only)
|
|
213
213
|
--subagents allow the Task tool (spawn child agents)
|
|
214
214
|
--reasoning <e> extended thinking: off|low|medium|high or a token budget (anthropic/openai)
|
package/dist/cli.js
CHANGED
|
@@ -3501,7 +3501,7 @@ var DuplexAgentOptions = class {
|
|
|
3501
3501
|
ai;
|
|
3502
3502
|
/** The WORKER's filesystem. If omitted the worker keeps Agent's jailed-disk-at-cwd default. */
|
|
3503
3503
|
fs;
|
|
3504
|
-
voiceModel = "
|
|
3504
|
+
voiceModel = "groq/openai/gpt-oss-20b";
|
|
3505
3505
|
workerModel = "anthropic/claude-sonnet-4-6";
|
|
3506
3506
|
/** Escape hatches merged over the derived per-agent options. */
|
|
3507
3507
|
voiceOptions;
|
|
@@ -4003,6 +4003,8 @@ var VoiceEngine = class {
|
|
|
4003
4003
|
lastOverlapPartial = "";
|
|
4004
4004
|
// change-detection: only NEW partial text counts as activity
|
|
4005
4005
|
resumeTimer = null;
|
|
4006
|
+
turnStartAt = 0;
|
|
4007
|
+
// timestamp when the current turn began (for TTFT logging)
|
|
4006
4008
|
constructor(options) {
|
|
4007
4009
|
this.options = { ...new VoiceEngineOptions(), ...options };
|
|
4008
4010
|
const o = this.options;
|
|
@@ -4063,6 +4065,7 @@ var VoiceEngine = class {
|
|
|
4063
4065
|
this.spokeDeltas = true;
|
|
4064
4066
|
this.ackAt = now();
|
|
4065
4067
|
}
|
|
4068
|
+
this.turnStartAt = now();
|
|
4066
4069
|
this.setState("thinking");
|
|
4067
4070
|
}
|
|
4068
4071
|
speakDelta(text) {
|
|
@@ -4071,6 +4074,7 @@ var VoiceEngine = class {
|
|
|
4071
4074
|
this.reply += text;
|
|
4072
4075
|
for (const w of this.words(this.reply)) this.echoWords.add(w);
|
|
4073
4076
|
this.tts.speak(text, true);
|
|
4077
|
+
if (!this.spokeDeltas && this.turnStartAt) log7.info(`ttft: ${Math.round(now() - this.turnStartAt)}ms`);
|
|
4074
4078
|
this.spokeDeltas = true;
|
|
4075
4079
|
this.setState("speaking");
|
|
4076
4080
|
}
|
|
@@ -4091,6 +4095,7 @@ var VoiceEngine = class {
|
|
|
4091
4095
|
}
|
|
4092
4096
|
this.drainTimer = null;
|
|
4093
4097
|
this.speaking = false;
|
|
4098
|
+
if (this.turnStartAt) log7.info(`turn: ${Math.round(now() - this.turnStartAt)}ms (incl. playback)`);
|
|
4094
4099
|
this.echoUntil = now() + 2500;
|
|
4095
4100
|
if (!this.usingAec) this.stt.reset();
|
|
4096
4101
|
this.setState("listening");
|
|
@@ -4127,7 +4132,7 @@ var VoiceEngine = class {
|
|
|
4127
4132
|
this.ctxOpen = false;
|
|
4128
4133
|
this.interrupted = true;
|
|
4129
4134
|
this.suspectUntil = 0;
|
|
4130
|
-
this.echoUntil = now() + 2500;
|
|
4135
|
+
this.echoUntil = now() + Math.max(2500, this.player.drainMs() + 3e3);
|
|
4131
4136
|
this.tts.cancel();
|
|
4132
4137
|
this.player.kill();
|
|
4133
4138
|
if (!this.usingAec) this.stt.reset();
|
|
@@ -4322,6 +4327,8 @@ var SonioxSTT = class {
|
|
|
4322
4327
|
lastChangeAt = 0;
|
|
4323
4328
|
lastCombined = "";
|
|
4324
4329
|
endpointTimer = null;
|
|
4330
|
+
firstTokenAt = 0;
|
|
4331
|
+
// first speech token in current utterance
|
|
4325
4332
|
constructor(options) {
|
|
4326
4333
|
this.options = { ...new SonioxSTTOptions(), ...options };
|
|
4327
4334
|
}
|
|
@@ -4361,6 +4368,7 @@ var SonioxSTT = class {
|
|
|
4361
4368
|
this.endpointTimer = setInterval(() => {
|
|
4362
4369
|
const combined = (this.finalText + this.partialText).trim();
|
|
4363
4370
|
if (!combined || now2() - this.lastChangeAt < this.options.silenceEndpointMs) return;
|
|
4371
|
+
if (this.firstTokenAt) log8.info(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192silence-endpoint, "${combined.slice(0, 60)}"`);
|
|
4364
4372
|
this.reset();
|
|
4365
4373
|
this.onUtterance(combined, now2());
|
|
4366
4374
|
}, 120);
|
|
@@ -4388,10 +4396,12 @@ var SonioxSTT = class {
|
|
|
4388
4396
|
if (combined !== this.lastCombined) {
|
|
4389
4397
|
this.lastCombined = combined;
|
|
4390
4398
|
this.lastChangeAt = now2();
|
|
4399
|
+
if (!this.firstTokenAt && combined.trim()) this.firstTokenAt = now2();
|
|
4391
4400
|
}
|
|
4392
4401
|
this.onPartial(combined);
|
|
4393
4402
|
if (endpoint && this.finalText.trim()) {
|
|
4394
4403
|
const utterance = this.finalText.trim();
|
|
4404
|
+
if (this.firstTokenAt) log8.info(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192endpoint, "${utterance.slice(0, 60)}"`);
|
|
4395
4405
|
this.reset();
|
|
4396
4406
|
this.onUtterance(utterance, now2());
|
|
4397
4407
|
}
|
|
@@ -4400,6 +4410,7 @@ var SonioxSTT = class {
|
|
|
4400
4410
|
this.finalText = "";
|
|
4401
4411
|
this.partialText = "";
|
|
4402
4412
|
this.lastCombined = "";
|
|
4413
|
+
this.firstTokenAt = 0;
|
|
4403
4414
|
}
|
|
4404
4415
|
stop() {
|
|
4405
4416
|
this.stopped = true;
|
|
@@ -4434,7 +4445,15 @@ var CartesiaTTS = class {
|
|
|
4434
4445
|
constructor(options) {
|
|
4435
4446
|
this.options = { ...new CartesiaTTSOptions(), ...options };
|
|
4436
4447
|
}
|
|
4448
|
+
closed = false;
|
|
4449
|
+
connecting = null;
|
|
4437
4450
|
async connect() {
|
|
4451
|
+
this.closed = false;
|
|
4452
|
+
this.connecting = this.doConnect();
|
|
4453
|
+
await this.connecting;
|
|
4454
|
+
this.connecting = null;
|
|
4455
|
+
}
|
|
4456
|
+
async doConnect() {
|
|
4438
4457
|
const key = await resolveAuth(this.options.auth);
|
|
4439
4458
|
const param = this.options.authMode === "token" ? "access_token" : "api_key";
|
|
4440
4459
|
this.ws = new WebSocket(`wss://api.cartesia.ai/tts/websocket?cartesia_version=2026-03-01&${param}=${key}`);
|
|
@@ -4442,7 +4461,12 @@ var CartesiaTTS = class {
|
|
|
4442
4461
|
this.ws.onopen = () => res();
|
|
4443
4462
|
this.ws.onerror = (e) => rej(new Error(`cartesia ws: ${e.message || "connect failed"}`));
|
|
4444
4463
|
});
|
|
4445
|
-
this.ws.onclose = (ev) =>
|
|
4464
|
+
this.ws.onclose = (ev) => {
|
|
4465
|
+
log9.warn(`cartesia ws closed (${ev.code} ${ev.reason || ""})`);
|
|
4466
|
+
if (!this.closed) {
|
|
4467
|
+
this.connecting = this.doConnect().catch((e) => log9.error(`cartesia reconnect failed: ${e.message}`));
|
|
4468
|
+
}
|
|
4469
|
+
};
|
|
4446
4470
|
this.ws.onmessage = (ev) => {
|
|
4447
4471
|
const m = JSON.parse(String(ev.data));
|
|
4448
4472
|
if (m.context_id && m.context_id !== this.ctxId) return;
|
|
@@ -4453,6 +4477,11 @@ var CartesiaTTS = class {
|
|
|
4453
4477
|
else if (m.type === "error" && !/already been cancelled|does not exist/.test(m.message || "")) log9.warn(`cartesia: ${JSON.stringify(m)}`);
|
|
4454
4478
|
};
|
|
4455
4479
|
}
|
|
4480
|
+
/** Ensure the WS is open before sending — reconnects if idle-closed. */
|
|
4481
|
+
async ensureConnected() {
|
|
4482
|
+
if (this.connecting) await this.connecting;
|
|
4483
|
+
if (this.ws?.readyState !== WebSocket.OPEN) await this.connect();
|
|
4484
|
+
}
|
|
4456
4485
|
newContext() {
|
|
4457
4486
|
this.ctxId = `ctx-${++this.ctxSeq}`;
|
|
4458
4487
|
this.firstAudioAt = 0;
|
|
@@ -4470,6 +4499,7 @@ var CartesiaTTS = class {
|
|
|
4470
4499
|
}
|
|
4471
4500
|
speak(text, cont) {
|
|
4472
4501
|
if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(this.frame(text, cont));
|
|
4502
|
+
else void this.ensureConnected().then(() => this.ws?.readyState === WebSocket.OPEN && this.ws.send(this.frame(text, cont)));
|
|
4473
4503
|
}
|
|
4474
4504
|
end() {
|
|
4475
4505
|
if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(this.frame("", false));
|
|
@@ -4478,6 +4508,7 @@ var CartesiaTTS = class {
|
|
|
4478
4508
|
if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(JSON.stringify({ context_id: this.ctxId, cancel: true }));
|
|
4479
4509
|
}
|
|
4480
4510
|
close() {
|
|
4511
|
+
this.closed = true;
|
|
4481
4512
|
if (this.ws) this.ws.onclose = null;
|
|
4482
4513
|
this.ws?.close();
|
|
4483
4514
|
}
|
|
@@ -7385,7 +7416,7 @@ Flags:
|
|
|
7385
7416
|
impulsive reactions, human pacing (implies --duplex; aliases: --convo, --voice)
|
|
7386
7417
|
with SONIOX_API_KEY + CARTESIA_API_KEY(+VOICE_ID) set: real voice I/O \u2014 mic in,
|
|
7387
7418
|
spoken replies out (echo-cancelled; speak over it to interrupt)
|
|
7388
|
-
--voice-model <id> with --duplex: the fast voice model (default
|
|
7419
|
+
--voice-model <id> with --duplex: the fast voice model (default groq/openai/gpt-oss-20b)
|
|
7389
7420
|
--add-dir <path> mount another directory into the workspace (repeatable; disk mode only)
|
|
7390
7421
|
--subagents allow the Task tool (spawn child agents)
|
|
7391
7422
|
--reasoning <e> extended thinking: off|low|medium|high or a token budget (anthropic/openai)
|