agent.libx.js 0.93.1 → 0.93.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cli/cli.ts CHANGED
@@ -208,7 +208,7 @@ Flags:
208
208
  impulsive reactions, human pacing (implies --duplex; aliases: --convo, --voice)
209
209
  with SONIOX_API_KEY + CARTESIA_API_KEY(+VOICE_ID) set: real voice I/O — mic in,
210
210
  spoken replies out (echo-cancelled; speak over it to interrupt)
211
- --voice-model <id> with --duplex: the fast voice model (default anthropic/claude-haiku-4-5)
211
+ --voice-model <id> with --duplex: the fast voice model (default groq/openai/gpt-oss-20b)
212
212
  --add-dir <path> mount another directory into the workspace (repeatable; disk mode only)
213
213
  --subagents allow the Task tool (spawn child agents)
214
214
  --reasoning <e> extended thinking: off|low|medium|high or a token budget (anthropic/openai)
package/dist/cli.js CHANGED
@@ -3501,7 +3501,7 @@ var DuplexAgentOptions = class {
3501
3501
  ai;
3502
3502
  /** The WORKER's filesystem. If omitted the worker keeps Agent's jailed-disk-at-cwd default. */
3503
3503
  fs;
3504
- voiceModel = "anthropic/claude-haiku-4-5";
3504
+ voiceModel = "groq/openai/gpt-oss-20b";
3505
3505
  workerModel = "anthropic/claude-sonnet-4-6";
3506
3506
  /** Escape hatches merged over the derived per-agent options. */
3507
3507
  voiceOptions;
@@ -4003,6 +4003,8 @@ var VoiceEngine = class {
4003
4003
  lastOverlapPartial = "";
4004
4004
  // change-detection: only NEW partial text counts as activity
4005
4005
  resumeTimer = null;
4006
+ turnStartAt = 0;
4007
+ // timestamp when the current turn began (for TTFT logging)
4006
4008
  constructor(options) {
4007
4009
  this.options = { ...new VoiceEngineOptions(), ...options };
4008
4010
  const o = this.options;
@@ -4063,6 +4065,7 @@ var VoiceEngine = class {
4063
4065
  this.spokeDeltas = true;
4064
4066
  this.ackAt = now();
4065
4067
  }
4068
+ this.turnStartAt = now();
4066
4069
  this.setState("thinking");
4067
4070
  }
4068
4071
  speakDelta(text) {
@@ -4071,6 +4074,7 @@ var VoiceEngine = class {
4071
4074
  this.reply += text;
4072
4075
  for (const w of this.words(this.reply)) this.echoWords.add(w);
4073
4076
  this.tts.speak(text, true);
4077
+ if (!this.spokeDeltas && this.turnStartAt) log7.info(`ttft: ${Math.round(now() - this.turnStartAt)}ms`);
4074
4078
  this.spokeDeltas = true;
4075
4079
  this.setState("speaking");
4076
4080
  }
@@ -4091,6 +4095,7 @@ var VoiceEngine = class {
4091
4095
  }
4092
4096
  this.drainTimer = null;
4093
4097
  this.speaking = false;
4098
+ if (this.turnStartAt) log7.info(`turn: ${Math.round(now() - this.turnStartAt)}ms (incl. playback)`);
4094
4099
  this.echoUntil = now() + 2500;
4095
4100
  if (!this.usingAec) this.stt.reset();
4096
4101
  this.setState("listening");
@@ -4127,7 +4132,7 @@ var VoiceEngine = class {
4127
4132
  this.ctxOpen = false;
4128
4133
  this.interrupted = true;
4129
4134
  this.suspectUntil = 0;
4130
- this.echoUntil = now() + 2500;
4135
+ this.echoUntil = now() + Math.max(2500, this.player.drainMs() + 3e3);
4131
4136
  this.tts.cancel();
4132
4137
  this.player.kill();
4133
4138
  if (!this.usingAec) this.stt.reset();
@@ -4322,6 +4327,8 @@ var SonioxSTT = class {
4322
4327
  lastChangeAt = 0;
4323
4328
  lastCombined = "";
4324
4329
  endpointTimer = null;
4330
+ firstTokenAt = 0;
4331
+ // first speech token in current utterance
4325
4332
  constructor(options) {
4326
4333
  this.options = { ...new SonioxSTTOptions(), ...options };
4327
4334
  }
@@ -4361,6 +4368,7 @@ var SonioxSTT = class {
4361
4368
  this.endpointTimer = setInterval(() => {
4362
4369
  const combined = (this.finalText + this.partialText).trim();
4363
4370
  if (!combined || now2() - this.lastChangeAt < this.options.silenceEndpointMs) return;
4371
+ if (this.firstTokenAt) log8.info(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192silence-endpoint, "${combined.slice(0, 60)}"`);
4364
4372
  this.reset();
4365
4373
  this.onUtterance(combined, now2());
4366
4374
  }, 120);
@@ -4388,10 +4396,12 @@ var SonioxSTT = class {
4388
4396
  if (combined !== this.lastCombined) {
4389
4397
  this.lastCombined = combined;
4390
4398
  this.lastChangeAt = now2();
4399
+ if (!this.firstTokenAt && combined.trim()) this.firstTokenAt = now2();
4391
4400
  }
4392
4401
  this.onPartial(combined);
4393
4402
  if (endpoint && this.finalText.trim()) {
4394
4403
  const utterance = this.finalText.trim();
4404
+ if (this.firstTokenAt) log8.info(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192endpoint, "${utterance.slice(0, 60)}"`);
4395
4405
  this.reset();
4396
4406
  this.onUtterance(utterance, now2());
4397
4407
  }
@@ -4400,6 +4410,7 @@ var SonioxSTT = class {
4400
4410
  this.finalText = "";
4401
4411
  this.partialText = "";
4402
4412
  this.lastCombined = "";
4413
+ this.firstTokenAt = 0;
4403
4414
  }
4404
4415
  stop() {
4405
4416
  this.stopped = true;
@@ -4434,7 +4445,15 @@ var CartesiaTTS = class {
4434
4445
  constructor(options) {
4435
4446
  this.options = { ...new CartesiaTTSOptions(), ...options };
4436
4447
  }
4448
+ closed = false;
4449
+ connecting = null;
4437
4450
  async connect() {
4451
+ this.closed = false;
4452
+ this.connecting = this.doConnect();
4453
+ await this.connecting;
4454
+ this.connecting = null;
4455
+ }
4456
+ async doConnect() {
4438
4457
  const key = await resolveAuth(this.options.auth);
4439
4458
  const param = this.options.authMode === "token" ? "access_token" : "api_key";
4440
4459
  this.ws = new WebSocket(`wss://api.cartesia.ai/tts/websocket?cartesia_version=2026-03-01&${param}=${key}`);
@@ -4442,7 +4461,12 @@ var CartesiaTTS = class {
4442
4461
  this.ws.onopen = () => res();
4443
4462
  this.ws.onerror = (e) => rej(new Error(`cartesia ws: ${e.message || "connect failed"}`));
4444
4463
  });
4445
- this.ws.onclose = (ev) => log9.warn(`cartesia ws closed (${ev.code} ${ev.reason || ""})`);
4464
+ this.ws.onclose = (ev) => {
4465
+ log9.warn(`cartesia ws closed (${ev.code} ${ev.reason || ""})`);
4466
+ if (!this.closed) {
4467
+ this.connecting = this.doConnect().catch((e) => log9.error(`cartesia reconnect failed: ${e.message}`));
4468
+ }
4469
+ };
4446
4470
  this.ws.onmessage = (ev) => {
4447
4471
  const m = JSON.parse(String(ev.data));
4448
4472
  if (m.context_id && m.context_id !== this.ctxId) return;
@@ -4453,6 +4477,11 @@ var CartesiaTTS = class {
4453
4477
  else if (m.type === "error" && !/already been cancelled|does not exist/.test(m.message || "")) log9.warn(`cartesia: ${JSON.stringify(m)}`);
4454
4478
  };
4455
4479
  }
4480
+ /** Ensure the WS is open before sending — reconnects if idle-closed. */
4481
+ async ensureConnected() {
4482
+ if (this.connecting) await this.connecting;
4483
+ if (this.ws?.readyState !== WebSocket.OPEN) await this.connect();
4484
+ }
4456
4485
  newContext() {
4457
4486
  this.ctxId = `ctx-${++this.ctxSeq}`;
4458
4487
  this.firstAudioAt = 0;
@@ -4470,6 +4499,7 @@ var CartesiaTTS = class {
4470
4499
  }
4471
4500
  speak(text, cont) {
4472
4501
  if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(this.frame(text, cont));
4502
+ else void this.ensureConnected().then(() => this.ws?.readyState === WebSocket.OPEN && this.ws.send(this.frame(text, cont)));
4473
4503
  }
4474
4504
  end() {
4475
4505
  if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(this.frame("", false));
@@ -4478,6 +4508,7 @@ var CartesiaTTS = class {
4478
4508
  if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(JSON.stringify({ context_id: this.ctxId, cancel: true }));
4479
4509
  }
4480
4510
  close() {
4511
+ this.closed = true;
4481
4512
  if (this.ws) this.ws.onclose = null;
4482
4513
  this.ws?.close();
4483
4514
  }
@@ -7385,7 +7416,7 @@ Flags:
7385
7416
  impulsive reactions, human pacing (implies --duplex; aliases: --convo, --voice)
7386
7417
  with SONIOX_API_KEY + CARTESIA_API_KEY(+VOICE_ID) set: real voice I/O \u2014 mic in,
7387
7418
  spoken replies out (echo-cancelled; speak over it to interrupt)
7388
- --voice-model <id> with --duplex: the fast voice model (default anthropic/claude-haiku-4-5)
7419
+ --voice-model <id> with --duplex: the fast voice model (default groq/openai/gpt-oss-20b)
7389
7420
  --add-dir <path> mount another directory into the workspace (repeatable; disk mode only)
7390
7421
  --subagents allow the Task tool (spawn child agents)
7391
7422
  --reasoning <e> extended thinking: off|low|medium|high or a token budget (anthropic/openai)