agent.libx.js 0.92.6 → 0.92.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -3552,12 +3552,14 @@ ${recent}` : brief;
3552
3552
  report.output(chunk);
3553
3553
  }
3554
3554
  } : base;
3555
+ const workerHost = o.host?.ask ? { ask: (q2) => o.host.ask(q2) } : void 0;
3555
3556
  const worker = new Agent({
3556
3557
  ai: o.ai,
3557
3558
  fs: o.fs,
3558
3559
  model: o.workerModel,
3559
3560
  ...o.workerOptions,
3560
3561
  // may override ai/fs/model/tools/… —
3562
+ ...workerHost ? { host: workerHost } : {},
3561
3563
  ...hooks ? { hooks } : {},
3562
3564
  signal: controller.signal
3563
3565
  // …but never the per-task cancellation signal
@@ -3805,6 +3807,18 @@ var VoiceEngineOptions = class {
3805
3807
  /** heuristic (non-AEC) energy barge-in tuning */
3806
3808
  bargeRmsMult = 2;
3807
3809
  bargeRmsFloor = 500;
3810
+ /** Overlap turn-taking (AEC tier, needs player.pause/resume) — human phone-call model:
3811
+ * onset → PAUSE (exact-sample hold, nothing lost); sustained overlap → cede (interrupt; the LLM
3812
+ * re-enters). Brief overlaps that die out (backchannels — "mm-hm", decided by DURATION, not
3813
+ * vocabulary) resume from the precise sample and are dropped. false disables. */
3814
+ overlapPause = true;
3815
+ /** sustained overlap ≥ this → cede the turn */
3816
+ overlapSustainMs = 350;
3817
+ /** quiet for this long while paused → resume, drop the interjection */
3818
+ overlapResumeMs = 700;
3819
+ /** energy floor for "overlap candidate" — must sit ABOVE typical room ambient (~110 rms measured;
3820
+ * ungated ambient re-arming the resume timer forever was a live wedge). User speech ≫ 300. */
3821
+ overlapRms = 300;
3808
3822
  };
3809
3823
  var VoiceEngine = class {
3810
3824
  options;
@@ -3835,6 +3849,13 @@ var VoiceEngine = class {
3835
3849
  // endpointed text held for the merge window
3836
3850
  pendingTimer = null;
3837
3851
  lastInterrupted = null;
3852
+ // overlap (pause) tier state — AEC + pause-capable sinks only
3853
+ pausedAt = 0;
3854
+ overlapLoud = 0;
3855
+ // loud chunks since pause (sustain must be real sound, not two clicks)
3856
+ overlapLastLoudAt = 0;
3857
+ // continuity guard: a gap re-arms the onset (sparse noise ≠ sustained speech)
3858
+ resumeTimer = null;
3838
3859
  constructor(options) {
3839
3860
  this.options = { ...new VoiceEngineOptions(), ...options };
3840
3861
  const o = this.options;
@@ -3882,6 +3903,7 @@ var VoiceEngine = class {
3882
3903
  this.drainTimer = null;
3883
3904
  }
3884
3905
  this.interrupted = false;
3906
+ this.resetOverlap(true);
3885
3907
  if (!this.speaking) this.player.markTurn();
3886
3908
  this.speaking = true;
3887
3909
  this.ctxOpen = true;
@@ -3916,6 +3938,10 @@ var VoiceEngine = class {
3916
3938
  this.drainTimer = null;
3917
3939
  return;
3918
3940
  }
3941
+ if (this.pausedAt) {
3942
+ this.drainTimer = setTimeout(settle, 250);
3943
+ return;
3944
+ }
3919
3945
  this.drainTimer = null;
3920
3946
  this.speaking = false;
3921
3947
  this.echoUntil = now() + 2500;
@@ -3947,6 +3973,7 @@ var VoiceEngine = class {
3947
3973
  clearTimeout(this.drainTimer);
3948
3974
  this.drainTimer = null;
3949
3975
  }
3976
+ this.resetOverlap(false);
3950
3977
  const heardChars = Math.round(Math.max(0, this.player.playedMs()) / 1e3 * 15);
3951
3978
  if (this.reply) this.lastInterrupted = { full: this.reply, heard: this.reply.slice(0, heardChars) };
3952
3979
  this.speaking = false;
@@ -3961,6 +3988,7 @@ var VoiceEngine = class {
3961
3988
  this.setState("listening");
3962
3989
  }
3963
3990
  stop() {
3991
+ if (this.resumeTimer) clearTimeout(this.resumeTimer);
3964
3992
  if (this.pendingTimer) clearTimeout(this.pendingTimer);
3965
3993
  if (this.drainTimer) clearTimeout(this.drainTimer);
3966
3994
  this.stt.stop();
@@ -3986,12 +4014,11 @@ var VoiceEngine = class {
3986
4014
  genuine(text) {
3987
4015
  const total = this.words(text).length;
3988
4016
  const novel = this.novelWords(text).length;
3989
- if (!novel) return false;
3990
- return novel >= 2 || novel / Math.max(1, total) > 0.5;
4017
+ return novel > 0 && novel / Math.max(1, total) > 0.5;
3991
4018
  }
3992
4019
  handlePartial(text) {
3993
4020
  if (this.speaking) {
3994
- const barge = this.usingAec ? this.genuine(text) : this.novelWords(text).length >= (this.suspectUntil ? 1 : 2);
4021
+ const barge = this.overlapCapable ? false : this.usingAec ? this.genuine(text) : this.novelWords(text).length >= (this.suspectUntil ? 1 : 2);
3995
4022
  if (barge) {
3996
4023
  const phase = this.ctxOpen ? "speaking" : "drain";
3997
4024
  this.interrupt();
@@ -4008,6 +4035,10 @@ var VoiceEngine = class {
4008
4035
  if (!this.echoActive() || (this.usingAec ? this.genuine(text) : this.novelWords(text).length >= 1)) this.options.onPartial(text);
4009
4036
  }
4010
4037
  handleUtterance(text) {
4038
+ if (this.speaking && this.ctxOpen && this.overlapCapable) {
4039
+ this.stt.reset();
4040
+ return;
4041
+ }
4011
4042
  if (this.echoActive() && (this.usingAec ? !this.genuine(text) : this.novelWords(text).length < 2)) {
4012
4043
  this.stt.reset();
4013
4044
  return;
@@ -4031,9 +4062,62 @@ var VoiceEngine = class {
4031
4062
  this.pendingUtt = "";
4032
4063
  if (text) this.options.onUtterance(text);
4033
4064
  }
4065
+ get overlapCapable() {
4066
+ return this.usingAec && this.options.overlapPause && !!this.player.pause && !!this.player.resume;
4067
+ }
4068
+ /** Overlap turn-taking (AEC tier): onset → pause (exact-sample hold); sustained → cede; died out
4069
+ * → resume. No vocabulary anywhere — duration and persistence decide (backchannels are short
4070
+ * and stop). Nothing is lost across a pause, so a false positive costs only a brief hold. */
4071
+ handleOverlap(rms) {
4072
+ const o = this.options;
4073
+ if (!this.speaking || !this.overlapCapable) return;
4074
+ if (rms < o.overlapRms) return;
4075
+ const t = now();
4076
+ if (!this.pausedAt) {
4077
+ this.pausedAt = t;
4078
+ this.overlapLoud = 1;
4079
+ this.overlapLastLoudAt = t;
4080
+ this.player.pause();
4081
+ this.armResume();
4082
+ return;
4083
+ }
4084
+ if (t - this.overlapLastLoudAt > 300) {
4085
+ this.pausedAt = t;
4086
+ this.overlapLoud = 1;
4087
+ this.overlapLastLoudAt = t;
4088
+ this.armResume();
4089
+ return;
4090
+ }
4091
+ this.overlapLastLoudAt = t;
4092
+ this.overlapLoud++;
4093
+ if (t - this.pausedAt >= o.overlapSustainMs && this.overlapLoud >= 4) {
4094
+ const phase = this.ctxOpen ? "speaking" : "drain";
4095
+ this.interrupt();
4096
+ this.options.onBargeIn(phase);
4097
+ return;
4098
+ }
4099
+ this.armResume();
4100
+ }
4101
+ armResume() {
4102
+ if (this.resumeTimer) clearTimeout(this.resumeTimer);
4103
+ this.resumeTimer = setTimeout(() => {
4104
+ this.resumeTimer = null;
4105
+ if (!this.pausedAt) return;
4106
+ this.resetOverlap(true);
4107
+ }, this.options.overlapResumeMs);
4108
+ }
4109
+ resetOverlap(resume) {
4110
+ if (this.resumeTimer) {
4111
+ clearTimeout(this.resumeTimer);
4112
+ this.resumeTimer = null;
4113
+ }
4114
+ if (this.pausedAt && resume) this.player.resume?.();
4115
+ this.pausedAt = 0;
4116
+ this.overlapLoud = 0;
4117
+ }
4034
4118
  /** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
4035
4119
  handleLevel(rms) {
4036
- if (this.usingAec) return;
4120
+ if (this.usingAec) return this.handleOverlap(rms);
4037
4121
  if (!this.speaking) {
4038
4122
  this.baseline = 0;
4039
4123
  this.hot = 0;
@@ -4983,6 +5067,102 @@ var NodeMicSource = class {
4983
5067
  }, 500).unref?.();
4984
5068
  }
4985
5069
  };
5070
+ var AecDuplexAudio = class {
5071
+ constructor(bin) {
5072
+ this.bin = bin;
5073
+ }
5074
+ bin;
5075
+ aec = true;
5076
+ proc = null;
5077
+ stopped = false;
5078
+ bytesWritten = 0;
5079
+ startedAt = 0;
5080
+ // --- AudioSource ---
5081
+ start(onChunk) {
5082
+ this.proc = spawn2(this.bin, [], { stdio: ["pipe", "pipe", "ignore"] });
5083
+ this.proc.stdin.on("error", () => {
5084
+ });
5085
+ this.proc.on("exit", (c) => {
5086
+ if (c && !this.stopped) log12.error(`aec duplex audio exited (${c}) \u2014 check mic permission / MIC_AEC=0`);
5087
+ });
5088
+ this.proc.stdout.on("data", (chunk) => onChunk(chunk));
5089
+ }
5090
+ stop() {
5091
+ this.stopped = true;
5092
+ const p = this.proc;
5093
+ this.proc = null;
5094
+ if (!p) return;
5095
+ p.kill("SIGTERM");
5096
+ setTimeout(() => {
5097
+ try {
5098
+ p.kill("SIGKILL");
5099
+ } catch {
5100
+ }
5101
+ }, 500).unref?.();
5102
+ }
5103
+ // --- AudioSink (frame writer; same played/drain byte-math as the ffplay Player) ---
5104
+ frame(payload) {
5105
+ const stdin = this.proc?.stdin;
5106
+ if (!stdin || stdin.destroyed) return;
5107
+ const hdr = Buffer.alloc(4);
5108
+ hdr.writeUInt32LE(payload ? payload.length : 0);
5109
+ stdin.write(hdr);
5110
+ if (payload?.length) stdin.write(payload);
5111
+ }
5112
+ markTurn() {
5113
+ this.frame(null);
5114
+ this.bytesWritten = 0;
5115
+ this.startedAt = 0;
5116
+ this.pausedSince = 0;
5117
+ this.pausedAccum = 0;
5118
+ }
5119
+ write(chunk) {
5120
+ if (!this.startedAt) this.startedAt = now4();
5121
+ this.bytesWritten += chunk.length;
5122
+ this.frame(chunk);
5123
+ }
5124
+ playedMs() {
5125
+ return this.startedAt ? now4() - this.startedAt - this.pausedMs() : 0;
5126
+ }
5127
+ drainMs() {
5128
+ if (!this.startedAt) return 0;
5129
+ const queuedMs = this.bytesWritten / (TTS_SAMPLE_RATE * 2) * 1e3;
5130
+ return Math.max(0, queuedMs - (now4() - this.startedAt - this.pausedMs()));
5131
+ }
5132
+ /** barge-in: silence NOW (in-band flush) — the capture side keeps running */
5133
+ kill() {
5134
+ this.frame(null);
5135
+ this.bytesWritten = 0;
5136
+ this.startedAt = 0;
5137
+ this.pausedSince = 0;
5138
+ this.pausedAccum = 0;
5139
+ }
5140
+ /** overlap trail-off: exact-sample PAUSE (len==0xFFFFFFFF) / RESUME (len==0xFFFFFFFE) frames */
5141
+ pausedSince = 0;
5142
+ pausedAccum = 0;
5143
+ ctl(code) {
5144
+ const stdin = this.proc?.stdin;
5145
+ if (!stdin || stdin.destroyed) return;
5146
+ const f = Buffer.alloc(4);
5147
+ f.writeUInt32LE(code, 0);
5148
+ stdin.write(f);
5149
+ }
5150
+ pause() {
5151
+ if (this.pausedSince) return;
5152
+ this.pausedSince = now4();
5153
+ this.ctl(4294967295);
5154
+ }
5155
+ resume() {
5156
+ if (!this.pausedSince) return;
5157
+ this.pausedAccum += now4() - this.pausedSince;
5158
+ this.pausedSince = 0;
5159
+ this.ctl(4294967294);
5160
+ }
5161
+ /** total paused time this turn — excluded from played/drain math (the tape held still) */
5162
+ pausedMs() {
5163
+ return this.pausedAccum + (this.pausedSince ? now4() - this.pausedSince : 0);
5164
+ }
5165
+ };
4986
5166
  var VoiceIOOptions = class extends VoiceEngineOptions {
4987
5167
  sonioxApiKey = process.env.SONIOX_API_KEY ?? "";
4988
5168
  cartesiaApiKey = process.env.CARTESIA_API_KEY ?? "";
@@ -4991,11 +5171,13 @@ var VoiceIOOptions = class extends VoiceEngineOptions {
4991
5171
  var VoiceIO = class extends VoiceEngine {
4992
5172
  constructor(options) {
4993
5173
  const o = { ...new VoiceIOOptions(), ...options };
5174
+ const bin = !o.stt || !o.player ? resolveAecBinary() : null;
5175
+ const duplex = bin ? new AecDuplexAudio(bin) : null;
4994
5176
  super({
4995
5177
  ...o,
4996
- stt: o.stt ?? new SonioxSTT({ auth: o.sonioxApiKey, source: new NodeMicSource() }),
5178
+ stt: o.stt ?? new SonioxSTT({ auth: o.sonioxApiKey, source: duplex ?? new NodeMicSource() }),
4997
5179
  tts: o.tts ?? new CartesiaTTS({ auth: o.cartesiaApiKey, voiceId: o.cartesiaVoiceId }),
4998
- player: o.player ?? new Player(),
5180
+ player: o.player ?? duplex ?? new Player(),
4999
5181
  bargeRmsMult: Number(process.env.BARGE_RMS_MULT || o.bargeRmsMult),
5000
5182
  bargeRmsFloor: Number(process.env.BARGE_RMS_FLOOR || o.bargeRmsFloor)
5001
5183
  });