agent.libx.js 0.92.5 → 0.92.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -3552,12 +3552,14 @@ ${recent}` : brief;
3552
3552
  report.output(chunk);
3553
3553
  }
3554
3554
  } : base;
3555
+ const workerHost = o.host?.ask ? { ask: (q2) => o.host.ask(q2) } : void 0;
3555
3556
  const worker = new Agent({
3556
3557
  ai: o.ai,
3557
3558
  fs: o.fs,
3558
3559
  model: o.workerModel,
3559
3560
  ...o.workerOptions,
3560
3561
  // may override ai/fs/model/tools/… —
3562
+ ...workerHost ? { host: workerHost } : {},
3561
3563
  ...hooks ? { hooks } : {},
3562
3564
  signal: controller.signal
3563
3565
  // …but never the per-task cancellation signal
@@ -3805,6 +3807,18 @@ var VoiceEngineOptions = class {
3805
3807
  /** heuristic (non-AEC) energy barge-in tuning */
3806
3808
  bargeRmsMult = 2;
3807
3809
  bargeRmsFloor = 500;
3810
+ /** Overlap turn-taking (AEC tier, needs player.pause/resume) — human phone-call model:
3811
+ * onset → PAUSE (exact-sample hold, nothing lost); sustained overlap → cede (interrupt; the LLM
3812
+ * re-enters). Brief overlaps that die out (backchannels — "mm-hm", decided by DURATION, not
3813
+ * vocabulary) resume from the precise sample and are dropped. false disables. */
3814
+ overlapPause = true;
3815
+ /** sustained overlap ≥ this → cede the turn */
3816
+ overlapSustainMs = 350;
3817
+ /** quiet for this long while paused → resume, drop the interjection */
3818
+ overlapResumeMs = 700;
3819
+ /** energy floor for "overlap candidate" — must sit ABOVE typical room ambient (~110 rms measured;
3820
+ * ungated ambient re-arming the resume timer forever was a live wedge). User speech ≫ 300. */
3821
+ overlapRms = 300;
3808
3822
  };
3809
3823
  var VoiceEngine = class {
3810
3824
  options;
@@ -3835,6 +3849,13 @@ var VoiceEngine = class {
3835
3849
  // endpointed text held for the merge window
3836
3850
  pendingTimer = null;
3837
3851
  lastInterrupted = null;
3852
+ // overlap (pause) tier state — AEC + pause-capable sinks only
3853
+ pausedAt = 0;
3854
+ overlapLoud = 0;
3855
+ // loud chunks since pause (sustain must be real sound, not two clicks)
3856
+ overlapLastLoudAt = 0;
3857
+ // continuity guard: a gap re-arms the onset (sparse noise ≠ sustained speech)
3858
+ resumeTimer = null;
3838
3859
  constructor(options) {
3839
3860
  this.options = { ...new VoiceEngineOptions(), ...options };
3840
3861
  const o = this.options;
@@ -3882,6 +3903,7 @@ var VoiceEngine = class {
3882
3903
  this.drainTimer = null;
3883
3904
  }
3884
3905
  this.interrupted = false;
3906
+ this.resetOverlap(true);
3885
3907
  if (!this.speaking) this.player.markTurn();
3886
3908
  this.speaking = true;
3887
3909
  this.ctxOpen = true;
@@ -3916,6 +3938,10 @@ var VoiceEngine = class {
3916
3938
  this.drainTimer = null;
3917
3939
  return;
3918
3940
  }
3941
+ if (this.pausedAt) {
3942
+ this.drainTimer = setTimeout(settle, 250);
3943
+ return;
3944
+ }
3919
3945
  this.drainTimer = null;
3920
3946
  this.speaking = false;
3921
3947
  this.echoUntil = now() + 2500;
@@ -3947,6 +3973,7 @@ var VoiceEngine = class {
3947
3973
  clearTimeout(this.drainTimer);
3948
3974
  this.drainTimer = null;
3949
3975
  }
3976
+ this.resetOverlap(false);
3950
3977
  const heardChars = Math.round(Math.max(0, this.player.playedMs()) / 1e3 * 15);
3951
3978
  if (this.reply) this.lastInterrupted = { full: this.reply, heard: this.reply.slice(0, heardChars) };
3952
3979
  this.speaking = false;
@@ -3961,6 +3988,7 @@ var VoiceEngine = class {
3961
3988
  this.setState("listening");
3962
3989
  }
3963
3990
  stop() {
3991
+ if (this.resumeTimer) clearTimeout(this.resumeTimer);
3964
3992
  if (this.pendingTimer) clearTimeout(this.pendingTimer);
3965
3993
  if (this.drainTimer) clearTimeout(this.drainTimer);
3966
3994
  this.stt.stop();
@@ -3978,9 +4006,19 @@ var VoiceEngine = class {
3978
4006
  echoActive() {
3979
4007
  return this.speaking || now() < this.echoUntil;
3980
4008
  }
4009
+ /** Genuine user speech vs our own bleed (AEC tier): novel words must DOMINATE, not merely exist.
4010
+ * Degraded AEC + an STT mis-hearing manufactures a single novel word out of pure echo (a name or
4011
+ * rare word in our own reply comes back transcribed slightly differently — 1 novel / N words).
4012
+ * A real interjection is mostly novel ("stop", "wait what") — short utterances pass on ratio,
4013
+ * longer ones on count. */
4014
+ genuine(text) {
4015
+ const total = this.words(text).length;
4016
+ const novel = this.novelWords(text).length;
4017
+ return novel > 0 && novel / Math.max(1, total) > 0.5;
4018
+ }
3981
4019
  handlePartial(text) {
3982
4020
  if (this.speaking) {
3983
- const barge = this.novelWords(text).length >= (this.usingAec ? 1 : this.suspectUntil ? 1 : 2);
4021
+ const barge = this.overlapCapable ? false : this.usingAec ? this.genuine(text) : this.novelWords(text).length >= (this.suspectUntil ? 1 : 2);
3984
4022
  if (barge) {
3985
4023
  const phase = this.ctxOpen ? "speaking" : "drain";
3986
4024
  this.interrupt();
@@ -3994,10 +4032,14 @@ var VoiceEngine = class {
3994
4032
  this.pendingTimer = null;
3995
4033
  }
3996
4034
  }
3997
- if (!this.echoActive() || this.novelWords(text).length >= 1) this.options.onPartial(text);
4035
+ if (!this.echoActive() || (this.usingAec ? this.genuine(text) : this.novelWords(text).length >= 1)) this.options.onPartial(text);
3998
4036
  }
3999
4037
  handleUtterance(text) {
4000
- if (this.echoActive() && this.novelWords(text).length < (this.usingAec ? 1 : 2)) {
4038
+ if (this.speaking && this.ctxOpen && this.overlapCapable) {
4039
+ this.stt.reset();
4040
+ return;
4041
+ }
4042
+ if (this.echoActive() && (this.usingAec ? !this.genuine(text) : this.novelWords(text).length < 2)) {
4001
4043
  this.stt.reset();
4002
4044
  return;
4003
4045
  }
@@ -4020,9 +4062,62 @@ var VoiceEngine = class {
4020
4062
  this.pendingUtt = "";
4021
4063
  if (text) this.options.onUtterance(text);
4022
4064
  }
4065
+ get overlapCapable() {
4066
+ return this.usingAec && this.options.overlapPause && !!this.player.pause && !!this.player.resume;
4067
+ }
4068
+ /** Overlap turn-taking (AEC tier): onset → pause (exact-sample hold); sustained → cede; died out
4069
+ * → resume. No vocabulary anywhere — duration and persistence decide (backchannels are short
4070
+ * and stop). Nothing is lost across a pause, so a false positive costs only a brief hold. */
4071
+ handleOverlap(rms) {
4072
+ const o = this.options;
4073
+ if (!this.speaking || !this.overlapCapable) return;
4074
+ if (rms < o.overlapRms) return;
4075
+ const t = now();
4076
+ if (!this.pausedAt) {
4077
+ this.pausedAt = t;
4078
+ this.overlapLoud = 1;
4079
+ this.overlapLastLoudAt = t;
4080
+ this.player.pause();
4081
+ this.armResume();
4082
+ return;
4083
+ }
4084
+ if (t - this.overlapLastLoudAt > 300) {
4085
+ this.pausedAt = t;
4086
+ this.overlapLoud = 1;
4087
+ this.overlapLastLoudAt = t;
4088
+ this.armResume();
4089
+ return;
4090
+ }
4091
+ this.overlapLastLoudAt = t;
4092
+ this.overlapLoud++;
4093
+ if (t - this.pausedAt >= o.overlapSustainMs && this.overlapLoud >= 4) {
4094
+ const phase = this.ctxOpen ? "speaking" : "drain";
4095
+ this.interrupt();
4096
+ this.options.onBargeIn(phase);
4097
+ return;
4098
+ }
4099
+ this.armResume();
4100
+ }
4101
+ armResume() {
4102
+ if (this.resumeTimer) clearTimeout(this.resumeTimer);
4103
+ this.resumeTimer = setTimeout(() => {
4104
+ this.resumeTimer = null;
4105
+ if (!this.pausedAt) return;
4106
+ this.resetOverlap(true);
4107
+ }, this.options.overlapResumeMs);
4108
+ }
4109
+ resetOverlap(resume) {
4110
+ if (this.resumeTimer) {
4111
+ clearTimeout(this.resumeTimer);
4112
+ this.resumeTimer = null;
4113
+ }
4114
+ if (this.pausedAt && resume) this.player.resume?.();
4115
+ this.pausedAt = 0;
4116
+ this.overlapLoud = 0;
4117
+ }
4023
4118
  /** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
4024
4119
  handleLevel(rms) {
4025
- if (this.usingAec) return;
4120
+ if (this.usingAec) return this.handleOverlap(rms);
4026
4121
  if (!this.speaking) {
4027
4122
  this.baseline = 0;
4028
4123
  this.hot = 0;
@@ -4972,6 +5067,102 @@ var NodeMicSource = class {
4972
5067
  }, 500).unref?.();
4973
5068
  }
4974
5069
  };
5070
+ var AecDuplexAudio = class {
5071
+ constructor(bin) {
5072
+ this.bin = bin;
5073
+ }
5074
+ bin;
5075
+ aec = true;
5076
+ proc = null;
5077
+ stopped = false;
5078
+ bytesWritten = 0;
5079
+ startedAt = 0;
5080
+ // --- AudioSource ---
5081
+ start(onChunk) {
5082
+ this.proc = spawn2(this.bin, [], { stdio: ["pipe", "pipe", "ignore"] });
5083
+ this.proc.stdin.on("error", () => {
5084
+ });
5085
+ this.proc.on("exit", (c) => {
5086
+ if (c && !this.stopped) log12.error(`aec duplex audio exited (${c}) \u2014 check mic permission / MIC_AEC=0`);
5087
+ });
5088
+ this.proc.stdout.on("data", (chunk) => onChunk(chunk));
5089
+ }
5090
+ stop() {
5091
+ this.stopped = true;
5092
+ const p = this.proc;
5093
+ this.proc = null;
5094
+ if (!p) return;
5095
+ p.kill("SIGTERM");
5096
+ setTimeout(() => {
5097
+ try {
5098
+ p.kill("SIGKILL");
5099
+ } catch {
5100
+ }
5101
+ }, 500).unref?.();
5102
+ }
5103
+ // --- AudioSink (frame writer; same played/drain byte-math as the ffplay Player) ---
5104
+ frame(payload) {
5105
+ const stdin = this.proc?.stdin;
5106
+ if (!stdin || stdin.destroyed) return;
5107
+ const hdr = Buffer.alloc(4);
5108
+ hdr.writeUInt32LE(payload ? payload.length : 0);
5109
+ stdin.write(hdr);
5110
+ if (payload?.length) stdin.write(payload);
5111
+ }
5112
+ markTurn() {
5113
+ this.frame(null);
5114
+ this.bytesWritten = 0;
5115
+ this.startedAt = 0;
5116
+ this.pausedSince = 0;
5117
+ this.pausedAccum = 0;
5118
+ }
5119
+ write(chunk) {
5120
+ if (!this.startedAt) this.startedAt = now4();
5121
+ this.bytesWritten += chunk.length;
5122
+ this.frame(chunk);
5123
+ }
5124
+ playedMs() {
5125
+ return this.startedAt ? now4() - this.startedAt - this.pausedMs() : 0;
5126
+ }
5127
+ drainMs() {
5128
+ if (!this.startedAt) return 0;
5129
+ const queuedMs = this.bytesWritten / (TTS_SAMPLE_RATE * 2) * 1e3;
5130
+ return Math.max(0, queuedMs - (now4() - this.startedAt - this.pausedMs()));
5131
+ }
5132
+ /** barge-in: silence NOW (in-band flush) — the capture side keeps running */
5133
+ kill() {
5134
+ this.frame(null);
5135
+ this.bytesWritten = 0;
5136
+ this.startedAt = 0;
5137
+ this.pausedSince = 0;
5138
+ this.pausedAccum = 0;
5139
+ }
5140
+ /** overlap trail-off: exact-sample PAUSE (len==0xFFFFFFFF) / RESUME (len==0xFFFFFFFE) frames */
5141
+ pausedSince = 0;
5142
+ pausedAccum = 0;
5143
+ ctl(code) {
5144
+ const stdin = this.proc?.stdin;
5145
+ if (!stdin || stdin.destroyed) return;
5146
+ const f = Buffer.alloc(4);
5147
+ f.writeUInt32LE(code, 0);
5148
+ stdin.write(f);
5149
+ }
5150
+ pause() {
5151
+ if (this.pausedSince) return;
5152
+ this.pausedSince = now4();
5153
+ this.ctl(4294967295);
5154
+ }
5155
+ resume() {
5156
+ if (!this.pausedSince) return;
5157
+ this.pausedAccum += now4() - this.pausedSince;
5158
+ this.pausedSince = 0;
5159
+ this.ctl(4294967294);
5160
+ }
5161
+ /** total paused time this turn — excluded from played/drain math (the tape held still) */
5162
+ pausedMs() {
5163
+ return this.pausedAccum + (this.pausedSince ? now4() - this.pausedSince : 0);
5164
+ }
5165
+ };
4975
5166
  var VoiceIOOptions = class extends VoiceEngineOptions {
4976
5167
  sonioxApiKey = process.env.SONIOX_API_KEY ?? "";
4977
5168
  cartesiaApiKey = process.env.CARTESIA_API_KEY ?? "";
@@ -4980,11 +5171,13 @@ var VoiceIOOptions = class extends VoiceEngineOptions {
4980
5171
  var VoiceIO = class extends VoiceEngine {
4981
5172
  constructor(options) {
4982
5173
  const o = { ...new VoiceIOOptions(), ...options };
5174
+ const bin = !o.stt || !o.player ? resolveAecBinary() : null;
5175
+ const duplex = bin ? new AecDuplexAudio(bin) : null;
4983
5176
  super({
4984
5177
  ...o,
4985
- stt: o.stt ?? new SonioxSTT({ auth: o.sonioxApiKey, source: new NodeMicSource() }),
5178
+ stt: o.stt ?? new SonioxSTT({ auth: o.sonioxApiKey, source: duplex ?? new NodeMicSource() }),
4986
5179
  tts: o.tts ?? new CartesiaTTS({ auth: o.cartesiaApiKey, voiceId: o.cartesiaVoiceId }),
4987
- player: o.player ?? new Player(),
5180
+ player: o.player ?? duplex ?? new Player(),
4988
5181
  bargeRmsMult: Number(process.env.BARGE_RMS_MULT || o.bargeRmsMult),
4989
5182
  bargeRmsFloor: Number(process.env.BARGE_RMS_FLOOR || o.bargeRmsFloor)
4990
5183
  });