agent.libx.js 0.92.5 → 0.92.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -745,6 +745,11 @@ interface AudioSink {
745
745
  playedMs(): number;
746
746
  /** stop playback NOW (barge-in primitive) */
747
747
  kill(): void;
748
+ /** optional exact-sample pause/resume — enables the overlap trail-off tier (web: AudioContext
749
+ * suspend/resume; CLI AEC helper: control frames). Sinks without it degrade to interrupt-only
750
+ * turn-taking. Nothing is lost across a pause; playedMs/drainMs must exclude paused time. */
751
+ pause?(): void;
752
+ resume?(): void;
748
753
  }
749
754
  /** Static key (server/CLI) or an async getter (browser: fetch a short-lived token from YOUR
750
755
  * backend). Getters are invoked on EVERY (re)connect — temp tokens expire, so a reconnect
@@ -794,6 +799,18 @@ declare class VoiceEngineOptions {
794
799
  /** heuristic (non-AEC) energy barge-in tuning */
795
800
  bargeRmsMult: number;
796
801
  bargeRmsFloor: number;
802
+ /** Overlap turn-taking (AEC tier, needs player.pause/resume) — human phone-call model:
803
+ * onset → PAUSE (exact-sample hold, nothing lost); sustained overlap → cede (interrupt; the LLM
804
+ * re-enters). Brief overlaps that die out (backchannels — "mm-hm", decided by DURATION, not
805
+ * vocabulary) resume from the precise sample and are dropped. false disables. */
806
+ overlapPause: boolean;
807
+ /** sustained overlap ≥ this → cede the turn */
808
+ overlapSustainMs: number;
809
+ /** quiet for this long while paused → resume, drop the interjection */
810
+ overlapResumeMs: number;
811
+ /** energy floor for "overlap candidate" — must sit ABOVE typical room ambient (~110 rms measured;
812
+ * ungated ambient re-arming the resume timer forever was a live wedge). User speech ≫ 300. */
813
+ overlapRms: number;
797
814
  }
798
815
  declare class VoiceEngine {
799
816
  options: VoiceEngineOptions;
@@ -817,6 +834,10 @@ declare class VoiceEngine {
817
834
  private pendingUtt;
818
835
  private pendingTimer;
819
836
  private lastInterrupted;
837
+ private pausedAt;
838
+ private overlapLoud;
839
+ private overlapLastLoudAt;
840
+ private resumeTimer;
820
841
  constructor(options?: Partial<VoiceEngineOptions>);
821
842
  start(): Promise<void>;
822
843
  get usingAec(): boolean;
@@ -843,9 +864,22 @@ declare class VoiceEngine {
843
864
  private words;
844
865
  private novelWords;
845
866
  private echoActive;
867
+ /** Genuine user speech vs our own bleed (AEC tier): novel words must DOMINATE, not merely exist.
868
+ * Degraded AEC + an STT mis-hearing manufactures a single novel word out of pure echo (a name or
869
+ * rare word in our own reply comes back transcribed slightly differently — 1 novel / N words).
870
+ * A real interjection is mostly novel ("stop", "wait what") — short utterances pass on ratio,
871
+ * longer ones on count. */
872
+ private genuine;
846
873
  private handlePartial;
847
874
  private handleUtterance;
848
875
  private flushUtterance;
876
+ private get overlapCapable();
877
+ /** Overlap turn-taking (AEC tier): onset → pause (exact-sample hold); sustained → cede; died out
878
+ * → resume. No vocabulary anywhere — duration and persistence decide (backchannels are short
879
+ * and stop). Nothing is lost across a pause, so a false positive costs only a brief hold. */
880
+ private handleOverlap;
881
+ private armResume;
882
+ private resetOverlap;
849
883
  /** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
850
884
  private handleLevel;
851
885
  }
package/dist/index.js CHANGED
@@ -3652,12 +3652,14 @@ ${recent}` : brief;
3652
3652
  report.output(chunk);
3653
3653
  }
3654
3654
  } : base;
3655
+ const workerHost = o.host?.ask ? { ask: (q) => o.host.ask(q) } : void 0;
3655
3656
  const worker = new Agent({
3656
3657
  ai: o.ai,
3657
3658
  fs: o.fs,
3658
3659
  model: o.workerModel,
3659
3660
  ...o.workerOptions,
3660
3661
  // may override ai/fs/model/tools/… —
3662
+ ...workerHost ? { host: workerHost } : {},
3661
3663
  ...hooks ? { hooks } : {},
3662
3664
  signal: controller.signal
3663
3665
  // …but never the per-task cancellation signal
@@ -4000,6 +4002,18 @@ var VoiceEngineOptions = class {
4000
4002
  /** heuristic (non-AEC) energy barge-in tuning */
4001
4003
  bargeRmsMult = 2;
4002
4004
  bargeRmsFloor = 500;
4005
+ /** Overlap turn-taking (AEC tier, needs player.pause/resume) — human phone-call model:
4006
+ * onset → PAUSE (exact-sample hold, nothing lost); sustained overlap → cede (interrupt; the LLM
4007
+ * re-enters). Brief overlaps that die out (backchannels — "mm-hm", decided by DURATION, not
4008
+ * vocabulary) resume from the precise sample and are dropped. false disables. */
4009
+ overlapPause = true;
4010
+ /** sustained overlap ≥ this → cede the turn */
4011
+ overlapSustainMs = 350;
4012
+ /** quiet for this long while paused → resume, drop the interjection */
4013
+ overlapResumeMs = 700;
4014
+ /** energy floor for "overlap candidate" — must sit ABOVE typical room ambient (~110 rms measured;
4015
+ * ungated ambient re-arming the resume timer forever was a live wedge). User speech ≫ 300. */
4016
+ overlapRms = 300;
4003
4017
  };
4004
4018
  var VoiceEngine = class {
4005
4019
  options;
@@ -4030,6 +4044,13 @@ var VoiceEngine = class {
4030
4044
  // endpointed text held for the merge window
4031
4045
  pendingTimer = null;
4032
4046
  lastInterrupted = null;
4047
+ // overlap (pause) tier state — AEC + pause-capable sinks only
4048
+ pausedAt = 0;
4049
+ overlapLoud = 0;
4050
+ // loud chunks since pause (sustain must be real sound, not two clicks)
4051
+ overlapLastLoudAt = 0;
4052
+ // continuity guard: a gap re-arms the onset (sparse noise ≠ sustained speech)
4053
+ resumeTimer = null;
4033
4054
  constructor(options) {
4034
4055
  this.options = { ...new VoiceEngineOptions(), ...options };
4035
4056
  const o = this.options;
@@ -4077,6 +4098,7 @@ var VoiceEngine = class {
4077
4098
  this.drainTimer = null;
4078
4099
  }
4079
4100
  this.interrupted = false;
4101
+ this.resetOverlap(true);
4080
4102
  if (!this.speaking) this.player.markTurn();
4081
4103
  this.speaking = true;
4082
4104
  this.ctxOpen = true;
@@ -4111,6 +4133,10 @@ var VoiceEngine = class {
4111
4133
  this.drainTimer = null;
4112
4134
  return;
4113
4135
  }
4136
+ if (this.pausedAt) {
4137
+ this.drainTimer = setTimeout(settle, 250);
4138
+ return;
4139
+ }
4114
4140
  this.drainTimer = null;
4115
4141
  this.speaking = false;
4116
4142
  this.echoUntil = now() + 2500;
@@ -4142,6 +4168,7 @@ var VoiceEngine = class {
4142
4168
  clearTimeout(this.drainTimer);
4143
4169
  this.drainTimer = null;
4144
4170
  }
4171
+ this.resetOverlap(false);
4145
4172
  const heardChars = Math.round(Math.max(0, this.player.playedMs()) / 1e3 * 15);
4146
4173
  if (this.reply) this.lastInterrupted = { full: this.reply, heard: this.reply.slice(0, heardChars) };
4147
4174
  this.speaking = false;
@@ -4156,6 +4183,7 @@ var VoiceEngine = class {
4156
4183
  this.setState("listening");
4157
4184
  }
4158
4185
  stop() {
4186
+ if (this.resumeTimer) clearTimeout(this.resumeTimer);
4159
4187
  if (this.pendingTimer) clearTimeout(this.pendingTimer);
4160
4188
  if (this.drainTimer) clearTimeout(this.drainTimer);
4161
4189
  this.stt.stop();
@@ -4173,9 +4201,19 @@ var VoiceEngine = class {
4173
4201
  echoActive() {
4174
4202
  return this.speaking || now() < this.echoUntil;
4175
4203
  }
4204
+ /** Genuine user speech vs our own bleed (AEC tier): novel words must DOMINATE, not merely exist.
4205
+ * Degraded AEC + an STT mis-hearing manufactures a single novel word out of pure echo (a name or
4206
+ * rare word in our own reply comes back transcribed slightly differently — 1 novel / N words).
4207
+ * A real interjection is mostly novel ("stop", "wait what") — short utterances pass on ratio,
4208
+ * longer ones on count. */
4209
+ genuine(text) {
4210
+ const total = this.words(text).length;
4211
+ const novel = this.novelWords(text).length;
4212
+ return novel > 0 && novel / Math.max(1, total) > 0.5;
4213
+ }
4176
4214
  handlePartial(text) {
4177
4215
  if (this.speaking) {
4178
- const barge = this.novelWords(text).length >= (this.usingAec ? 1 : this.suspectUntil ? 1 : 2);
4216
+ const barge = this.overlapCapable ? false : this.usingAec ? this.genuine(text) : this.novelWords(text).length >= (this.suspectUntil ? 1 : 2);
4179
4217
  if (barge) {
4180
4218
  const phase = this.ctxOpen ? "speaking" : "drain";
4181
4219
  this.interrupt();
@@ -4189,10 +4227,14 @@ var VoiceEngine = class {
4189
4227
  this.pendingTimer = null;
4190
4228
  }
4191
4229
  }
4192
- if (!this.echoActive() || this.novelWords(text).length >= 1) this.options.onPartial(text);
4230
+ if (!this.echoActive() || (this.usingAec ? this.genuine(text) : this.novelWords(text).length >= 1)) this.options.onPartial(text);
4193
4231
  }
4194
4232
  handleUtterance(text) {
4195
- if (this.echoActive() && this.novelWords(text).length < (this.usingAec ? 1 : 2)) {
4233
+ if (this.speaking && this.ctxOpen && this.overlapCapable) {
4234
+ this.stt.reset();
4235
+ return;
4236
+ }
4237
+ if (this.echoActive() && (this.usingAec ? !this.genuine(text) : this.novelWords(text).length < 2)) {
4196
4238
  this.stt.reset();
4197
4239
  return;
4198
4240
  }
@@ -4215,9 +4257,62 @@ var VoiceEngine = class {
4215
4257
  this.pendingUtt = "";
4216
4258
  if (text) this.options.onUtterance(text);
4217
4259
  }
4260
+ get overlapCapable() {
4261
+ return this.usingAec && this.options.overlapPause && !!this.player.pause && !!this.player.resume;
4262
+ }
4263
+ /** Overlap turn-taking (AEC tier): onset → pause (exact-sample hold); sustained → cede; died out
4264
+ * → resume. No vocabulary anywhere — duration and persistence decide (backchannels are short
4265
+ * and stop). Nothing is lost across a pause, so a false positive costs only a brief hold. */
4266
+ handleOverlap(rms) {
4267
+ const o = this.options;
4268
+ if (!this.speaking || !this.overlapCapable) return;
4269
+ if (rms < o.overlapRms) return;
4270
+ const t = now();
4271
+ if (!this.pausedAt) {
4272
+ this.pausedAt = t;
4273
+ this.overlapLoud = 1;
4274
+ this.overlapLastLoudAt = t;
4275
+ this.player.pause();
4276
+ this.armResume();
4277
+ return;
4278
+ }
4279
+ if (t - this.overlapLastLoudAt > 300) {
4280
+ this.pausedAt = t;
4281
+ this.overlapLoud = 1;
4282
+ this.overlapLastLoudAt = t;
4283
+ this.armResume();
4284
+ return;
4285
+ }
4286
+ this.overlapLastLoudAt = t;
4287
+ this.overlapLoud++;
4288
+ if (t - this.pausedAt >= o.overlapSustainMs && this.overlapLoud >= 4) {
4289
+ const phase = this.ctxOpen ? "speaking" : "drain";
4290
+ this.interrupt();
4291
+ this.options.onBargeIn(phase);
4292
+ return;
4293
+ }
4294
+ this.armResume();
4295
+ }
4296
+ armResume() {
4297
+ if (this.resumeTimer) clearTimeout(this.resumeTimer);
4298
+ this.resumeTimer = setTimeout(() => {
4299
+ this.resumeTimer = null;
4300
+ if (!this.pausedAt) return;
4301
+ this.resetOverlap(true);
4302
+ }, this.options.overlapResumeMs);
4303
+ }
4304
+ resetOverlap(resume) {
4305
+ if (this.resumeTimer) {
4306
+ clearTimeout(this.resumeTimer);
4307
+ this.resumeTimer = null;
4308
+ }
4309
+ if (this.pausedAt && resume) this.player.resume?.();
4310
+ this.pausedAt = 0;
4311
+ this.overlapLoud = 0;
4312
+ }
4218
4313
  /** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
4219
4314
  handleLevel(rms) {
4220
- if (this.usingAec) return;
4315
+ if (this.usingAec) return this.handleOverlap(rms);
4221
4316
  if (!this.speaking) {
4222
4317
  this.baseline = 0;
4223
4318
  this.hot = 0;