agent.libx.js 0.92.5 → 0.92.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +199 -6
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +34 -0
- package/dist/index.js +99 -4
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -745,6 +745,11 @@ interface AudioSink {
|
|
|
745
745
|
playedMs(): number;
|
|
746
746
|
/** stop playback NOW (barge-in primitive) */
|
|
747
747
|
kill(): void;
|
|
748
|
+
/** optional exact-sample pause/resume — enables the overlap trail-off tier (web: AudioContext
|
|
749
|
+
* suspend/resume; CLI AEC helper: control frames). Sinks without it degrade to interrupt-only
|
|
750
|
+
* turn-taking. Nothing is lost across a pause; playedMs/drainMs must exclude paused time. */
|
|
751
|
+
pause?(): void;
|
|
752
|
+
resume?(): void;
|
|
748
753
|
}
|
|
749
754
|
/** Static key (server/CLI) or an async getter (browser: fetch a short-lived token from YOUR
|
|
750
755
|
* backend). Getters are invoked on EVERY (re)connect — temp tokens expire, so a reconnect
|
|
@@ -794,6 +799,18 @@ declare class VoiceEngineOptions {
|
|
|
794
799
|
/** heuristic (non-AEC) energy barge-in tuning */
|
|
795
800
|
bargeRmsMult: number;
|
|
796
801
|
bargeRmsFloor: number;
|
|
802
|
+
/** Overlap turn-taking (AEC tier, needs player.pause/resume) — human phone-call model:
|
|
803
|
+
* onset → PAUSE (exact-sample hold, nothing lost); sustained overlap → cede (interrupt; the LLM
|
|
804
|
+
* re-enters). Brief overlaps that die out (backchannels — "mm-hm", decided by DURATION, not
|
|
805
|
+
* vocabulary) resume from the precise sample and are dropped. false disables. */
|
|
806
|
+
overlapPause: boolean;
|
|
807
|
+
/** sustained overlap ≥ this → cede the turn */
|
|
808
|
+
overlapSustainMs: number;
|
|
809
|
+
/** quiet for this long while paused → resume, drop the interjection */
|
|
810
|
+
overlapResumeMs: number;
|
|
811
|
+
/** energy floor for "overlap candidate" — must sit ABOVE typical room ambient (~110 rms measured;
|
|
812
|
+
* ungated ambient re-arming the resume timer forever was a live wedge). User speech ≫ 300. */
|
|
813
|
+
overlapRms: number;
|
|
797
814
|
}
|
|
798
815
|
declare class VoiceEngine {
|
|
799
816
|
options: VoiceEngineOptions;
|
|
@@ -817,6 +834,10 @@ declare class VoiceEngine {
|
|
|
817
834
|
private pendingUtt;
|
|
818
835
|
private pendingTimer;
|
|
819
836
|
private lastInterrupted;
|
|
837
|
+
private pausedAt;
|
|
838
|
+
private overlapLoud;
|
|
839
|
+
private overlapLastLoudAt;
|
|
840
|
+
private resumeTimer;
|
|
820
841
|
constructor(options?: Partial<VoiceEngineOptions>);
|
|
821
842
|
start(): Promise<void>;
|
|
822
843
|
get usingAec(): boolean;
|
|
@@ -843,9 +864,22 @@ declare class VoiceEngine {
|
|
|
843
864
|
private words;
|
|
844
865
|
private novelWords;
|
|
845
866
|
private echoActive;
|
|
867
|
+
/** Genuine user speech vs our own bleed (AEC tier): novel words must DOMINATE, not merely exist.
|
|
868
|
+
* Degraded AEC + an STT mis-hearing manufactures a single novel word out of pure echo (a name or
|
|
869
|
+
* rare word in our own reply comes back transcribed slightly differently — 1 novel / N words).
|
|
870
|
+
* A real interjection is mostly novel ("stop", "wait what") — short utterances pass on ratio,
|
|
871
|
+
* longer ones on count. */
|
|
872
|
+
private genuine;
|
|
846
873
|
private handlePartial;
|
|
847
874
|
private handleUtterance;
|
|
848
875
|
private flushUtterance;
|
|
876
|
+
private get overlapCapable();
|
|
877
|
+
/** Overlap turn-taking (AEC tier): onset → pause (exact-sample hold); sustained → cede; died out
|
|
878
|
+
* → resume. No vocabulary anywhere — duration and persistence decide (backchannels are short
|
|
879
|
+
* and stop). Nothing is lost across a pause, so a false positive costs only a brief hold. */
|
|
880
|
+
private handleOverlap;
|
|
881
|
+
private armResume;
|
|
882
|
+
private resetOverlap;
|
|
849
883
|
/** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
|
|
850
884
|
private handleLevel;
|
|
851
885
|
}
|
package/dist/index.js
CHANGED
|
@@ -3652,12 +3652,14 @@ ${recent}` : brief;
|
|
|
3652
3652
|
report.output(chunk);
|
|
3653
3653
|
}
|
|
3654
3654
|
} : base;
|
|
3655
|
+
const workerHost = o.host?.ask ? { ask: (q) => o.host.ask(q) } : void 0;
|
|
3655
3656
|
const worker = new Agent({
|
|
3656
3657
|
ai: o.ai,
|
|
3657
3658
|
fs: o.fs,
|
|
3658
3659
|
model: o.workerModel,
|
|
3659
3660
|
...o.workerOptions,
|
|
3660
3661
|
// may override ai/fs/model/tools/… —
|
|
3662
|
+
...workerHost ? { host: workerHost } : {},
|
|
3661
3663
|
...hooks ? { hooks } : {},
|
|
3662
3664
|
signal: controller.signal
|
|
3663
3665
|
// …but never the per-task cancellation signal
|
|
@@ -4000,6 +4002,18 @@ var VoiceEngineOptions = class {
|
|
|
4000
4002
|
/** heuristic (non-AEC) energy barge-in tuning */
|
|
4001
4003
|
bargeRmsMult = 2;
|
|
4002
4004
|
bargeRmsFloor = 500;
|
|
4005
|
+
/** Overlap turn-taking (AEC tier, needs player.pause/resume) — human phone-call model:
|
|
4006
|
+
* onset → PAUSE (exact-sample hold, nothing lost); sustained overlap → cede (interrupt; the LLM
|
|
4007
|
+
* re-enters). Brief overlaps that die out (backchannels — "mm-hm", decided by DURATION, not
|
|
4008
|
+
* vocabulary) resume from the precise sample and are dropped. false disables. */
|
|
4009
|
+
overlapPause = true;
|
|
4010
|
+
/** sustained overlap ≥ this → cede the turn */
|
|
4011
|
+
overlapSustainMs = 350;
|
|
4012
|
+
/** quiet for this long while paused → resume, drop the interjection */
|
|
4013
|
+
overlapResumeMs = 700;
|
|
4014
|
+
/** energy floor for "overlap candidate" — must sit ABOVE typical room ambient (~110 rms measured;
|
|
4015
|
+
* ungated ambient re-arming the resume timer forever was a live wedge). User speech ≫ 300. */
|
|
4016
|
+
overlapRms = 300;
|
|
4003
4017
|
};
|
|
4004
4018
|
var VoiceEngine = class {
|
|
4005
4019
|
options;
|
|
@@ -4030,6 +4044,13 @@ var VoiceEngine = class {
|
|
|
4030
4044
|
// endpointed text held for the merge window
|
|
4031
4045
|
pendingTimer = null;
|
|
4032
4046
|
lastInterrupted = null;
|
|
4047
|
+
// overlap (pause) tier state — AEC + pause-capable sinks only
|
|
4048
|
+
pausedAt = 0;
|
|
4049
|
+
overlapLoud = 0;
|
|
4050
|
+
// loud chunks since pause (sustain must be real sound, not two clicks)
|
|
4051
|
+
overlapLastLoudAt = 0;
|
|
4052
|
+
// continuity guard: a gap re-arms the onset (sparse noise ≠ sustained speech)
|
|
4053
|
+
resumeTimer = null;
|
|
4033
4054
|
constructor(options) {
|
|
4034
4055
|
this.options = { ...new VoiceEngineOptions(), ...options };
|
|
4035
4056
|
const o = this.options;
|
|
@@ -4077,6 +4098,7 @@ var VoiceEngine = class {
|
|
|
4077
4098
|
this.drainTimer = null;
|
|
4078
4099
|
}
|
|
4079
4100
|
this.interrupted = false;
|
|
4101
|
+
this.resetOverlap(true);
|
|
4080
4102
|
if (!this.speaking) this.player.markTurn();
|
|
4081
4103
|
this.speaking = true;
|
|
4082
4104
|
this.ctxOpen = true;
|
|
@@ -4111,6 +4133,10 @@ var VoiceEngine = class {
|
|
|
4111
4133
|
this.drainTimer = null;
|
|
4112
4134
|
return;
|
|
4113
4135
|
}
|
|
4136
|
+
if (this.pausedAt) {
|
|
4137
|
+
this.drainTimer = setTimeout(settle, 250);
|
|
4138
|
+
return;
|
|
4139
|
+
}
|
|
4114
4140
|
this.drainTimer = null;
|
|
4115
4141
|
this.speaking = false;
|
|
4116
4142
|
this.echoUntil = now() + 2500;
|
|
@@ -4142,6 +4168,7 @@ var VoiceEngine = class {
|
|
|
4142
4168
|
clearTimeout(this.drainTimer);
|
|
4143
4169
|
this.drainTimer = null;
|
|
4144
4170
|
}
|
|
4171
|
+
this.resetOverlap(false);
|
|
4145
4172
|
const heardChars = Math.round(Math.max(0, this.player.playedMs()) / 1e3 * 15);
|
|
4146
4173
|
if (this.reply) this.lastInterrupted = { full: this.reply, heard: this.reply.slice(0, heardChars) };
|
|
4147
4174
|
this.speaking = false;
|
|
@@ -4156,6 +4183,7 @@ var VoiceEngine = class {
|
|
|
4156
4183
|
this.setState("listening");
|
|
4157
4184
|
}
|
|
4158
4185
|
stop() {
|
|
4186
|
+
if (this.resumeTimer) clearTimeout(this.resumeTimer);
|
|
4159
4187
|
if (this.pendingTimer) clearTimeout(this.pendingTimer);
|
|
4160
4188
|
if (this.drainTimer) clearTimeout(this.drainTimer);
|
|
4161
4189
|
this.stt.stop();
|
|
@@ -4173,9 +4201,19 @@ var VoiceEngine = class {
|
|
|
4173
4201
|
echoActive() {
|
|
4174
4202
|
return this.speaking || now() < this.echoUntil;
|
|
4175
4203
|
}
|
|
4204
|
+
/** Genuine user speech vs our own bleed (AEC tier): novel words must DOMINATE, not merely exist.
|
|
4205
|
+
* Degraded AEC + an STT mis-hearing manufactures a single novel word out of pure echo (a name or
|
|
4206
|
+
* rare word in our own reply comes back transcribed slightly differently — 1 novel / N words).
|
|
4207
|
+
* A real interjection is mostly novel ("stop", "wait what") — short utterances pass on ratio,
|
|
4208
|
+
* longer ones on count. */
|
|
4209
|
+
genuine(text) {
|
|
4210
|
+
const total = this.words(text).length;
|
|
4211
|
+
const novel = this.novelWords(text).length;
|
|
4212
|
+
return novel > 0 && novel / Math.max(1, total) > 0.5;
|
|
4213
|
+
}
|
|
4176
4214
|
handlePartial(text) {
|
|
4177
4215
|
if (this.speaking) {
|
|
4178
|
-
const barge = this.
|
|
4216
|
+
const barge = this.overlapCapable ? false : this.usingAec ? this.genuine(text) : this.novelWords(text).length >= (this.suspectUntil ? 1 : 2);
|
|
4179
4217
|
if (barge) {
|
|
4180
4218
|
const phase = this.ctxOpen ? "speaking" : "drain";
|
|
4181
4219
|
this.interrupt();
|
|
@@ -4189,10 +4227,14 @@ var VoiceEngine = class {
|
|
|
4189
4227
|
this.pendingTimer = null;
|
|
4190
4228
|
}
|
|
4191
4229
|
}
|
|
4192
|
-
if (!this.echoActive() || this.novelWords(text).length >= 1) this.options.onPartial(text);
|
|
4230
|
+
if (!this.echoActive() || (this.usingAec ? this.genuine(text) : this.novelWords(text).length >= 1)) this.options.onPartial(text);
|
|
4193
4231
|
}
|
|
4194
4232
|
handleUtterance(text) {
|
|
4195
|
-
if (this.
|
|
4233
|
+
if (this.speaking && this.ctxOpen && this.overlapCapable) {
|
|
4234
|
+
this.stt.reset();
|
|
4235
|
+
return;
|
|
4236
|
+
}
|
|
4237
|
+
if (this.echoActive() && (this.usingAec ? !this.genuine(text) : this.novelWords(text).length < 2)) {
|
|
4196
4238
|
this.stt.reset();
|
|
4197
4239
|
return;
|
|
4198
4240
|
}
|
|
@@ -4215,9 +4257,62 @@ var VoiceEngine = class {
|
|
|
4215
4257
|
this.pendingUtt = "";
|
|
4216
4258
|
if (text) this.options.onUtterance(text);
|
|
4217
4259
|
}
|
|
4260
|
+
get overlapCapable() {
|
|
4261
|
+
return this.usingAec && this.options.overlapPause && !!this.player.pause && !!this.player.resume;
|
|
4262
|
+
}
|
|
4263
|
+
/** Overlap turn-taking (AEC tier): onset → pause (exact-sample hold); sustained → cede; died out
|
|
4264
|
+
* → resume. No vocabulary anywhere — duration and persistence decide (backchannels are short
|
|
4265
|
+
* and stop). Nothing is lost across a pause, so a false positive costs only a brief hold. */
|
|
4266
|
+
handleOverlap(rms) {
|
|
4267
|
+
const o = this.options;
|
|
4268
|
+
if (!this.speaking || !this.overlapCapable) return;
|
|
4269
|
+
if (rms < o.overlapRms) return;
|
|
4270
|
+
const t = now();
|
|
4271
|
+
if (!this.pausedAt) {
|
|
4272
|
+
this.pausedAt = t;
|
|
4273
|
+
this.overlapLoud = 1;
|
|
4274
|
+
this.overlapLastLoudAt = t;
|
|
4275
|
+
this.player.pause();
|
|
4276
|
+
this.armResume();
|
|
4277
|
+
return;
|
|
4278
|
+
}
|
|
4279
|
+
if (t - this.overlapLastLoudAt > 300) {
|
|
4280
|
+
this.pausedAt = t;
|
|
4281
|
+
this.overlapLoud = 1;
|
|
4282
|
+
this.overlapLastLoudAt = t;
|
|
4283
|
+
this.armResume();
|
|
4284
|
+
return;
|
|
4285
|
+
}
|
|
4286
|
+
this.overlapLastLoudAt = t;
|
|
4287
|
+
this.overlapLoud++;
|
|
4288
|
+
if (t - this.pausedAt >= o.overlapSustainMs && this.overlapLoud >= 4) {
|
|
4289
|
+
const phase = this.ctxOpen ? "speaking" : "drain";
|
|
4290
|
+
this.interrupt();
|
|
4291
|
+
this.options.onBargeIn(phase);
|
|
4292
|
+
return;
|
|
4293
|
+
}
|
|
4294
|
+
this.armResume();
|
|
4295
|
+
}
|
|
4296
|
+
armResume() {
|
|
4297
|
+
if (this.resumeTimer) clearTimeout(this.resumeTimer);
|
|
4298
|
+
this.resumeTimer = setTimeout(() => {
|
|
4299
|
+
this.resumeTimer = null;
|
|
4300
|
+
if (!this.pausedAt) return;
|
|
4301
|
+
this.resetOverlap(true);
|
|
4302
|
+
}, this.options.overlapResumeMs);
|
|
4303
|
+
}
|
|
4304
|
+
resetOverlap(resume) {
|
|
4305
|
+
if (this.resumeTimer) {
|
|
4306
|
+
clearTimeout(this.resumeTimer);
|
|
4307
|
+
this.resumeTimer = null;
|
|
4308
|
+
}
|
|
4309
|
+
if (this.pausedAt && resume) this.player.resume?.();
|
|
4310
|
+
this.pausedAt = 0;
|
|
4311
|
+
this.overlapLoud = 0;
|
|
4312
|
+
}
|
|
4218
4313
|
/** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
|
|
4219
4314
|
handleLevel(rms) {
|
|
4220
|
-
if (this.usingAec) return;
|
|
4315
|
+
if (this.usingAec) return this.handleOverlap(rms);
|
|
4221
4316
|
if (!this.speaking) {
|
|
4222
4317
|
this.baseline = 0;
|
|
4223
4318
|
this.hot = 0;
|