agent.libx.js 0.92.5 → 0.92.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +199 -6
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +34 -0
- package/dist/index.js +99 -4
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -3552,12 +3552,14 @@ ${recent}` : brief;
|
|
|
3552
3552
|
report.output(chunk);
|
|
3553
3553
|
}
|
|
3554
3554
|
} : base;
|
|
3555
|
+
const workerHost = o.host?.ask ? { ask: (q2) => o.host.ask(q2) } : void 0;
|
|
3555
3556
|
const worker = new Agent({
|
|
3556
3557
|
ai: o.ai,
|
|
3557
3558
|
fs: o.fs,
|
|
3558
3559
|
model: o.workerModel,
|
|
3559
3560
|
...o.workerOptions,
|
|
3560
3561
|
// may override ai/fs/model/tools/… —
|
|
3562
|
+
...workerHost ? { host: workerHost } : {},
|
|
3561
3563
|
...hooks ? { hooks } : {},
|
|
3562
3564
|
signal: controller.signal
|
|
3563
3565
|
// …but never the per-task cancellation signal
|
|
@@ -3805,6 +3807,18 @@ var VoiceEngineOptions = class {
|
|
|
3805
3807
|
/** heuristic (non-AEC) energy barge-in tuning */
|
|
3806
3808
|
bargeRmsMult = 2;
|
|
3807
3809
|
bargeRmsFloor = 500;
|
|
3810
|
+
/** Overlap turn-taking (AEC tier, needs player.pause/resume) — human phone-call model:
|
|
3811
|
+
* onset → PAUSE (exact-sample hold, nothing lost); sustained overlap → cede (interrupt; the LLM
|
|
3812
|
+
* re-enters). Brief overlaps that die out (backchannels — "mm-hm", decided by DURATION, not
|
|
3813
|
+
* vocabulary) resume from the precise sample and are dropped. false disables. */
|
|
3814
|
+
overlapPause = true;
|
|
3815
|
+
/** sustained overlap ≥ this → cede the turn */
|
|
3816
|
+
overlapSustainMs = 350;
|
|
3817
|
+
/** quiet for this long while paused → resume, drop the interjection */
|
|
3818
|
+
overlapResumeMs = 700;
|
|
3819
|
+
/** energy floor for "overlap candidate" — must sit ABOVE typical room ambient (~110 rms measured;
|
|
3820
|
+
* ungated ambient re-arming the resume timer forever was a live wedge). User speech ≫ 300. */
|
|
3821
|
+
overlapRms = 300;
|
|
3808
3822
|
};
|
|
3809
3823
|
var VoiceEngine = class {
|
|
3810
3824
|
options;
|
|
@@ -3835,6 +3849,13 @@ var VoiceEngine = class {
|
|
|
3835
3849
|
// endpointed text held for the merge window
|
|
3836
3850
|
pendingTimer = null;
|
|
3837
3851
|
lastInterrupted = null;
|
|
3852
|
+
// overlap (pause) tier state — AEC + pause-capable sinks only
|
|
3853
|
+
pausedAt = 0;
|
|
3854
|
+
overlapLoud = 0;
|
|
3855
|
+
// loud chunks since pause (sustain must be real sound, not two clicks)
|
|
3856
|
+
overlapLastLoudAt = 0;
|
|
3857
|
+
// continuity guard: a gap re-arms the onset (sparse noise ≠ sustained speech)
|
|
3858
|
+
resumeTimer = null;
|
|
3838
3859
|
constructor(options) {
|
|
3839
3860
|
this.options = { ...new VoiceEngineOptions(), ...options };
|
|
3840
3861
|
const o = this.options;
|
|
@@ -3882,6 +3903,7 @@ var VoiceEngine = class {
|
|
|
3882
3903
|
this.drainTimer = null;
|
|
3883
3904
|
}
|
|
3884
3905
|
this.interrupted = false;
|
|
3906
|
+
this.resetOverlap(true);
|
|
3885
3907
|
if (!this.speaking) this.player.markTurn();
|
|
3886
3908
|
this.speaking = true;
|
|
3887
3909
|
this.ctxOpen = true;
|
|
@@ -3916,6 +3938,10 @@ var VoiceEngine = class {
|
|
|
3916
3938
|
this.drainTimer = null;
|
|
3917
3939
|
return;
|
|
3918
3940
|
}
|
|
3941
|
+
if (this.pausedAt) {
|
|
3942
|
+
this.drainTimer = setTimeout(settle, 250);
|
|
3943
|
+
return;
|
|
3944
|
+
}
|
|
3919
3945
|
this.drainTimer = null;
|
|
3920
3946
|
this.speaking = false;
|
|
3921
3947
|
this.echoUntil = now() + 2500;
|
|
@@ -3947,6 +3973,7 @@ var VoiceEngine = class {
|
|
|
3947
3973
|
clearTimeout(this.drainTimer);
|
|
3948
3974
|
this.drainTimer = null;
|
|
3949
3975
|
}
|
|
3976
|
+
this.resetOverlap(false);
|
|
3950
3977
|
const heardChars = Math.round(Math.max(0, this.player.playedMs()) / 1e3 * 15);
|
|
3951
3978
|
if (this.reply) this.lastInterrupted = { full: this.reply, heard: this.reply.slice(0, heardChars) };
|
|
3952
3979
|
this.speaking = false;
|
|
@@ -3961,6 +3988,7 @@ var VoiceEngine = class {
|
|
|
3961
3988
|
this.setState("listening");
|
|
3962
3989
|
}
|
|
3963
3990
|
stop() {
|
|
3991
|
+
if (this.resumeTimer) clearTimeout(this.resumeTimer);
|
|
3964
3992
|
if (this.pendingTimer) clearTimeout(this.pendingTimer);
|
|
3965
3993
|
if (this.drainTimer) clearTimeout(this.drainTimer);
|
|
3966
3994
|
this.stt.stop();
|
|
@@ -3978,9 +4006,19 @@ var VoiceEngine = class {
|
|
|
3978
4006
|
echoActive() {
|
|
3979
4007
|
return this.speaking || now() < this.echoUntil;
|
|
3980
4008
|
}
|
|
4009
|
+
/** Genuine user speech vs our own bleed (AEC tier): novel words must DOMINATE, not merely exist.
|
|
4010
|
+
* Degraded AEC + an STT mis-hearing manufactures a single novel word out of pure echo (a name or
|
|
4011
|
+
* rare word in our own reply comes back transcribed slightly differently — 1 novel / N words).
|
|
4012
|
+
* A real interjection is mostly novel ("stop", "wait what") — short utterances pass on ratio,
|
|
4013
|
+
* longer ones on count. */
|
|
4014
|
+
genuine(text) {
|
|
4015
|
+
const total = this.words(text).length;
|
|
4016
|
+
const novel = this.novelWords(text).length;
|
|
4017
|
+
return novel > 0 && novel / Math.max(1, total) > 0.5;
|
|
4018
|
+
}
|
|
3981
4019
|
handlePartial(text) {
|
|
3982
4020
|
if (this.speaking) {
|
|
3983
|
-
const barge = this.
|
|
4021
|
+
const barge = this.overlapCapable ? false : this.usingAec ? this.genuine(text) : this.novelWords(text).length >= (this.suspectUntil ? 1 : 2);
|
|
3984
4022
|
if (barge) {
|
|
3985
4023
|
const phase = this.ctxOpen ? "speaking" : "drain";
|
|
3986
4024
|
this.interrupt();
|
|
@@ -3994,10 +4032,14 @@ var VoiceEngine = class {
|
|
|
3994
4032
|
this.pendingTimer = null;
|
|
3995
4033
|
}
|
|
3996
4034
|
}
|
|
3997
|
-
if (!this.echoActive() || this.novelWords(text).length >= 1) this.options.onPartial(text);
|
|
4035
|
+
if (!this.echoActive() || (this.usingAec ? this.genuine(text) : this.novelWords(text).length >= 1)) this.options.onPartial(text);
|
|
3998
4036
|
}
|
|
3999
4037
|
handleUtterance(text) {
|
|
4000
|
-
if (this.
|
|
4038
|
+
if (this.speaking && this.ctxOpen && this.overlapCapable) {
|
|
4039
|
+
this.stt.reset();
|
|
4040
|
+
return;
|
|
4041
|
+
}
|
|
4042
|
+
if (this.echoActive() && (this.usingAec ? !this.genuine(text) : this.novelWords(text).length < 2)) {
|
|
4001
4043
|
this.stt.reset();
|
|
4002
4044
|
return;
|
|
4003
4045
|
}
|
|
@@ -4020,9 +4062,62 @@ var VoiceEngine = class {
|
|
|
4020
4062
|
this.pendingUtt = "";
|
|
4021
4063
|
if (text) this.options.onUtterance(text);
|
|
4022
4064
|
}
|
|
4065
|
+
get overlapCapable() {
|
|
4066
|
+
return this.usingAec && this.options.overlapPause && !!this.player.pause && !!this.player.resume;
|
|
4067
|
+
}
|
|
4068
|
+
/** Overlap turn-taking (AEC tier): onset → pause (exact-sample hold); sustained → cede; died out
|
|
4069
|
+
* → resume. No vocabulary anywhere — duration and persistence decide (backchannels are short
|
|
4070
|
+
* and stop). Nothing is lost across a pause, so a false positive costs only a brief hold. */
|
|
4071
|
+
handleOverlap(rms) {
|
|
4072
|
+
const o = this.options;
|
|
4073
|
+
if (!this.speaking || !this.overlapCapable) return;
|
|
4074
|
+
if (rms < o.overlapRms) return;
|
|
4075
|
+
const t = now();
|
|
4076
|
+
if (!this.pausedAt) {
|
|
4077
|
+
this.pausedAt = t;
|
|
4078
|
+
this.overlapLoud = 1;
|
|
4079
|
+
this.overlapLastLoudAt = t;
|
|
4080
|
+
this.player.pause();
|
|
4081
|
+
this.armResume();
|
|
4082
|
+
return;
|
|
4083
|
+
}
|
|
4084
|
+
if (t - this.overlapLastLoudAt > 300) {
|
|
4085
|
+
this.pausedAt = t;
|
|
4086
|
+
this.overlapLoud = 1;
|
|
4087
|
+
this.overlapLastLoudAt = t;
|
|
4088
|
+
this.armResume();
|
|
4089
|
+
return;
|
|
4090
|
+
}
|
|
4091
|
+
this.overlapLastLoudAt = t;
|
|
4092
|
+
this.overlapLoud++;
|
|
4093
|
+
if (t - this.pausedAt >= o.overlapSustainMs && this.overlapLoud >= 4) {
|
|
4094
|
+
const phase = this.ctxOpen ? "speaking" : "drain";
|
|
4095
|
+
this.interrupt();
|
|
4096
|
+
this.options.onBargeIn(phase);
|
|
4097
|
+
return;
|
|
4098
|
+
}
|
|
4099
|
+
this.armResume();
|
|
4100
|
+
}
|
|
4101
|
+
armResume() {
|
|
4102
|
+
if (this.resumeTimer) clearTimeout(this.resumeTimer);
|
|
4103
|
+
this.resumeTimer = setTimeout(() => {
|
|
4104
|
+
this.resumeTimer = null;
|
|
4105
|
+
if (!this.pausedAt) return;
|
|
4106
|
+
this.resetOverlap(true);
|
|
4107
|
+
}, this.options.overlapResumeMs);
|
|
4108
|
+
}
|
|
4109
|
+
resetOverlap(resume) {
|
|
4110
|
+
if (this.resumeTimer) {
|
|
4111
|
+
clearTimeout(this.resumeTimer);
|
|
4112
|
+
this.resumeTimer = null;
|
|
4113
|
+
}
|
|
4114
|
+
if (this.pausedAt && resume) this.player.resume?.();
|
|
4115
|
+
this.pausedAt = 0;
|
|
4116
|
+
this.overlapLoud = 0;
|
|
4117
|
+
}
|
|
4023
4118
|
/** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
|
|
4024
4119
|
handleLevel(rms) {
|
|
4025
|
-
if (this.usingAec) return;
|
|
4120
|
+
if (this.usingAec) return this.handleOverlap(rms);
|
|
4026
4121
|
if (!this.speaking) {
|
|
4027
4122
|
this.baseline = 0;
|
|
4028
4123
|
this.hot = 0;
|
|
@@ -4972,6 +5067,102 @@ var NodeMicSource = class {
|
|
|
4972
5067
|
}, 500).unref?.();
|
|
4973
5068
|
}
|
|
4974
5069
|
};
|
|
5070
|
+
var AecDuplexAudio = class {
|
|
5071
|
+
constructor(bin) {
|
|
5072
|
+
this.bin = bin;
|
|
5073
|
+
}
|
|
5074
|
+
bin;
|
|
5075
|
+
aec = true;
|
|
5076
|
+
proc = null;
|
|
5077
|
+
stopped = false;
|
|
5078
|
+
bytesWritten = 0;
|
|
5079
|
+
startedAt = 0;
|
|
5080
|
+
// --- AudioSource ---
|
|
5081
|
+
start(onChunk) {
|
|
5082
|
+
this.proc = spawn2(this.bin, [], { stdio: ["pipe", "pipe", "ignore"] });
|
|
5083
|
+
this.proc.stdin.on("error", () => {
|
|
5084
|
+
});
|
|
5085
|
+
this.proc.on("exit", (c) => {
|
|
5086
|
+
if (c && !this.stopped) log12.error(`aec duplex audio exited (${c}) \u2014 check mic permission / MIC_AEC=0`);
|
|
5087
|
+
});
|
|
5088
|
+
this.proc.stdout.on("data", (chunk) => onChunk(chunk));
|
|
5089
|
+
}
|
|
5090
|
+
stop() {
|
|
5091
|
+
this.stopped = true;
|
|
5092
|
+
const p = this.proc;
|
|
5093
|
+
this.proc = null;
|
|
5094
|
+
if (!p) return;
|
|
5095
|
+
p.kill("SIGTERM");
|
|
5096
|
+
setTimeout(() => {
|
|
5097
|
+
try {
|
|
5098
|
+
p.kill("SIGKILL");
|
|
5099
|
+
} catch {
|
|
5100
|
+
}
|
|
5101
|
+
}, 500).unref?.();
|
|
5102
|
+
}
|
|
5103
|
+
// --- AudioSink (frame writer; same played/drain byte-math as the ffplay Player) ---
|
|
5104
|
+
frame(payload) {
|
|
5105
|
+
const stdin = this.proc?.stdin;
|
|
5106
|
+
if (!stdin || stdin.destroyed) return;
|
|
5107
|
+
const hdr = Buffer.alloc(4);
|
|
5108
|
+
hdr.writeUInt32LE(payload ? payload.length : 0);
|
|
5109
|
+
stdin.write(hdr);
|
|
5110
|
+
if (payload?.length) stdin.write(payload);
|
|
5111
|
+
}
|
|
5112
|
+
markTurn() {
|
|
5113
|
+
this.frame(null);
|
|
5114
|
+
this.bytesWritten = 0;
|
|
5115
|
+
this.startedAt = 0;
|
|
5116
|
+
this.pausedSince = 0;
|
|
5117
|
+
this.pausedAccum = 0;
|
|
5118
|
+
}
|
|
5119
|
+
write(chunk) {
|
|
5120
|
+
if (!this.startedAt) this.startedAt = now4();
|
|
5121
|
+
this.bytesWritten += chunk.length;
|
|
5122
|
+
this.frame(chunk);
|
|
5123
|
+
}
|
|
5124
|
+
playedMs() {
|
|
5125
|
+
return this.startedAt ? now4() - this.startedAt - this.pausedMs() : 0;
|
|
5126
|
+
}
|
|
5127
|
+
drainMs() {
|
|
5128
|
+
if (!this.startedAt) return 0;
|
|
5129
|
+
const queuedMs = this.bytesWritten / (TTS_SAMPLE_RATE * 2) * 1e3;
|
|
5130
|
+
return Math.max(0, queuedMs - (now4() - this.startedAt - this.pausedMs()));
|
|
5131
|
+
}
|
|
5132
|
+
/** barge-in: silence NOW (in-band flush) — the capture side keeps running */
|
|
5133
|
+
kill() {
|
|
5134
|
+
this.frame(null);
|
|
5135
|
+
this.bytesWritten = 0;
|
|
5136
|
+
this.startedAt = 0;
|
|
5137
|
+
this.pausedSince = 0;
|
|
5138
|
+
this.pausedAccum = 0;
|
|
5139
|
+
}
|
|
5140
|
+
/** overlap trail-off: exact-sample PAUSE (len==0xFFFFFFFF) / RESUME (len==0xFFFFFFFE) frames */
|
|
5141
|
+
pausedSince = 0;
|
|
5142
|
+
pausedAccum = 0;
|
|
5143
|
+
ctl(code) {
|
|
5144
|
+
const stdin = this.proc?.stdin;
|
|
5145
|
+
if (!stdin || stdin.destroyed) return;
|
|
5146
|
+
const f = Buffer.alloc(4);
|
|
5147
|
+
f.writeUInt32LE(code, 0);
|
|
5148
|
+
stdin.write(f);
|
|
5149
|
+
}
|
|
5150
|
+
pause() {
|
|
5151
|
+
if (this.pausedSince) return;
|
|
5152
|
+
this.pausedSince = now4();
|
|
5153
|
+
this.ctl(4294967295);
|
|
5154
|
+
}
|
|
5155
|
+
resume() {
|
|
5156
|
+
if (!this.pausedSince) return;
|
|
5157
|
+
this.pausedAccum += now4() - this.pausedSince;
|
|
5158
|
+
this.pausedSince = 0;
|
|
5159
|
+
this.ctl(4294967294);
|
|
5160
|
+
}
|
|
5161
|
+
/** total paused time this turn — excluded from played/drain math (the tape held still) */
|
|
5162
|
+
pausedMs() {
|
|
5163
|
+
return this.pausedAccum + (this.pausedSince ? now4() - this.pausedSince : 0);
|
|
5164
|
+
}
|
|
5165
|
+
};
|
|
4975
5166
|
var VoiceIOOptions = class extends VoiceEngineOptions {
|
|
4976
5167
|
sonioxApiKey = process.env.SONIOX_API_KEY ?? "";
|
|
4977
5168
|
cartesiaApiKey = process.env.CARTESIA_API_KEY ?? "";
|
|
@@ -4980,11 +5171,13 @@ var VoiceIOOptions = class extends VoiceEngineOptions {
|
|
|
4980
5171
|
var VoiceIO = class extends VoiceEngine {
|
|
4981
5172
|
constructor(options) {
|
|
4982
5173
|
const o = { ...new VoiceIOOptions(), ...options };
|
|
5174
|
+
const bin = !o.stt || !o.player ? resolveAecBinary() : null;
|
|
5175
|
+
const duplex = bin ? new AecDuplexAudio(bin) : null;
|
|
4983
5176
|
super({
|
|
4984
5177
|
...o,
|
|
4985
|
-
stt: o.stt ?? new SonioxSTT({ auth: o.sonioxApiKey, source: new NodeMicSource() }),
|
|
5178
|
+
stt: o.stt ?? new SonioxSTT({ auth: o.sonioxApiKey, source: duplex ?? new NodeMicSource() }),
|
|
4986
5179
|
tts: o.tts ?? new CartesiaTTS({ auth: o.cartesiaApiKey, voiceId: o.cartesiaVoiceId }),
|
|
4987
|
-
player: o.player ?? new Player(),
|
|
5180
|
+
player: o.player ?? duplex ?? new Player(),
|
|
4988
5181
|
bargeRmsMult: Number(process.env.BARGE_RMS_MULT || o.bargeRmsMult),
|
|
4989
5182
|
bargeRmsFloor: Number(process.env.BARGE_RMS_FLOOR || o.bargeRmsFloor)
|
|
4990
5183
|
});
|