agent.libx.js 0.92.6 → 0.92.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +188 -6
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +28 -0
- package/dist/index.js +88 -4
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -3552,12 +3552,14 @@ ${recent}` : brief;
|
|
|
3552
3552
|
report.output(chunk);
|
|
3553
3553
|
}
|
|
3554
3554
|
} : base;
|
|
3555
|
+
const workerHost = o.host?.ask ? { ask: (q2) => o.host.ask(q2) } : void 0;
|
|
3555
3556
|
const worker = new Agent({
|
|
3556
3557
|
ai: o.ai,
|
|
3557
3558
|
fs: o.fs,
|
|
3558
3559
|
model: o.workerModel,
|
|
3559
3560
|
...o.workerOptions,
|
|
3560
3561
|
// may override ai/fs/model/tools/… —
|
|
3562
|
+
...workerHost ? { host: workerHost } : {},
|
|
3561
3563
|
...hooks ? { hooks } : {},
|
|
3562
3564
|
signal: controller.signal
|
|
3563
3565
|
// …but never the per-task cancellation signal
|
|
@@ -3805,6 +3807,18 @@ var VoiceEngineOptions = class {
|
|
|
3805
3807
|
/** heuristic (non-AEC) energy barge-in tuning */
|
|
3806
3808
|
bargeRmsMult = 2;
|
|
3807
3809
|
bargeRmsFloor = 500;
|
|
3810
|
+
/** Overlap turn-taking (AEC tier, needs player.pause/resume) — human phone-call model:
|
|
3811
|
+
* onset → PAUSE (exact-sample hold, nothing lost); sustained overlap → cede (interrupt; the LLM
|
|
3812
|
+
* re-enters). Brief overlaps that die out (backchannels — "mm-hm", decided by DURATION, not
|
|
3813
|
+
* vocabulary) resume from the precise sample and are dropped. false disables. */
|
|
3814
|
+
overlapPause = true;
|
|
3815
|
+
/** sustained overlap ≥ this → cede the turn */
|
|
3816
|
+
overlapSustainMs = 350;
|
|
3817
|
+
/** quiet for this long while paused → resume, drop the interjection */
|
|
3818
|
+
overlapResumeMs = 700;
|
|
3819
|
+
/** energy floor for "overlap candidate" — must sit ABOVE typical room ambient (~110 rms measured;
|
|
3820
|
+
* ungated ambient re-arming the resume timer forever was a live wedge). User speech ≫ 300. */
|
|
3821
|
+
overlapRms = 300;
|
|
3808
3822
|
};
|
|
3809
3823
|
var VoiceEngine = class {
|
|
3810
3824
|
options;
|
|
@@ -3835,6 +3849,13 @@ var VoiceEngine = class {
|
|
|
3835
3849
|
// endpointed text held for the merge window
|
|
3836
3850
|
pendingTimer = null;
|
|
3837
3851
|
lastInterrupted = null;
|
|
3852
|
+
// overlap (pause) tier state — AEC + pause-capable sinks only
|
|
3853
|
+
pausedAt = 0;
|
|
3854
|
+
overlapLoud = 0;
|
|
3855
|
+
// loud chunks since pause (sustain must be real sound, not two clicks)
|
|
3856
|
+
overlapLastLoudAt = 0;
|
|
3857
|
+
// continuity guard: a gap re-arms the onset (sparse noise ≠ sustained speech)
|
|
3858
|
+
resumeTimer = null;
|
|
3838
3859
|
constructor(options) {
|
|
3839
3860
|
this.options = { ...new VoiceEngineOptions(), ...options };
|
|
3840
3861
|
const o = this.options;
|
|
@@ -3882,6 +3903,7 @@ var VoiceEngine = class {
|
|
|
3882
3903
|
this.drainTimer = null;
|
|
3883
3904
|
}
|
|
3884
3905
|
this.interrupted = false;
|
|
3906
|
+
this.resetOverlap(true);
|
|
3885
3907
|
if (!this.speaking) this.player.markTurn();
|
|
3886
3908
|
this.speaking = true;
|
|
3887
3909
|
this.ctxOpen = true;
|
|
@@ -3916,6 +3938,10 @@ var VoiceEngine = class {
|
|
|
3916
3938
|
this.drainTimer = null;
|
|
3917
3939
|
return;
|
|
3918
3940
|
}
|
|
3941
|
+
if (this.pausedAt) {
|
|
3942
|
+
this.drainTimer = setTimeout(settle, 250);
|
|
3943
|
+
return;
|
|
3944
|
+
}
|
|
3919
3945
|
this.drainTimer = null;
|
|
3920
3946
|
this.speaking = false;
|
|
3921
3947
|
this.echoUntil = now() + 2500;
|
|
@@ -3947,6 +3973,7 @@ var VoiceEngine = class {
|
|
|
3947
3973
|
clearTimeout(this.drainTimer);
|
|
3948
3974
|
this.drainTimer = null;
|
|
3949
3975
|
}
|
|
3976
|
+
this.resetOverlap(false);
|
|
3950
3977
|
const heardChars = Math.round(Math.max(0, this.player.playedMs()) / 1e3 * 15);
|
|
3951
3978
|
if (this.reply) this.lastInterrupted = { full: this.reply, heard: this.reply.slice(0, heardChars) };
|
|
3952
3979
|
this.speaking = false;
|
|
@@ -3961,6 +3988,7 @@ var VoiceEngine = class {
|
|
|
3961
3988
|
this.setState("listening");
|
|
3962
3989
|
}
|
|
3963
3990
|
stop() {
|
|
3991
|
+
if (this.resumeTimer) clearTimeout(this.resumeTimer);
|
|
3964
3992
|
if (this.pendingTimer) clearTimeout(this.pendingTimer);
|
|
3965
3993
|
if (this.drainTimer) clearTimeout(this.drainTimer);
|
|
3966
3994
|
this.stt.stop();
|
|
@@ -3986,12 +4014,11 @@ var VoiceEngine = class {
|
|
|
3986
4014
|
genuine(text) {
|
|
3987
4015
|
const total = this.words(text).length;
|
|
3988
4016
|
const novel = this.novelWords(text).length;
|
|
3989
|
-
|
|
3990
|
-
return novel >= 2 || novel / Math.max(1, total) > 0.5;
|
|
4017
|
+
return novel > 0 && novel / Math.max(1, total) > 0.5;
|
|
3991
4018
|
}
|
|
3992
4019
|
handlePartial(text) {
|
|
3993
4020
|
if (this.speaking) {
|
|
3994
|
-
const barge = this.usingAec ? this.genuine(text) : this.novelWords(text).length >= (this.suspectUntil ? 1 : 2);
|
|
4021
|
+
const barge = this.overlapCapable ? false : this.usingAec ? this.genuine(text) : this.novelWords(text).length >= (this.suspectUntil ? 1 : 2);
|
|
3995
4022
|
if (barge) {
|
|
3996
4023
|
const phase = this.ctxOpen ? "speaking" : "drain";
|
|
3997
4024
|
this.interrupt();
|
|
@@ -4008,6 +4035,10 @@ var VoiceEngine = class {
|
|
|
4008
4035
|
if (!this.echoActive() || (this.usingAec ? this.genuine(text) : this.novelWords(text).length >= 1)) this.options.onPartial(text);
|
|
4009
4036
|
}
|
|
4010
4037
|
handleUtterance(text) {
|
|
4038
|
+
if (this.speaking && this.ctxOpen && this.overlapCapable) {
|
|
4039
|
+
this.stt.reset();
|
|
4040
|
+
return;
|
|
4041
|
+
}
|
|
4011
4042
|
if (this.echoActive() && (this.usingAec ? !this.genuine(text) : this.novelWords(text).length < 2)) {
|
|
4012
4043
|
this.stt.reset();
|
|
4013
4044
|
return;
|
|
@@ -4031,9 +4062,62 @@ var VoiceEngine = class {
|
|
|
4031
4062
|
this.pendingUtt = "";
|
|
4032
4063
|
if (text) this.options.onUtterance(text);
|
|
4033
4064
|
}
|
|
4065
|
+
get overlapCapable() {
|
|
4066
|
+
return this.usingAec && this.options.overlapPause && !!this.player.pause && !!this.player.resume;
|
|
4067
|
+
}
|
|
4068
|
+
/** Overlap turn-taking (AEC tier): onset → pause (exact-sample hold); sustained → cede; died out
|
|
4069
|
+
* → resume. No vocabulary anywhere — duration and persistence decide (backchannels are short
|
|
4070
|
+
* and stop). Nothing is lost across a pause, so a false positive costs only a brief hold. */
|
|
4071
|
+
handleOverlap(rms) {
|
|
4072
|
+
const o = this.options;
|
|
4073
|
+
if (!this.speaking || !this.overlapCapable) return;
|
|
4074
|
+
if (rms < o.overlapRms) return;
|
|
4075
|
+
const t = now();
|
|
4076
|
+
if (!this.pausedAt) {
|
|
4077
|
+
this.pausedAt = t;
|
|
4078
|
+
this.overlapLoud = 1;
|
|
4079
|
+
this.overlapLastLoudAt = t;
|
|
4080
|
+
this.player.pause();
|
|
4081
|
+
this.armResume();
|
|
4082
|
+
return;
|
|
4083
|
+
}
|
|
4084
|
+
if (t - this.overlapLastLoudAt > 300) {
|
|
4085
|
+
this.pausedAt = t;
|
|
4086
|
+
this.overlapLoud = 1;
|
|
4087
|
+
this.overlapLastLoudAt = t;
|
|
4088
|
+
this.armResume();
|
|
4089
|
+
return;
|
|
4090
|
+
}
|
|
4091
|
+
this.overlapLastLoudAt = t;
|
|
4092
|
+
this.overlapLoud++;
|
|
4093
|
+
if (t - this.pausedAt >= o.overlapSustainMs && this.overlapLoud >= 4) {
|
|
4094
|
+
const phase = this.ctxOpen ? "speaking" : "drain";
|
|
4095
|
+
this.interrupt();
|
|
4096
|
+
this.options.onBargeIn(phase);
|
|
4097
|
+
return;
|
|
4098
|
+
}
|
|
4099
|
+
this.armResume();
|
|
4100
|
+
}
|
|
4101
|
+
armResume() {
|
|
4102
|
+
if (this.resumeTimer) clearTimeout(this.resumeTimer);
|
|
4103
|
+
this.resumeTimer = setTimeout(() => {
|
|
4104
|
+
this.resumeTimer = null;
|
|
4105
|
+
if (!this.pausedAt) return;
|
|
4106
|
+
this.resetOverlap(true);
|
|
4107
|
+
}, this.options.overlapResumeMs);
|
|
4108
|
+
}
|
|
4109
|
+
resetOverlap(resume) {
|
|
4110
|
+
if (this.resumeTimer) {
|
|
4111
|
+
clearTimeout(this.resumeTimer);
|
|
4112
|
+
this.resumeTimer = null;
|
|
4113
|
+
}
|
|
4114
|
+
if (this.pausedAt && resume) this.player.resume?.();
|
|
4115
|
+
this.pausedAt = 0;
|
|
4116
|
+
this.overlapLoud = 0;
|
|
4117
|
+
}
|
|
4034
4118
|
/** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
|
|
4035
4119
|
handleLevel(rms) {
|
|
4036
|
-
if (this.usingAec) return;
|
|
4120
|
+
if (this.usingAec) return this.handleOverlap(rms);
|
|
4037
4121
|
if (!this.speaking) {
|
|
4038
4122
|
this.baseline = 0;
|
|
4039
4123
|
this.hot = 0;
|
|
@@ -4983,6 +5067,102 @@ var NodeMicSource = class {
|
|
|
4983
5067
|
}, 500).unref?.();
|
|
4984
5068
|
}
|
|
4985
5069
|
};
|
|
5070
|
+
var AecDuplexAudio = class {
|
|
5071
|
+
constructor(bin) {
|
|
5072
|
+
this.bin = bin;
|
|
5073
|
+
}
|
|
5074
|
+
bin;
|
|
5075
|
+
aec = true;
|
|
5076
|
+
proc = null;
|
|
5077
|
+
stopped = false;
|
|
5078
|
+
bytesWritten = 0;
|
|
5079
|
+
startedAt = 0;
|
|
5080
|
+
// --- AudioSource ---
|
|
5081
|
+
start(onChunk) {
|
|
5082
|
+
this.proc = spawn2(this.bin, [], { stdio: ["pipe", "pipe", "ignore"] });
|
|
5083
|
+
this.proc.stdin.on("error", () => {
|
|
5084
|
+
});
|
|
5085
|
+
this.proc.on("exit", (c) => {
|
|
5086
|
+
if (c && !this.stopped) log12.error(`aec duplex audio exited (${c}) \u2014 check mic permission / MIC_AEC=0`);
|
|
5087
|
+
});
|
|
5088
|
+
this.proc.stdout.on("data", (chunk) => onChunk(chunk));
|
|
5089
|
+
}
|
|
5090
|
+
stop() {
|
|
5091
|
+
this.stopped = true;
|
|
5092
|
+
const p = this.proc;
|
|
5093
|
+
this.proc = null;
|
|
5094
|
+
if (!p) return;
|
|
5095
|
+
p.kill("SIGTERM");
|
|
5096
|
+
setTimeout(() => {
|
|
5097
|
+
try {
|
|
5098
|
+
p.kill("SIGKILL");
|
|
5099
|
+
} catch {
|
|
5100
|
+
}
|
|
5101
|
+
}, 500).unref?.();
|
|
5102
|
+
}
|
|
5103
|
+
// --- AudioSink (frame writer; same played/drain byte-math as the ffplay Player) ---
|
|
5104
|
+
frame(payload) {
|
|
5105
|
+
const stdin = this.proc?.stdin;
|
|
5106
|
+
if (!stdin || stdin.destroyed) return;
|
|
5107
|
+
const hdr = Buffer.alloc(4);
|
|
5108
|
+
hdr.writeUInt32LE(payload ? payload.length : 0);
|
|
5109
|
+
stdin.write(hdr);
|
|
5110
|
+
if (payload?.length) stdin.write(payload);
|
|
5111
|
+
}
|
|
5112
|
+
markTurn() {
|
|
5113
|
+
this.frame(null);
|
|
5114
|
+
this.bytesWritten = 0;
|
|
5115
|
+
this.startedAt = 0;
|
|
5116
|
+
this.pausedSince = 0;
|
|
5117
|
+
this.pausedAccum = 0;
|
|
5118
|
+
}
|
|
5119
|
+
write(chunk) {
|
|
5120
|
+
if (!this.startedAt) this.startedAt = now4();
|
|
5121
|
+
this.bytesWritten += chunk.length;
|
|
5122
|
+
this.frame(chunk);
|
|
5123
|
+
}
|
|
5124
|
+
playedMs() {
|
|
5125
|
+
return this.startedAt ? now4() - this.startedAt - this.pausedMs() : 0;
|
|
5126
|
+
}
|
|
5127
|
+
drainMs() {
|
|
5128
|
+
if (!this.startedAt) return 0;
|
|
5129
|
+
const queuedMs = this.bytesWritten / (TTS_SAMPLE_RATE * 2) * 1e3;
|
|
5130
|
+
return Math.max(0, queuedMs - (now4() - this.startedAt - this.pausedMs()));
|
|
5131
|
+
}
|
|
5132
|
+
/** barge-in: silence NOW (in-band flush) — the capture side keeps running */
|
|
5133
|
+
kill() {
|
|
5134
|
+
this.frame(null);
|
|
5135
|
+
this.bytesWritten = 0;
|
|
5136
|
+
this.startedAt = 0;
|
|
5137
|
+
this.pausedSince = 0;
|
|
5138
|
+
this.pausedAccum = 0;
|
|
5139
|
+
}
|
|
5140
|
+
/** overlap trail-off: exact-sample PAUSE (len==0xFFFFFFFF) / RESUME (len==0xFFFFFFFE) frames */
|
|
5141
|
+
pausedSince = 0;
|
|
5142
|
+
pausedAccum = 0;
|
|
5143
|
+
ctl(code) {
|
|
5144
|
+
const stdin = this.proc?.stdin;
|
|
5145
|
+
if (!stdin || stdin.destroyed) return;
|
|
5146
|
+
const f = Buffer.alloc(4);
|
|
5147
|
+
f.writeUInt32LE(code, 0);
|
|
5148
|
+
stdin.write(f);
|
|
5149
|
+
}
|
|
5150
|
+
pause() {
|
|
5151
|
+
if (this.pausedSince) return;
|
|
5152
|
+
this.pausedSince = now4();
|
|
5153
|
+
this.ctl(4294967295);
|
|
5154
|
+
}
|
|
5155
|
+
resume() {
|
|
5156
|
+
if (!this.pausedSince) return;
|
|
5157
|
+
this.pausedAccum += now4() - this.pausedSince;
|
|
5158
|
+
this.pausedSince = 0;
|
|
5159
|
+
this.ctl(4294967294);
|
|
5160
|
+
}
|
|
5161
|
+
/** total paused time this turn — excluded from played/drain math (the tape held still) */
|
|
5162
|
+
pausedMs() {
|
|
5163
|
+
return this.pausedAccum + (this.pausedSince ? now4() - this.pausedSince : 0);
|
|
5164
|
+
}
|
|
5165
|
+
};
|
|
4986
5166
|
var VoiceIOOptions = class extends VoiceEngineOptions {
|
|
4987
5167
|
sonioxApiKey = process.env.SONIOX_API_KEY ?? "";
|
|
4988
5168
|
cartesiaApiKey = process.env.CARTESIA_API_KEY ?? "";
|
|
@@ -4991,11 +5171,13 @@ var VoiceIOOptions = class extends VoiceEngineOptions {
|
|
|
4991
5171
|
var VoiceIO = class extends VoiceEngine {
|
|
4992
5172
|
constructor(options) {
|
|
4993
5173
|
const o = { ...new VoiceIOOptions(), ...options };
|
|
5174
|
+
const bin = !o.stt || !o.player ? resolveAecBinary() : null;
|
|
5175
|
+
const duplex = bin ? new AecDuplexAudio(bin) : null;
|
|
4994
5176
|
super({
|
|
4995
5177
|
...o,
|
|
4996
|
-
stt: o.stt ?? new SonioxSTT({ auth: o.sonioxApiKey, source: new NodeMicSource() }),
|
|
5178
|
+
stt: o.stt ?? new SonioxSTT({ auth: o.sonioxApiKey, source: duplex ?? new NodeMicSource() }),
|
|
4997
5179
|
tts: o.tts ?? new CartesiaTTS({ auth: o.cartesiaApiKey, voiceId: o.cartesiaVoiceId }),
|
|
4998
|
-
player: o.player ?? new Player(),
|
|
5180
|
+
player: o.player ?? duplex ?? new Player(),
|
|
4999
5181
|
bargeRmsMult: Number(process.env.BARGE_RMS_MULT || o.bargeRmsMult),
|
|
5000
5182
|
bargeRmsFloor: Number(process.env.BARGE_RMS_FLOOR || o.bargeRmsFloor)
|
|
5001
5183
|
});
|