agent.libx.js 0.92.6 → 0.92.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/cli.ts +18 -3
- package/dist/cli.js +266 -10
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +46 -0
- package/dist/index.js +150 -6
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -602,6 +602,12 @@ declare class DuplexAgentOptions {
|
|
|
602
602
|
progressUpdates: boolean;
|
|
603
603
|
/** Min ms between progress re-voices per task. */
|
|
604
604
|
progressIntervalMs: number;
|
|
605
|
+
/** Relay worker questions (AskUserQuestion + permission asks via parkQuestion) through the VOICE:
|
|
606
|
+
* the question re-voices as '[task <id> asks] …', the user answers conversationally, and the
|
|
607
|
+
* voice model resolves it with the AnswerTask tool. Off → host.ask passthrough (text menus). */
|
|
608
|
+
askRelay: boolean;
|
|
609
|
+
/** Parked questions auto-resolve empty after this long (callers map '' to deny/best-judgment). */
|
|
610
|
+
askTimeoutMs: number;
|
|
605
611
|
/** Host overrides for QuickLook lookups (keyed by `what`). The engine's defaults go through the
|
|
606
612
|
* (possibly jailed) fs — e.g. `.git/**` is deny-listed, so the CLI supplies 'branch' itself. */
|
|
607
613
|
quickLook?: Record<string, (path?: string) => string | Promise<string>>;
|
|
@@ -621,6 +627,11 @@ declare class DuplexAgent {
|
|
|
621
627
|
private seq;
|
|
622
628
|
private pendingEvents;
|
|
623
629
|
private flushQueued;
|
|
630
|
+
/** Parked worker questions awaiting a (voice-relayed) user answer, keyed by ask id. */
|
|
631
|
+
readonly pendingAsks: Map<string, {
|
|
632
|
+
question: string;
|
|
633
|
+
resolve: (answer: string) => void;
|
|
634
|
+
}>;
|
|
624
635
|
constructor(options?: Partial<DuplexAgentOptions>);
|
|
625
636
|
/** One user turn: the voice agent streams the reply (and may Delegate). Serialized with re-voice turns. */
|
|
626
637
|
send(content: MessageContent): Promise<RunResult>;
|
|
@@ -640,6 +651,12 @@ declare class DuplexAgent {
|
|
|
640
651
|
* (pre records the in-flight call; a self-cleaning timer narrates "still inside Bash — 70s").
|
|
641
652
|
* Completion supersedes: nothing is emitted once the task has settled. */
|
|
642
653
|
private progressReporter;
|
|
654
|
+
/** Park a question under `askId` (a task id, or any unique key for permission asks): re-voices
|
|
655
|
+
* '[task <id> asks] …' and resolves with the user's answer via AnswerTask — or '' on timeout/
|
|
656
|
+
* task settle (callers map '' to deny / best-judgment). Workers never block forever. */
|
|
657
|
+
parkQuestion(askId: string, question: string): Promise<string>;
|
|
658
|
+
/** Resolve any question a settling/cancelled task left parked (its answer can no longer matter). */
|
|
659
|
+
private dropAsk;
|
|
643
660
|
private onWorkerSettled;
|
|
644
661
|
private onWorkerFailed;
|
|
645
662
|
private failTask;
|
|
@@ -649,6 +666,7 @@ declare class DuplexAgent {
|
|
|
649
666
|
* fs-only (no shell; the engine is VFS-abstracted): time, git branch (.git/HEAD read), ls, file
|
|
650
667
|
* head. Output is hard-capped so a lookup can never bloat the skinny voice context. */
|
|
651
668
|
private quickLookTool;
|
|
669
|
+
private answerTaskTool;
|
|
652
670
|
private cancelTaskTool;
|
|
653
671
|
}
|
|
654
672
|
|
|
@@ -745,6 +763,11 @@ interface AudioSink {
|
|
|
745
763
|
playedMs(): number;
|
|
746
764
|
/** stop playback NOW (barge-in primitive) */
|
|
747
765
|
kill(): void;
|
|
766
|
+
/** optional exact-sample pause/resume — enables the overlap trail-off tier (web: AudioContext
|
|
767
|
+
* suspend/resume; CLI AEC helper: control frames). Sinks without it degrade to interrupt-only
|
|
768
|
+
* turn-taking. Nothing is lost across a pause; playedMs/drainMs must exclude paused time. */
|
|
769
|
+
pause?(): void;
|
|
770
|
+
resume?(): void;
|
|
748
771
|
}
|
|
749
772
|
/** Static key (server/CLI) or an async getter (browser: fetch a short-lived token from YOUR
|
|
750
773
|
* backend). Getters are invoked on EVERY (re)connect — temp tokens expire, so a reconnect
|
|
@@ -794,6 +817,18 @@ declare class VoiceEngineOptions {
|
|
|
794
817
|
/** heuristic (non-AEC) energy barge-in tuning */
|
|
795
818
|
bargeRmsMult: number;
|
|
796
819
|
bargeRmsFloor: number;
|
|
820
|
+
/** Overlap turn-taking (AEC tier, needs player.pause/resume) — human phone-call model:
|
|
821
|
+
* onset → PAUSE (exact-sample hold, nothing lost); sustained overlap → cede (interrupt; the LLM
|
|
822
|
+
* re-enters). Brief overlaps that die out (backchannels — "mm-hm", decided by DURATION, not
|
|
823
|
+
* vocabulary) resume from the precise sample and are dropped. false disables. */
|
|
824
|
+
overlapPause: boolean;
|
|
825
|
+
/** sustained overlap ≥ this → cede the turn */
|
|
826
|
+
overlapSustainMs: number;
|
|
827
|
+
/** quiet for this long while paused → resume, drop the interjection */
|
|
828
|
+
overlapResumeMs: number;
|
|
829
|
+
/** energy floor for "overlap candidate" — must sit ABOVE typical room ambient (~110 rms measured;
|
|
830
|
+
* ungated ambient re-arming the resume timer forever was a live wedge). User speech ≫ 300. */
|
|
831
|
+
overlapRms: number;
|
|
797
832
|
}
|
|
798
833
|
declare class VoiceEngine {
|
|
799
834
|
options: VoiceEngineOptions;
|
|
@@ -817,6 +852,10 @@ declare class VoiceEngine {
|
|
|
817
852
|
private pendingUtt;
|
|
818
853
|
private pendingTimer;
|
|
819
854
|
private lastInterrupted;
|
|
855
|
+
private pausedAt;
|
|
856
|
+
private overlapLoud;
|
|
857
|
+
private overlapLastLoudAt;
|
|
858
|
+
private resumeTimer;
|
|
820
859
|
constructor(options?: Partial<VoiceEngineOptions>);
|
|
821
860
|
start(): Promise<void>;
|
|
822
861
|
get usingAec(): boolean;
|
|
@@ -852,6 +891,13 @@ declare class VoiceEngine {
|
|
|
852
891
|
private handlePartial;
|
|
853
892
|
private handleUtterance;
|
|
854
893
|
private flushUtterance;
|
|
894
|
+
private get overlapCapable();
|
|
895
|
+
/** Overlap turn-taking (AEC tier): onset → pause (exact-sample hold); sustained → cede; died out
|
|
896
|
+
* → resume. No vocabulary anywhere — duration and persistence decide (backchannels are short
|
|
897
|
+
* and stop). Nothing is lost across a pause, so a false positive costs only a brief hold. */
|
|
898
|
+
private handleOverlap;
|
|
899
|
+
private armResume;
|
|
900
|
+
private resetOverlap;
|
|
855
901
|
/** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
|
|
856
902
|
private handleLevel;
|
|
857
903
|
}
|
package/dist/index.js
CHANGED
|
@@ -3545,11 +3545,17 @@ var DuplexAgentOptions = class {
|
|
|
3545
3545
|
progressUpdates = false;
|
|
3546
3546
|
/** Min ms between progress re-voices per task. */
|
|
3547
3547
|
progressIntervalMs = 25e3;
|
|
3548
|
+
/** Relay worker questions (AskUserQuestion + permission asks via parkQuestion) through the VOICE:
|
|
3549
|
+
* the question re-voices as '[task <id> asks] …', the user answers conversationally, and the
|
|
3550
|
+
* voice model resolves it with the AnswerTask tool. Off → host.ask passthrough (text menus). */
|
|
3551
|
+
askRelay = false;
|
|
3552
|
+
/** Parked questions auto-resolve empty after this long (callers map '' to deny/best-judgment). */
|
|
3553
|
+
askTimeoutMs = 12e4;
|
|
3548
3554
|
/** Host overrides for QuickLook lookups (keyed by `what`). The engine's defaults go through the
|
|
3549
3555
|
* (possibly jailed) fs — e.g. `.git/**` is deny-listed, so the CLI supplies 'branch' itself. */
|
|
3550
3556
|
quickLook;
|
|
3551
3557
|
};
|
|
3552
|
-
var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou work in a pair: you talk, and a background worker with FULL access to the user\'s environment (files, shell, web) does the hands-on work. You can find out or do ANYTHING by calling `Delegate` with a clear, self-contained brief \u2014 so NEVER tell the user you can\'t see, access, or do something. Delegate and find out. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), delegate IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nAfter calling Delegate, tell the user you are on it in one short sentence, then end your turn. Do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, summarize it for the ear in one or two short sentences. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nNever read raw file paths, diffs, or code aloud verbatim.\nDo not fire a second Delegate for work already in flight \u2014 check `TaskStatus` first. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not delegate, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file \u2014 use `QuickLook` (instant, no task). Anything requiring searching, reasoning, running commands, or editing still goes through `Delegate`.\nNEVER claim to have stored, saved, or remembered something durably \u2014 you cannot. Anything the user wants persisted (their name, preferences, notes) must be Delegated so a worker writes it to memory.\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
|
|
3558
|
+
var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou work in a pair: you talk, and a background worker with FULL access to the user\'s environment (files, shell, web) does the hands-on work. You can find out or do ANYTHING by calling `Delegate` with a clear, self-contained brief \u2014 so NEVER tell the user you can\'t see, access, or do something. Delegate and find out. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), delegate IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nAfter calling Delegate, tell the user you are on it in one short sentence, then end your turn. Do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, summarize it for the ear in one or two short sentences. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nNever read raw file paths, diffs, or code aloud verbatim.\n"[task t1 asks] \u2026" events are QUESTIONS from a background task \u2014 relay to the user in your own words, short, then end your turn. When the user answers, call `AnswerTask` with that id and their answer. NEVER answer on the user\'s behalf for permissions or risky operations; if their reply is ambiguous, confirm first.\nDo not fire a second Delegate for work already in flight \u2014 check `TaskStatus` first. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not delegate, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file \u2014 use `QuickLook` (instant, no task). Anything requiring searching, reasoning, running commands, or editing still goes through `Delegate`.\nNEVER claim to have stored, saved, or remembered something durably \u2014 you cannot. Anything the user wants persisted (their name, preferences, notes) must be Delegated so a worker writes it to memory.\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
|
|
3553
3559
|
var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you delegate, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now delegate", no task ids out loud).`;
|
|
3554
3560
|
var DuplexAgent = class {
|
|
3555
3561
|
options;
|
|
@@ -3559,6 +3565,8 @@ var DuplexAgent = class {
|
|
|
3559
3565
|
seq = 0;
|
|
3560
3566
|
pendingEvents = [];
|
|
3561
3567
|
flushQueued = false;
|
|
3568
|
+
/** Parked worker questions awaiting a (voice-relayed) user answer, keyed by ask id. */
|
|
3569
|
+
pendingAsks = /* @__PURE__ */ new Map();
|
|
3562
3570
|
constructor(options) {
|
|
3563
3571
|
this.options = { ...new DuplexAgentOptions(), ...options };
|
|
3564
3572
|
const o = this.options;
|
|
@@ -3581,7 +3589,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`,
|
|
|
3581
3589
|
// no defaultTools() — the voice can only Delegate, never touch files itself. Set AFTER the
|
|
3582
3590
|
// voiceOptions spread (addTools() would be clobbered by the first prepare()); extra voice
|
|
3583
3591
|
// tools come in via voiceOptions.tools and are merged here.
|
|
3584
|
-
tools: [...o.voiceOptions?.tools ?? [], this.delegateTool(), this.taskStatusTool(), this.cancelTaskTool(), this.quickLookTool()]
|
|
3592
|
+
tools: [...o.voiceOptions?.tools ?? [], this.delegateTool(), this.taskStatusTool(), this.cancelTaskTool(), this.quickLookTool(), this.answerTaskTool()]
|
|
3585
3593
|
});
|
|
3586
3594
|
}
|
|
3587
3595
|
/** One user turn: the voice agent streams the reply (and may Delegate). Serialized with re-voice turns. */
|
|
@@ -3652,12 +3660,19 @@ ${recent}` : brief;
|
|
|
3652
3660
|
report.output(chunk);
|
|
3653
3661
|
}
|
|
3654
3662
|
} : base;
|
|
3663
|
+
const relayAsk = async (q) => {
|
|
3664
|
+
const opts = q.options?.length ? ` Options: ${q.options.map((x) => x.label).join(", ")}.` : "";
|
|
3665
|
+
const a = await this.parkQuestion(id, `${q.question}${opts}`);
|
|
3666
|
+
return a || "(no answer from the user \u2014 use your best judgment and note the assumption)";
|
|
3667
|
+
};
|
|
3668
|
+
const workerHost = o.askRelay ? { ask: relayAsk } : o.host?.ask ? { ask: (q) => o.host.ask(q) } : void 0;
|
|
3655
3669
|
const worker = new Agent({
|
|
3656
3670
|
ai: o.ai,
|
|
3657
3671
|
fs: o.fs,
|
|
3658
3672
|
model: o.workerModel,
|
|
3659
3673
|
...o.workerOptions,
|
|
3660
3674
|
// may override ai/fs/model/tools/… —
|
|
3675
|
+
...workerHost ? { host: workerHost } : {},
|
|
3661
3676
|
...hooks ? { hooks } : {},
|
|
3662
3677
|
signal: controller.signal
|
|
3663
3678
|
// …but never the per-task cancellation signal
|
|
@@ -3706,7 +3721,35 @@ ${recent}` : brief;
|
|
|
3706
3721
|
}
|
|
3707
3722
|
};
|
|
3708
3723
|
}
|
|
3724
|
+
/** Park a question under `askId` (a task id, or any unique key for permission asks): re-voices
|
|
3725
|
+
* '[task <id> asks] …' and resolves with the user's answer via AnswerTask — or '' on timeout/
|
|
3726
|
+
* task settle (callers map '' to deny / best-judgment). Workers never block forever. */
|
|
3727
|
+
parkQuestion(askId, question) {
|
|
3728
|
+
return new Promise((resolve) => {
|
|
3729
|
+
let settled = false;
|
|
3730
|
+
const finish = (answer) => {
|
|
3731
|
+
if (settled) return;
|
|
3732
|
+
settled = true;
|
|
3733
|
+
clearTimeout(timer);
|
|
3734
|
+
this.pendingAsks.delete(askId);
|
|
3735
|
+
resolve(answer);
|
|
3736
|
+
};
|
|
3737
|
+
const timer = setTimeout(() => {
|
|
3738
|
+
this.notify("task_ask_timeout", `task ${askId}: question timed out \u2014 proceeding without an answer`);
|
|
3739
|
+
finish("");
|
|
3740
|
+
}, this.options.askTimeoutMs);
|
|
3741
|
+
this.pendingAsks.set(askId, { question, resolve: finish });
|
|
3742
|
+
this.notify("task_ask", `task ${askId} asks: ${question}`, { id: askId, question });
|
|
3743
|
+
this.queueRevoice(`[task ${askId} asks] ${question}
|
|
3744
|
+
(Relay this to the user in your own words. When they answer, call AnswerTask with id "${askId}" and their answer.)`);
|
|
3745
|
+
});
|
|
3746
|
+
}
|
|
3747
|
+
/** Resolve any question a settling/cancelled task left parked (its answer can no longer matter). */
|
|
3748
|
+
dropAsk(id) {
|
|
3749
|
+
this.pendingAsks.get(id)?.resolve("");
|
|
3750
|
+
}
|
|
3709
3751
|
onWorkerSettled(id, res) {
|
|
3752
|
+
this.dropAsk(id);
|
|
3710
3753
|
const rec = this.tasks.get(id);
|
|
3711
3754
|
if (res.finishReason === "aborted" || rec.status === "cancelled") {
|
|
3712
3755
|
rec.status = "cancelled";
|
|
@@ -3726,6 +3769,7 @@ ${recent}` : brief;
|
|
|
3726
3769
|
this.failTask(this.tasks.get(id), err instanceof Error ? err.message : String(err));
|
|
3727
3770
|
}
|
|
3728
3771
|
failTask(rec, msg) {
|
|
3772
|
+
this.dropAsk(rec.id);
|
|
3729
3773
|
rec.status = "error";
|
|
3730
3774
|
log7.warn(`task ${rec.id} failed: ${msg}`);
|
|
3731
3775
|
this.notify("task_error", `task ${rec.id} (${rec.label}) failed: ${msg}`);
|
|
@@ -3818,6 +3862,23 @@ ${recent}` : brief;
|
|
|
3818
3862
|
}
|
|
3819
3863
|
};
|
|
3820
3864
|
}
|
|
3865
|
+
answerTaskTool() {
|
|
3866
|
+
return {
|
|
3867
|
+
name: "AnswerTask",
|
|
3868
|
+
description: `Relay the user's answer to a pending question from a background task (the "[task <id> asks]" events). Pass the id from the event and the user's answer.`,
|
|
3869
|
+
parameters: {
|
|
3870
|
+
type: "object",
|
|
3871
|
+
required: ["id", "answer"],
|
|
3872
|
+
properties: { id: { type: "string" }, answer: { type: "string", description: "the user's answer, verbatim or faithfully summarized" } }
|
|
3873
|
+
},
|
|
3874
|
+
run: async ({ id, answer }) => {
|
|
3875
|
+
const ask = this.pendingAsks.get(String(id));
|
|
3876
|
+
if (!ask) return `No pending question for '${id}' \u2014 it may have been answered already or timed out.`;
|
|
3877
|
+
ask.resolve(String(answer ?? ""));
|
|
3878
|
+
return `Answer relayed \u2014 task ${id} resumes.`;
|
|
3879
|
+
}
|
|
3880
|
+
};
|
|
3881
|
+
}
|
|
3821
3882
|
cancelTaskTool() {
|
|
3822
3883
|
return {
|
|
3823
3884
|
name: "CancelTask",
|
|
@@ -4000,6 +4061,18 @@ var VoiceEngineOptions = class {
|
|
|
4000
4061
|
/** heuristic (non-AEC) energy barge-in tuning */
|
|
4001
4062
|
bargeRmsMult = 2;
|
|
4002
4063
|
bargeRmsFloor = 500;
|
|
4064
|
+
/** Overlap turn-taking (AEC tier, needs player.pause/resume) — human phone-call model:
|
|
4065
|
+
* onset → PAUSE (exact-sample hold, nothing lost); sustained overlap → cede (interrupt; the LLM
|
|
4066
|
+
* re-enters). Brief overlaps that die out (backchannels — "mm-hm", decided by DURATION, not
|
|
4067
|
+
* vocabulary) resume from the precise sample and are dropped. false disables. */
|
|
4068
|
+
overlapPause = true;
|
|
4069
|
+
/** sustained overlap ≥ this → cede the turn */
|
|
4070
|
+
overlapSustainMs = 350;
|
|
4071
|
+
/** quiet for this long while paused → resume, drop the interjection */
|
|
4072
|
+
overlapResumeMs = 700;
|
|
4073
|
+
/** energy floor for "overlap candidate" — must sit ABOVE typical room ambient (~110 rms measured;
|
|
4074
|
+
* ungated ambient re-arming the resume timer forever was a live wedge). User speech ≫ 300. */
|
|
4075
|
+
overlapRms = 300;
|
|
4003
4076
|
};
|
|
4004
4077
|
var VoiceEngine = class {
|
|
4005
4078
|
options;
|
|
@@ -4030,6 +4103,13 @@ var VoiceEngine = class {
|
|
|
4030
4103
|
// endpointed text held for the merge window
|
|
4031
4104
|
pendingTimer = null;
|
|
4032
4105
|
lastInterrupted = null;
|
|
4106
|
+
// overlap (pause) tier state — AEC + pause-capable sinks only
|
|
4107
|
+
pausedAt = 0;
|
|
4108
|
+
overlapLoud = 0;
|
|
4109
|
+
// loud chunks since pause (sustain must be real sound, not two clicks)
|
|
4110
|
+
overlapLastLoudAt = 0;
|
|
4111
|
+
// continuity guard: a gap re-arms the onset (sparse noise ≠ sustained speech)
|
|
4112
|
+
resumeTimer = null;
|
|
4033
4113
|
constructor(options) {
|
|
4034
4114
|
this.options = { ...new VoiceEngineOptions(), ...options };
|
|
4035
4115
|
const o = this.options;
|
|
@@ -4077,6 +4157,7 @@ var VoiceEngine = class {
|
|
|
4077
4157
|
this.drainTimer = null;
|
|
4078
4158
|
}
|
|
4079
4159
|
this.interrupted = false;
|
|
4160
|
+
this.resetOverlap(true);
|
|
4080
4161
|
if (!this.speaking) this.player.markTurn();
|
|
4081
4162
|
this.speaking = true;
|
|
4082
4163
|
this.ctxOpen = true;
|
|
@@ -4111,6 +4192,10 @@ var VoiceEngine = class {
|
|
|
4111
4192
|
this.drainTimer = null;
|
|
4112
4193
|
return;
|
|
4113
4194
|
}
|
|
4195
|
+
if (this.pausedAt) {
|
|
4196
|
+
this.drainTimer = setTimeout(settle, 250);
|
|
4197
|
+
return;
|
|
4198
|
+
}
|
|
4114
4199
|
this.drainTimer = null;
|
|
4115
4200
|
this.speaking = false;
|
|
4116
4201
|
this.echoUntil = now() + 2500;
|
|
@@ -4142,6 +4227,7 @@ var VoiceEngine = class {
|
|
|
4142
4227
|
clearTimeout(this.drainTimer);
|
|
4143
4228
|
this.drainTimer = null;
|
|
4144
4229
|
}
|
|
4230
|
+
this.resetOverlap(false);
|
|
4145
4231
|
const heardChars = Math.round(Math.max(0, this.player.playedMs()) / 1e3 * 15);
|
|
4146
4232
|
if (this.reply) this.lastInterrupted = { full: this.reply, heard: this.reply.slice(0, heardChars) };
|
|
4147
4233
|
this.speaking = false;
|
|
@@ -4156,6 +4242,7 @@ var VoiceEngine = class {
|
|
|
4156
4242
|
this.setState("listening");
|
|
4157
4243
|
}
|
|
4158
4244
|
stop() {
|
|
4245
|
+
if (this.resumeTimer) clearTimeout(this.resumeTimer);
|
|
4159
4246
|
if (this.pendingTimer) clearTimeout(this.pendingTimer);
|
|
4160
4247
|
if (this.drainTimer) clearTimeout(this.drainTimer);
|
|
4161
4248
|
this.stt.stop();
|
|
@@ -4181,12 +4268,11 @@ var VoiceEngine = class {
|
|
|
4181
4268
|
genuine(text) {
|
|
4182
4269
|
const total = this.words(text).length;
|
|
4183
4270
|
const novel = this.novelWords(text).length;
|
|
4184
|
-
|
|
4185
|
-
return novel >= 2 || novel / Math.max(1, total) > 0.5;
|
|
4271
|
+
return novel > 0 && novel / Math.max(1, total) > 0.5;
|
|
4186
4272
|
}
|
|
4187
4273
|
handlePartial(text) {
|
|
4188
4274
|
if (this.speaking) {
|
|
4189
|
-
const barge = this.usingAec ? this.genuine(text) : this.novelWords(text).length >= (this.suspectUntil ? 1 : 2);
|
|
4275
|
+
const barge = this.overlapCapable ? false : this.usingAec ? this.genuine(text) : this.novelWords(text).length >= (this.suspectUntil ? 1 : 2);
|
|
4190
4276
|
if (barge) {
|
|
4191
4277
|
const phase = this.ctxOpen ? "speaking" : "drain";
|
|
4192
4278
|
this.interrupt();
|
|
@@ -4203,6 +4289,10 @@ var VoiceEngine = class {
|
|
|
4203
4289
|
if (!this.echoActive() || (this.usingAec ? this.genuine(text) : this.novelWords(text).length >= 1)) this.options.onPartial(text);
|
|
4204
4290
|
}
|
|
4205
4291
|
handleUtterance(text) {
|
|
4292
|
+
if (this.speaking && this.ctxOpen && this.overlapCapable) {
|
|
4293
|
+
this.stt.reset();
|
|
4294
|
+
return;
|
|
4295
|
+
}
|
|
4206
4296
|
if (this.echoActive() && (this.usingAec ? !this.genuine(text) : this.novelWords(text).length < 2)) {
|
|
4207
4297
|
this.stt.reset();
|
|
4208
4298
|
return;
|
|
@@ -4226,9 +4316,63 @@ var VoiceEngine = class {
|
|
|
4226
4316
|
this.pendingUtt = "";
|
|
4227
4317
|
if (text) this.options.onUtterance(text);
|
|
4228
4318
|
}
|
|
4319
|
+
get overlapCapable() {
|
|
4320
|
+
return this.usingAec && this.options.overlapPause && !!this.player.pause && !!this.player.resume;
|
|
4321
|
+
}
|
|
4322
|
+
/** Overlap turn-taking (AEC tier): onset → pause (exact-sample hold); sustained → cede; died out
|
|
4323
|
+
* → resume. No vocabulary anywhere — duration and persistence decide (backchannels are short
|
|
4324
|
+
* and stop). Nothing is lost across a pause, so a false positive costs only a brief hold. */
|
|
4325
|
+
handleOverlap(rms) {
|
|
4326
|
+
const o = this.options;
|
|
4327
|
+
if (!this.speaking || !this.overlapCapable) return;
|
|
4328
|
+
if (rms < o.overlapRms) return;
|
|
4329
|
+
const t = now();
|
|
4330
|
+
if (!this.pausedAt) {
|
|
4331
|
+
this.overlapLoud = t - this.overlapLastLoudAt <= 60 ? this.overlapLoud + 1 : 1;
|
|
4332
|
+
this.overlapLastLoudAt = t;
|
|
4333
|
+
if (this.overlapLoud < 3) return;
|
|
4334
|
+
this.pausedAt = t;
|
|
4335
|
+
this.player.pause();
|
|
4336
|
+
this.armResume();
|
|
4337
|
+
return;
|
|
4338
|
+
}
|
|
4339
|
+
if (t - this.overlapLastLoudAt > 300) {
|
|
4340
|
+
this.pausedAt = t;
|
|
4341
|
+
this.overlapLoud = 1;
|
|
4342
|
+
this.overlapLastLoudAt = t;
|
|
4343
|
+
this.armResume();
|
|
4344
|
+
return;
|
|
4345
|
+
}
|
|
4346
|
+
this.overlapLastLoudAt = t;
|
|
4347
|
+
this.overlapLoud++;
|
|
4348
|
+
if (t - this.pausedAt >= o.overlapSustainMs && this.overlapLoud >= 4) {
|
|
4349
|
+
const phase = this.ctxOpen ? "speaking" : "drain";
|
|
4350
|
+
this.interrupt();
|
|
4351
|
+
this.options.onBargeIn(phase);
|
|
4352
|
+
return;
|
|
4353
|
+
}
|
|
4354
|
+
this.armResume();
|
|
4355
|
+
}
|
|
4356
|
+
armResume() {
|
|
4357
|
+
if (this.resumeTimer) clearTimeout(this.resumeTimer);
|
|
4358
|
+
this.resumeTimer = setTimeout(() => {
|
|
4359
|
+
this.resumeTimer = null;
|
|
4360
|
+
if (!this.pausedAt) return;
|
|
4361
|
+
this.resetOverlap(true);
|
|
4362
|
+
}, this.options.overlapResumeMs);
|
|
4363
|
+
}
|
|
4364
|
+
resetOverlap(resume) {
|
|
4365
|
+
if (this.resumeTimer) {
|
|
4366
|
+
clearTimeout(this.resumeTimer);
|
|
4367
|
+
this.resumeTimer = null;
|
|
4368
|
+
}
|
|
4369
|
+
if (this.pausedAt && resume) this.player.resume?.();
|
|
4370
|
+
this.pausedAt = 0;
|
|
4371
|
+
this.overlapLoud = 0;
|
|
4372
|
+
}
|
|
4229
4373
|
/** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
|
|
4230
4374
|
handleLevel(rms) {
|
|
4231
|
-
if (this.usingAec) return;
|
|
4375
|
+
if (this.usingAec) return this.handleOverlap(rms);
|
|
4232
4376
|
if (!this.speaking) {
|
|
4233
4377
|
this.baseline = 0;
|
|
4234
4378
|
this.hot = 0;
|