omnius 1.0.211 → 1.0.213
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +551 -311
- package/npm-shrinkwrap.json +2 -2
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -551582,28 +551582,38 @@ var init_personality = __esm({
|
|
|
551582
551582
|
});
|
|
551583
551583
|
|
|
551584
551584
|
// packages/orchestrator/dist/critic.js
|
|
551585
|
-
function
|
|
551585
|
+
function buildCriticGuidanceMessage(call, hits, opts = {}) {
|
|
551586
551586
|
const argPreview = JSON.stringify(call.args ?? {}).slice(0, 200);
|
|
551587
|
-
|
|
551588
|
-
|
|
551589
|
-
|
|
551590
|
-
|
|
551591
|
-
|
|
551592
|
-
|
|
551593
|
-
|
|
551587
|
+
const cached = opts.cachedResult ? `
|
|
551588
|
+
Prior evidence preview:
|
|
551589
|
+
${opts.cachedResult.slice(0, 700)}` : "";
|
|
551590
|
+
const source = opts.adversaryFlag ? "The adversary recognized this exact tool call as already observed earlier." : `This is exact repeat #${hits} for the same ${call.tool} arguments.`;
|
|
551591
|
+
return `[ADVERSARY GUIDANCE — non-blocking]
|
|
551592
|
+
Observation: ${source}
|
|
551593
|
+
Call: ${call.tool}(${argPreview})
|
|
551594
|
+
Root cause hypothesis: the run is losing track of already-observed evidence, usually after path confusion, compaction, or an over-broad discovery loop.
|
|
551595
|
+
Corrective action: let this call's result inform the next step once, then pivot to a concrete action.
|
|
551596
|
+
Suggested next actions: edit/write the implicated file, run verification, read a different specific file, or complete with evidence. Prefer not to repeat this exact call again unless the filesystem, browser, or page state changed.${cached}`;
|
|
551594
551597
|
}
|
|
551595
551598
|
function buildCachedResultEnvelope(result) {
|
|
551596
|
-
return `[
|
|
551599
|
+
return `[PRIOR RESULT — already observed by a prior identical call]
|
|
551597
551600
|
${result}`;
|
|
551598
551601
|
}
|
|
551599
551602
|
function evaluate2(inputs) {
|
|
551600
|
-
const { proposedCall, fingerprint, isReadLike, recentToolResults, dedupHitCount,
|
|
551601
|
-
if (
|
|
551603
|
+
const { proposedCall, fingerprint, isReadLike, recentToolResults, dedupHitCount, adversaryRedundantSignal } = inputs;
|
|
551604
|
+
if (adversaryRedundantSignal) {
|
|
551602
551605
|
const cached = recentToolResults.get(fingerprint);
|
|
551606
|
+
const cachedResult = cached ? buildCachedResultEnvelope(cached.result) : void 0;
|
|
551603
551607
|
return {
|
|
551604
|
-
decision: "
|
|
551605
|
-
reason: "
|
|
551606
|
-
|
|
551608
|
+
decision: "guidance",
|
|
551609
|
+
reason: "Adversary flagged this fingerprint as redundant",
|
|
551610
|
+
hitNumber: (dedupHitCount.get(fingerprint) ?? 0) + 1,
|
|
551611
|
+
guidanceMessage: buildCriticGuidanceMessage(proposedCall, (dedupHitCount.get(fingerprint) ?? 0) + 1, {
|
|
551612
|
+
cachedResult,
|
|
551613
|
+
adversaryFlag: true
|
|
551614
|
+
}),
|
|
551615
|
+
cachedResult,
|
|
551616
|
+
compacted: cached?.compacted
|
|
551607
551617
|
};
|
|
551608
551618
|
}
|
|
551609
551619
|
const cacheEligible = isReadLike || proposedCall.tool === "shell";
|
|
@@ -551611,24 +551621,16 @@ function evaluate2(inputs) {
|
|
|
551611
551621
|
const cached = recentToolResults.get(fingerprint);
|
|
551612
551622
|
if (cached !== void 0) {
|
|
551613
551623
|
const hits = (dedupHitCount.get(fingerprint) ?? 0) + 1;
|
|
551614
|
-
const threshold = proposedCall.tool === "shell" ? SHELL_THRESHOLD : FS_THRESHOLD;
|
|
551615
|
-
if (hits >= threshold) {
|
|
551616
|
-
return {
|
|
551617
|
-
decision: "force_progress_block",
|
|
551618
|
-
reason: `${proposedCall.tool} fingerprint hit count ${hits} >= ${threshold}`,
|
|
551619
|
-
hitNumber: hits,
|
|
551620
|
-
blockMessage: buildForceProgressBlockMessage(proposedCall, hits),
|
|
551621
|
-
cachedResult: buildCachedResultEnvelope(cached.result),
|
|
551622
|
-
compacted: cached.compacted
|
|
551623
|
-
};
|
|
551624
|
-
}
|
|
551625
551624
|
const cachedEnvelope = buildCachedResultEnvelope(cached.result);
|
|
551626
551625
|
return {
|
|
551627
|
-
decision: "
|
|
551628
|
-
reason: cached.compacted ? "post-compaction
|
|
551626
|
+
decision: "guidance",
|
|
551627
|
+
reason: cached.compacted ? "post-compaction duplicate evidence" : `duplicate call #${hits}`,
|
|
551629
551628
|
cachedResult: cachedEnvelope,
|
|
551630
551629
|
compacted: cached.compacted,
|
|
551631
|
-
hitNumber: hits
|
|
551630
|
+
hitNumber: hits,
|
|
551631
|
+
guidanceMessage: buildCriticGuidanceMessage(proposedCall, hits, {
|
|
551632
|
+
cachedResult: cachedEnvelope
|
|
551633
|
+
})
|
|
551632
551634
|
};
|
|
551633
551635
|
}
|
|
551634
551636
|
}
|
|
@@ -551680,12 +551682,9 @@ function isStagnant(signals, opts) {
|
|
|
551680
551682
|
return false;
|
|
551681
551683
|
return signals.completedDelta <= 0 && signals.filesDelta < filesDeltaMin && signals.failureSum >= failureThreshold && signals.variantCount >= variantThreshold;
|
|
551682
551684
|
}
|
|
551683
|
-
var SHELL_THRESHOLD, FS_THRESHOLD;
|
|
551684
551685
|
var init_critic = __esm({
|
|
551685
551686
|
"packages/orchestrator/dist/critic.js"() {
|
|
551686
551687
|
"use strict";
|
|
551687
|
-
SHELL_THRESHOLD = 2;
|
|
551688
|
-
FS_THRESHOLD = 3;
|
|
551689
551688
|
}
|
|
551690
551689
|
});
|
|
551691
551690
|
|
|
@@ -558656,8 +558655,8 @@ var init_agenticRunner = __esm({
|
|
|
558656
558655
|
// WO-KG-15
|
|
558657
558656
|
_retrievalContextCache = null;
|
|
558658
558657
|
// WO-KG-15: cache per-run
|
|
558659
|
-
//
|
|
558660
|
-
|
|
558658
|
+
// Adversary world-model and cohort stats
|
|
558659
|
+
_adversaryMode = "both";
|
|
558661
558660
|
_worldFacts = { files: /* @__PURE__ */ new Map(), lastTest: {}, lastLists: /* @__PURE__ */ new Map() };
|
|
558662
558661
|
// REG-7-root: Track file writes since last todo_write call. When this
|
|
558663
558662
|
// counter climbs without a todo update, the agent has likely batched
|
|
@@ -559006,6 +559005,8 @@ var init_agenticRunner = __esm({
|
|
|
559006
559005
|
_sessionId = `session-${Date.now()}`;
|
|
559007
559006
|
_workingDirectory = "";
|
|
559008
559007
|
constructor(backend, options2) {
|
|
559008
|
+
const adversaryMode = options2?.adversaryMode ?? options2?.observerMode ?? "both";
|
|
559009
|
+
const disableAdversaryCritic = options2?.disableAdversaryCritic ?? options2?.disableStepCritic ?? false;
|
|
559009
559010
|
this.backend = backend;
|
|
559010
559011
|
this.options = {
|
|
559011
559012
|
maxTurns: options2?.maxTurns ?? 60,
|
|
@@ -559029,19 +559030,23 @@ var init_agenticRunner = __esm({
|
|
|
559029
559030
|
bruteForce: options2?.bruteForce ?? true,
|
|
559030
559031
|
bruteForceMaxCycles: options2?.bruteForceMaxCycles ?? 100,
|
|
559031
559032
|
allowTurnExtension: options2?.allowTurnExtension ?? true,
|
|
559033
|
+
completionProvenanceGuard: options2?.completionProvenanceGuard ?? true,
|
|
559034
|
+
disableAdversaryCritic,
|
|
559035
|
+
disableStepCritic: disableAdversaryCritic,
|
|
559032
559036
|
modelTier: options2?.modelTier ?? "large",
|
|
559033
559037
|
contextWindowSize: options2?.contextWindowSize ?? 0,
|
|
559034
559038
|
personality: options2?.personality ?? PERSONALITY_PRESETS.balanced,
|
|
559035
559039
|
personalityName: options2?.personalityName ?? "",
|
|
559036
559040
|
finalVarResolver: options2?.finalVarResolver ?? void 0,
|
|
559037
|
-
|
|
559041
|
+
adversaryMode,
|
|
559042
|
+
observerMode: adversaryMode,
|
|
559038
559043
|
// Phase 4 — sub-agent isolation flag (defaults false). When true, this
|
|
559039
559044
|
// runner skips cross-task handoff inheritance from the parent's
|
|
559040
559045
|
// session.
|
|
559041
559046
|
subAgent: options2?.subAgent ?? false,
|
|
559042
559047
|
skipCrossTaskHandoff: options2?.skipCrossTaskHandoff ?? false
|
|
559043
559048
|
};
|
|
559044
|
-
this.
|
|
559049
|
+
this._adversaryMode = this.options.adversaryMode;
|
|
559045
559050
|
}
|
|
559046
559051
|
/** Update context window size (e.g. after querying Ollama /api/show) */
|
|
559047
559052
|
setContextWindowSize(size) {
|
|
@@ -559049,7 +559054,10 @@ var init_agenticRunner = __esm({
|
|
|
559049
559054
|
}
|
|
559050
559055
|
/** Set the working directory for session checkpointing */
|
|
559051
559056
|
setWorkingDirectory(dir) {
|
|
559052
|
-
this._workingDirectory = dir;
|
|
559057
|
+
this._workingDirectory = _pathResolve(dir);
|
|
559058
|
+
}
|
|
559059
|
+
authoritativeWorkingDirectory() {
|
|
559060
|
+
return _pathResolve(this._workingDirectory || process.cwd());
|
|
559053
559061
|
}
|
|
559054
559062
|
/** State root for runner-owned memory/artifacts. Defaults to cwd/.omnius. */
|
|
559055
559063
|
omniusStateDir() {
|
|
@@ -559822,7 +559830,7 @@ ${result.output ?? ""}`;
|
|
|
559822
559830
|
* checklist via todo_write, and only then call task_complete.
|
|
559823
559831
|
*/
|
|
559824
559832
|
/**
|
|
559825
|
-
* REG-39c: tag a SYNTHETIC failure (FORCED PROGRESS BLOCK /
|
|
559833
|
+
* REG-39c: tag a SYNTHETIC failure (FORCED PROGRESS BLOCK / adversary
|
|
559826
559834
|
* block / budget exhausted). These paths return early from
|
|
559827
559835
|
* executeSingle BEFORE the main result-handling code, so the normal
|
|
559828
559836
|
* MAST tagging miss them. This helper lets each return-early site
|
|
@@ -559898,6 +559906,198 @@ Do NOT call task_complete until all items are marked completed via todo_write.`;
|
|
|
559898
559906
|
`Continue the work loop: inspect the failed evidence, make the smallest targeted fix, then rerun the relevant verification. Use the full available verification spectrum for the artifact you changed: static syntax, build/typecheck, tests, service startup, runtime logs, browser/page errors, console output, network failures, screenshots, accessibility/DOM state, and end-to-end user flow checks where applicable. The exact tools are stack-dependent; the standard is objective runtime evidence, not self-report.`
|
|
559899
559907
|
].join("\n");
|
|
559900
559908
|
}
|
|
559909
|
+
buildMissionCompletionContract(task, context2) {
|
|
559910
|
+
if (process.env["OMNIUS_DISABLE_MISSION_COMPLETION_CONTRACT"] === "1")
|
|
559911
|
+
return "";
|
|
559912
|
+
const profile = this._inferCompletionProfile(`${task}
|
|
559913
|
+
${context2 ?? ""}`);
|
|
559914
|
+
const requirements = [];
|
|
559915
|
+
if (profile.browser)
|
|
559916
|
+
requirements.push("browser/UI state must be proven by a post-action screenshot/DOM/observe_bundle pass");
|
|
559917
|
+
if (profile.desktop)
|
|
559918
|
+
requirements.push("desktop state must be proven by vision_action_loop observe or desktop_describe after the final action");
|
|
559919
|
+
if (profile.code)
|
|
559920
|
+
requirements.push("code/file changes must be proven by a relevant build/test/typecheck/runtime command after the last edit");
|
|
559921
|
+
if (profile.research)
|
|
559922
|
+
requirements.push("research/root-cause claims must cite concrete inspected files, commands, logs, or source artifacts");
|
|
559923
|
+
if (requirements.length === 0)
|
|
559924
|
+
requirements.push("final claims must name the concrete evidence used or state that the task required no external action");
|
|
559925
|
+
return [
|
|
559926
|
+
`[MISSION COMPLETION CONTRACT]`,
|
|
559927
|
+
`Current ask: ${task.slice(0, 500)}`,
|
|
559928
|
+
``,
|
|
559929
|
+
`Before claiming success or calling task_complete, satisfy the mission-specific evidence requirements:`,
|
|
559930
|
+
...requirements.map((line) => `- ${line}.`),
|
|
559931
|
+
``,
|
|
559932
|
+
`The final task_complete summary for any action-heavy task must include a compact Provenance/Evidence note naming the validating tool output, command, screenshot, DOM state, file path, or blocker. Self-confidence is not evidence.`,
|
|
559933
|
+
`For browser/form/account/send flows: after the last click/type/navigate/submit action, capture a fresh browser observation and verify the visible final state before completion.`,
|
|
559934
|
+
`If completion is impossible, use a summary beginning BLOCKED: and name the exact blocker plus the evidence already collected.`
|
|
559935
|
+
].join("\n");
|
|
559936
|
+
}
|
|
559937
|
+
_inferCompletionProfile(text) {
|
|
559938
|
+
const t2 = text.toLowerCase();
|
|
559939
|
+
const browser3 = /\b(browser|web\s*page|website|page|playwright|selenium|chromedriver|chrome|headless|gui|proton|login|captcha|form|account|compose|mail|submit|click|type|fill)\b/.test(t2);
|
|
559940
|
+
const desktop = /\b(desktop|screen|application|app\b|window|file manager|open a file|laptop|screenshot|vision_action_loop|desktop_describe|desktop_click)\b/.test(t2);
|
|
559941
|
+
const code8 = /\b(implement|fix|patch|refactor|rewrite|build|compile|typecheck|test suite|unit test|integration test|source file|codebase|package|typescript|javascript|python|rust|golang)\b/.test(t2);
|
|
559942
|
+
const research = /\b(discover|root cause|triage|deep dive|review|audit|investigate|prove|validate|forensics|diagnostic|failure mode)\b/.test(t2);
|
|
559943
|
+
const formLike = /\b(form|fill|submit|signup|sign up|login|log in|account|compose|send|sent|mail|captcha|checkout|payment|upload)\b/.test(t2);
|
|
559944
|
+
return { browser: browser3, desktop, code: code8, research, formLike };
|
|
559945
|
+
}
|
|
559946
|
+
_completionSummaryHasProvenance(summary) {
|
|
559947
|
+
return /\b(provenance|evidence|verified|validated|confirmed|observed|screenshot|dom|console|network|log|test|typecheck|build|passed|opened|sent|created|submitted|blocked)\b/i.test(summary);
|
|
559948
|
+
}
|
|
559949
|
+
_isBlockedCompletionSummary(summary) {
|
|
559950
|
+
return /^\s*(?:BLOCKED|PARTIAL|NO FILE CHANGES REQUIRED)\b/i.test(summary);
|
|
559951
|
+
}
|
|
559952
|
+
_browserActionKind(entry) {
|
|
559953
|
+
if (!/^(browser_action|playwright_browser|carbonyl_browser)$/.test(entry.name))
|
|
559954
|
+
return "other";
|
|
559955
|
+
const args = this._parseExactArgsKey(entry.argsKey);
|
|
559956
|
+
const action = String(args.get("action") ?? args.get("tool") ?? args.get("command") ?? "").toLowerCase();
|
|
559957
|
+
if (/^(screenshot|dom|dom_summary|observe|observe_bundle|page_errors|console_logs|network_log|accessibility|snapshot|state|url|title)$/.test(action)) {
|
|
559958
|
+
return "observe";
|
|
559959
|
+
}
|
|
559960
|
+
if (/^(navigate|click|click_xy|vision_click|visual_click|type|fill|press|select|submit|evaluate|scroll|scroll_up|scroll_down|back|forward)$/.test(action)) {
|
|
559961
|
+
return "state";
|
|
559962
|
+
}
|
|
559963
|
+
return entry.name === "carbonyl_browser" ? "state" : "other";
|
|
559964
|
+
}
|
|
559965
|
+
_desktopActionKind(entry) {
|
|
559966
|
+
if (/^(desktop_describe|screenshot)$/.test(entry.name))
|
|
559967
|
+
return "observe";
|
|
559968
|
+
if (entry.name === "desktop_click")
|
|
559969
|
+
return "state";
|
|
559970
|
+
if (entry.name !== "vision_action_loop")
|
|
559971
|
+
return "other";
|
|
559972
|
+
const args = this._parseExactArgsKey(entry.argsKey);
|
|
559973
|
+
const action = String(args.get("action") ?? "").toLowerCase();
|
|
559974
|
+
if (/^(observe|screenshot|describe|ocr|state)$/.test(action))
|
|
559975
|
+
return "observe";
|
|
559976
|
+
if (/^(click|type|key|press|open|focus|drag|scroll)$/.test(action))
|
|
559977
|
+
return "state";
|
|
559978
|
+
return "other";
|
|
559979
|
+
}
|
|
559980
|
+
_isVerificationShell(entry) {
|
|
559981
|
+
if (entry.name !== "shell" && entry.name !== "background_run")
|
|
559982
|
+
return false;
|
|
559983
|
+
if (entry.success !== true)
|
|
559984
|
+
return false;
|
|
559985
|
+
const args = this._parseExactArgsKey(entry.argsKey);
|
|
559986
|
+
const command = String(args.get("command") ?? args.get("cmd") ?? "").toLowerCase();
|
|
559987
|
+
return /\b(test|typecheck|check|build|compile|verify|lint|pytest|vitest|jest|playwright|cypress|tsc|cargo\s+test|go\s+test|npm\s+run|pnpm\s+run|yarn\s+run)\b/.test(command);
|
|
559988
|
+
}
|
|
559989
|
+
_evaluateCompletionProvenanceGate(input) {
|
|
559990
|
+
if (this.options.completionProvenanceGuard === false)
|
|
559991
|
+
return { proceed: true };
|
|
559992
|
+
if (process.env["OMNIUS_DISABLE_COMPLETION_PROVENANCE_GUARD"] === "1")
|
|
559993
|
+
return { proceed: true };
|
|
559994
|
+
const summary = input.summary || "";
|
|
559995
|
+
const blockedSummary = this._isBlockedCompletionSummary(summary);
|
|
559996
|
+
const profile = this._inferCompletionProfile(input.taskGoal);
|
|
559997
|
+
const log22 = input.toolCallLog.filter((entry) => entry.name !== "task_complete");
|
|
559998
|
+
const browserUsed = log22.some((entry) => /^(browser_action|playwright_browser|carbonyl_browser)$/.test(entry.name));
|
|
559999
|
+
const desktopUsed = log22.some((entry) => /^(desktop_describe|desktop_click|vision_action_loop|screenshot)$/.test(entry.name));
|
|
560000
|
+
const mutated = log22.some((entry) => entry.mutated === true);
|
|
560001
|
+
const issues = [];
|
|
560002
|
+
const actionHeavy = profile.browser || profile.desktop || profile.code || profile.research || browserUsed || desktopUsed || mutated || this._fileWritesThisRun > 0;
|
|
560003
|
+
if (!actionHeavy)
|
|
560004
|
+
return { proceed: true };
|
|
560005
|
+
if (blockedSummary)
|
|
560006
|
+
return { proceed: true };
|
|
560007
|
+
const successfulNonCompletion = log22.filter((entry) => entry.success === true);
|
|
560008
|
+
if (successfulNonCompletion.length === 0) {
|
|
560009
|
+
issues.push("No successful objective tool result is recorded for this action-oriented task.");
|
|
560010
|
+
}
|
|
560011
|
+
const requiresBrowser = profile.browser || browserUsed;
|
|
560012
|
+
const requiresDesktop = profile.desktop || desktopUsed;
|
|
560013
|
+
if (requiresBrowser) {
|
|
560014
|
+
let lastStateIdx = -1;
|
|
560015
|
+
let lastObserveIdx = -1;
|
|
560016
|
+
log22.forEach((entry, idx) => {
|
|
560017
|
+
if (entry.success !== true)
|
|
560018
|
+
return;
|
|
560019
|
+
const kind = this._browserActionKind(entry);
|
|
560020
|
+
if (kind === "state")
|
|
560021
|
+
lastStateIdx = idx;
|
|
560022
|
+
if (kind === "observe")
|
|
560023
|
+
lastObserveIdx = idx;
|
|
560024
|
+
});
|
|
560025
|
+
if (lastStateIdx >= 0 && lastObserveIdx <= lastStateIdx) {
|
|
560026
|
+
issues.push("Browser state changed after the last browser observation. Capture a fresh screenshot/DOM/observe_bundle after the final click/type/navigate/submit before completion.");
|
|
560027
|
+
} else if (profile.formLike && lastObserveIdx < 0) {
|
|
560028
|
+
issues.push("This looks like a form/account/send flow, but no successful post-action browser observation is recorded.");
|
|
560029
|
+
}
|
|
560030
|
+
}
|
|
560031
|
+
if (requiresDesktop) {
|
|
560032
|
+
let lastStateIdx = -1;
|
|
560033
|
+
let lastObserveIdx = -1;
|
|
560034
|
+
log22.forEach((entry, idx) => {
|
|
560035
|
+
if (entry.success !== true)
|
|
560036
|
+
return;
|
|
560037
|
+
const kind = this._desktopActionKind(entry);
|
|
560038
|
+
if (kind === "state")
|
|
560039
|
+
lastStateIdx = idx;
|
|
560040
|
+
if (kind === "observe")
|
|
560041
|
+
lastObserveIdx = idx;
|
|
560042
|
+
});
|
|
560043
|
+
if (lastStateIdx >= 0 && lastObserveIdx <= lastStateIdx) {
|
|
560044
|
+
issues.push("Desktop state changed after the last visual observation. Run vision_action_loop observe or desktop_describe after the final desktop action before completion.");
|
|
560045
|
+
}
|
|
560046
|
+
}
|
|
560047
|
+
const mutatedEntries = log22.map((entry, idx) => ({ entry, idx })).filter(({ entry }) => entry.mutated === true);
|
|
560048
|
+
if (profile.code || mutatedEntries.length > 0 || this._fileWritesThisRun > 0) {
|
|
560049
|
+
const lastMutationIdx = mutatedEntries.length > 0 ? Math.max(...mutatedEntries.map(({ idx }) => idx)) : -1;
|
|
560050
|
+
const verifiedAfterMutation = log22.some((entry, idx) => {
|
|
560051
|
+
if (lastMutationIdx >= 0 && idx <= lastMutationIdx)
|
|
560052
|
+
return false;
|
|
560053
|
+
if (this._isVerificationShell(entry))
|
|
560054
|
+
return true;
|
|
560055
|
+
if (entry.success === true && this._browserActionKind(entry) === "observe")
|
|
560056
|
+
return true;
|
|
560057
|
+
if (entry.success === true && this._desktopActionKind(entry) === "observe")
|
|
560058
|
+
return true;
|
|
560059
|
+
return false;
|
|
560060
|
+
});
|
|
560061
|
+
if (lastMutationIdx >= 0 && !verifiedAfterMutation) {
|
|
560062
|
+
issues.push("Files were changed but no successful verification command or runtime observation appears after the last mutation.");
|
|
560063
|
+
}
|
|
560064
|
+
}
|
|
560065
|
+
const lastNonCompletion = [...log22].reverse().find(Boolean);
|
|
560066
|
+
if (lastNonCompletion?.success === false) {
|
|
560067
|
+
issues.push(`The most recent non-completion tool result failed (${lastNonCompletion.name}); resolve or explicitly report BLOCKED before completing.`);
|
|
560068
|
+
}
|
|
560069
|
+
if (!this._completionSummaryHasProvenance(summary)) {
|
|
560070
|
+
issues.push("The completion summary does not include an explicit Evidence/Provenance note.");
|
|
560071
|
+
}
|
|
560072
|
+
if (issues.length === 0)
|
|
560073
|
+
return { proceed: true };
|
|
560074
|
+
const recentEvidence = successfulNonCompletion.slice(-6).map((entry) => {
|
|
560075
|
+
const preview = (entry.outputPreview ?? "").replace(/\s+/g, " ").slice(0, 160);
|
|
560076
|
+
return `- ${entry.name}(${entry.argsKey.slice(0, 120)})${preview ? ` -> ${preview}` : ""}`;
|
|
560077
|
+
}).join("\n");
|
|
560078
|
+
return {
|
|
560079
|
+
proceed: false,
|
|
560080
|
+
reason: issues[0] ?? "missing provenance",
|
|
560081
|
+
feedback: [
|
|
560082
|
+
`[COMPLETION PROVENANCE REQUIRED]`,
|
|
560083
|
+
``,
|
|
560084
|
+
`You attempted to finish, but the completion claim is not yet proven against the current mission.`,
|
|
560085
|
+
``,
|
|
560086
|
+
`Blocking issues:`,
|
|
560087
|
+
...issues.map((issue, index) => `${index + 1}. ${issue}`),
|
|
560088
|
+
``,
|
|
560089
|
+
recentEvidence ? `Recent successful evidence already available:
|
|
560090
|
+
${recentEvidence}` : `Recent successful evidence already available: none recorded.`,
|
|
560091
|
+
``,
|
|
560092
|
+
`Do the smallest missing verification step now. For browser/UI work, take a fresh screenshot/DOM/observe_bundle after the final action. For desktop work, run vision_action_loop observe or desktop_describe after the final action. For code/file changes, run the relevant build/test/typecheck/runtime check after the last edit.`,
|
|
560093
|
+
``,
|
|
560094
|
+
`Only then call task_complete with this shape:`,
|
|
560095
|
+
`Summary: <what changed or what final state was reached>`,
|
|
560096
|
+
`Provenance: <tool/command/screenshot/DOM/log/file evidence proving it>`,
|
|
560097
|
+
`If impossible, call task_complete with summary starting BLOCKED: and name the blocker plus evidence.`
|
|
560098
|
+
].join("\n")
|
|
560099
|
+
};
|
|
560100
|
+
}
|
|
559901
560101
|
/**
|
|
559902
560102
|
* REG-47: post-implementation backward-pass review.
|
|
559903
560103
|
*
|
|
@@ -561174,7 +561374,7 @@ ${latest.output || ""}`.trim();
|
|
|
561174
561374
|
}
|
|
561175
561375
|
}
|
|
561176
561376
|
const sections = [
|
|
561177
|
-
"[KNOWLEDGE — cached tool results already known to the runtime.
|
|
561377
|
+
"[KNOWLEDGE — cached tool results already known to the runtime. Repeating an exact read/list/search/shell call is a wasted action and will be blocked or served from cache:]"
|
|
561178
561378
|
];
|
|
561179
561379
|
if (compactedCount > 0) {
|
|
561180
561380
|
sections.push(`Compacted cached entries still count as already-known results (${compactedCount}); an exact repeat will be served from cache or skipped, not produce new information.`);
|
|
@@ -561186,6 +561386,7 @@ ${latest.output || ""}`.trim();
|
|
|
561186
561386
|
if (dirsListed.length > 0) {
|
|
561187
561387
|
const unique2 = [...new Set(dirsListed)].slice(0, 15);
|
|
561188
561388
|
sections.push(`Directories already listed (${unique2.length}): ${unique2.join(", ")}`);
|
|
561389
|
+
sections.push(`Do not call list_directory again on these exact directories unless you changed their contents. Use the listed child paths directly with file_read/edit/delegation.`);
|
|
561189
561390
|
}
|
|
561190
561391
|
if (searches.length > 0) {
|
|
561191
561392
|
const unique2 = [...new Set(searches)].slice(0, 15);
|
|
@@ -561199,6 +561400,23 @@ ${latest.output || ""}`.trim();
|
|
|
561199
561400
|
return null;
|
|
561200
561401
|
return sections.join("\n");
|
|
561201
561402
|
}
|
|
561403
|
+
_renderRuntimeRootBlock() {
|
|
561404
|
+
const authoritative = this.authoritativeWorkingDirectory();
|
|
561405
|
+
const proc = _pathResolve(process.cwd());
|
|
561406
|
+
const lines = [
|
|
561407
|
+
`[RUNTIME ROOT — authoritative]`,
|
|
561408
|
+
`Current working directory for this run: ${authoritative}`,
|
|
561409
|
+
`All relative file/tool paths resolve under this directory unless the tool call uses an absolute path.`,
|
|
561410
|
+
`Do not infer cwd from old tasks, shell transcripts, memory, or prior browser sessions.`
|
|
561411
|
+
];
|
|
561412
|
+
if (proc !== authoritative) {
|
|
561413
|
+
lines.push(`Process cwd differs (${proc}); treat the run cwd above as authoritative for repo/project work.`);
|
|
561414
|
+
}
|
|
561415
|
+
if (this._worldFacts.lastCwd && this._worldFacts.lastCwd !== authoritative) {
|
|
561416
|
+
lines.push(`Last shell cd target was command-local only: ${this._worldFacts.lastCwd}. It does not change the run cwd.`);
|
|
561417
|
+
}
|
|
561418
|
+
return lines.join("\n");
|
|
561419
|
+
}
|
|
561202
561420
|
_insertContextFrame(messages2, frame) {
|
|
561203
561421
|
if (!frame)
|
|
561204
561422
|
return;
|
|
@@ -561236,7 +561454,7 @@ ${latest.output || ""}`.trim();
|
|
|
561236
561454
|
add2(this._activeContextItem("task_state", "todo-state", "turn.todos", "Todo state", input.todoBlock, 80));
|
|
561237
561455
|
add2(this._activeContextItem("recent_failure", "recent-failures", "turn.failures", "Recent failures", input.failureBlock, 95));
|
|
561238
561456
|
add2(this._activeContextItem("recent_failure", "write-churn", "turn.churn", "Write churn", input.churnBlock, 75));
|
|
561239
|
-
add2(this._activeContextItem("tool_cache", "tool-cache", "turn.tool-cache", "Tool cache", input.toolCacheBlock,
|
|
561457
|
+
add2(this._activeContextItem("tool_cache", "tool-cache", "turn.tool-cache", "Tool cache", input.toolCacheBlock, 92));
|
|
561240
561458
|
add2(this._activeContextItem("anchor", "anchors", "turn.anchors", "Relevant anchors", input.anchorsBlock, 50));
|
|
561241
561459
|
add2(this._activeContextItem("environment", "environment", "turn.environment", "Environment", input.environmentBlock, 35));
|
|
561242
561460
|
if (this._lastPprMemoryLines.length > 0) {
|
|
@@ -561491,7 +561709,10 @@ ${chunk.content}`, {
|
|
|
561491
561709
|
async _buildTurnContextFrame(turn, messages2, recentToolResults, environmentBlock) {
|
|
561492
561710
|
this._contextLedger.clearSources("turn.");
|
|
561493
561711
|
this._contextLedger.prune(turn);
|
|
561494
|
-
const goalBlock =
|
|
561712
|
+
const goalBlock = [
|
|
561713
|
+
this._renderRuntimeRootBlock(),
|
|
561714
|
+
this._taskState.goal ? `Active task: ${this._taskState.goal}` : null
|
|
561715
|
+
].filter(Boolean).join("\n\n");
|
|
561495
561716
|
const filesystemBlock = this._renderFilesystemStateBlock(turn);
|
|
561496
561717
|
const todoBlock = this._renderTodoStateBlock(turn);
|
|
561497
561718
|
const failureBlock = this._renderRecentFailuresBlock(turn);
|
|
@@ -561557,7 +561778,7 @@ ${this._lastPprMemoryLines.slice(0, 5).join("\n")}` : null;
|
|
|
561557
561778
|
signalFromBlock("tool_cache", "turn.tool-cache", toolCacheBlock, {
|
|
561558
561779
|
id: "tool-cache",
|
|
561559
561780
|
dedupeKey: "turn.tool-cache",
|
|
561560
|
-
priority:
|
|
561781
|
+
priority: 92,
|
|
561561
561782
|
createdTurn: turn,
|
|
561562
561783
|
ttlTurns: 1
|
|
561563
561784
|
}),
|
|
@@ -562409,8 +562630,8 @@ ${notice}`;
|
|
|
562409
562630
|
const window2 = recentToolCalls.slice(-repetitionWindow);
|
|
562410
562631
|
const uniqueKeys = new Set(window2.map((tc) => `${tc.name}:${tc.argsKey}`));
|
|
562411
562632
|
const ratio = 1 - uniqueKeys.size / window2.length;
|
|
562412
|
-
if (ratio > 0.4 && this.
|
|
562413
|
-
const recentOutcomes = this.
|
|
562633
|
+
if (ratio > 0.4 && this._adversaryToolOutcomes.length >= 3) {
|
|
562634
|
+
const recentOutcomes = this._adversaryToolOutcomes.slice(-6);
|
|
562414
562635
|
const uniquePreviews = new Set(recentOutcomes.map((o2) => o2.preview.slice(0, 40)));
|
|
562415
562636
|
if (uniquePreviews.size >= 3) {
|
|
562416
562637
|
return Math.max(0, ratio - 0.4);
|
|
@@ -562508,6 +562729,9 @@ Respond with your assessment, then take action.`;
|
|
|
562508
562729
|
this._lastActiveForgettingReport = null;
|
|
562509
562730
|
this._lastContextConsolidationTurn = -1e3;
|
|
562510
562731
|
this._contextFrameBuilder = new ContextFrameBuilder();
|
|
562732
|
+
if (!this._workingDirectory) {
|
|
562733
|
+
this._workingDirectory = _pathResolve(process.cwd());
|
|
562734
|
+
}
|
|
562511
562735
|
if (!this.options.disablePersistentMemory && !this._memoryInitialized) {
|
|
562512
562736
|
try {
|
|
562513
562737
|
const path12 = await import("node:path");
|
|
@@ -562728,7 +562952,6 @@ Respond with your assessment, then take action.`;
|
|
|
562728
562952
|
const contextComposition = await this.assembleContext(task, context2);
|
|
562729
562953
|
const systemPrompt = contextComposition.assembled;
|
|
562730
562954
|
this._contextTree = new ContextTree(`sys-${systemPrompt.length}`, cleanedTask.slice(0, 200));
|
|
562731
|
-
this._phaseMessageStartIdx = 2;
|
|
562732
562955
|
this.emit({
|
|
562733
562956
|
type: "status",
|
|
562734
562957
|
content: `Context assembled: ${contextComposition.sections.map((s2) => `${s2.label}(${s2.tokenEstimate}t)`).join(" + ")} = ~${contextComposition.totalTokenEstimate}t`,
|
|
@@ -562772,10 +562995,13 @@ TASK: ${scrubbedTask}` : scrubbedTask;
|
|
|
562772
562995
|
}
|
|
562773
562996
|
});
|
|
562774
562997
|
}
|
|
562998
|
+
const missionCompletionContract = this.buildMissionCompletionContract(cleanedTask, context2);
|
|
562775
562999
|
const messages2 = [
|
|
562776
563000
|
{ role: "system", content: systemPrompt },
|
|
563001
|
+
...missionCompletionContract ? [{ role: "system", content: missionCompletionContract }] : [],
|
|
562777
563002
|
{ role: "user", content: userContent }
|
|
562778
563003
|
];
|
|
563004
|
+
this._phaseMessageStartIdx = messages2.length;
|
|
562779
563005
|
if (process.env["OMNIUS_DISABLE_DECOMP1"] !== "1") {
|
|
562780
563006
|
try {
|
|
562781
563007
|
const _taskBodyForDecomp = typeof userContent === "string" ? userContent : "";
|
|
@@ -562939,10 +563165,10 @@ TASK: ${scrubbedTask}` : scrubbedTask;
|
|
|
562939
563165
|
this._hookDenyHintCount = 0;
|
|
562940
563166
|
this._selfConsistencyVotes = 0;
|
|
562941
563167
|
this._retrievalContextCache = null;
|
|
562942
|
-
this.
|
|
563168
|
+
this._adversaryMode = this.options.adversaryMode ?? "both";
|
|
562943
563169
|
this._worldFacts = { files: /* @__PURE__ */ new Map(), lastTest: {}, lastLists: /* @__PURE__ */ new Map() };
|
|
562944
563170
|
this._argCohorts.clear();
|
|
562945
|
-
this.
|
|
563171
|
+
this._adversaryRedundantSignals.clear();
|
|
562946
563172
|
this._lastTodoWriteTurn = -1;
|
|
562947
563173
|
this._lastTodoReminderTurn = -1;
|
|
562948
563174
|
let pendingConstraintWarnings = [];
|
|
@@ -563033,6 +563259,54 @@ TASK: ${scrubbedTask}` : scrubbedTask;
|
|
|
563033
563259
|
});
|
|
563034
563260
|
return true;
|
|
563035
563261
|
};
|
|
563262
|
+
const holdProvenanceTaskComplete = (args, turn) => {
|
|
563263
|
+
const proposedSummary = extractTaskCompleteSummary(args);
|
|
563264
|
+
const gate = this._evaluateCompletionProvenanceGate({
|
|
563265
|
+
summary: proposedSummary,
|
|
563266
|
+
taskGoal: cleanedTask,
|
|
563267
|
+
toolCallLog
|
|
563268
|
+
});
|
|
563269
|
+
if (gate.proceed)
|
|
563270
|
+
return false;
|
|
563271
|
+
messages2.push({
|
|
563272
|
+
role: "system",
|
|
563273
|
+
content: `${gate.feedback}
|
|
563274
|
+
|
|
563275
|
+
[ADVISORY ONLY] This critique does not block task_complete; use it to improve the next run or visible evidence if the task continues.`
|
|
563276
|
+
});
|
|
563277
|
+
this.emit({
|
|
563278
|
+
type: "status",
|
|
563279
|
+
content: `completion provenance critique emitted without blocking: ${gate.reason}`,
|
|
563280
|
+
turn,
|
|
563281
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
563282
|
+
});
|
|
563283
|
+
this.emit({
|
|
563284
|
+
type: "adversary_reaction",
|
|
563285
|
+
adversary: {
|
|
563286
|
+
class: "guidance",
|
|
563287
|
+
shortText: "Completion provenance critique emitted",
|
|
563288
|
+
confidence: 0.9,
|
|
563289
|
+
details: gate.reason
|
|
563290
|
+
},
|
|
563291
|
+
turn,
|
|
563292
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
563293
|
+
});
|
|
563294
|
+
return false;
|
|
563295
|
+
};
|
|
563296
|
+
const emitBackwardPassAdvisory = (feedback, turn) => {
|
|
563297
|
+
messages2.push({
|
|
563298
|
+
role: "system",
|
|
563299
|
+
content: `${feedback}
|
|
563300
|
+
|
|
563301
|
+
[ADVISORY ONLY] Backward-pass critique is non-blocking; do not treat this as a tool failure or completion refusal.`
|
|
563302
|
+
});
|
|
563303
|
+
this.emit({
|
|
563304
|
+
type: "status",
|
|
563305
|
+
content: "backward-pass critique emitted without blocking completion",
|
|
563306
|
+
turn,
|
|
563307
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
563308
|
+
});
|
|
563309
|
+
};
|
|
563036
563310
|
const turnCap = this.options.maxTurns && this.options.maxTurns > 0 ? this.options.maxTurns : Number.MAX_SAFE_INTEGER;
|
|
563037
563311
|
for (let turn = 0; turn < turnCap; turn++) {
|
|
563038
563312
|
clearTurnState(this._appState);
|
|
@@ -564017,8 +564291,8 @@ ${_staleSamples.join("\n")}` : ``,
|
|
|
564017
564291
|
nextSelfEval = now + selfEvalInterval;
|
|
564018
564292
|
}
|
|
564019
564293
|
const turnsRemaining = this.options.maxTurns - turn;
|
|
564020
|
-
if (this.options.allowTurnExtension && turnsRemaining <= 3 && turnsRemaining > 0 && this.
|
|
564021
|
-
const recentOutcomes = this.
|
|
564294
|
+
if (this.options.allowTurnExtension && turnsRemaining <= 3 && turnsRemaining > 0 && this._adversaryToolOutcomes.length >= 2) {
|
|
564295
|
+
const recentOutcomes = this._adversaryToolOutcomes.slice(-6);
|
|
564022
564296
|
const recentSuccesses = recentOutcomes.filter((o2) => o2.succeeded).length;
|
|
564023
564297
|
const uniqueResults = new Set(recentOutcomes.map((o2) => o2.preview.slice(0, 40))).size;
|
|
564024
564298
|
const isActive = recentSuccesses >= 2 && uniqueResults >= 2;
|
|
@@ -564027,16 +564301,16 @@ ${_staleSamples.join("\n")}` : ``,
|
|
|
564027
564301
|
this.options.maxTurns += extension3;
|
|
564028
564302
|
this.emit({
|
|
564029
564303
|
type: "status",
|
|
564030
|
-
content: `
|
|
564304
|
+
content: `Adversary triage: activity detected (${recentSuccesses} recent successes, ${uniqueResults} unique results) — extending turn limit by ${extension3} (now ${this.options.maxTurns})`,
|
|
564031
564305
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
564032
564306
|
});
|
|
564033
564307
|
const detailsLines = recentOutcomes.map((o2) => `- ${o2.tool}: ${o2.succeeded ? "OK" : "ERR"} — ${o2.preview}`);
|
|
564034
564308
|
this.emit({
|
|
564035
|
-
type: "
|
|
564309
|
+
type: "debug_adversary",
|
|
564036
564310
|
turn,
|
|
564037
564311
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
564038
564312
|
content: `Timeout triage: EXTENDED by ${extension3} turns (active session detected)`,
|
|
564039
|
-
|
|
564313
|
+
adversaryAction: {
|
|
564040
564314
|
detection: "none",
|
|
564041
564315
|
recentSuccesses,
|
|
564042
564316
|
recentFailures: recentOutcomes.length - recentSuccesses,
|
|
@@ -564369,6 +564643,9 @@ ${memoryLines.join("\n")}`
|
|
|
564369
564643
|
maxTokens: effectiveMaxTokens,
|
|
564370
564644
|
timeoutMs: this.options.requestTimeoutMs
|
|
564371
564645
|
};
|
|
564646
|
+
if ((this.options.contextWindowSize ?? 0) > 0) {
|
|
564647
|
+
chatRequest.numCtx = this.options.contextWindowSize;
|
|
564648
|
+
}
|
|
564372
564649
|
if (this.options.memoryPrefix)
|
|
564373
564650
|
chatRequest.memoryPrefix = this.options.memoryPrefix;
|
|
564374
564651
|
if (this.options.memoryPrefixHash)
|
|
@@ -564410,7 +564687,7 @@ ${memoryLines.join("\n")}`
|
|
|
564410
564687
|
compactionThreshold: limits.compactionThreshold,
|
|
564411
564688
|
toolCallCount,
|
|
564412
564689
|
keepRecent: limits.keepRecent,
|
|
564413
|
-
|
|
564690
|
+
adversaryOutcomes: this._adversaryToolOutcomes.length,
|
|
564414
564691
|
headroom: limits.compactionThreshold - estTokens
|
|
564415
564692
|
}
|
|
564416
564693
|
});
|
|
@@ -564773,16 +565050,19 @@ ${memoryLines.join("\n")}`
|
|
|
564773
565050
|
const cohort = this._argCohorts.get(cohortKey);
|
|
564774
565051
|
if (cohort && cohort.failure >= 3 && cohort.success === 0) {
|
|
564775
565052
|
this.emit({
|
|
564776
|
-
type: "
|
|
565053
|
+
type: "adversary_reaction",
|
|
564777
565054
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
564778
|
-
|
|
565055
|
+
adversary: {
|
|
564779
565056
|
class: "arg_cohort_risk",
|
|
564780
565057
|
shortText: `${tc.name} with similar args has failed ${cohort.failure}× recently`,
|
|
564781
565058
|
confidence: 0.85
|
|
564782
565059
|
}
|
|
564783
565060
|
});
|
|
564784
|
-
if (this.
|
|
564785
|
-
this.pendingUserMessages.push(
|
|
565061
|
+
if (this._adversaryMode === "skillcoach" || this._adversaryMode === "both") {
|
|
565062
|
+
this.pendingUserMessages.push(`[ADVERSARY CRITIQUE — non-blocking]
|
|
565063
|
+
Evidence: ${tc.name} with similar arguments has failed ${cohort.failure}× recently.
|
|
565064
|
+
Root cause hypothesis: the argument family may be wrong, a prerequisite may be missing, or the tool is being used before enough state is known.
|
|
565065
|
+
Corrective action: try a different approach first: read relevant files, adjust arguments, or verify prerequisites.`);
|
|
564786
565066
|
}
|
|
564787
565067
|
}
|
|
564788
565068
|
if (this._errorPatterns.size > 0) {
|
|
@@ -565064,19 +565344,11 @@ ${memoryLines.join("\n")}`
|
|
|
565064
565344
|
].includes(tc.name);
|
|
565065
565345
|
const isStatefulBrowserTool = this._isStatefulBrowserTool(tc.name);
|
|
565066
565346
|
const isReadLike = !isStatefulBrowserTool && (baseIsReadLike || tc.name === "shell" && this._isShellCommandReadOnly(tc.arguments?.["command"] ?? tc.arguments?.["cmd"] ?? ""));
|
|
565067
|
-
const
|
|
565068
|
-
if (
|
|
565069
|
-
this.
|
|
565347
|
+
const adversaryRedundantSignal = this._adversaryRedundantSignals.has(toolFingerprint);
|
|
565348
|
+
if (adversaryRedundantSignal) {
|
|
565349
|
+
this._adversaryRedundantSignals.delete(toolFingerprint);
|
|
565070
565350
|
}
|
|
565071
|
-
|
|
565072
|
-
const lastLog = toolCallLog[_toolLogTailIdx];
|
|
565073
|
-
if (!lastLog)
|
|
565074
|
-
return;
|
|
565075
|
-
lastLog.success = true;
|
|
565076
|
-
lastLog.mutated = false;
|
|
565077
|
-
lastLog.mutatedFiles = [];
|
|
565078
|
-
lastLog.outputPreview = outputPreview.slice(0, 100);
|
|
565079
|
-
};
|
|
565351
|
+
let criticGuidance = null;
|
|
565080
565352
|
{
|
|
565081
565353
|
const _reflStem = buildStem(tc.name, tc.arguments ?? {});
|
|
565082
565354
|
if (!this._reflectionsInjectedThisTurn.has(_reflStem)) {
|
|
@@ -565118,7 +565390,10 @@ ${memoryLines.join("\n")}`
|
|
|
565118
565390
|
}
|
|
565119
565391
|
}
|
|
565120
565392
|
}
|
|
565121
|
-
const criticDecision =
|
|
565393
|
+
const criticDecision = this.options.disableAdversaryCritic === true ? {
|
|
565394
|
+
decision: "pass",
|
|
565395
|
+
reason: "adversary critic disabled for isolated evaluation"
|
|
565396
|
+
} : evaluate2({
|
|
565122
565397
|
proposedCall: { tool: tc.name, args: tc.arguments ?? {} },
|
|
565123
565398
|
fingerprint: toolFingerprint,
|
|
565124
565399
|
isReadLike,
|
|
@@ -565132,116 +565407,33 @@ ${memoryLines.join("\n")}`
|
|
|
565132
565407
|
stagnationSignals: null,
|
|
565133
565408
|
// stagnation gate handled at top-of-turn
|
|
565134
565409
|
stagnationGateActive: false,
|
|
565135
|
-
|
|
565410
|
+
adversaryRedundantSignal
|
|
565136
565411
|
});
|
|
565137
|
-
if (criticDecision.decision === "
|
|
565138
|
-
this.emit({
|
|
565139
|
-
type: "tool_call",
|
|
565140
|
-
toolName: tc.name,
|
|
565141
|
-
toolArgs: tc.arguments,
|
|
565142
|
-
turn,
|
|
565143
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
565144
|
-
});
|
|
565145
|
-
const blockMsg = criticDecision.cachedResult ? `[BLOCKED — this tool+args already succeeded. Re-served from cache:]
|
|
565146
|
-
|
|
565147
|
-
${criticDecision.cachedResult.slice(0, 500)}` : `[BLOCKED — the observer confirmed this tool already succeeded with these arguments on a prior turn. Do NOT re-run. Use your prior findings to proceed.]`;
|
|
565148
|
-
markSyntheticToolLog(blockMsg);
|
|
565149
|
-
this.emit({
|
|
565150
|
-
type: "tool_result",
|
|
565151
|
-
toolName: tc.name,
|
|
565152
|
-
success: true,
|
|
565153
|
-
content: blockMsg.slice(0, 100),
|
|
565154
|
-
turn,
|
|
565155
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
565156
|
-
});
|
|
565157
|
-
this._tagSyntheticFailure({
|
|
565158
|
-
mode: "step_repetition",
|
|
565159
|
-
rationale: `observer-block on ${tc.name} fingerprint flagged redundant`
|
|
565160
|
-
});
|
|
565161
|
-
return { tc, output: blockMsg, success: true };
|
|
565162
|
-
}
|
|
565163
|
-
if (criticDecision.decision === "force_progress_block") {
|
|
565164
|
-
dedupHitCount.set(toolFingerprint, criticDecision.hitNumber);
|
|
565165
|
-
const _existingFp = recentToolResults.get(toolFingerprint);
|
|
565166
|
-
if (_existingFp !== void 0) {
|
|
565167
|
-
recentToolResults.delete(toolFingerprint);
|
|
565168
|
-
recentToolResults.set(toolFingerprint, _existingFp);
|
|
565169
|
-
}
|
|
565170
|
-
this.emit({
|
|
565171
|
-
type: "tool_call",
|
|
565172
|
-
toolName: tc.name,
|
|
565173
|
-
toolArgs: tc.arguments,
|
|
565174
|
-
turn,
|
|
565175
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
565176
|
-
});
|
|
565177
|
-
this.emit({
|
|
565178
|
-
type: "tool_result",
|
|
565179
|
-
toolName: tc.name,
|
|
565180
|
-
success: true,
|
|
565181
|
-
content: `[SKIPPED DUPLICATE — exact ${tc.name} call not re-run; cached result returned.]`.slice(0, 120),
|
|
565182
|
-
turn,
|
|
565183
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
565184
|
-
});
|
|
565185
|
-
this._tagSyntheticFailure({
|
|
565186
|
-
mode: "step_repetition",
|
|
565187
|
-
rationale: `force_progress_block on ${tc.name} after ${criticDecision.hitNumber} identical calls`
|
|
565188
|
-
});
|
|
565189
|
-
const generationCompletionHint = isGenerationArtifactSuccess(tc.name, criticDecision.cachedResult) ? `
|
|
565190
|
-
|
|
565191
|
-
[GENERATION ALREADY COMPLETE] This exact ${tc.name} call already succeeded. Do not call it again. Use the cached artifact/path above; if delivery is needed, send it, otherwise call task_complete.` : "";
|
|
565192
|
-
const header = criticDecision.compacted ? `[RE-SERVED FROM CACHE — the original result was compacted from context. Here is the data again. Do not retry this exact call.]
|
|
565193
|
-
|
|
565194
|
-
` : `[SKIPPED DUPLICATE — exact ${tc.name} call not re-run. The cached result below is from the prior successful call. Do not retry this exact call.]
|
|
565195
|
-
|
|
565196
|
-
`;
|
|
565197
|
-
const truncatedCache = criticDecision.cachedResult.length > 500 ? criticDecision.cachedResult.slice(0, 500) + `
|
|
565198
|
-
... [${criticDecision.cachedResult.length - 500} chars omitted — same as before]` : criticDecision.cachedResult;
|
|
565199
|
-
markSyntheticToolLog(`${criticDecision.blockMessage}
|
|
565200
|
-
|
|
565201
|
-
${truncatedCache}`);
|
|
565202
|
-
return {
|
|
565203
|
-
tc,
|
|
565204
|
-
output: `${criticDecision.blockMessage}
|
|
565205
|
-
|
|
565206
|
-
${header}${truncatedCache}${generationCompletionHint}`,
|
|
565207
|
-
success: true
|
|
565208
|
-
};
|
|
565209
|
-
}
|
|
565210
|
-
if (criticDecision.decision === "serve_cached") {
|
|
565412
|
+
if (criticDecision.decision === "guidance") {
|
|
565211
565413
|
dedupHitCount.set(toolFingerprint, criticDecision.hitNumber);
|
|
565212
565414
|
const _existingFp = recentToolResults.get(toolFingerprint);
|
|
565213
565415
|
if (_existingFp !== void 0) {
|
|
565214
565416
|
recentToolResults.delete(toolFingerprint);
|
|
565215
565417
|
recentToolResults.set(toolFingerprint, _existingFp);
|
|
565216
565418
|
}
|
|
565419
|
+
criticGuidance = criticDecision.guidanceMessage;
|
|
565217
565420
|
this.emit({
|
|
565218
|
-
type: "
|
|
565219
|
-
|
|
565220
|
-
|
|
565221
|
-
|
|
565421
|
+
type: "adversary_reaction",
|
|
565422
|
+
adversary: {
|
|
565423
|
+
class: "guidance",
|
|
565424
|
+
shortText: `Adversary guidance for repeated ${tc.name} call`,
|
|
565425
|
+
confidence: 0.9,
|
|
565426
|
+
details: criticDecision.reason
|
|
565427
|
+
},
|
|
565222
565428
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
565223
565429
|
});
|
|
565224
|
-
const generationCompletionHint = isGenerationArtifactSuccess(tc.name, criticDecision.cachedResult) ? `
|
|
565225
|
-
|
|
565226
|
-
[GENERATION ALREADY COMPLETE] This exact ${tc.name} call already succeeded. Do not call it again. Use the cached artifact/path above; if delivery is needed, send it, otherwise call task_complete.` : "";
|
|
565227
|
-
const header = criticDecision.compacted ? `[RE-SERVED FROM CACHE — the original result was compacted from context. Here is the data again. No need to call this tool again.]
|
|
565228
|
-
|
|
565229
|
-
` : `[DUPLICATE CALL #${criticDecision.hitNumber} — you already called ${tc.name} with these exact arguments. The result is identical. Do NOT call this again. Use the data you already have to make progress. One more identical call will trigger a hard progress block.]
|
|
565230
|
-
|
|
565231
|
-
`;
|
|
565232
|
-
const truncatedCache = criticDecision.cachedResult.length > 500 ? criticDecision.cachedResult.slice(0, 500) + `
|
|
565233
|
-
... [${criticDecision.cachedResult.length - 500} chars omitted — same as before]` : criticDecision.cachedResult;
|
|
565234
|
-
const dedupOutput = header + truncatedCache + generationCompletionHint;
|
|
565235
|
-
markSyntheticToolLog(dedupOutput);
|
|
565236
565430
|
this.emit({
|
|
565237
|
-
type: "
|
|
565431
|
+
type: "status",
|
|
565238
565432
|
toolName: tc.name,
|
|
565239
|
-
|
|
565240
|
-
content: header.slice(0, 100),
|
|
565433
|
+
content: `Adversary guidance emitted for ${tc.name}; tool call will still execute`,
|
|
565241
565434
|
turn,
|
|
565242
565435
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
565243
565436
|
});
|
|
565244
|
-
return { tc, output: dedupOutput, success: true };
|
|
565245
565437
|
}
|
|
565246
565438
|
this.emit({
|
|
565247
565439
|
type: "tool_call",
|
|
@@ -566242,6 +566434,11 @@ Respond with EXACTLY this structure before your next tool call:
|
|
|
566242
566434
|
result = await this.offloadEmbeddedImageResult(result, tc.name, turn);
|
|
566243
566435
|
}
|
|
566244
566436
|
let output = this.normalizeToolOutput(result, tc.name, tc.arguments, turn);
|
|
566437
|
+
if (criticGuidance) {
|
|
566438
|
+
output += `
|
|
566439
|
+
|
|
566440
|
+
${criticGuidance}`;
|
|
566441
|
+
}
|
|
566245
566442
|
if (!result.success && (this.options.modelTier === "small" || this.options.modelTier === "medium")) {
|
|
566246
566443
|
const recovery = this.buildRecoveryGuidance(tc.name, result.error ?? "", tc.arguments);
|
|
566247
566444
|
if (recovery)
|
|
@@ -566251,6 +566448,13 @@ Respond with EXACTLY this structure before your next tool call:
|
|
|
566251
566448
|
editFeedbackRequiredBeforeMoreEdits = this._buildBatchEditAtomicAbortGuidance(tc.arguments);
|
|
566252
566449
|
this.pendingUserMessages.push(editFeedbackRequiredBeforeMoreEdits);
|
|
566253
566450
|
}
|
|
566451
|
+
const currentLogEntry = toolCallLog[_toolLogTailIdx];
|
|
566452
|
+
if (currentLogEntry) {
|
|
566453
|
+
currentLogEntry.success = result.success;
|
|
566454
|
+
currentLogEntry.mutated = realFileMutation;
|
|
566455
|
+
currentLogEntry.mutatedFiles = realMutationPaths;
|
|
566456
|
+
currentLogEntry.outputPreview = (result.success ? result.llmContent ?? result.output ?? output : result.error ?? result.output ?? output).toString().slice(0, 500);
|
|
566457
|
+
}
|
|
566254
566458
|
this.emit({
|
|
566255
566459
|
type: "tool_result",
|
|
566256
566460
|
toolName: tc.name,
|
|
@@ -566640,27 +566844,26 @@ ${sr.result.output}`;
|
|
|
566640
566844
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
566641
566845
|
});
|
|
566642
566846
|
} else {
|
|
566643
|
-
if (holdNoProgressTaskComplete(matchTc.arguments, turn)) {
|
|
566847
|
+
if (holdNoProgressTaskComplete(matchTc.arguments, turn) || holdProvenanceTaskComplete(matchTc.arguments, turn)) {
|
|
566644
566848
|
continue;
|
|
566645
566849
|
}
|
|
566646
566850
|
const _bp1 = await this._runBackwardPassReview(turn);
|
|
566647
566851
|
if (_bp1 && !_bp1.proceed && _bp1.feedback) {
|
|
566648
|
-
|
|
566649
|
-
}
|
|
566650
|
-
|
|
566651
|
-
|
|
566652
|
-
|
|
566653
|
-
|
|
566654
|
-
|
|
566655
|
-
|
|
566656
|
-
|
|
566657
|
-
|
|
566658
|
-
|
|
566659
|
-
|
|
566660
|
-
|
|
566661
|
-
}
|
|
566662
|
-
break;
|
|
566852
|
+
emitBackwardPassAdvisory(_bp1.feedback, turn);
|
|
566853
|
+
}
|
|
566854
|
+
completed = true;
|
|
566855
|
+
summary = extractTaskCompleteSummary(matchTc.arguments);
|
|
566856
|
+
if (summary && !this._assistantTextEmitted) {
|
|
566857
|
+
this.emit({
|
|
566858
|
+
type: "assistant_text",
|
|
566859
|
+
content: summary,
|
|
566860
|
+
source: "task_complete_summary",
|
|
566861
|
+
turn,
|
|
566862
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
566863
|
+
});
|
|
566864
|
+
this._assistantTextEmitted = true;
|
|
566663
566865
|
}
|
|
566866
|
+
break;
|
|
566664
566867
|
}
|
|
566665
566868
|
}
|
|
566666
566869
|
}
|
|
@@ -566696,27 +566899,26 @@ ${sr.result.output}`;
|
|
|
566696
566899
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
566697
566900
|
});
|
|
566698
566901
|
} else {
|
|
566699
|
-
if (holdNoProgressTaskComplete(r2.tc.arguments, turn)) {
|
|
566902
|
+
if (holdNoProgressTaskComplete(r2.tc.arguments, turn) || holdProvenanceTaskComplete(r2.tc.arguments, turn)) {
|
|
566700
566903
|
continue;
|
|
566701
566904
|
}
|
|
566702
566905
|
const _bp2 = await this._runBackwardPassReview(turn);
|
|
566703
566906
|
if (_bp2 && !_bp2.proceed && _bp2.feedback) {
|
|
566704
|
-
|
|
566705
|
-
}
|
|
566706
|
-
|
|
566707
|
-
|
|
566708
|
-
|
|
566709
|
-
|
|
566710
|
-
|
|
566711
|
-
|
|
566712
|
-
|
|
566713
|
-
|
|
566714
|
-
|
|
566715
|
-
|
|
566716
|
-
|
|
566717
|
-
}
|
|
566718
|
-
break;
|
|
566907
|
+
emitBackwardPassAdvisory(_bp2.feedback, turn);
|
|
566908
|
+
}
|
|
566909
|
+
completed = true;
|
|
566910
|
+
summary = extractTaskCompleteSummary(r2.tc.arguments);
|
|
566911
|
+
if (summary && !this._assistantTextEmitted) {
|
|
566912
|
+
this.emit({
|
|
566913
|
+
type: "assistant_text",
|
|
566914
|
+
content: summary,
|
|
566915
|
+
source: "task_complete_summary",
|
|
566916
|
+
turn,
|
|
566917
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
566918
|
+
});
|
|
566919
|
+
this._assistantTextEmitted = true;
|
|
566719
566920
|
}
|
|
566921
|
+
break;
|
|
566720
566922
|
}
|
|
566721
566923
|
}
|
|
566722
566924
|
}
|
|
@@ -566788,27 +566990,26 @@ ${sr.result.output}`;
|
|
|
566788
566990
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
566789
566991
|
});
|
|
566790
566992
|
} else {
|
|
566791
|
-
if (holdNoProgressTaskComplete(r2.tc.arguments, turn)) {
|
|
566993
|
+
if (holdNoProgressTaskComplete(r2.tc.arguments, turn) || holdProvenanceTaskComplete(r2.tc.arguments, turn)) {
|
|
566792
566994
|
continue;
|
|
566793
566995
|
}
|
|
566794
566996
|
const _bp3 = await this._runBackwardPassReview(turn);
|
|
566795
566997
|
if (_bp3 && !_bp3.proceed && _bp3.feedback) {
|
|
566796
|
-
|
|
566797
|
-
} else {
|
|
566798
|
-
completed = true;
|
|
566799
|
-
summary = extractTaskCompleteSummary(r2.tc.arguments);
|
|
566800
|
-
if (summary && !this._assistantTextEmitted) {
|
|
566801
|
-
this.emit({
|
|
566802
|
-
type: "assistant_text",
|
|
566803
|
-
content: summary,
|
|
566804
|
-
source: "task_complete_summary",
|
|
566805
|
-
turn,
|
|
566806
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
566807
|
-
});
|
|
566808
|
-
this._assistantTextEmitted = true;
|
|
566809
|
-
}
|
|
566810
|
-
break;
|
|
566998
|
+
emitBackwardPassAdvisory(_bp3.feedback, turn);
|
|
566811
566999
|
}
|
|
567000
|
+
completed = true;
|
|
567001
|
+
summary = extractTaskCompleteSummary(r2.tc.arguments);
|
|
567002
|
+
if (summary && !this._assistantTextEmitted) {
|
|
567003
|
+
this.emit({
|
|
567004
|
+
type: "assistant_text",
|
|
567005
|
+
content: summary,
|
|
567006
|
+
source: "task_complete_summary",
|
|
567007
|
+
turn,
|
|
567008
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
567009
|
+
});
|
|
567010
|
+
this._assistantTextEmitted = true;
|
|
567011
|
+
}
|
|
567012
|
+
break;
|
|
566812
567013
|
}
|
|
566813
567014
|
}
|
|
566814
567015
|
}
|
|
@@ -566819,7 +567020,7 @@ ${sr.result.output}`;
|
|
|
566819
567020
|
}
|
|
566820
567021
|
if (completed)
|
|
566821
567022
|
break;
|
|
566822
|
-
this.
|
|
567023
|
+
this.adversaryObserve(messages2, turn);
|
|
566823
567024
|
const currentRepScore = this.detectRepetition(toolCallLog);
|
|
566824
567025
|
if (currentRepScore > 0.4 && toolCallLog.length >= 4) {
|
|
566825
567026
|
const { repetitionWindow } = this.contextLimits();
|
|
@@ -567006,13 +567207,17 @@ Call task_complete(summary="...") NOW with whatever you have.`
|
|
|
567006
567207
|
});
|
|
567007
567208
|
}
|
|
567008
567209
|
if (/task.?complete|all tests pass/i.test(content)) {
|
|
567210
|
+
const completionArgs = { summary: content };
|
|
567211
|
+
if (holdNoProgressTaskComplete(completionArgs, turn) || holdProvenanceTaskComplete(completionArgs, turn)) {
|
|
567212
|
+
continue;
|
|
567213
|
+
}
|
|
567009
567214
|
completed = true;
|
|
567010
567215
|
summary = content;
|
|
567011
567216
|
break;
|
|
567012
567217
|
}
|
|
567013
567218
|
if (isThinkOnly) {
|
|
567014
567219
|
if (consecutiveThinkOnly >= MAX_CONSECUTIVE_THINK_ONLY) {
|
|
567015
|
-
const recentSuccesses = this.
|
|
567220
|
+
const recentSuccesses = this._adversaryToolOutcomes.slice(-3).filter((o2) => o2.succeeded);
|
|
567016
567221
|
const hasRecentSuccess = recentSuccesses.length > 0;
|
|
567017
567222
|
const successHint = hasRecentSuccess ? `
|
|
567018
567223
|
|
|
@@ -567263,7 +567468,8 @@ ${this.options.maxTurns && this.options.maxTurns > 0 ? `You have ${this.options.
|
|
|
567263
567468
|
tools: toolDefs,
|
|
567264
567469
|
temperature: this.options.temperature,
|
|
567265
567470
|
maxTokens: this.options.maxTokens,
|
|
567266
|
-
timeoutMs: this.options.requestTimeoutMs
|
|
567471
|
+
timeoutMs: this.options.requestTimeoutMs,
|
|
567472
|
+
numCtx: this.options.contextWindowSize || void 0
|
|
567267
567473
|
};
|
|
567268
567474
|
let response;
|
|
567269
567475
|
try {
|
|
@@ -567568,13 +567774,12 @@ Full content available via: repl_exec(code="data = retrieve('${handleId}')") or
|
|
|
567568
567774
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
567569
567775
|
});
|
|
567570
567776
|
} else {
|
|
567571
|
-
if (holdNoProgressTaskComplete(tc.arguments, turn)) {
|
|
567777
|
+
if (holdNoProgressTaskComplete(tc.arguments, turn) || holdProvenanceTaskComplete(tc.arguments, turn)) {
|
|
567572
567778
|
continue;
|
|
567573
567779
|
}
|
|
567574
567780
|
const _bp4 = await this._runBackwardPassReview(turn);
|
|
567575
567781
|
if (_bp4 && !_bp4.proceed && _bp4.feedback) {
|
|
567576
|
-
|
|
567577
|
-
continue;
|
|
567782
|
+
emitBackwardPassAdvisory(_bp4.feedback, turn);
|
|
567578
567783
|
}
|
|
567579
567784
|
completed = true;
|
|
567580
567785
|
summary = extractTaskCompleteSummary(tc.arguments);
|
|
@@ -567621,22 +567826,9 @@ Full content available via: repl_exec(code="data = retrieve('${handleId}')") or
|
|
|
567621
567826
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
567622
567827
|
});
|
|
567623
567828
|
} else {
|
|
567624
|
-
|
|
567625
|
-
|
|
567626
|
-
|
|
567627
|
-
toolCallLog,
|
|
567628
|
-
taskState: this._taskState
|
|
567629
|
-
});
|
|
567630
|
-
if (gate.shouldInject && gate.content) {
|
|
567631
|
-
messages2.push({ role: "system", content: gate.content });
|
|
567632
|
-
this.emit({
|
|
567633
|
-
type: "status",
|
|
567634
|
-
content: "text completion held: discovery happened but no deliverable or explicit blocker is recorded",
|
|
567635
|
-
turn,
|
|
567636
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
567637
|
-
});
|
|
567638
|
-
continue;
|
|
567639
|
-
}
|
|
567829
|
+
const completionArgs = { summary: content };
|
|
567830
|
+
if (holdNoProgressTaskComplete(completionArgs, turn) || holdProvenanceTaskComplete(completionArgs, turn)) {
|
|
567831
|
+
continue;
|
|
567640
567832
|
}
|
|
567641
567833
|
completed = true;
|
|
567642
567834
|
summary = content;
|
|
@@ -567660,7 +567852,7 @@ Full content available via: repl_exec(code="data = retrieve('${handleId}')") or
|
|
|
567660
567852
|
}
|
|
567661
567853
|
if (isThinkOnlyBF) {
|
|
567662
567854
|
if (consecutiveThinkOnly >= MAX_CONSECUTIVE_THINK_ONLY) {
|
|
567663
|
-
const recentSucc = this.
|
|
567855
|
+
const recentSucc = this._adversaryToolOutcomes.slice(-3).filter((o2) => o2.succeeded);
|
|
567664
567856
|
const succHint = recentSucc.length > 0 ? "\n\nYour most recent tool calls SUCCEEDED. If the task is complete, call task_complete now with a summary." : "";
|
|
567665
567857
|
messages2.push({
|
|
567666
567858
|
role: "user",
|
|
@@ -569748,36 +569940,35 @@ ${newerSummary}`;
|
|
|
569748
569940
|
${trimmedNew}`;
|
|
569749
569941
|
}
|
|
569750
569942
|
// -------------------------------------------------------------------------
|
|
569751
|
-
//
|
|
569943
|
+
// Adversary — parallel meta-analysis of the main loop
|
|
569752
569944
|
// -------------------------------------------------------------------------
|
|
569753
|
-
// Inspired by Hannover's fireCompanionObserver (src/buddy/observer.ts).
|
|
569754
569945
|
// Runs after each tool turn to detect when the model has lost track of
|
|
569755
569946
|
// what happened and inject corrections before the next inference.
|
|
569756
569947
|
//
|
|
569757
569948
|
// This is the architectural fix for the "I see both tools have been failing"
|
|
569758
569949
|
// regression: instead of only fixing the data the model sees (mask/summary),
|
|
569759
569950
|
// we add a second analysis path that catches mismatches in real-time.
|
|
569760
|
-
/** Track recent tool outcomes for the
|
|
569761
|
-
|
|
569762
|
-
/** WO-FIX-C: Tool fingerprints the
|
|
569763
|
-
* Checked in executeSingle to
|
|
569764
|
-
|
|
569951
|
+
/** Track recent tool outcomes for the adversary */
|
|
569952
|
+
_adversaryToolOutcomes = [];
|
|
569953
|
+
/** WO-FIX-C: Tool fingerprints the adversary has flagged as redundant.
|
|
569954
|
+
* Checked in executeSingle to attach advisory guidance before dispatch. */
|
|
569955
|
+
_adversaryRedundantSignals = /* @__PURE__ */ new Set();
|
|
569765
569956
|
/** Reflexion pattern: task-local failure-indexed reflection buffer.
|
|
569766
569957
|
* Generates typed self-reflections on task failure and injects them
|
|
569767
569958
|
* into the next attempt's context for active learning. */
|
|
569768
569959
|
_reflectionBuffer = null;
|
|
569769
569960
|
/**
|
|
569770
|
-
*
|
|
569961
|
+
* Adversary: post-turn meta-analysis.
|
|
569771
569962
|
*
|
|
569772
569963
|
* Examines the last few messages looking for contradictions between
|
|
569773
569964
|
* actual tool outcomes and the model's stated understanding. When it
|
|
569774
569965
|
* detects the model claiming failure after success (or vice versa),
|
|
569775
|
-
* it injects a corrective
|
|
569966
|
+
* it injects a corrective non-blocking critique.
|
|
569776
569967
|
*
|
|
569777
569968
|
* Also detects repeated actions — when the model re-does something
|
|
569778
|
-
* that already succeeded, the
|
|
569969
|
+
* that already succeeded, the adversary nudges it to move on.
|
|
569779
569970
|
*/
|
|
569780
|
-
|
|
569971
|
+
adversaryObserve(messages2, turn) {
|
|
569781
569972
|
if (this.options.modelTier === "large")
|
|
569782
569973
|
return;
|
|
569783
569974
|
const recent = messages2.slice(-6);
|
|
@@ -569806,8 +569997,8 @@ ${trimmedNew}`;
|
|
|
569806
569997
|
}
|
|
569807
569998
|
const argsKey = toolArgs ? this._buildExactArgsKey(toolArgs) : void 0;
|
|
569808
569999
|
const fingerprint = toolArgs ? this._buildToolFingerprint(toolName, toolArgs) : void 0;
|
|
569809
|
-
if (!this.
|
|
569810
|
-
this.
|
|
570000
|
+
if (!this._adversaryToolOutcomes.some((o2) => o2.turn === turn && o2.tool === toolName && o2.fingerprint === fingerprint)) {
|
|
570001
|
+
this._adversaryToolOutcomes.push({
|
|
569811
570002
|
turn,
|
|
569812
570003
|
tool: toolName,
|
|
569813
570004
|
argsKey,
|
|
@@ -569818,27 +570009,47 @@ ${trimmedNew}`;
|
|
|
569818
570009
|
}
|
|
569819
570010
|
}
|
|
569820
570011
|
}
|
|
569821
|
-
while (this.
|
|
569822
|
-
this.
|
|
570012
|
+
while (this._adversaryToolOutcomes.length > 20)
|
|
570013
|
+
this._adversaryToolOutcomes.shift();
|
|
569823
570014
|
const emitReaction = (cls, shortText, confidence2, details2) => {
|
|
569824
570015
|
this.emit({
|
|
569825
|
-
type: "
|
|
570016
|
+
type: "adversary_reaction",
|
|
569826
570017
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
569827
|
-
|
|
570018
|
+
adversary: { class: cls, shortText, confidence: confidence2, details: details2 }
|
|
569828
570019
|
});
|
|
569829
570020
|
};
|
|
570021
|
+
const buildAdversaryCritique = (input) => {
|
|
570022
|
+
const alternatives = input.alternatives && input.alternatives.length > 0 ? `
|
|
570023
|
+
Alternatives:
|
|
570024
|
+
${input.alternatives.map((item) => `- ${item}`).join("\n")}` : "";
|
|
570025
|
+
return [
|
|
570026
|
+
`[ADVERSARY CRITIQUE — non-blocking]`,
|
|
570027
|
+
`Evidence: ${input.evidence}`,
|
|
570028
|
+
`Root cause hypothesis: ${input.hypothesis}`,
|
|
570029
|
+
`Corrective action: ${input.correctiveAction}${alternatives}`
|
|
570030
|
+
].join("\n");
|
|
570031
|
+
};
|
|
569830
570032
|
const lastAssistant = [...recent].reverse().find((m2) => m2.role === "assistant" && typeof m2.content === "string");
|
|
569831
570033
|
if (lastAssistant && typeof lastAssistant.content === "string") {
|
|
569832
570034
|
const text = lastAssistant.content.toLowerCase();
|
|
569833
570035
|
const claimsFailure = /(?:fail|error|didn't work|not working|unable to|cannot|couldn't|both .* fail|tools? (?:have |has )?been fail)/i.test(text);
|
|
569834
570036
|
if (claimsFailure) {
|
|
569835
|
-
const recentOutcomes = this.
|
|
570037
|
+
const recentOutcomes = this._adversaryToolOutcomes.slice(-4);
|
|
569836
570038
|
const successes = recentOutcomes.filter((o2) => o2.succeeded);
|
|
569837
570039
|
if (successes.length >= 1) {
|
|
569838
570040
|
const successList = successes.map((o2) => `${o2.tool}: ${o2.preview.slice(0, 60)}`).join("; ");
|
|
569839
570041
|
emitReaction("false_failure", `Claimed failure, but recent tools succeeded (${successes.length})`, 0.9, successList);
|
|
569840
|
-
if (this.
|
|
569841
|
-
this.pendingUserMessages.push(
|
|
570042
|
+
if (this._adversaryMode === "skillcoach" || this._adversaryMode === "both") {
|
|
570043
|
+
this.pendingUserMessages.push(buildAdversaryCritique({
|
|
570044
|
+
evidence: `Recent tools succeeded: ${successList}.`,
|
|
570045
|
+
hypothesis: "The main loop is interpreting uncertainty or partial progress as failure and may be about to discard usable evidence.",
|
|
570046
|
+
correctiveAction: "Use the successful results to advance the task, then verify the next concrete step.",
|
|
570047
|
+
alternatives: [
|
|
570048
|
+
"Edit or run the next verification step that follows from the successful output.",
|
|
570049
|
+
"Read a different targeted file if the successful result exposed a new path or symbol.",
|
|
570050
|
+
"Complete only if the successful output is sufficient evidence for the user's request."
|
|
570051
|
+
]
|
|
570052
|
+
}));
|
|
569842
570053
|
}
|
|
569843
570054
|
this.emit({
|
|
569844
570055
|
type: "status",
|
|
@@ -569852,47 +570063,67 @@ ${trimmedNew}`;
|
|
|
569852
570063
|
const text = lastAssistant.content.toLowerCase();
|
|
569853
570064
|
const claimsSuccess = /(done|fixed|success|passed|complete)/i.test(text);
|
|
569854
570065
|
if (claimsSuccess) {
|
|
569855
|
-
const recentOutcomes = this.
|
|
570066
|
+
const recentOutcomes = this._adversaryToolOutcomes.slice(-4);
|
|
569856
570067
|
const failures = recentOutcomes.filter((o2) => !o2.succeeded);
|
|
569857
570068
|
const successes = recentOutcomes.filter((o2) => o2.succeeded);
|
|
569858
570069
|
if (failures.length > 0 && successes.length === 0) {
|
|
569859
570070
|
const failList = failures.map((o2) => `${o2.tool}: ${o2.preview.slice(0, 60)}`).join("; ");
|
|
569860
570071
|
emitReaction("false_success", `Claimed success, but recent tools failed (${failures.length})`, 0.9, failList);
|
|
569861
|
-
if (this.
|
|
569862
|
-
this.pendingUserMessages.push(
|
|
570072
|
+
if (this._adversaryMode === "skillcoach" || this._adversaryMode === "both") {
|
|
570073
|
+
this.pendingUserMessages.push(buildAdversaryCritique({
|
|
570074
|
+
evidence: `Recent tools show errors (${failures.length}): ${failList}.`,
|
|
570075
|
+
hypothesis: "The main loop is prematurely compressing intent into success language before the verifier produced evidence.",
|
|
570076
|
+
correctiveAction: "Inspect the failed output, identify the implicated path/symbol/command, and run one focused corrective step before claiming success.",
|
|
570077
|
+
alternatives: [
|
|
570078
|
+
"Read the smallest relevant source region around the failed symbol.",
|
|
570079
|
+
"Patch the implicated code or configuration.",
|
|
570080
|
+
"Run the same verifier only after a state-changing fix."
|
|
570081
|
+
]
|
|
570082
|
+
}));
|
|
569863
570083
|
}
|
|
569864
570084
|
}
|
|
569865
570085
|
}
|
|
569866
570086
|
}
|
|
569867
|
-
|
|
569868
|
-
|
|
569869
|
-
const
|
|
569870
|
-
|
|
569871
|
-
|
|
569872
|
-
|
|
569873
|
-
|
|
569874
|
-
|
|
569875
|
-
|
|
569876
|
-
|
|
569877
|
-
|
|
569878
|
-
|
|
569879
|
-
|
|
569880
|
-
|
|
569881
|
-
|
|
569882
|
-
|
|
569883
|
-
|
|
569884
|
-
this.
|
|
570087
|
+
if (this.options.disableAdversaryCritic !== true) {
|
|
570088
|
+
const lastToolCalls = recent.filter((m2) => m2.role === "assistant" && m2.tool_calls?.length).flatMap((m2) => m2.tool_calls ?? []);
|
|
570089
|
+
for (const tc of lastToolCalls) {
|
|
570090
|
+
const name10 = tc.function.name;
|
|
570091
|
+
if (this._isStatefulBrowserTool(name10))
|
|
570092
|
+
continue;
|
|
570093
|
+
let args = {};
|
|
570094
|
+
try {
|
|
570095
|
+
args = JSON.parse(tc.function.arguments);
|
|
570096
|
+
} catch {
|
|
570097
|
+
}
|
|
570098
|
+
const argsKey = this._buildExactArgsKey(args);
|
|
570099
|
+
const fingerprint = this._buildToolFingerprint(name10, args);
|
|
570100
|
+
const prior = this._adversaryToolOutcomes.find((o2) => o2.succeeded && o2.tool === name10 && o2.fingerprint === fingerprint && o2.turn < turn);
|
|
570101
|
+
if (prior) {
|
|
570102
|
+
this._adversaryRedundantSignals.add(fingerprint);
|
|
570103
|
+
emitReaction("redundant_action", `Already ran ${name10} successfully on turn ${prior.turn}`, 0.8, prior.preview);
|
|
570104
|
+
if (this._adversaryMode === "skillcoach" || this._adversaryMode === "both") {
|
|
570105
|
+
this.pendingUserMessages.push(buildAdversaryCritique({
|
|
570106
|
+
evidence: `${name10} already succeeded on turn ${prior.turn} with exact arguments (${argsKey.slice(0, 120)}). Prior preview: ${prior.preview}`,
|
|
570107
|
+
hypothesis: "The main loop may have lost track of previously observed evidence because of context pressure, path confusion, or repeated discovery.",
|
|
570108
|
+
correctiveAction: "Let this duplicate run execute if needed, but treat the prior result as evidence and pivot afterward unless state has changed.",
|
|
570109
|
+
alternatives: [
|
|
570110
|
+
"Use the prior result to edit/write, verify, or finish with evidence.",
|
|
570111
|
+
"Read a different specific file or selector if the current evidence is insufficient.",
|
|
570112
|
+
"Repeat exact arguments only when filesystem, browser, or page state changed."
|
|
570113
|
+
]
|
|
570114
|
+
}));
|
|
570115
|
+
}
|
|
570116
|
+
this.emit({
|
|
570117
|
+
type: "status",
|
|
570118
|
+
content: `\x1B[38;5;178m⚠ Adversary noted redundant ${name10} call (succeeded on turn ${prior.turn}); action remains allowed\x1B[0m`,
|
|
570119
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
570120
|
+
});
|
|
570121
|
+
break;
|
|
569885
570122
|
}
|
|
569886
|
-
this.emit({
|
|
569887
|
-
type: "status",
|
|
569888
|
-
content: `\x1B[38;5;178m⚠ Prevented redundant ${name10} call (succeeded on turn ${prior.turn})\x1B[0m`,
|
|
569889
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
569890
|
-
});
|
|
569891
|
-
break;
|
|
569892
570123
|
}
|
|
569893
570124
|
}
|
|
569894
570125
|
{
|
|
569895
|
-
const recentCalls = this.
|
|
570126
|
+
const recentCalls = this._adversaryToolOutcomes.slice(-5);
|
|
569896
570127
|
if (recentCalls.length >= 3) {
|
|
569897
570128
|
let consecutiveShortResults = 0;
|
|
569898
570129
|
for (let i2 = recentCalls.length - 1; i2 >= 0; i2--) {
|
|
@@ -569905,30 +570136,39 @@ ${trimmedNew}`;
|
|
|
569905
570136
|
}
|
|
569906
570137
|
if (consecutiveShortResults >= 3) {
|
|
569907
570138
|
emitReaction("idle_think", `Consecutive output without input: ${consecutiveShortResults}`, 0.7);
|
|
569908
|
-
if (this.
|
|
569909
|
-
this.pendingUserMessages.push(
|
|
570139
|
+
if (this._adversaryMode === "skillcoach" || this._adversaryMode === "both") {
|
|
570140
|
+
this.pendingUserMessages.push(buildAdversaryCritique({
|
|
570141
|
+
evidence: `${consecutiveShortResults} consecutive output-like calls occurred without an input-like observation.`,
|
|
570142
|
+
hypothesis: "The loop may be acting from stale state instead of re-observing the environment.",
|
|
570143
|
+
correctiveAction: "Take one input/observation step before another output step.",
|
|
570144
|
+
alternatives: [
|
|
570145
|
+
"Call the input/listen/poll tool for the current environment.",
|
|
570146
|
+
"Read the current UI/page state before clicking or typing again.",
|
|
570147
|
+
"If the task is already complete, finish with the concrete evidence already observed."
|
|
570148
|
+
]
|
|
570149
|
+
}));
|
|
569910
570150
|
}
|
|
569911
570151
|
this.emit({
|
|
569912
570152
|
type: "status",
|
|
569913
|
-
content: `\x1B[38;5;178m⚠
|
|
570153
|
+
content: `\x1B[38;5;178m⚠ Adversary flagged runaway-output risk (${consecutiveShortResults} consecutive sends without receive); action remains allowed\x1B[0m`,
|
|
569914
570154
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
569915
570155
|
});
|
|
569916
570156
|
}
|
|
569917
570157
|
}
|
|
569918
570158
|
}
|
|
569919
|
-
const succCount = this.
|
|
569920
|
-
const failCount = this.
|
|
569921
|
-
const lastFour = this.
|
|
570159
|
+
const succCount = this._adversaryToolOutcomes.filter((o2) => o2.succeeded).length;
|
|
570160
|
+
const failCount = this._adversaryToolOutcomes.filter((o2) => !o2.succeeded).length;
|
|
570161
|
+
const lastFour = this._adversaryToolOutcomes.slice(-4);
|
|
569922
570162
|
const details = [
|
|
569923
570163
|
`Recent tool outcomes:`,
|
|
569924
570164
|
...lastFour.map((o2) => `- ${o2.tool}: ${o2.succeeded ? "OK" : "ERR"} — ${o2.preview}`)
|
|
569925
570165
|
].join("\n");
|
|
569926
570166
|
this.emit({
|
|
569927
|
-
type: "
|
|
570167
|
+
type: "debug_adversary",
|
|
569928
570168
|
turn,
|
|
569929
570169
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
569930
|
-
content: `
|
|
569931
|
-
|
|
570170
|
+
content: `Adversary: ${this._adversaryToolOutcomes.length} tracked outcomes (${succCount} ok, ${failCount} err)`,
|
|
570171
|
+
adversaryAction: {
|
|
569932
570172
|
detection: "none",
|
|
569933
570173
|
recentSuccesses: succCount,
|
|
569934
570174
|
recentFailures: failCount,
|
|
@@ -650870,7 +651110,7 @@ ${conversationStream}`
|
|
|
650870
651110
|
// off default rather than the global config's value.
|
|
650871
651111
|
thinking: false,
|
|
650872
651112
|
// Telegram sub-agent runs must be bounded. Brute-force re-engagement and
|
|
650873
|
-
// the
|
|
651113
|
+
// the Adversary near-cap turn extension are appropriate for the full TUI
|
|
650874
651114
|
// session but cause Telegram to silently outgrow its nominal maxTurns,
|
|
650875
651115
|
// which is how the Snow Crash PDF loop reached 60+ turns of self-talk.
|
|
650876
651116
|
...TELEGRAM_SUB_AGENT_BOUNDED_OPTIONS
|
|
@@ -683233,8 +683473,8 @@ ${entry.fullContent}`
|
|
|
683233
683473
|
let streamTextBuffer = "";
|
|
683234
683474
|
let lastAssistantText = "";
|
|
683235
683475
|
let lastProvenancePath = null;
|
|
683236
|
-
let
|
|
683237
|
-
const
|
|
683476
|
+
let showAdversary = false;
|
|
683477
|
+
const adversaryBuffer = [];
|
|
683238
683478
|
const contentWrite = (fn) => {
|
|
683239
683479
|
if (isNeovimActive()) {
|
|
683240
683480
|
const origWrite = process.stdout.write;
|
|
@@ -683718,24 +683958,24 @@ ${entry.fullContent}`
|
|
|
683718
683958
|
if (snap) {
|
|
683719
683959
|
contentWrite(
|
|
683720
683960
|
() => renderInfo(
|
|
683721
|
-
`\x1B[38;5;243m[ctx] ${snap.messageCount} msgs | ~${snap.estimatedTokens} tok | headroom: ${snap.headroom} | tools: ${snap.toolCallCount} |
|
|
683961
|
+
`\x1B[38;5;243m[ctx] ${snap.messageCount} msgs | ~${snap.estimatedTokens} tok | headroom: ${snap.headroom} | tools: ${snap.toolCallCount} | adversary: ${snap.adversaryOutcomes} tracked\x1B[0m`
|
|
683722
683962
|
)
|
|
683723
683963
|
);
|
|
683724
683964
|
}
|
|
683725
683965
|
}
|
|
683726
683966
|
break;
|
|
683727
|
-
case "
|
|
683728
|
-
if (event.
|
|
683729
|
-
const lm = event.
|
|
683967
|
+
case "debug_adversary":
|
|
683968
|
+
if (event.adversaryAction) {
|
|
683969
|
+
const lm = event.adversaryAction;
|
|
683730
683970
|
if (lm.intervention) {
|
|
683731
683971
|
const simple = `⚠ ${lm.intervention}`;
|
|
683732
683972
|
contentWrite(() => renderInfo(simple));
|
|
683733
683973
|
}
|
|
683734
683974
|
if (lm.details) {
|
|
683735
|
-
|
|
683736
|
-
if (
|
|
683737
|
-
|
|
683738
|
-
if (
|
|
683975
|
+
adversaryBuffer.push(lm.details);
|
|
683976
|
+
if (adversaryBuffer.length > 50)
|
|
683977
|
+
adversaryBuffer.splice(0, adversaryBuffer.length - 50);
|
|
683978
|
+
if (showAdversary) {
|
|
683739
683979
|
const det = String(lm.details);
|
|
683740
683980
|
contentWrite(() => {
|
|
683741
683981
|
process.stdout.write(c3.dim(det) + "\n");
|
|
@@ -685477,8 +685717,8 @@ This is an independent background session started from /background.`
|
|
|
685477
685717
|
origTtyWriteRef = null;
|
|
685478
685718
|
statusBar.setNeovimFocusChecker(() => isNeovimFocused());
|
|
685479
685719
|
let _escapeHandler = null;
|
|
685480
|
-
let
|
|
685481
|
-
const
|
|
685720
|
+
let showAdversary = false;
|
|
685721
|
+
const adversaryBuffer = [];
|
|
685482
685722
|
statusBar.hookDirectInput(
|
|
685483
685723
|
rl,
|
|
685484
685724
|
() => {
|
|
@@ -685511,26 +685751,26 @@ This is an independent background session started from /background.`
|
|
|
685511
685751
|
}
|
|
685512
685752
|
},
|
|
685513
685753
|
() => {
|
|
685514
|
-
|
|
685754
|
+
showAdversary = !showAdversary;
|
|
685515
685755
|
if (statusBar.isActive) {
|
|
685516
685756
|
try {
|
|
685517
685757
|
statusBar.jumpToLive();
|
|
685518
685758
|
} catch {
|
|
685519
685759
|
}
|
|
685520
685760
|
statusBar.beginContentWrite();
|
|
685521
|
-
if (
|
|
685522
|
-
renderInfo("
|
|
685523
|
-
const dump =
|
|
685761
|
+
if (showAdversary) {
|
|
685762
|
+
renderInfo("Adversary details: shown");
|
|
685763
|
+
const dump = adversaryBuffer.slice(-10).join("\n");
|
|
685524
685764
|
if (dump.trim()) {
|
|
685525
685765
|
process.stdout.write(`
|
|
685526
|
-
${c3.dim("[
|
|
685766
|
+
${c3.dim("[adversary recap]")}
|
|
685527
685767
|
`);
|
|
685528
685768
|
for (const line of dump.split("\n")) {
|
|
685529
685769
|
process.stdout.write(" " + c3.dim(line) + "\n");
|
|
685530
685770
|
}
|
|
685531
685771
|
}
|
|
685532
685772
|
} else {
|
|
685533
|
-
renderInfo("
|
|
685773
|
+
renderInfo("Adversary details: hidden");
|
|
685534
685774
|
}
|
|
685535
685775
|
statusBar.endContentWrite();
|
|
685536
685776
|
}
|