create-walle 0.9.13 → 0.9.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -3
- package/bin/create-walle.js +232 -32
- package/bin/mcp-inject.js +18 -53
- package/package.json +3 -1
- package/template/claude-task-manager/api-prompts.js +11 -2
- package/template/claude-task-manager/approval-agent.js +7 -0
- package/template/claude-task-manager/db.js +94 -75
- package/template/claude-task-manager/docs/session-standup-command-center-design.md +242 -0
- package/template/claude-task-manager/docs/session-tooltip-freshness-design.md +224 -0
- package/template/claude-task-manager/docs/session-ux-issue-review-2026-05-01.md +369 -0
- package/template/claude-task-manager/fuzzy-utils.js +10 -2
- package/template/claude-task-manager/git-utils.js +140 -10
- package/template/claude-task-manager/lib/agent-capabilities.js +1 -1
- package/template/claude-task-manager/lib/agent-presets.js +38 -5
- package/template/claude-task-manager/lib/codex-terminal-final.js +53 -0
- package/template/claude-task-manager/lib/ctm-session-context-api.js +222 -0
- package/template/claude-task-manager/lib/session-diagnostics.js +56 -0
- package/template/claude-task-manager/lib/session-history.js +309 -16
- package/template/claude-task-manager/lib/session-standup.js +409 -0
- package/template/claude-task-manager/lib/session-stream.js +253 -20
- package/template/claude-task-manager/lib/standup-attention.js +200 -0
- package/template/claude-task-manager/lib/status-hooks.js +8 -2
- package/template/claude-task-manager/lib/update-telemetry.js +114 -0
- package/template/claude-task-manager/lib/walle-ctm-history.js +49 -6
- package/template/claude-task-manager/lib/walle-default-model.js +55 -0
- package/template/claude-task-manager/lib/walle-mcp-auto-config.js +66 -0
- package/template/claude-task-manager/lib/walle-supervisor.js +86 -19
- package/template/claude-task-manager/lib/walle-transcript.js +1 -3
- package/template/claude-task-manager/lib/worktree-cwd.js +82 -0
- package/template/claude-task-manager/package.json +1 -0
- package/template/claude-task-manager/providers/codex-mcp.js +104 -0
- package/template/claude-task-manager/providers/index.js +2 -0
- package/template/claude-task-manager/public/css/setup.css +2 -1
- package/template/claude-task-manager/public/css/walle.css +71 -0
- package/template/claude-task-manager/public/index.html +2388 -429
- package/template/claude-task-manager/public/js/message-renderer.js +314 -35
- package/template/claude-task-manager/public/js/session-search-utils.js +185 -3
- package/template/claude-task-manager/public/js/session-status-precedence.js +125 -0
- package/template/claude-task-manager/public/js/setup.js +62 -19
- package/template/claude-task-manager/public/js/stream-view.js +396 -55
- package/template/claude-task-manager/public/js/terminal-restore-state.js +57 -0
- package/template/claude-task-manager/public/js/walle-session.js +234 -26
- package/template/claude-task-manager/public/js/walle.js +143 -2
- package/template/claude-task-manager/server.js +1402 -433
- package/template/claude-task-manager/session-integrity.js +77 -28
- package/template/claude-task-manager/workers/approval-widget-validator.js +15 -5
- package/template/claude-task-manager/workers/scrollback-worker.js +5 -6
- package/template/claude-task-manager/workers/state-detectors/codex.js +6 -0
- package/template/package.json +1 -1
- package/template/wall-e/agent-runners/claude-code.js +2 -0
- package/template/wall-e/agent.js +63 -8
- package/template/wall-e/api-walle.js +330 -52
- package/template/wall-e/brain.js +291 -42
- package/template/wall-e/chat.js +172 -15
- package/template/wall-e/coding/compaction-service.js +19 -5
- package/template/wall-e/coding/stream-processor.js +22 -2
- package/template/wall-e/coding/workspace-replay.js +1 -4
- package/template/wall-e/coding-orchestrator.js +250 -80
- package/template/wall-e/compat.js +0 -28
- package/template/wall-e/context/context-builder.js +3 -1
- package/template/wall-e/embeddings.js +2 -7
- package/template/wall-e/eval/agent-runner.js +30 -9
- package/template/wall-e/eval/benchmark-generator.js +21 -1
- package/template/wall-e/eval/benchmarks/chat-eval.json +66 -6
- package/template/wall-e/eval/benchmarks/coding-agent.json +0 -596
- package/template/wall-e/eval/cc-replay.js +1 -0
- package/template/wall-e/eval/codex-cli-baseline.js +633 -0
- package/template/wall-e/eval/debug-agent003.js +1 -0
- package/template/wall-e/eval/eval-orchestrator.js +3 -3
- package/template/wall-e/eval/run-agent-benchmarks.js +11 -3
- package/template/wall-e/eval/run-codex-cli-baseline.js +177 -0
- package/template/wall-e/eval/run-model-comparison.js +1 -0
- package/template/wall-e/eval/swebench-adapter.js +1 -0
- package/template/wall-e/evaluation/quorum-evaluator.js +0 -1
- package/template/wall-e/extraction/knowledge-extractor.js +1 -2
- package/template/wall-e/lib/mcp-integration.js +336 -0
- package/template/wall-e/llm/ollama.js +47 -8
- package/template/wall-e/llm/ollama.plugin.json +1 -1
- package/template/wall-e/llm/tool-adapter.js +1 -0
- package/template/wall-e/loops/ingest.js +42 -8
- package/template/wall-e/loops/initiative.js +87 -2
- package/template/wall-e/mcp-server.js +872 -19
- package/template/wall-e/memory/ctm-context-client.js +230 -0
- package/template/wall-e/memory/ctm-session-context.js +1376 -0
- package/template/wall-e/prompts/coding/memory-protocol.md +6 -0
- package/template/wall-e/server.js +30 -1
- package/template/wall-e/skills/_bundled/memory-search/SKILL.md +8 -0
- package/template/wall-e/skills/_bundled/scan-ctm-sessions/SKILL.md +20 -0
- package/template/wall-e/skills/_bundled/scan-ctm-sessions/run.js +43 -0
- package/template/wall-e/skills/_bundled/slack-mentions/run.js +471 -188
- package/template/wall-e/skills/skill-planner.js +86 -4
- package/template/wall-e/slack/socket-mode-listener.js +276 -0
- package/template/wall-e/telemetry.js +70 -2
- package/template/wall-e/tools/builtin-middleware.js +55 -2
- package/template/wall-e/tools/shell-policy.js +1 -1
- package/template/wall-e/tools/slack-owner.js +104 -0
- package/template/website/index.html +4 -4
- package/template/builder-journal.md +0 -17
|
@@ -54,32 +54,17 @@ const {
|
|
|
54
54
|
shouldUseStreamProcessor,
|
|
55
55
|
} = require('./coding/runtime-mode');
|
|
56
56
|
const { createCodingTranscript } = require('./coding/transcript-writer');
|
|
57
|
+
const {
|
|
58
|
+
CompactionService,
|
|
59
|
+
DEFAULT_CONTEXT_WINDOW,
|
|
60
|
+
} = require('./coding/compaction-service');
|
|
61
|
+
const { estimateTokens, estimateMessagesTokens } = require('./context/token-counter');
|
|
57
62
|
|
|
58
63
|
const MAX_CUMULATIVE_CONTEXT = 4000;
|
|
59
64
|
const MAX_DIFF_SIZE = 50 * 1024; // 50KB
|
|
60
65
|
const MAX_AGENT_TURNS = 50;
|
|
61
66
|
const CHECKPOINT_INTERVAL = 5;
|
|
62
67
|
|
|
63
|
-
// ANSI-safe truncation: avoid cutting inside CSI escape sequences.
|
|
64
|
-
// Inspired by cmux SessionPersistence.swift scrollback truncation.
|
|
65
|
-
function ansiSafeTruncate(text, maxLen) {
|
|
66
|
-
if (text.length <= maxLen) return text;
|
|
67
|
-
let end = maxLen;
|
|
68
|
-
// If we're inside an ANSI escape sequence (ESC[...m), advance to its end
|
|
69
|
-
// Look back up to 20 chars for an unclosed ESC[
|
|
70
|
-
for (let i = end; i > Math.max(0, end - 20); i--) {
|
|
71
|
-
if (text[i] === '\x1b' || (text[i] === '\x1B')) {
|
|
72
|
-
// Found ESC — check if the sequence closes before our cut point
|
|
73
|
-
const closeIdx = text.indexOf('m', i);
|
|
74
|
-
if (closeIdx > end && closeIdx < end + 20) {
|
|
75
|
-
end = closeIdx + 1; // include the closing 'm'
|
|
76
|
-
}
|
|
77
|
-
break;
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
return text.slice(0, end);
|
|
81
|
-
}
|
|
82
|
-
|
|
83
68
|
// Coding-focused tool definitions (subset of local-tools)
|
|
84
69
|
const CODING_TOOLS = [
|
|
85
70
|
{
|
|
@@ -493,6 +478,156 @@ function providerSupportsToolCalls(provider) {
|
|
|
493
478
|
return true;
|
|
494
479
|
}
|
|
495
480
|
|
|
481
|
+
function positiveNumber(value) {
|
|
482
|
+
const n = Number(value);
|
|
483
|
+
return Number.isFinite(n) && n > 0 ? n : null;
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
function resolveCodingContextWindow(provider, opts = {}) {
|
|
487
|
+
const candidates = [
|
|
488
|
+
opts.compactionContextWindow,
|
|
489
|
+
opts.contextWindow,
|
|
490
|
+
opts.maxContextTokens,
|
|
491
|
+
opts.modelContextWindow,
|
|
492
|
+
provider?.maxContextTokens,
|
|
493
|
+
provider?.max_context_tokens,
|
|
494
|
+
provider?.contextWindow,
|
|
495
|
+
provider?.context_window,
|
|
496
|
+
provider?.metadata?.maxContextTokens,
|
|
497
|
+
provider?.metadata?.max_context_tokens,
|
|
498
|
+
provider?.metadata?.contextWindow,
|
|
499
|
+
provider?.modelInfo?.maxContextTokens,
|
|
500
|
+
provider?.modelInfo?.max_context_tokens,
|
|
501
|
+
];
|
|
502
|
+
for (const candidate of candidates) {
|
|
503
|
+
const n = positiveNumber(candidate);
|
|
504
|
+
if (n) return n;
|
|
505
|
+
}
|
|
506
|
+
return DEFAULT_CONTEXT_WINDOW;
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
function createCodingCompactionService(provider, modelId, opts = {}) {
|
|
510
|
+
if (opts.autoCompact === false || opts.compaction === false || opts.disableCompaction === true) return null;
|
|
511
|
+
if (String(process.env.WALLE_CODING_AUTO_COMPACT || '').trim() === '0') return null;
|
|
512
|
+
if (opts.compactionService) return opts.compactionService;
|
|
513
|
+
return new CompactionService({
|
|
514
|
+
provider,
|
|
515
|
+
model: modelId,
|
|
516
|
+
contextWindow: resolveCodingContextWindow(provider, opts),
|
|
517
|
+
threshold: opts.compactionThreshold,
|
|
518
|
+
tailTokenBudget: opts.compactionTailTokenBudget,
|
|
519
|
+
keepRecentUserTurns: opts.compactionKeepRecentUserTurns,
|
|
520
|
+
});
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
async function maybeCompactCodingContext({
|
|
524
|
+
messages,
|
|
525
|
+
compactionService,
|
|
526
|
+
systemPrompt = '',
|
|
527
|
+
sessionId,
|
|
528
|
+
cwd,
|
|
529
|
+
transcript,
|
|
530
|
+
events,
|
|
531
|
+
emitProgress,
|
|
532
|
+
mode,
|
|
533
|
+
step = -1,
|
|
534
|
+
sessionMemory,
|
|
535
|
+
reason = 'context_threshold',
|
|
536
|
+
opts = {},
|
|
537
|
+
} = {}) {
|
|
538
|
+
if (!compactionService || !Array.isArray(messages) || messages.length < 2) return null;
|
|
539
|
+
const systemTokens = estimateTokens(systemPrompt || '');
|
|
540
|
+
const estimatedInputTokens = systemTokens + estimateMessagesTokens(messages);
|
|
541
|
+
if (!compactionService.shouldCompact({ messages, systemTokens })) return null;
|
|
542
|
+
|
|
543
|
+
emitProgress?.({
|
|
544
|
+
phase: mode || 'executing',
|
|
545
|
+
step,
|
|
546
|
+
message: 'Compacting coding context...',
|
|
547
|
+
});
|
|
548
|
+
|
|
549
|
+
const result = await compactionService.compact(messages, {
|
|
550
|
+
sessionId,
|
|
551
|
+
cwd,
|
|
552
|
+
reason,
|
|
553
|
+
transcript,
|
|
554
|
+
sessionMemory,
|
|
555
|
+
tailMode: opts.compactionTailMode || 'continue',
|
|
556
|
+
tailTokenBudget: opts.compactionTailTokenBudget,
|
|
557
|
+
keepRecentUserTurns: opts.compactionKeepRecentUserTurns,
|
|
558
|
+
continuePrompt: opts.compactionContinuePrompt,
|
|
559
|
+
});
|
|
560
|
+
if (!result?.compacted || !Array.isArray(result.messages)) return result;
|
|
561
|
+
|
|
562
|
+
messages.splice(0, messages.length, ...result.messages);
|
|
563
|
+
const detail = {
|
|
564
|
+
compactionId: result.metadata?.compactionId || '',
|
|
565
|
+
reason,
|
|
566
|
+
estimatedInputTokens,
|
|
567
|
+
tokensBefore: result.tokensBeforeCompaction,
|
|
568
|
+
tokensAfter: result.tokensAfterCompaction,
|
|
569
|
+
compactedMessages: result.metadata?.compacted_message_count || 0,
|
|
570
|
+
retainedMessages: result.metadata?.retained_message_count || 0,
|
|
571
|
+
tailMode: result.metadata?.tail_mode || '',
|
|
572
|
+
};
|
|
573
|
+
events?.emit?.('context.overflow', { tokens: result.tokensBeforeCompaction, sessionId });
|
|
574
|
+
events?.emit?.('context.compacted', { sessionId, ...detail });
|
|
575
|
+
emitProgress?.({
|
|
576
|
+
phase: mode || 'executing',
|
|
577
|
+
step,
|
|
578
|
+
message: `Context compacted (${detail.compactedMessages} messages summarized)`,
|
|
579
|
+
detail,
|
|
580
|
+
});
|
|
581
|
+
return result;
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
function shouldAutoFallbackToCli({ opts = {}, explicitProvider = false, requestedTools = [] } = {}) {
|
|
585
|
+
if (opts._cliFallbackAttempt) return false;
|
|
586
|
+
if (opts.allowCliFallback === false) return false;
|
|
587
|
+
if (process.env.WALLE_CODING_AUTO_CLI_FALLBACK === '0') return false;
|
|
588
|
+
if (explicitProvider && opts.allowCliFallback !== true) return false;
|
|
589
|
+
if (Array.isArray(requestedTools) && requestedTools.length === 0) return false;
|
|
590
|
+
return true;
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
function isProviderFailureRecoverableByCli(message) {
|
|
594
|
+
const text = String(message || '');
|
|
595
|
+
return /oauth_proxy_error|OAuth token not found|Invalid bearer token|authentication_error|API key not valid|exceeded your current quota|does not support tool calls|No LLM provider configured/i.test(text);
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
async function runCliFallback(prompt, opts = {}, { sid, cwd, reason, fromProvider, model, runtimeMode } = {}) {
|
|
599
|
+
const runnerId = opts.agentRunner || opts.agent_runner || 'claude-code';
|
|
600
|
+
if (opts.onProgress) {
|
|
601
|
+
opts.onProgress({
|
|
602
|
+
type: 'cli_fallback',
|
|
603
|
+
phase: opts.mode || 'executing',
|
|
604
|
+
step: -1,
|
|
605
|
+
message: `Falling back to ${runnerId}`,
|
|
606
|
+
detail: { reason, fromProvider },
|
|
607
|
+
});
|
|
608
|
+
}
|
|
609
|
+
const result = await runHeadless(prompt, {
|
|
610
|
+
cwd,
|
|
611
|
+
sessionId: sid,
|
|
612
|
+
timeoutMs: opts.timeoutMs,
|
|
613
|
+
budgetUsd: opts.budgetUsd,
|
|
614
|
+
runnerId,
|
|
615
|
+
model,
|
|
616
|
+
mode: opts.mode || 'build',
|
|
617
|
+
});
|
|
618
|
+
return {
|
|
619
|
+
...result,
|
|
620
|
+
provider: result.provider || result.providerType || fromProvider,
|
|
621
|
+
model: result.model || model,
|
|
622
|
+
runtimeMode: runtimeMode?.id || runtimeMode,
|
|
623
|
+
fallback: {
|
|
624
|
+
runnerId,
|
|
625
|
+
fromProvider: fromProvider || null,
|
|
626
|
+
reason: String(reason || '').slice(0, 500),
|
|
627
|
+
},
|
|
628
|
+
};
|
|
629
|
+
}
|
|
630
|
+
|
|
496
631
|
/**
|
|
497
632
|
* Writes state object to JSON file.
|
|
498
633
|
*/
|
|
@@ -611,6 +746,7 @@ function saveCheckpointToBrain(sid, turn, messages, opts, totalInput, totalOutpu
|
|
|
611
746
|
*/
|
|
612
747
|
async function runAgentLoop(prompt, opts = {}) {
|
|
613
748
|
const { cwd, timeoutMs, maxTurns, provider, model, tools, onProgress } = opts;
|
|
749
|
+
const explicitProvider = !!provider;
|
|
614
750
|
const sid = opts._resumeSessionId || crypto.randomUUID();
|
|
615
751
|
|
|
616
752
|
// Persist activity start (Phase 2: Activity History)
|
|
@@ -677,12 +813,23 @@ async function runAgentLoop(prompt, opts = {}) {
|
|
|
677
813
|
}
|
|
678
814
|
if (requestedTools.length > 0 && !providerSupportsToolCalls(llm)) {
|
|
679
815
|
const providerType = llm.type || 'unknown';
|
|
816
|
+
const message = `Provider ${providerType} does not support tool calls`;
|
|
817
|
+
if (shouldAutoFallbackToCli({ opts, explicitProvider, requestedTools }) && isProviderFailureRecoverableByCli(message)) {
|
|
818
|
+
return runCliFallback(prompt, opts, {
|
|
819
|
+
sid,
|
|
820
|
+
cwd: resolvedCwd,
|
|
821
|
+
reason: message,
|
|
822
|
+
fromProvider: providerType,
|
|
823
|
+
model,
|
|
824
|
+
runtimeMode,
|
|
825
|
+
});
|
|
826
|
+
}
|
|
680
827
|
if (transcript?.appendPart) {
|
|
681
828
|
transcript.appendPart({
|
|
682
829
|
sessionId: sid,
|
|
683
830
|
cwd: resolvedCwd,
|
|
684
831
|
partType: 'error',
|
|
685
|
-
data: { message
|
|
832
|
+
data: { message },
|
|
686
833
|
});
|
|
687
834
|
}
|
|
688
835
|
return {
|
|
@@ -750,7 +897,7 @@ async function runAgentLoop(prompt, opts = {}) {
|
|
|
750
897
|
|
|
751
898
|
const mw = opts.middleware || (() => {
|
|
752
899
|
const m = new CodingMiddleware();
|
|
753
|
-
registerBuiltinMiddleware(m, { cwd, provider: llm?.type, model: modelId, claudeMd: opts.claudeMd, mode: opts.mode, taskEnv: opts.env });
|
|
900
|
+
registerBuiltinMiddleware(m, { cwd, provider: llm?.type, model: modelId, claudeMd: opts.claudeMd, mode: opts.mode, taskEnv: opts.env, benchmark: opts.benchmark });
|
|
754
901
|
return m;
|
|
755
902
|
})();
|
|
756
903
|
const events = opts.events || new CodingEvents();
|
|
@@ -810,8 +957,10 @@ async function runAgentLoop(prompt, opts = {}) {
|
|
|
810
957
|
// ── Interactive Questions (B1) ──
|
|
811
958
|
// Inspired by OpenCode Question service (packages/opencode/src/question/index.ts)
|
|
812
959
|
const questionManager = opts.questionManager || new QuestionManager(events);
|
|
960
|
+
const compactionService = createCodingCompactionService(llm, modelId, opts);
|
|
813
961
|
|
|
814
962
|
// projectInfo already detected above (before system prompt)
|
|
963
|
+
const llmCtxRef = { current: null }; // populated each turn (see llmCtx below)
|
|
815
964
|
|
|
816
965
|
// Stream-native runtime: model deltas, tool states, snapshots, permissions,
|
|
817
966
|
// and step boundaries are persisted as typed transcript parts while the loop
|
|
@@ -835,9 +984,15 @@ async function runAgentLoop(prompt, opts = {}) {
|
|
|
835
984
|
if (call.name === 'list_directory' && input.directory && !path.isAbsolute(input.directory)) {
|
|
836
985
|
input.directory = path.join(resolvedCwd, input.directory);
|
|
837
986
|
}
|
|
987
|
+
if (call.name === 'run_shell' && !input.cwd) {
|
|
988
|
+
input.cwd = resolvedCwd;
|
|
989
|
+
}
|
|
838
990
|
input.sessionId = sid;
|
|
839
991
|
input.projectRoot = resolvedCwd;
|
|
840
|
-
|
|
992
|
+
const toolCtx = { sessionId: sid, cwd: resolvedCwd, model: modelId, provider: llm.type, runtimeMode: runtimeMode.id };
|
|
993
|
+
const finalInput = await mw.run('tool.before', toolCtx, call.name, input);
|
|
994
|
+
const result = await toolRegistry.execute(call.name, finalInput, toolCtx);
|
|
995
|
+
return mw.run('tool.after', toolCtx, call.name, finalInput, result);
|
|
841
996
|
},
|
|
842
997
|
});
|
|
843
998
|
processor.on('event', (evt) => emitProgress({
|
|
@@ -851,6 +1006,7 @@ async function runAgentLoop(prompt, opts = {}) {
|
|
|
851
1006
|
let streamStopReason = '';
|
|
852
1007
|
let streamModel = modelId;
|
|
853
1008
|
const streamErrors = [];
|
|
1009
|
+
let streamHadEdit = false;
|
|
854
1010
|
for (let turnIndex = opts._resumeTurn || 0; turnIndex < turns; turnIndex++) {
|
|
855
1011
|
const remaining = deadline - Date.now();
|
|
856
1012
|
if (remaining <= 0) {
|
|
@@ -878,14 +1034,39 @@ async function runAgentLoop(prompt, opts = {}) {
|
|
|
878
1034
|
runtimeMode: runtimeMode.id,
|
|
879
1035
|
cwd: resolvedCwd,
|
|
880
1036
|
});
|
|
1037
|
+
const llmCtx = { params: { maxTokens: taskFileHints.length >= 4 ? 8192 : 4096 }, system: systemPrompt, cwd: resolvedCwd,
|
|
1038
|
+
provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {},
|
|
1039
|
+
toolsAvailable: toolsForTurn.length > 0 };
|
|
1040
|
+
llmCtxRef.current = llmCtx;
|
|
1041
|
+
await mw.run('llm.before', llmCtx);
|
|
1042
|
+
await maybeCompactCodingContext({
|
|
1043
|
+
messages,
|
|
1044
|
+
compactionService,
|
|
1045
|
+
systemPrompt: llmCtx.system,
|
|
1046
|
+
sessionId: sid,
|
|
1047
|
+
cwd: resolvedCwd,
|
|
1048
|
+
transcript,
|
|
1049
|
+
events,
|
|
1050
|
+
emitProgress,
|
|
1051
|
+
mode: opts.mode || 'executing',
|
|
1052
|
+
step: turnIndex,
|
|
1053
|
+
sessionMemory: opts.sessionMemory,
|
|
1054
|
+
reason: 'stream_pre_turn',
|
|
1055
|
+
opts,
|
|
1056
|
+
});
|
|
881
1057
|
turn = await processor.runTurn({
|
|
882
1058
|
sessionId: sid,
|
|
883
1059
|
cwd: resolvedCwd,
|
|
884
|
-
system:
|
|
1060
|
+
system: llmCtx.system,
|
|
885
1061
|
messages,
|
|
886
1062
|
tools: toolsForTurn,
|
|
887
1063
|
maxTokens: taskFileHints.length >= 4 ? 8192 : 4096,
|
|
888
1064
|
signal: ac.signal,
|
|
1065
|
+
maxTokens: llmCtx.params.maxTokens,
|
|
1066
|
+
temperature: llmCtx.params.temperature,
|
|
1067
|
+
thinking: llmCtx.params.thinking,
|
|
1068
|
+
reasoningEffort: llmCtx.params.reasoningEffort,
|
|
1069
|
+
options: llmCtx.params.options,
|
|
889
1070
|
});
|
|
890
1071
|
} finally {
|
|
891
1072
|
clearTimeout(timer);
|
|
@@ -911,6 +1092,7 @@ async function runAgentLoop(prompt, opts = {}) {
|
|
|
911
1092
|
content: turn.text,
|
|
912
1093
|
stopReason: turn.stopReason,
|
|
913
1094
|
});
|
|
1095
|
+
if (turn.hadEdit) streamHadEdit = true;
|
|
914
1096
|
|
|
915
1097
|
if (turn.status === 'error') break;
|
|
916
1098
|
if ((turn.toolCalls || []).length === 0) {
|
|
@@ -931,9 +1113,24 @@ async function runAgentLoop(prompt, opts = {}) {
|
|
|
931
1113
|
}
|
|
932
1114
|
if (turn.assistantMessage) messages.push(turn.assistantMessage);
|
|
933
1115
|
if (turn.toolResultMessage) messages.push(turn.toolResultMessage);
|
|
1116
|
+
if (turn.verified && streamHadEdit) break;
|
|
934
1117
|
if (turn.next !== 'continue') break;
|
|
935
1118
|
}
|
|
936
1119
|
|
|
1120
|
+
if (streamStatus === 'error') {
|
|
1121
|
+
const errorText = streamErrors.join('\n');
|
|
1122
|
+
if (shouldAutoFallbackToCli({ opts, explicitProvider, requestedTools }) && isProviderFailureRecoverableByCli(errorText)) {
|
|
1123
|
+
return runCliFallback(prompt, opts, {
|
|
1124
|
+
sid,
|
|
1125
|
+
cwd: resolvedCwd,
|
|
1126
|
+
reason: errorText,
|
|
1127
|
+
fromProvider: llm.type || '',
|
|
1128
|
+
model,
|
|
1129
|
+
runtimeMode,
|
|
1130
|
+
});
|
|
1131
|
+
}
|
|
1132
|
+
}
|
|
1133
|
+
|
|
937
1134
|
if (streamStatus === 'error' && transcript?.appendPart) {
|
|
938
1135
|
transcript.appendPart({
|
|
939
1136
|
sessionId: sid,
|
|
@@ -971,7 +1168,6 @@ async function runAgentLoop(prompt, opts = {}) {
|
|
|
971
1168
|
// ── Bridge: event bus → middleware (A2) ──
|
|
972
1169
|
// When the event bus fires, propagate to middleware's onEvent hook so
|
|
973
1170
|
// registered middleware can react to file edits, reads, and context overflow.
|
|
974
|
-
const llmCtxRef = { current: null }; // populated each turn (see llmCtx below)
|
|
975
1171
|
const _bridgeHandlers = {};
|
|
976
1172
|
for (const evtType of ['file.edited', 'file.read', 'context.overflow']) {
|
|
977
1173
|
const handler = (data) => {
|
|
@@ -1073,70 +1269,33 @@ async function runAgentLoop(prompt, opts = {}) {
|
|
|
1073
1269
|
const timer = setTimeout(() => ac.abort(), Math.min(remaining, perTurnCap));
|
|
1074
1270
|
|
|
1075
1271
|
// Middleware: prepare LLM call
|
|
1272
|
+
const turnsRemaining = turns - turn;
|
|
1076
1273
|
const llmCtx = { params: { maxTokens: taskFileHints.length >= 4 ? 8192 : 4096 }, system: systemPrompt, cwd: resolvedCwd,
|
|
1077
|
-
provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {}
|
|
1274
|
+
provider: llm.type, model: modelId, mode: opts.mode, runtimeMode: runtimeMode.id, claudeMd: opts.claudeMd, log: {},
|
|
1275
|
+
toolsAvailable: turnsRemaining > 1 };
|
|
1078
1276
|
llmCtxRef.current = llmCtx; // expose to event bridge (A2)
|
|
1079
1277
|
await mw.run('llm.before', llmCtx);
|
|
1080
1278
|
let adaptedTools = await toolRegistry.getDefinitions(llmCtx);
|
|
1081
1279
|
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
const contextLimit = 200000; // conservative for most models
|
|
1098
|
-
const reservedBuffer = 20000;
|
|
1099
|
-
const totalTokens = estimateTokens(messages);
|
|
1100
|
-
if (totalTokens >= contextLimit - reservedBuffer) {
|
|
1101
|
-
events.emit('context.overflow', { tokens: totalTokens, sessionId: sid });
|
|
1102
|
-
// Prune oldest tool results, keep last ~40K tokens worth
|
|
1103
|
-
const protectChars = 160000; // ~40K tokens * 4 chars/token
|
|
1104
|
-
let charsSeen = 0;
|
|
1105
|
-
for (let m = messages.length - 1; m >= 1; m--) { // never prune first user msg
|
|
1106
|
-
const msg = messages[m];
|
|
1107
|
-
if (typeof msg.content === 'string') charsSeen += msg.content.length;
|
|
1108
|
-
else if (Array.isArray(msg.content)) {
|
|
1109
|
-
for (const part of msg.content) {
|
|
1110
|
-
charsSeen += (part.text || part.content || '').length;
|
|
1111
|
-
}
|
|
1112
|
-
}
|
|
1113
|
-
if (charsSeen >= protectChars) {
|
|
1114
|
-
// Prune everything older than index m
|
|
1115
|
-
for (let j = 1; j < m; j++) {
|
|
1116
|
-
const old = messages[j];
|
|
1117
|
-
if (Array.isArray(old.content)) {
|
|
1118
|
-
old.content = old.content.map(part => {
|
|
1119
|
-
if (part.type === 'tool_result' && part.content) {
|
|
1120
|
-
const text = typeof part.content === 'string' ? part.content
|
|
1121
|
-
: Array.isArray(part.content) ? part.content.map(c => c.text || '').join('')
|
|
1122
|
-
: String(part.content);
|
|
1123
|
-
if (text.length > 200) {
|
|
1124
|
-
return { ...part, content: ansiSafeTruncate(text, 200) + '\n[compacted]' };
|
|
1125
|
-
}
|
|
1126
|
-
}
|
|
1127
|
-
return part;
|
|
1128
|
-
});
|
|
1129
|
-
}
|
|
1130
|
-
}
|
|
1131
|
-
break;
|
|
1132
|
-
}
|
|
1133
|
-
}
|
|
1134
|
-
}
|
|
1280
|
+
await maybeCompactCodingContext({
|
|
1281
|
+
messages,
|
|
1282
|
+
compactionService,
|
|
1283
|
+
systemPrompt: llmCtx.system,
|
|
1284
|
+
sessionId: sid,
|
|
1285
|
+
cwd: resolvedCwd,
|
|
1286
|
+
transcript,
|
|
1287
|
+
events,
|
|
1288
|
+
emitProgress,
|
|
1289
|
+
mode: opts.mode || 'executing',
|
|
1290
|
+
step: turn,
|
|
1291
|
+
sessionMemory: opts.sessionMemory,
|
|
1292
|
+
reason: 'legacy_pre_turn',
|
|
1293
|
+
opts,
|
|
1294
|
+
});
|
|
1135
1295
|
|
|
1136
1296
|
// Graceful max-steps degradation (6n)
|
|
1137
1297
|
// Note: warnings are appended to the LAST message's content (not as separate
|
|
1138
1298
|
// user messages) to avoid consecutive user messages which the API rejects.
|
|
1139
|
-
const turnsRemaining = turns - turn;
|
|
1140
1299
|
if (turnsRemaining <= 1) {
|
|
1141
1300
|
// Final turn: disable tools, force structured summary
|
|
1142
1301
|
adaptedTools = [];
|
|
@@ -1519,6 +1678,17 @@ async function runAgentLoop(prompt, opts = {}) {
|
|
|
1519
1678
|
if (questionManager) questionManager.clear();
|
|
1520
1679
|
try { require('./tools/file-tracker').clearSession(sid); } catch {}
|
|
1521
1680
|
|
|
1681
|
+
if (shouldAutoFallbackToCli({ opts, explicitProvider, requestedTools }) && isProviderFailureRecoverableByCli(err.message)) {
|
|
1682
|
+
return runCliFallback(prompt, opts, {
|
|
1683
|
+
sid,
|
|
1684
|
+
cwd: resolvedCwd,
|
|
1685
|
+
reason: err.message,
|
|
1686
|
+
fromProvider: llm?.type || '',
|
|
1687
|
+
model,
|
|
1688
|
+
runtimeMode,
|
|
1689
|
+
});
|
|
1690
|
+
}
|
|
1691
|
+
|
|
1522
1692
|
return {
|
|
1523
1693
|
success: false,
|
|
1524
1694
|
output: finalOutput,
|
|
@@ -5,27 +5,6 @@
|
|
|
5
5
|
// Called at boot to log warnings; queried by /telemetry skill for removal candidates.
|
|
6
6
|
|
|
7
7
|
const COMPAT_REGISTRY = {
|
|
8
|
-
embedding_v1_table: {
|
|
9
|
-
addedIn: '0.8.0',
|
|
10
|
-
deprecatedIn: '0.12.0',
|
|
11
|
-
removeAfter: '1.0.0',
|
|
12
|
-
replacedBy: 'Per-model embedding_vec_<key> tables',
|
|
13
|
-
telemetryKey: 'embedding_v1_migration',
|
|
14
|
-
},
|
|
15
|
-
legacy_knowledge_array: {
|
|
16
|
-
addedIn: '0.5.0',
|
|
17
|
-
deprecatedIn: '0.12.0',
|
|
18
|
-
removeAfter: '1.0.0',
|
|
19
|
-
replacedBy: '{ knowledge: [...], classifications: [...] } format',
|
|
20
|
-
telemetryKey: 'legacy_knowledge_format',
|
|
21
|
-
},
|
|
22
|
-
legacy_quorum_consensus: {
|
|
23
|
-
addedIn: '0.7.0',
|
|
24
|
-
deprecatedIn: '0.12.0',
|
|
25
|
-
removeAfter: '1.0.0',
|
|
26
|
-
replacedBy: 'Evaluator-scored quorum (workerResponse flow)',
|
|
27
|
-
telemetryKey: 'legacy_quorum_consensus',
|
|
28
|
-
},
|
|
29
8
|
chat_json_mode: {
|
|
30
9
|
addedIn: '0.5.0',
|
|
31
10
|
deprecatedIn: null,
|
|
@@ -33,13 +12,6 @@ const COMPAT_REGISTRY = {
|
|
|
33
12
|
replacedBy: '?stream=1 SSE mode',
|
|
34
13
|
telemetryKey: 'chat_json_mode',
|
|
35
14
|
},
|
|
36
|
-
old_env_gemini_key: {
|
|
37
|
-
addedIn: '0.6.0',
|
|
38
|
-
deprecatedIn: '0.12.0',
|
|
39
|
-
removeAfter: '1.0.0',
|
|
40
|
-
replacedBy: 'GOOGLE_API_KEY environment variable',
|
|
41
|
-
telemetryKey: 'old_env_gemini_key',
|
|
42
|
-
},
|
|
43
15
|
devbox_gateway: {
|
|
44
16
|
addedIn: '0.4.0',
|
|
45
17
|
deprecatedIn: null,
|
|
@@ -283,6 +283,7 @@ Relevant memories and knowledge are provided above. If they answer the question,
|
|
|
283
283
|
### Step 2: SEARCH — only if the context above is insufficient
|
|
284
284
|
Call search_memories to find additional evidence. Batch multiple searches in ONE turn.
|
|
285
285
|
Use different query angles: English keywords, Chinese terms, source filters.
|
|
286
|
+
For private, remembered, or work-context questions, use Wall-E memory before public web_fetch. This includes prior conversations, decisions, preferences, people, teams, projects, tools, Slack/email/calendar context, and "last time" / "do you know" / "what did we discuss" prompts. Use public web only for public/current facts or after memory misses.
|
|
286
287
|
|
|
287
288
|
### Step 3: THINK — reason through the evidence
|
|
288
289
|
Use the **think** tool before responding to:
|
|
@@ -308,7 +309,8 @@ function buildToolRefBlock(ownerName, intent) {
|
|
|
308
309
|
const lines = ['### Tools'];
|
|
309
310
|
if (intent === 'knowledge') {
|
|
310
311
|
lines.push(`- **think**: Internal scratchpad (${ownerName} won't see). Use BEFORE every substantive response.`);
|
|
311
|
-
lines.push('- **search_memories**: Hybrid search (BM25 + vector). source:"slack" for Slack only. Batch multiple searches in one turn.');
|
|
312
|
+
lines.push('- **search_memories**: Hybrid search (BM25 + vector). Use for private/user/work memory: prior conversations, decisions, preferences, projects, people, and Slack/email/calendar context. source:"slack" for Slack only. Batch multiple searches in one turn.');
|
|
313
|
+
lines.push('- **lookup_person**: Person profile lookup. Use alongside search_memories for colleague/role/team questions.');
|
|
312
314
|
lines.push('- **remember_fact**: Store facts the user teaches you.');
|
|
313
315
|
}
|
|
314
316
|
lines.push('- **run_skill / mcp_call / list_mcp_tools**: Actions and external services.');
|
|
@@ -101,7 +101,7 @@ function getEmbeddingModel() {
|
|
|
101
101
|
function _hasApiKey(provider) {
|
|
102
102
|
// Check process.env first, then fall back to model_providers table in brain DB
|
|
103
103
|
switch (provider) {
|
|
104
|
-
case 'google': return !!(process.env.GOOGLE_API_KEY ||
|
|
104
|
+
case 'google': return !!(process.env.GOOGLE_API_KEY || _hasProviderKey('google'));
|
|
105
105
|
case 'voyage': return !!process.env.VOYAGE_API_KEY;
|
|
106
106
|
case 'openai': return !!(process.env.OPENAI_API_KEY || _hasProviderKey('openai'));
|
|
107
107
|
case 'ollama': return _isOllamaAvailable();
|
|
@@ -278,11 +278,8 @@ async function _googleEmbed(texts, config) {
|
|
|
278
278
|
*/
|
|
279
279
|
function _resolveGoogleCredential() {
|
|
280
280
|
// 1. Static API key from env
|
|
281
|
-
const envKey = process.env.GOOGLE_API_KEY
|
|
281
|
+
const envKey = process.env.GOOGLE_API_KEY;
|
|
282
282
|
if (envKey && !envKey.startsWith('ya29.')) {
|
|
283
|
-
if (!process.env.GOOGLE_API_KEY && process.env.GEMINI_API_KEY) {
|
|
284
|
-
try { require('./compat').recordCompatUsage('old_env_gemini_key'); } catch {}
|
|
285
|
-
}
|
|
286
283
|
return { type: 'api_key', token: envKey, expired: false };
|
|
287
284
|
}
|
|
288
285
|
|
|
@@ -475,8 +472,6 @@ function _migrateOldTable(db, config) {
|
|
|
475
472
|
const oldExists = db.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='embedding_vec'").get();
|
|
476
473
|
if (!oldExists) return;
|
|
477
474
|
|
|
478
|
-
try { require('./compat').recordCompatUsage('embedding_v1_migration'); } catch {}
|
|
479
|
-
|
|
480
475
|
const oldCount = db.prepare('SELECT count(*) as c FROM embedding_map WHERE model = ?').get(config.name)?.c || 0;
|
|
481
476
|
if (oldCount === 0) {
|
|
482
477
|
// Old table has data but model column might be from a different model — just drop
|
|
@@ -100,10 +100,13 @@ async function runAgentBenchmark(benchmark, options = {}) {
|
|
|
100
100
|
}
|
|
101
101
|
|
|
102
102
|
// Run the agent loop with hard timeout safety net
|
|
103
|
-
const
|
|
103
|
+
const maxTurns = expectations.maxTurns || 20;
|
|
104
|
+
const turnBudgetTimeout = maxTurns * 30000;
|
|
105
|
+
const effectiveTimeout = Math.min(timeoutMs || turnBudgetTimeout, turnBudgetTimeout);
|
|
104
106
|
const agentPromise = runAgentLoop(benchmark.prompt, {
|
|
105
107
|
cwd: sandboxDir,
|
|
106
108
|
timeoutMs: effectiveTimeout,
|
|
109
|
+
maxTurns,
|
|
107
110
|
provider,
|
|
108
111
|
model,
|
|
109
112
|
mode: 'build',
|
|
@@ -111,6 +114,7 @@ async function runAgentBenchmark(benchmark, options = {}) {
|
|
|
111
114
|
headless: true,
|
|
112
115
|
headlessPolicy: 'allow',
|
|
113
116
|
permissionTimeoutMs: 0,
|
|
117
|
+
persistTranscript: false,
|
|
114
118
|
});
|
|
115
119
|
let timeoutHandle;
|
|
116
120
|
const timeoutPromise = new Promise((_, reject) => {
|
|
@@ -131,7 +135,9 @@ async function runAgentBenchmark(benchmark, options = {}) {
|
|
|
131
135
|
const actualToolCalls = extractToolCalls(result);
|
|
132
136
|
const toolCallDetails = extractToolCallDetails(result);
|
|
133
137
|
const actualFileChanges = await getModifiedFiles(sandboxDir);
|
|
134
|
-
const
|
|
138
|
+
const externalRunnerId = result.runnerId || result.fallback?.runnerId || null;
|
|
139
|
+
const externalRunnerWork = Boolean(externalRunnerId && actualFileChanges.length > 0);
|
|
140
|
+
const actualTurns = (result.log || []).length || actualToolCalls.length || (externalRunnerId ? 1 : 0);
|
|
135
141
|
|
|
136
142
|
// Run test command if specified (validate against allowlist)
|
|
137
143
|
let testsPassed = null;
|
|
@@ -156,6 +162,10 @@ async function runAgentBenchmark(benchmark, options = {}) {
|
|
|
156
162
|
const inputTokens = usage.inputTokens ?? usage.input ?? 0;
|
|
157
163
|
const expectedFileChanges = expectations.expectedFileChanges || [];
|
|
158
164
|
const missingExpectedWork = expectedFileChanges.length > 0 && actualFileChanges.length === 0;
|
|
165
|
+
const attemptedFileChange = actualToolCalls.some((call) => {
|
|
166
|
+
const name = typeof call === 'string' ? call : call?.name;
|
|
167
|
+
return /edit|write|patch|create|delete|modify/i.test(String(name || ''));
|
|
168
|
+
});
|
|
159
169
|
const testRegression = (expectations.testCommand && testsPassed === false);
|
|
160
170
|
const rawError = result.stderr || result.error || null;
|
|
161
171
|
const validatedByTests = Boolean(
|
|
@@ -164,9 +174,11 @@ async function runAgentBenchmark(benchmark, options = {}) {
|
|
|
164
174
|
actualFileChanges.length > 0
|
|
165
175
|
);
|
|
166
176
|
const fatalError = rawError && !validatedByTests ? rawError : null;
|
|
167
|
-
const noEffort = (actualToolCalls.length === 0
|
|
177
|
+
const noEffort = (actualToolCalls.length === 0 && !externalRunnerWork) ||
|
|
178
|
+
(inputTokens === 0 && !externalRunnerWork) ||
|
|
179
|
+
missingExpectedWork;
|
|
168
180
|
const hadError = !!fatalError;
|
|
169
|
-
const validatedSuccess = Boolean(result.success || validatedByTests) && !hadError && !noEffort && !testRegression;
|
|
181
|
+
const validatedSuccess = Boolean(result.success || validatedByTests || externalRunnerWork) && !hadError && !noEffort && !testRegression;
|
|
170
182
|
|
|
171
183
|
// Score the result
|
|
172
184
|
let score = scoreAgentResult(benchmark, {
|
|
@@ -199,7 +211,7 @@ async function runAgentBenchmark(benchmark, options = {}) {
|
|
|
199
211
|
: testRegression
|
|
200
212
|
? 'tests_failed'
|
|
201
213
|
: missingExpectedWork
|
|
202
|
-
? 'no_file_changes'
|
|
214
|
+
? attemptedFileChange ? 'missing_expected_changes' : 'no_file_changes'
|
|
203
215
|
: 'no_effort' },
|
|
204
216
|
};
|
|
205
217
|
}
|
|
@@ -296,6 +308,10 @@ function scoreAgentResult(benchmark, actual) {
|
|
|
296
308
|
});
|
|
297
309
|
}
|
|
298
310
|
|
|
311
|
+
function isTrustedAgentResult(result = {}) {
|
|
312
|
+
return result.success === true && !result.error && result.testsPassed === true;
|
|
313
|
+
}
|
|
314
|
+
|
|
299
315
|
/**
|
|
300
316
|
* Run a multi-turn benchmark — sends each turn's prompt sequentially,
|
|
301
317
|
* accumulating conversation context. Scores after the final turn.
|
|
@@ -333,6 +349,7 @@ async function runMultiTurnBenchmark(benchmark, options = {}) {
|
|
|
333
349
|
headless: true,
|
|
334
350
|
headlessPolicy: 'allow',
|
|
335
351
|
permissionTimeoutMs: 0,
|
|
352
|
+
persistTranscript: false,
|
|
336
353
|
messages, // pass accumulated conversation
|
|
337
354
|
});
|
|
338
355
|
|
|
@@ -353,7 +370,9 @@ async function runMultiTurnBenchmark(benchmark, options = {}) {
|
|
|
353
370
|
const costDollars = estimateCost(totalUsage, provider?.type || provider || 'anthropic', model);
|
|
354
371
|
|
|
355
372
|
const actualFileChanges = await getModifiedFiles(sandboxDir);
|
|
356
|
-
const
|
|
373
|
+
const externalRunnerId = lastResult?.runnerId || lastResult?.fallback?.runnerId || null;
|
|
374
|
+
const externalRunnerWork = Boolean(externalRunnerId && actualFileChanges.length > 0);
|
|
375
|
+
const actualTurns = totalTurns || (externalRunnerId ? 1 : 0);
|
|
357
376
|
|
|
358
377
|
let testsPassed = null;
|
|
359
378
|
let testsAfter = null;
|
|
@@ -390,7 +409,8 @@ async function runMultiTurnBenchmark(benchmark, options = {}) {
|
|
|
390
409
|
// Same hard-zero floor as single-turn — see runAgentBenchmark for rationale.
|
|
391
410
|
const inputTokens = totalUsage.inputTokens ?? 0;
|
|
392
411
|
const hadError = !!(lastResult?.stderr || lastResult?.error);
|
|
393
|
-
const noEffort = (allToolCalls.length === 0) ||
|
|
412
|
+
const noEffort = (allToolCalls.length === 0 && !externalRunnerWork) ||
|
|
413
|
+
(inputTokens === 0 && !externalRunnerWork);
|
|
394
414
|
const testRegression = (expectations.testCommand && testsPassed === false);
|
|
395
415
|
if (hadError || noEffort || testRegression) {
|
|
396
416
|
score = {
|
|
@@ -507,7 +527,7 @@ async function runAgentBenchmarkSuite(options = {}) {
|
|
|
507
527
|
outputTokens: result.outputTokens ?? null,
|
|
508
528
|
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
509
529
|
scoringMethod,
|
|
510
|
-
trusted:
|
|
530
|
+
trusted: isTrustedAgentResult(result),
|
|
511
531
|
runConfig: { timeoutMs, scoringMethod },
|
|
512
532
|
}, {
|
|
513
533
|
suite: 'coding-agent',
|
|
@@ -517,7 +537,7 @@ async function runAgentBenchmarkSuite(options = {}) {
|
|
|
517
537
|
model: resolveModelName(model),
|
|
518
538
|
scoringMethod,
|
|
519
539
|
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
520
|
-
trusted:
|
|
540
|
+
trusted: isTrustedAgentResult(result),
|
|
521
541
|
runConfig: { timeoutMs, scoringMethod },
|
|
522
542
|
}));
|
|
523
543
|
} catch { /* non-fatal */ }
|
|
@@ -666,6 +686,7 @@ module.exports = {
|
|
|
666
686
|
runMultiTurnBenchmark,
|
|
667
687
|
runAgentBenchmarkSuite,
|
|
668
688
|
scoreAgentResult,
|
|
689
|
+
isTrustedAgentResult,
|
|
669
690
|
extractToolCalls,
|
|
670
691
|
extractToolCallDetails,
|
|
671
692
|
countTests,
|