@librechat/agents 3.1.77 → 3.1.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/common/enum.cjs +54 -0
- package/dist/cjs/common/enum.cjs.map +1 -1
- package/dist/cjs/graphs/Graph.cjs +155 -4
- package/dist/cjs/graphs/Graph.cjs.map +1 -1
- package/dist/cjs/hooks/createWorkspacePolicyHook.cjs +291 -0
- package/dist/cjs/hooks/createWorkspacePolicyHook.cjs.map +1 -0
- package/dist/cjs/main.cjs +90 -0
- package/dist/cjs/main.cjs.map +1 -1
- package/dist/cjs/messages/anthropicToolCache.cjs +102 -0
- package/dist/cjs/messages/anthropicToolCache.cjs.map +1 -0
- package/dist/cjs/messages/prune.cjs +27 -0
- package/dist/cjs/messages/prune.cjs.map +1 -1
- package/dist/cjs/messages/recency.cjs +99 -0
- package/dist/cjs/messages/recency.cjs.map +1 -0
- package/dist/cjs/run.cjs +30 -0
- package/dist/cjs/run.cjs.map +1 -1
- package/dist/cjs/summarization/node.cjs +100 -6
- package/dist/cjs/summarization/node.cjs.map +1 -1
- package/dist/cjs/tools/ToolNode.cjs +635 -23
- package/dist/cjs/tools/ToolNode.cjs.map +1 -1
- package/dist/cjs/tools/local/CompileCheckTool.cjs +227 -0
- package/dist/cjs/tools/local/CompileCheckTool.cjs.map +1 -0
- package/dist/cjs/tools/local/FileCheckpointer.cjs +90 -0
- package/dist/cjs/tools/local/FileCheckpointer.cjs.map +1 -0
- package/dist/cjs/tools/local/LocalCodingTools.cjs +1098 -0
- package/dist/cjs/tools/local/LocalCodingTools.cjs.map +1 -0
- package/dist/cjs/tools/local/LocalExecutionEngine.cjs +1042 -0
- package/dist/cjs/tools/local/LocalExecutionEngine.cjs.map +1 -0
- package/dist/cjs/tools/local/LocalExecutionTools.cjs +122 -0
- package/dist/cjs/tools/local/LocalExecutionTools.cjs.map +1 -0
- package/dist/cjs/tools/local/LocalProgrammaticToolCalling.cjs +453 -0
- package/dist/cjs/tools/local/LocalProgrammaticToolCalling.cjs.map +1 -0
- package/dist/cjs/tools/local/attachments.cjs +183 -0
- package/dist/cjs/tools/local/attachments.cjs.map +1 -0
- package/dist/cjs/tools/local/bashAst.cjs +129 -0
- package/dist/cjs/tools/local/bashAst.cjs.map +1 -0
- package/dist/cjs/tools/local/editStrategies.cjs +188 -0
- package/dist/cjs/tools/local/editStrategies.cjs.map +1 -0
- package/dist/cjs/tools/local/resolveLocalExecutionTools.cjs +141 -0
- package/dist/cjs/tools/local/resolveLocalExecutionTools.cjs.map +1 -0
- package/dist/cjs/tools/local/syntaxCheck.cjs +182 -0
- package/dist/cjs/tools/local/syntaxCheck.cjs.map +1 -0
- package/dist/cjs/tools/local/textEncoding.cjs +30 -0
- package/dist/cjs/tools/local/textEncoding.cjs.map +1 -0
- package/dist/cjs/tools/local/workspaceFS.cjs +51 -0
- package/dist/cjs/tools/local/workspaceFS.cjs.map +1 -0
- package/dist/cjs/tools/subagent/SubagentExecutor.cjs +31 -0
- package/dist/cjs/tools/subagent/SubagentExecutor.cjs.map +1 -1
- package/dist/esm/common/enum.mjs +53 -1
- package/dist/esm/common/enum.mjs.map +1 -1
- package/dist/esm/graphs/Graph.mjs +156 -5
- package/dist/esm/graphs/Graph.mjs.map +1 -1
- package/dist/esm/hooks/createWorkspacePolicyHook.mjs +289 -0
- package/dist/esm/hooks/createWorkspacePolicyHook.mjs.map +1 -0
- package/dist/esm/main.mjs +17 -2
- package/dist/esm/main.mjs.map +1 -1
- package/dist/esm/messages/anthropicToolCache.mjs +99 -0
- package/dist/esm/messages/anthropicToolCache.mjs.map +1 -0
- package/dist/esm/messages/prune.mjs +26 -1
- package/dist/esm/messages/prune.mjs.map +1 -1
- package/dist/esm/messages/recency.mjs +97 -0
- package/dist/esm/messages/recency.mjs.map +1 -0
- package/dist/esm/run.mjs +30 -0
- package/dist/esm/run.mjs.map +1 -1
- package/dist/esm/summarization/node.mjs +100 -6
- package/dist/esm/summarization/node.mjs.map +1 -1
- package/dist/esm/tools/ToolNode.mjs +635 -23
- package/dist/esm/tools/ToolNode.mjs.map +1 -1
- package/dist/esm/tools/local/CompileCheckTool.mjs +223 -0
- package/dist/esm/tools/local/CompileCheckTool.mjs.map +1 -0
- package/dist/esm/tools/local/FileCheckpointer.mjs +87 -0
- package/dist/esm/tools/local/FileCheckpointer.mjs.map +1 -0
- package/dist/esm/tools/local/LocalCodingTools.mjs +1075 -0
- package/dist/esm/tools/local/LocalCodingTools.mjs.map +1 -0
- package/dist/esm/tools/local/LocalExecutionEngine.mjs +1022 -0
- package/dist/esm/tools/local/LocalExecutionEngine.mjs.map +1 -0
- package/dist/esm/tools/local/LocalExecutionTools.mjs +117 -0
- package/dist/esm/tools/local/LocalExecutionTools.mjs.map +1 -0
- package/dist/esm/tools/local/LocalProgrammaticToolCalling.mjs +448 -0
- package/dist/esm/tools/local/LocalProgrammaticToolCalling.mjs.map +1 -0
- package/dist/esm/tools/local/attachments.mjs +180 -0
- package/dist/esm/tools/local/attachments.mjs.map +1 -0
- package/dist/esm/tools/local/bashAst.mjs +126 -0
- package/dist/esm/tools/local/bashAst.mjs.map +1 -0
- package/dist/esm/tools/local/editStrategies.mjs +185 -0
- package/dist/esm/tools/local/editStrategies.mjs.map +1 -0
- package/dist/esm/tools/local/resolveLocalExecutionTools.mjs +137 -0
- package/dist/esm/tools/local/resolveLocalExecutionTools.mjs.map +1 -0
- package/dist/esm/tools/local/syntaxCheck.mjs +179 -0
- package/dist/esm/tools/local/syntaxCheck.mjs.map +1 -0
- package/dist/esm/tools/local/textEncoding.mjs +27 -0
- package/dist/esm/tools/local/textEncoding.mjs.map +1 -0
- package/dist/esm/tools/local/workspaceFS.mjs +49 -0
- package/dist/esm/tools/local/workspaceFS.mjs.map +1 -0
- package/dist/esm/tools/subagent/SubagentExecutor.mjs +31 -0
- package/dist/esm/tools/subagent/SubagentExecutor.mjs.map +1 -1
- package/dist/types/common/enum.d.ts +39 -1
- package/dist/types/graphs/Graph.d.ts +34 -0
- package/dist/types/hooks/createWorkspacePolicyHook.d.ts +95 -0
- package/dist/types/hooks/index.d.ts +2 -0
- package/dist/types/index.d.ts +1 -0
- package/dist/types/messages/anthropicToolCache.d.ts +51 -0
- package/dist/types/messages/index.d.ts +2 -0
- package/dist/types/messages/prune.d.ts +11 -0
- package/dist/types/messages/recency.d.ts +64 -0
- package/dist/types/run.d.ts +21 -0
- package/dist/types/tools/ToolNode.d.ts +145 -2
- package/dist/types/tools/local/CompileCheckTool.d.ts +31 -0
- package/dist/types/tools/local/FileCheckpointer.d.ts +39 -0
- package/dist/types/tools/local/LocalCodingTools.d.ts +57 -0
- package/dist/types/tools/local/LocalExecutionEngine.d.ts +149 -0
- package/dist/types/tools/local/LocalExecutionTools.d.ts +9 -0
- package/dist/types/tools/local/LocalProgrammaticToolCalling.d.ts +21 -0
- package/dist/types/tools/local/attachments.d.ts +84 -0
- package/dist/types/tools/local/bashAst.d.ts +11 -0
- package/dist/types/tools/local/editStrategies.d.ts +28 -0
- package/dist/types/tools/local/index.d.ts +12 -0
- package/dist/types/tools/local/resolveLocalExecutionTools.d.ts +38 -0
- package/dist/types/tools/local/syntaxCheck.d.ts +42 -0
- package/dist/types/tools/local/textEncoding.d.ts +21 -0
- package/dist/types/tools/local/workspaceFS.d.ts +49 -0
- package/dist/types/tools/subagent/SubagentExecutor.d.ts +29 -0
- package/dist/types/types/hitl.d.ts +56 -27
- package/dist/types/types/run.d.ts +8 -1
- package/dist/types/types/summarize.d.ts +30 -0
- package/dist/types/types/tools.d.ts +341 -6
- package/package.json +21 -2
- package/src/common/enum.ts +54 -0
- package/src/graphs/Graph.ts +173 -6
- package/src/hooks/__tests__/compactHooks.test.ts +38 -2
- package/src/hooks/__tests__/createWorkspacePolicyHook.test.ts +393 -0
- package/src/hooks/createWorkspacePolicyHook.ts +355 -0
- package/src/hooks/index.ts +6 -0
- package/src/index.ts +1 -0
- package/src/messages/__tests__/anthropicToolCache.test.ts +125 -0
- package/src/messages/__tests__/recency.test.ts +267 -0
- package/src/messages/anthropicToolCache.ts +116 -0
- package/src/messages/index.ts +2 -0
- package/src/messages/prune.ts +27 -1
- package/src/messages/recency.ts +155 -0
- package/src/run.ts +31 -0
- package/src/scripts/compare_pi_vs_ours.ts +840 -0
- package/src/scripts/local_engine.ts +166 -0
- package/src/scripts/local_engine_checkpointer.ts +205 -0
- package/src/scripts/local_engine_compile.ts +263 -0
- package/src/scripts/local_engine_hooks.ts +226 -0
- package/src/scripts/local_engine_image.ts +201 -0
- package/src/scripts/local_engine_ptc.ts +151 -0
- package/src/scripts/local_engine_workspace.ts +258 -0
- package/src/scripts/subagent-configurable-inheritance.ts +252 -0
- package/src/scripts/summarization-recency.ts +462 -0
- package/src/specs/prune.test.ts +39 -0
- package/src/summarization/__tests__/node.test.ts +499 -3
- package/src/summarization/node.ts +124 -7
- package/src/tools/ToolNode.ts +769 -20
- package/src/tools/__tests__/LocalExecutionTools.test.ts +2647 -0
- package/src/tools/__tests__/ProgrammaticToolCalling.test.ts +175 -0
- package/src/tools/__tests__/SubagentExecutor.test.ts +148 -0
- package/src/tools/__tests__/ToolNode.outputReferences.test.ts +114 -0
- package/src/tools/__tests__/ToolNode.session.test.ts +84 -0
- package/src/tools/__tests__/directToolHITLResumeScope.test.ts +467 -0
- package/src/tools/__tests__/directToolHooks.test.ts +411 -0
- package/src/tools/__tests__/localToolNames.test.ts +73 -0
- package/src/tools/__tests__/workspaceSeam.test.ts +134 -0
- package/src/tools/local/CompileCheckTool.ts +278 -0
- package/src/tools/local/FileCheckpointer.ts +93 -0
- package/src/tools/local/LocalCodingTools.ts +1342 -0
- package/src/tools/local/LocalExecutionEngine.ts +1329 -0
- package/src/tools/local/LocalExecutionTools.ts +167 -0
- package/src/tools/local/LocalProgrammaticToolCalling.ts +594 -0
- package/src/tools/local/__tests__/FileCheckpointer.test.ts +120 -0
- package/src/tools/local/__tests__/editStrategies.test.ts +134 -0
- package/src/tools/local/attachments.ts +251 -0
- package/src/tools/local/bashAst.ts +151 -0
- package/src/tools/local/editStrategies.ts +188 -0
- package/src/tools/local/index.ts +12 -0
- package/src/tools/local/resolveLocalExecutionTools.ts +208 -0
- package/src/tools/local/syntaxCheck.ts +243 -0
- package/src/tools/local/textEncoding.ts +37 -0
- package/src/tools/local/workspaceFS.ts +89 -0
- package/src/tools/subagent/SubagentExecutor.ts +60 -0
- package/src/types/hitl.ts +56 -27
- package/src/types/run.ts +12 -1
- package/src/types/summarize.ts +31 -0
- package/src/types/tools.ts +359 -7
|
@@ -0,0 +1,840 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* src/scripts/compare_pi_vs_ours.ts
|
|
3
|
+
*
|
|
4
|
+
* Side-by-side runs: pi-mono's `pi` CLI vs our local engine, same
|
|
5
|
+
* task, same model, two parallel temp workspaces. We track:
|
|
6
|
+
*
|
|
7
|
+
* - tool calls (name + args length, ordered)
|
|
8
|
+
* - wall-clock time
|
|
9
|
+
* - total Anthropic input/output tokens (when reported)
|
|
10
|
+
* - whether the final on-disk state matches the expected outcome
|
|
11
|
+
*
|
|
12
|
+
* The tasks intentionally probe areas where we expect the local
|
|
13
|
+
* engine to behave differently:
|
|
14
|
+
*
|
|
15
|
+
* T1 simple-edit — both should one-shot
|
|
16
|
+
* T2 fuzzy-edit — model emits an `oldText` with off-by-
|
|
17
|
+
* whitespace; our `editStrategies` chain
|
|
18
|
+
* should recover without re-reading;
|
|
19
|
+
* pi should also handle it (its edit tool
|
|
20
|
+
* has a similar fallback chain)
|
|
21
|
+
* T3 syntax-error-fix — pre-seed broken JS; ours surfaces the
|
|
22
|
+
* parse error in the write_file tool result
|
|
23
|
+
* via post-edit syntax check; pi has to read
|
|
24
|
+
* the file (or run bash node --check) to
|
|
25
|
+
* notice
|
|
26
|
+
*
|
|
27
|
+
* Run: PI_BIN=path/to/cli.js npm run compare:pi
|
|
28
|
+
* Defaults to ~/Projects/pi-mono/packages/coding-agent/dist/cli.js.
|
|
29
|
+
*/
|
|
30
|
+
import { config } from 'dotenv';
|
|
31
|
+
config();
|
|
32
|
+
import { spawn } from 'child_process';
|
|
33
|
+
import { homedir, tmpdir } from 'os';
|
|
34
|
+
import { join, resolve } from 'path';
|
|
35
|
+
import { mkdtemp, readFile, rm, writeFile } from 'fs/promises';
|
|
36
|
+
import { performance } from 'perf_hooks';
|
|
37
|
+
import { HumanMessage, ToolMessage } from '@langchain/core/messages';
|
|
38
|
+
import type { BaseMessage } from '@langchain/core/messages';
|
|
39
|
+
import type * as t from '@/types';
|
|
40
|
+
import { ChatModelStreamHandler, createContentAggregator } from '@/stream';
|
|
41
|
+
import { ToolEndHandler, ModelEndHandler } from '@/events';
|
|
42
|
+
import { getLLMConfig } from '@/utils/llmConfig';
|
|
43
|
+
import { GraphEvents, Providers } from '@/common';
|
|
44
|
+
import { Run } from '@/run';
|
|
45
|
+
|
|
46
|
+
const PROVIDER = Providers.ANTHROPIC;
|
|
47
|
+
const MODEL = 'claude-sonnet-4-5';
|
|
48
|
+
const PI_BIN =
|
|
49
|
+
process.env.PI_BIN ??
|
|
50
|
+
resolve(
|
|
51
|
+
homedir(),
|
|
52
|
+
'Projects/pi-mono/packages/coding-agent/dist/cli.js'
|
|
53
|
+
);
|
|
54
|
+
|
|
55
|
+
interface Task {
|
|
56
|
+
name: string;
|
|
57
|
+
description: string;
|
|
58
|
+
/** Files seeded into the workspace before the run. */
|
|
59
|
+
seed: Record<string, string>;
|
|
60
|
+
/** Optional binary files seeded into the workspace (key = path, value = bytes). */
|
|
61
|
+
seedBinary?: Record<string, Buffer>;
|
|
62
|
+
/** Prompt sent to both agents. */
|
|
63
|
+
prompt: string;
|
|
64
|
+
/** Function that returns true if the workspace ended in the right state. */
|
|
65
|
+
verify: (cwd: string) => Promise<{ ok: boolean; detail: string }>;
|
|
66
|
+
/** Optional pre-run hook (e.g. symlink node_modules so `tsc` is available). */
|
|
67
|
+
setup?: (cwd: string) => Promise<void>;
|
|
68
|
+
/**
|
|
69
|
+
* Optional setup specific to our local engine (extra `local.*` config knobs)
|
|
70
|
+
* — lets us toggle e.g. `attachReadAttachments` per-task without
|
|
71
|
+
* making the default surface noisier than necessary.
|
|
72
|
+
*/
|
|
73
|
+
oursLocalConfigOverrides?: Partial<t.LocalExecutionConfig>;
|
|
74
|
+
/**
|
|
75
|
+
* Some tasks aren't realistically supportable on one side. When set,
|
|
76
|
+
* skip the named runner and report N/A in the table.
|
|
77
|
+
*/
|
|
78
|
+
skip?: 'pi' | 'ours';
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
interface ToolCallObservation {
|
|
82
|
+
name: string;
|
|
83
|
+
argsBytes: number;
|
|
84
|
+
isError: boolean;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
interface RunOutcome {
|
|
88
|
+
toolCalls: ToolCallObservation[];
|
|
89
|
+
wallMs: number;
|
|
90
|
+
inputTokens: number;
|
|
91
|
+
outputTokens: number;
|
|
92
|
+
cacheReadTokens: number;
|
|
93
|
+
cacheWriteTokens: number;
|
|
94
|
+
cost: number;
|
|
95
|
+
finalAssistant: string;
|
|
96
|
+
errored: boolean;
|
|
97
|
+
errorMessage?: string;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
const TASKS: Task[] = [
|
|
101
|
+
{
|
|
102
|
+
name: 'T1 simple-edit',
|
|
103
|
+
description: 'Single literal substitution in an existing file.',
|
|
104
|
+
seed: {
|
|
105
|
+
'greet.py':
|
|
106
|
+
'def greet(name):\n return f"Hello, {name}!"\n',
|
|
107
|
+
},
|
|
108
|
+
prompt:
|
|
109
|
+
'Edit greet.py: change the greeting from "Hello" to "Hi". ' +
|
|
110
|
+
'Keep the rest of the file identical. Reply with "done" when finished.',
|
|
111
|
+
verify: async (cwd) => {
|
|
112
|
+
const text = await readFile(join(cwd, 'greet.py'), 'utf8').catch(
|
|
113
|
+
() => ''
|
|
114
|
+
);
|
|
115
|
+
const ok = text.includes('"Hi, {name}!"') && !text.includes('Hello,');
|
|
116
|
+
return { ok, detail: ok ? '' : `actual: ${JSON.stringify(text)}` };
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
name: 'T2 fuzzy-edit',
|
|
121
|
+
description:
|
|
122
|
+
'Original file has trailing whitespace + tabs; the model is asked to make a literal change without seeing the trailing whitespace.',
|
|
123
|
+
seed: {
|
|
124
|
+
// trailing spaces are intentional here
|
|
125
|
+
'config.ts':
|
|
126
|
+
'export const config = { \n' +
|
|
127
|
+
'\tport: 3000,\n' +
|
|
128
|
+
'\thost: "localhost", \n' +
|
|
129
|
+
'};\n',
|
|
130
|
+
},
|
|
131
|
+
prompt:
|
|
132
|
+
'In config.ts, change the port from 3000 to 4242. The file may have ' +
|
|
133
|
+
'unusual whitespace; do the smallest correct change. Reply with "done".',
|
|
134
|
+
verify: async (cwd) => {
|
|
135
|
+
const text = await readFile(join(cwd, 'config.ts'), 'utf8').catch(
|
|
136
|
+
() => ''
|
|
137
|
+
);
|
|
138
|
+
const ok = /port:\s*4242/.test(text) && !/3000/.test(text);
|
|
139
|
+
return { ok, detail: ok ? '' : `actual:\n${text}` };
|
|
140
|
+
},
|
|
141
|
+
},
|
|
142
|
+
{
|
|
143
|
+
name: 'T4 type-error-fix-loop',
|
|
144
|
+
description:
|
|
145
|
+
'Pre-seeded TS file with a type error in a tiny tsconfig project. Ours can call `compile_check`; pi can run `npx tsc --noEmit` via bash.',
|
|
146
|
+
seed: {
|
|
147
|
+
'tsconfig.json': JSON.stringify(
|
|
148
|
+
{
|
|
149
|
+
compilerOptions: {
|
|
150
|
+
target: 'ES2020',
|
|
151
|
+
module: 'commonjs',
|
|
152
|
+
strict: true,
|
|
153
|
+
noEmit: true,
|
|
154
|
+
skipLibCheck: true,
|
|
155
|
+
},
|
|
156
|
+
include: ['*.ts'],
|
|
157
|
+
},
|
|
158
|
+
null,
|
|
159
|
+
2
|
|
160
|
+
),
|
|
161
|
+
'package.json': JSON.stringify(
|
|
162
|
+
{ name: 'lc-compare-t4', private: true },
|
|
163
|
+
null,
|
|
164
|
+
2
|
|
165
|
+
),
|
|
166
|
+
'broken.ts':
|
|
167
|
+
'export const port: number = "not a number";\n',
|
|
168
|
+
},
|
|
169
|
+
prompt:
|
|
170
|
+
'broken.ts has a type error. Fix it so the project typechecks cleanly. ' +
|
|
171
|
+
'After fixing, verify by running the project\'s typecheck (or `compile_check` if available). ' +
|
|
172
|
+
'Reply with "done".',
|
|
173
|
+
verify: async (cwd) => {
|
|
174
|
+
const text = await readFile(join(cwd, 'broken.ts'), 'utf8').catch(
|
|
175
|
+
() => ''
|
|
176
|
+
);
|
|
177
|
+
const ok =
|
|
178
|
+
/port:\s*number\s*=\s*\d/.test(text) && !/"not a number"/.test(text);
|
|
179
|
+
return { ok, detail: ok ? '' : `actual: ${text}` };
|
|
180
|
+
},
|
|
181
|
+
setup: symlinkRepoNodeModules,
|
|
182
|
+
},
|
|
183
|
+
{
|
|
184
|
+
name: 'T3 syntax-error-fix',
|
|
185
|
+
description:
|
|
186
|
+
'Pre-seeded broken JS file. Ours surfaces the parse error in the write_file/edit_file tool result; pi has to discover it via bash/read.',
|
|
187
|
+
seed: {
|
|
188
|
+
'broken.js':
|
|
189
|
+
'function add(a, b) {\n return a + ;\n}\nconsole.log(add(1, 2));\n',
|
|
190
|
+
},
|
|
191
|
+
prompt:
|
|
192
|
+
'broken.js is syntactically invalid. Fix it so `node --check broken.js` passes. ' +
|
|
193
|
+
'The intended behaviour is that add(1, 2) prints 3. Reply with "done".',
|
|
194
|
+
verify: async (cwd) => {
|
|
195
|
+
const text = await readFile(join(cwd, 'broken.js'), 'utf8').catch(
|
|
196
|
+
() => ''
|
|
197
|
+
);
|
|
198
|
+
// Does not include the broken token
|
|
199
|
+
const cleaned = !/return\s+a\s*\+\s*;/.test(text);
|
|
200
|
+
// Should still console.log the result
|
|
201
|
+
const hasLog = /console\.log/.test(text);
|
|
202
|
+
const ok = cleaned && hasLog;
|
|
203
|
+
return { ok, detail: ok ? '' : `actual:\n${text}` };
|
|
204
|
+
},
|
|
205
|
+
},
|
|
206
|
+
{
|
|
207
|
+
name: 'T5 multi-file-rename',
|
|
208
|
+
description:
|
|
209
|
+
'Rename a function across three files. Tests how the agent finds + applies the rename.',
|
|
210
|
+
seed: {
|
|
211
|
+
'src/lib.ts':
|
|
212
|
+
'export function calc_total(a: number, b: number): number {\n' +
|
|
213
|
+
' return a + b;\n' +
|
|
214
|
+
'}\n',
|
|
215
|
+
'src/index.ts':
|
|
216
|
+
'import { calc_total } from "./lib";\n' +
|
|
217
|
+
'console.log(calc_total(2, 3));\n',
|
|
218
|
+
'src/index.test.ts':
|
|
219
|
+
'import { calc_total } from "./lib";\n' +
|
|
220
|
+
'if (calc_total(1, 1) !== 2) throw new Error("fail");\n' +
|
|
221
|
+
'console.log("ok");\n',
|
|
222
|
+
},
|
|
223
|
+
prompt:
|
|
224
|
+
'Rename the exported function `calc_total` to `calculateTotal` across src/lib.ts, ' +
|
|
225
|
+
'src/index.ts, and src/index.test.ts. Update every reference. Reply "done" when finished.',
|
|
226
|
+
verify: async (cwd) => {
|
|
227
|
+
const lib = await readFile(join(cwd, 'src/lib.ts'), 'utf8').catch(() => '');
|
|
228
|
+
const idx = await readFile(join(cwd, 'src/index.ts'), 'utf8').catch(() => '');
|
|
229
|
+
const tst = await readFile(join(cwd, 'src/index.test.ts'), 'utf8').catch(
|
|
230
|
+
() => ''
|
|
231
|
+
);
|
|
232
|
+
const allRenamed =
|
|
233
|
+
/function\s+calculateTotal/.test(lib) &&
|
|
234
|
+
/calculateTotal\(/.test(idx) &&
|
|
235
|
+
/calculateTotal\(/.test(tst);
|
|
236
|
+
const noOldName =
|
|
237
|
+
!/calc_total/.test(lib) &&
|
|
238
|
+
!/calc_total/.test(idx) &&
|
|
239
|
+
!/calc_total/.test(tst);
|
|
240
|
+
const ok = allRenamed && noOldName;
|
|
241
|
+
return {
|
|
242
|
+
ok,
|
|
243
|
+
detail: ok
|
|
244
|
+
? ''
|
|
245
|
+
: `lib:\n${lib}\nindex:\n${idx}\ntest:\n${tst}`,
|
|
246
|
+
};
|
|
247
|
+
},
|
|
248
|
+
},
|
|
249
|
+
{
|
|
250
|
+
name: 'T6 image-read-and-describe',
|
|
251
|
+
description:
|
|
252
|
+
'Reads a PNG and describes it. Ours embeds via attachReadAttachments + image_url block; pi has no equivalent and is skipped.',
|
|
253
|
+
seed: {},
|
|
254
|
+
setup: async (cwd) => {
|
|
255
|
+
const { copyFile } = await import('fs/promises');
|
|
256
|
+
// Use a real PNG (Anthropic refuses tiny 1x1 PNGs with "Could not
|
|
257
|
+
// process image"). Try a few well-known macOS app icons; fall back to
|
|
258
|
+
// any *.png we can find under /System.
|
|
259
|
+
const candidates = [
|
|
260
|
+
'/System/Library/CoreServices/Certificate Assistant.app/Contents/Resources/droppedImage.png',
|
|
261
|
+
'/System/Library/CoreServices/Certificate Assistant.app/Contents/Resources/shapeimage_1.png',
|
|
262
|
+
'/System/Library/CoreServices/BluetoothUIServer.app/Contents/Resources/handoff.png',
|
|
263
|
+
];
|
|
264
|
+
for (const path of candidates) {
|
|
265
|
+
try {
|
|
266
|
+
await copyFile(path, join(cwd, 'sample.png'));
|
|
267
|
+
return;
|
|
268
|
+
} catch {
|
|
269
|
+
// try next
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
throw new Error('No system PNG available for T6 image task');
|
|
273
|
+
},
|
|
274
|
+
prompt:
|
|
275
|
+
'Read sample.png and briefly describe what the image shows. Reply with "done" at the end.',
|
|
276
|
+
verify: async (cwd) => {
|
|
277
|
+
// The verify step is soft — we just check the file is still on disk
|
|
278
|
+
// (the agent shouldn't have deleted it) and the script-level error
|
|
279
|
+
// tracking will fail this task if Anthropic refused the request.
|
|
280
|
+
const { stat } = await import('fs/promises');
|
|
281
|
+
try {
|
|
282
|
+
await stat(join(cwd, 'sample.png'));
|
|
283
|
+
return { ok: true, detail: '' };
|
|
284
|
+
} catch {
|
|
285
|
+
return { ok: false, detail: 'sample.png missing' };
|
|
286
|
+
}
|
|
287
|
+
},
|
|
288
|
+
oursLocalConfigOverrides: { attachReadAttachments: 'images-only' },
|
|
289
|
+
skip: 'pi',
|
|
290
|
+
},
|
|
291
|
+
];
|
|
292
|
+
|
|
293
|
+
/* ------------------------------------------------------------------ */
|
|
294
|
+
/* pi runner */
|
|
295
|
+
/* ------------------------------------------------------------------ */
|
|
296
|
+
|
|
297
|
+
async function runPi(task: Task, cwd: string): Promise<RunOutcome> {
|
|
298
|
+
const start = performance.now();
|
|
299
|
+
const args = [
|
|
300
|
+
PI_BIN,
|
|
301
|
+
'--print',
|
|
302
|
+
'--mode',
|
|
303
|
+
'json',
|
|
304
|
+
'--no-session',
|
|
305
|
+
'--provider',
|
|
306
|
+
PROVIDER,
|
|
307
|
+
'--model',
|
|
308
|
+
MODEL,
|
|
309
|
+
task.prompt,
|
|
310
|
+
];
|
|
311
|
+
return new Promise<RunOutcome>((resolveOutcome) => {
|
|
312
|
+
const child = spawn('node', args, {
|
|
313
|
+
cwd,
|
|
314
|
+
env: { ...process.env, FORCE_COLOR: '0', NO_COLOR: '1' },
|
|
315
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
316
|
+
});
|
|
317
|
+
|
|
318
|
+
let stdout = '';
|
|
319
|
+
let stderr = '';
|
|
320
|
+
child.stdout.on('data', (chunk: Buffer) => {
|
|
321
|
+
stdout += chunk.toString('utf8');
|
|
322
|
+
});
|
|
323
|
+
child.stderr.on('data', (chunk: Buffer) => {
|
|
324
|
+
stderr += chunk.toString('utf8');
|
|
325
|
+
});
|
|
326
|
+
|
|
327
|
+
child.on('close', (code) => {
|
|
328
|
+
const wallMs = performance.now() - start;
|
|
329
|
+
if (code !== 0) {
|
|
330
|
+
resolveOutcome({
|
|
331
|
+
toolCalls: [],
|
|
332
|
+
wallMs,
|
|
333
|
+
inputTokens: 0,
|
|
334
|
+
outputTokens: 0,
|
|
335
|
+
cacheReadTokens: 0,
|
|
336
|
+
cacheWriteTokens: 0,
|
|
337
|
+
cost: 0,
|
|
338
|
+
finalAssistant: '',
|
|
339
|
+
errored: true,
|
|
340
|
+
errorMessage:
|
|
341
|
+
stderr.trim().slice(-500) || `exit ${code ?? 'unknown'}`,
|
|
342
|
+
});
|
|
343
|
+
return;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
const toolCalls: ToolCallObservation[] = [];
|
|
347
|
+
let inputTokens = 0;
|
|
348
|
+
let outputTokens = 0;
|
|
349
|
+
let cacheReadTokens = 0;
|
|
350
|
+
let cacheWriteTokens = 0;
|
|
351
|
+
let cost = 0;
|
|
352
|
+
let finalAssistant = '';
|
|
353
|
+
|
|
354
|
+
for (const line of stdout.split('\n')) {
|
|
355
|
+
if (line === '') continue;
|
|
356
|
+
let event: { type?: string; message?: unknown };
|
|
357
|
+
try {
|
|
358
|
+
event = JSON.parse(line);
|
|
359
|
+
} catch {
|
|
360
|
+
continue;
|
|
361
|
+
}
|
|
362
|
+
if (event.type === 'message_end') {
|
|
363
|
+
const m = event.message as {
|
|
364
|
+
role?: string;
|
|
365
|
+
content?: Array<{
|
|
366
|
+
type?: string;
|
|
367
|
+
name?: string;
|
|
368
|
+
arguments?: unknown;
|
|
369
|
+
text?: string;
|
|
370
|
+
}>;
|
|
371
|
+
usage?: {
|
|
372
|
+
input?: number;
|
|
373
|
+
output?: number;
|
|
374
|
+
cost?: { total?: number };
|
|
375
|
+
};
|
|
376
|
+
};
|
|
377
|
+
if (m.role === 'assistant') {
|
|
378
|
+
const usage = m.usage as
|
|
379
|
+
| {
|
|
380
|
+
input?: number;
|
|
381
|
+
output?: number;
|
|
382
|
+
cacheRead?: number;
|
|
383
|
+
cacheWrite?: number;
|
|
384
|
+
cost?: { total?: number };
|
|
385
|
+
}
|
|
386
|
+
| undefined;
|
|
387
|
+
inputTokens += usage?.input ?? 0;
|
|
388
|
+
outputTokens += usage?.output ?? 0;
|
|
389
|
+
cacheReadTokens += usage?.cacheRead ?? 0;
|
|
390
|
+
cacheWriteTokens += usage?.cacheWrite ?? 0;
|
|
391
|
+
cost += usage?.cost?.total ?? 0;
|
|
392
|
+
for (const block of m.content ?? []) {
|
|
393
|
+
if (block.type === 'toolCall' && block.name != null) {
|
|
394
|
+
toolCalls.push({
|
|
395
|
+
name: block.name,
|
|
396
|
+
argsBytes: JSON.stringify(block.arguments ?? {}).length,
|
|
397
|
+
isError: false,
|
|
398
|
+
});
|
|
399
|
+
}
|
|
400
|
+
if (block.type === 'text' && block.text != null) {
|
|
401
|
+
finalAssistant = block.text;
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
if (m.role === 'toolResult') {
|
|
406
|
+
const tr = m as unknown as { isError?: boolean };
|
|
407
|
+
if (tr.isError && toolCalls.length > 0) {
|
|
408
|
+
toolCalls[toolCalls.length - 1].isError = true;
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
resolveOutcome({
|
|
415
|
+
toolCalls,
|
|
416
|
+
wallMs,
|
|
417
|
+
inputTokens,
|
|
418
|
+
outputTokens,
|
|
419
|
+
cacheReadTokens,
|
|
420
|
+
cacheWriteTokens,
|
|
421
|
+
cost,
|
|
422
|
+
finalAssistant,
|
|
423
|
+
errored: false,
|
|
424
|
+
});
|
|
425
|
+
});
|
|
426
|
+
});
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
/* ------------------------------------------------------------------ */
|
|
430
|
+
/* Our local-engine runner */
|
|
431
|
+
/* ------------------------------------------------------------------ */
|
|
432
|
+
|
|
433
|
+
async function runOurs(
|
|
434
|
+
task: Task,
|
|
435
|
+
cwd: string,
|
|
436
|
+
overrides: Partial<t.LocalExecutionConfig> = {}
|
|
437
|
+
): Promise<RunOutcome> {
|
|
438
|
+
const start = performance.now();
|
|
439
|
+
const conversation: BaseMessage[] = [];
|
|
440
|
+
const observedToolCalls: ToolCallObservation[] = [];
|
|
441
|
+
let inputTokens = 0;
|
|
442
|
+
let outputTokens = 0;
|
|
443
|
+
let cacheReadTokens = 0;
|
|
444
|
+
let cacheWriteTokens = 0;
|
|
445
|
+
|
|
446
|
+
const { aggregateContent } = createContentAggregator();
|
|
447
|
+
const customHandlers = {
|
|
448
|
+
[GraphEvents.TOOL_END]: new ToolEndHandler(),
|
|
449
|
+
[GraphEvents.CHAT_MODEL_END]: new ModelEndHandler(),
|
|
450
|
+
[GraphEvents.CHAT_MODEL_STREAM]: new ChatModelStreamHandler(),
|
|
451
|
+
// ON_RUN_STEP must be forwarded too — without it the aggregator's
|
|
452
|
+
// `stepMap` is empty when ON_RUN_STEP_COMPLETED arrives and you
|
|
453
|
+
// get a "No run step or runId found for completed step event"
|
|
454
|
+
// warn for every tool call. The harness doesn't actually use the
|
|
455
|
+
// aggregated content, but feeding both events keeps logs clean.
|
|
456
|
+
[GraphEvents.ON_RUN_STEP]: {
|
|
457
|
+
handle: (
|
|
458
|
+
event: GraphEvents.ON_RUN_STEP,
|
|
459
|
+
data: t.StreamEventData
|
|
460
|
+
): void => {
|
|
461
|
+
aggregateContent({ event, data: data as t.RunStep });
|
|
462
|
+
},
|
|
463
|
+
},
|
|
464
|
+
[GraphEvents.ON_RUN_STEP_COMPLETED]: {
|
|
465
|
+
handle: (
|
|
466
|
+
event: GraphEvents.ON_RUN_STEP_COMPLETED,
|
|
467
|
+
data: t.StreamEventData
|
|
468
|
+
): void => {
|
|
469
|
+
aggregateContent({
|
|
470
|
+
event,
|
|
471
|
+
data: data as unknown as { result: t.ToolEndEvent },
|
|
472
|
+
});
|
|
473
|
+
},
|
|
474
|
+
},
|
|
475
|
+
};
|
|
476
|
+
|
|
477
|
+
const llmConfig = getLLMConfig(PROVIDER);
|
|
478
|
+
const runConfig: t.RunConfig = {
|
|
479
|
+
runId: `compare-${Date.now()}`,
|
|
480
|
+
graphConfig: {
|
|
481
|
+
type: 'standard',
|
|
482
|
+
// NB: in the legacy path Run.createLegacyGraph rebuilds
|
|
483
|
+
// `clientOptions` from llmConfig (it ignores graphConfig.clientOptions),
|
|
484
|
+
// so promptCache lives here and not on a separate clientOptions field.
|
|
485
|
+
llmConfig: { ...llmConfig, model: MODEL, promptCache: true },
|
|
486
|
+
instructions:
|
|
487
|
+
'You are a coding assistant with local file tools. Use read_file, ' +
|
|
488
|
+
'edit_file, write_file, bash. Be concise.',
|
|
489
|
+
},
|
|
490
|
+
toolExecution: {
|
|
491
|
+
engine: 'local',
|
|
492
|
+
local: {
|
|
493
|
+
cwd,
|
|
494
|
+
postEditSyntaxCheck: 'auto',
|
|
495
|
+
timeoutMs: 30_000,
|
|
496
|
+
...overrides,
|
|
497
|
+
},
|
|
498
|
+
},
|
|
499
|
+
returnContent: true,
|
|
500
|
+
skipCleanup: true,
|
|
501
|
+
customHandlers,
|
|
502
|
+
};
|
|
503
|
+
|
|
504
|
+
let errored = false;
|
|
505
|
+
let errorMessage: string | undefined;
|
|
506
|
+
try {
|
|
507
|
+
const run = await Run.create<t.IState>(runConfig);
|
|
508
|
+
conversation.push(new HumanMessage(task.prompt));
|
|
509
|
+
const streamConfig = {
|
|
510
|
+
configurable: { provider: PROVIDER, thread_id: `compare-${Date.now()}` },
|
|
511
|
+
streamMode: 'values',
|
|
512
|
+
version: 'v2' as const,
|
|
513
|
+
};
|
|
514
|
+
await run.processStream(
|
|
515
|
+
{ messages: conversation },
|
|
516
|
+
streamConfig as Parameters<typeof run.processStream>[1]
|
|
517
|
+
);
|
|
518
|
+
const finalMessages = run.getRunMessages();
|
|
519
|
+
if (finalMessages) {
|
|
520
|
+
conversation.push(...finalMessages);
|
|
521
|
+
}
|
|
522
|
+
} catch (err) {
|
|
523
|
+
errored = true;
|
|
524
|
+
errorMessage = (err as Error).message.slice(0, 500);
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
// Walk the conversation: tool calls live on AIMessage as `tool_calls`,
|
|
528
|
+
// tool results are ToolMessage entries (already chronologically next to them).
|
|
529
|
+
for (const msg of conversation) {
|
|
530
|
+
if (msg._getType() === 'ai') {
|
|
531
|
+
const ai = msg as unknown as {
|
|
532
|
+
tool_calls?: Array<{ name?: string; args?: unknown }>;
|
|
533
|
+
usage_metadata?: { input_tokens?: number; output_tokens?: number };
|
|
534
|
+
};
|
|
535
|
+
if (ai.tool_calls != null) {
|
|
536
|
+
for (const tc of ai.tool_calls) {
|
|
537
|
+
observedToolCalls.push({
|
|
538
|
+
name: tc.name ?? '?',
|
|
539
|
+
argsBytes: JSON.stringify(tc.args ?? {}).length,
|
|
540
|
+
isError: false,
|
|
541
|
+
});
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
if (ai.usage_metadata != null) {
|
|
545
|
+
const reportedInput = ai.usage_metadata.input_tokens ?? 0;
|
|
546
|
+
outputTokens += ai.usage_metadata.output_tokens ?? 0;
|
|
547
|
+
const idu =
|
|
548
|
+
(ai.usage_metadata as unknown as {
|
|
549
|
+
input_token_details?: {
|
|
550
|
+
cache_read?: number;
|
|
551
|
+
cache_creation?: number;
|
|
552
|
+
};
|
|
553
|
+
}).input_token_details;
|
|
554
|
+
const cacheRead = idu?.cache_read ?? 0;
|
|
555
|
+
const cacheCreate = idu?.cache_creation ?? 0;
|
|
556
|
+
cacheReadTokens += cacheRead;
|
|
557
|
+
cacheWriteTokens += cacheCreate;
|
|
558
|
+
// The Anthropic adapter at src/llm/anthropic/utils/message_outputs.ts:31
|
|
559
|
+
// reports usage_metadata.input_tokens as the TOTAL prompt
|
|
560
|
+
// (input + cache_creation + cache_read), not just the uncached
|
|
561
|
+
// portion. Subtract cached fields so `inputTokens` here is
|
|
562
|
+
// apples-to-apples with pi's `input` field (uncached only).
|
|
563
|
+
const trulyUncached = Math.max(
|
|
564
|
+
0,
|
|
565
|
+
reportedInput - cacheRead - cacheCreate
|
|
566
|
+
);
|
|
567
|
+
inputTokens += trulyUncached;
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
if (msg instanceof ToolMessage) {
|
|
571
|
+
if (msg.status === 'error' && observedToolCalls.length > 0) {
|
|
572
|
+
observedToolCalls[observedToolCalls.length - 1].isError = true;
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
const lastAssistant = [...conversation]
|
|
578
|
+
.reverse()
|
|
579
|
+
.find((m) => m._getType() === 'ai');
|
|
580
|
+
let finalAssistant = '';
|
|
581
|
+
if (lastAssistant) {
|
|
582
|
+
const c = lastAssistant.content;
|
|
583
|
+
finalAssistant =
|
|
584
|
+
typeof c === 'string'
|
|
585
|
+
? c
|
|
586
|
+
: Array.isArray(c)
|
|
587
|
+
? c
|
|
588
|
+
.map((b) => ('text' in b ? b.text : ''))
|
|
589
|
+
.filter(Boolean)
|
|
590
|
+
.join(' ')
|
|
591
|
+
: '';
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
// Sonnet 4.5 pricing (USD per 1M tokens). Pi computes its own cost; we
|
|
595
|
+
// compute ours from the same per-turn breakdown so the cost columns are
|
|
596
|
+
// comparable. Source: anthropic.com/pricing as of model ship.
|
|
597
|
+
const PRICE_INPUT = 3.0 / 1_000_000;
|
|
598
|
+
const PRICE_OUTPUT = 15.0 / 1_000_000;
|
|
599
|
+
const PRICE_CACHE_WRITE = 3.75 / 1_000_000;
|
|
600
|
+
const PRICE_CACHE_READ = 0.3 / 1_000_000;
|
|
601
|
+
const cost =
|
|
602
|
+
inputTokens * PRICE_INPUT +
|
|
603
|
+
outputTokens * PRICE_OUTPUT +
|
|
604
|
+
cacheWriteTokens * PRICE_CACHE_WRITE +
|
|
605
|
+
cacheReadTokens * PRICE_CACHE_READ;
|
|
606
|
+
|
|
607
|
+
return {
|
|
608
|
+
toolCalls: observedToolCalls,
|
|
609
|
+
wallMs: performance.now() - start,
|
|
610
|
+
inputTokens,
|
|
611
|
+
outputTokens,
|
|
612
|
+
cacheReadTokens,
|
|
613
|
+
cacheWriteTokens,
|
|
614
|
+
cost,
|
|
615
|
+
finalAssistant: finalAssistant.slice(0, 500),
|
|
616
|
+
errored,
|
|
617
|
+
errorMessage,
|
|
618
|
+
};
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
/* ------------------------------------------------------------------ */
|
|
622
|
+
/* Harness */
|
|
623
|
+
/* ------------------------------------------------------------------ */
|
|
624
|
+
|
|
625
|
+
async function setupWorkspace(task: Task): Promise<string> {
|
|
626
|
+
const { mkdir } = await import('fs/promises');
|
|
627
|
+
const dir = await mkdtemp(join(tmpdir(), 'lc-compare-'));
|
|
628
|
+
for (const [relPath, content] of Object.entries(task.seed)) {
|
|
629
|
+
const abs = join(dir, relPath);
|
|
630
|
+
await mkdir(join(abs, '..'), { recursive: true });
|
|
631
|
+
await writeFile(abs, content, 'utf8');
|
|
632
|
+
}
|
|
633
|
+
for (const [relPath, bytes] of Object.entries(task.seedBinary ?? {})) {
|
|
634
|
+
const abs = join(dir, relPath);
|
|
635
|
+
await mkdir(join(abs, '..'), { recursive: true });
|
|
636
|
+
await writeFile(abs, bytes);
|
|
637
|
+
}
|
|
638
|
+
if (task.setup != null) {
|
|
639
|
+
await task.setup(dir);
|
|
640
|
+
}
|
|
641
|
+
return dir;
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
async function symlinkRepoNodeModules(cwd: string): Promise<void> {
|
|
645
|
+
const { symlink } = await import('fs/promises');
|
|
646
|
+
const repo = resolve(process.cwd(), 'node_modules');
|
|
647
|
+
await symlink(repo, join(cwd, 'node_modules'), 'dir').catch(() => {
|
|
648
|
+
/* fall through; tsc just won't be available */
|
|
649
|
+
});
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
function summariseToolCalls(calls: ToolCallObservation[]): string {
|
|
653
|
+
if (calls.length === 0) return '<none>';
|
|
654
|
+
const grouped = new Map<string, number>();
|
|
655
|
+
for (const c of calls) {
|
|
656
|
+
grouped.set(c.name, (grouped.get(c.name) ?? 0) + 1);
|
|
657
|
+
}
|
|
658
|
+
const inline = [...grouped.entries()]
|
|
659
|
+
.map(([n, c]) => `${n}×${c}`)
|
|
660
|
+
.join(', ');
|
|
661
|
+
const errors = calls.filter((c) => c.isError).length;
|
|
662
|
+
return `${calls.length} call(s) [${inline}]${errors > 0 ? ` (${errors} errored)` : ''}`;
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
function fmtMs(ms: number): string {
|
|
666
|
+
return ms >= 1000 ? `${(ms / 1000).toFixed(1)}s` : `${Math.round(ms)}ms`;
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
interface AggregatedSide {
|
|
670
|
+
outcomes: RunOutcome[];
|
|
671
|
+
verifies: boolean[];
|
|
672
|
+
}
|
|
673
|
+
function emptySide(): AggregatedSide {
|
|
674
|
+
return { outcomes: [], verifies: [] };
|
|
675
|
+
}
|
|
676
|
+
function avg(xs: number[]): number {
|
|
677
|
+
return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
async function runOnce(
|
|
681
|
+
task: Task,
|
|
682
|
+
side: 'pi' | 'ours'
|
|
683
|
+
): Promise<{ outcome: RunOutcome; verify: { ok: boolean; detail: string } } | null> {
|
|
684
|
+
if (task.skip === side) return null;
|
|
685
|
+
const cwd = await setupWorkspace(task);
|
|
686
|
+
const outcome =
|
|
687
|
+
side === 'pi'
|
|
688
|
+
? await runPi(task, cwd)
|
|
689
|
+
: await runOurs(task, cwd, task.oursLocalConfigOverrides ?? {});
|
|
690
|
+
let verify = await task.verify(cwd);
|
|
691
|
+
if (outcome.errored) {
|
|
692
|
+
// Force-fail verify when the runner errored — otherwise a soft
|
|
693
|
+
// verify can mask a real provider rejection or a crash.
|
|
694
|
+
verify = {
|
|
695
|
+
ok: false,
|
|
696
|
+
detail: `runner errored: ${outcome.errorMessage ?? 'unknown'}`,
|
|
697
|
+
};
|
|
698
|
+
}
|
|
699
|
+
await rm(cwd, { recursive: true, force: true });
|
|
700
|
+
return { outcome, verify };
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
async function main(): Promise<void> {
|
|
704
|
+
const ITERS = Math.max(1, Number(process.env.COMPARE_ITERS ?? '1'));
|
|
705
|
+
console.log(`pi binary: ${PI_BIN}`);
|
|
706
|
+
console.log(`model: ${MODEL}`);
|
|
707
|
+
console.log(`provider: ${PROVIDER}`);
|
|
708
|
+
console.log(`iters: ${ITERS}`);
|
|
709
|
+
|
|
710
|
+
const results: Array<{
|
|
711
|
+
task: Task;
|
|
712
|
+
pi: AggregatedSide;
|
|
713
|
+
ours: AggregatedSide;
|
|
714
|
+
}> = [];
|
|
715
|
+
|
|
716
|
+
for (const task of TASKS) {
|
|
717
|
+
console.log(`\n========== ${task.name} ==========`);
|
|
718
|
+
console.log(task.description);
|
|
719
|
+
|
|
720
|
+
const pi = emptySide();
|
|
721
|
+
const ours = emptySide();
|
|
722
|
+
|
|
723
|
+
for (let i = 0; i < ITERS; i++) {
|
|
724
|
+
const tag = ITERS > 1 ? ` (iter ${i + 1}/${ITERS})` : '';
|
|
725
|
+
const piRes = await runOnce(task, 'pi');
|
|
726
|
+
if (piRes != null) {
|
|
727
|
+
pi.outcomes.push(piRes.outcome);
|
|
728
|
+
pi.verifies.push(piRes.verify.ok);
|
|
729
|
+
console.log(
|
|
730
|
+
`[pi]${tag} ${piRes.outcome.errored ? 'ERROR' : piRes.verify.ok ? 'ok' : 'fail'} ` +
|
|
731
|
+
`${fmtMs(piRes.outcome.wallMs)} ${summariseToolCalls(piRes.outcome.toolCalls)} ` +
|
|
732
|
+
`in=${piRes.outcome.inputTokens} out=${piRes.outcome.outputTokens} ` +
|
|
733
|
+
`cacheR=${piRes.outcome.cacheReadTokens} cacheW=${piRes.outcome.cacheWriteTokens} ` +
|
|
734
|
+
`$${piRes.outcome.cost.toFixed(4)}`
|
|
735
|
+
);
|
|
736
|
+
if (piRes.outcome.errored) console.log(` err: ${piRes.outcome.errorMessage}`);
|
|
737
|
+
} else {
|
|
738
|
+
console.log(`[pi]${tag} (skipped)`);
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
const oursRes = await runOnce(task, 'ours');
|
|
742
|
+
if (oursRes != null) {
|
|
743
|
+
ours.outcomes.push(oursRes.outcome);
|
|
744
|
+
ours.verifies.push(oursRes.verify.ok);
|
|
745
|
+
console.log(
|
|
746
|
+
`[ours]${tag} ${oursRes.outcome.errored ? 'ERROR' : oursRes.verify.ok ? 'ok' : 'fail'} ` +
|
|
747
|
+
`${fmtMs(oursRes.outcome.wallMs)} ${summariseToolCalls(oursRes.outcome.toolCalls)} ` +
|
|
748
|
+
`in=${oursRes.outcome.inputTokens} out=${oursRes.outcome.outputTokens} ` +
|
|
749
|
+
`cacheR=${oursRes.outcome.cacheReadTokens} cacheW=${oursRes.outcome.cacheWriteTokens}`
|
|
750
|
+
);
|
|
751
|
+
if (oursRes.outcome.errored) console.log(` err: ${oursRes.outcome.errorMessage}`);
|
|
752
|
+
} else {
|
|
753
|
+
console.log(`[ours]${tag} (skipped)`);
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
results.push({ task, pi, ours });
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
/* Summary table ---------------------------------------------------- */
|
|
761
|
+
console.log('\n\n================ SUMMARY ================');
|
|
762
|
+
if (ITERS > 1) {
|
|
763
|
+
console.log(`(metrics are mean over ${ITERS} iterations)\n`);
|
|
764
|
+
} else {
|
|
765
|
+
console.log();
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
function fmtSide(side: AggregatedSide, key: keyof RunOutcome): string {
|
|
769
|
+
if (side.outcomes.length === 0) return 'N/A';
|
|
770
|
+
const vals = side.outcomes.map((o) => Number(o[key] ?? 0));
|
|
771
|
+
return Math.round(avg(vals)).toString();
|
|
772
|
+
}
|
|
773
|
+
function fmtSideMs(side: AggregatedSide): string {
|
|
774
|
+
if (side.outcomes.length === 0) return 'N/A';
|
|
775
|
+
return fmtMs(avg(side.outcomes.map((o) => o.wallMs)));
|
|
776
|
+
}
|
|
777
|
+
function fmtSideCalls(side: AggregatedSide): string {
|
|
778
|
+
if (side.outcomes.length === 0) return 'N/A';
|
|
779
|
+
return avg(side.outcomes.map((o) => o.toolCalls.length)).toFixed(1);
|
|
780
|
+
}
|
|
781
|
+
function fmtVerify(side: AggregatedSide): string {
|
|
782
|
+
if (side.verifies.length === 0) return 'N/A';
|
|
783
|
+
const passed = side.verifies.filter(Boolean).length;
|
|
784
|
+
return passed === side.verifies.length
|
|
785
|
+
? '✔'
|
|
786
|
+
: `${passed}/${side.verifies.length}`;
|
|
787
|
+
}
|
|
788
|
+
function fmtCost(side: AggregatedSide): string {
|
|
789
|
+
if (side.outcomes.length === 0) return 'N/A';
|
|
790
|
+
const c = avg(side.outcomes.map((o) => o.cost));
|
|
791
|
+
return c === 0 ? '-' : `$${c.toFixed(4)}`;
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
const cols: Array<[string, string, string, string]> = [
|
|
795
|
+
['task', 'metric', 'pi', 'ours'],
|
|
796
|
+
];
|
|
797
|
+
for (const r of results) {
|
|
798
|
+
cols.push([r.task.name, 'verify', fmtVerify(r.pi), fmtVerify(r.ours)]);
|
|
799
|
+
cols.push(['', 'wall', fmtSideMs(r.pi), fmtSideMs(r.ours)]);
|
|
800
|
+
cols.push(['', 'tool calls', fmtSideCalls(r.pi), fmtSideCalls(r.ours)]);
|
|
801
|
+
cols.push(['', 'input new', fmtSide(r.pi, 'inputTokens'), fmtSide(r.ours, 'inputTokens')]);
|
|
802
|
+
cols.push(['', 'cache read', fmtSide(r.pi, 'cacheReadTokens'), fmtSide(r.ours, 'cacheReadTokens')]);
|
|
803
|
+
cols.push(['', 'cache write', fmtSide(r.pi, 'cacheWriteTokens'), fmtSide(r.ours, 'cacheWriteTokens')]);
|
|
804
|
+
cols.push(['', 'output tok', fmtSide(r.pi, 'outputTokens'), fmtSide(r.ours, 'outputTokens')]);
|
|
805
|
+
cols.push(['', 'cost', fmtCost(r.pi), fmtCost(r.ours)]);
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
const widths = [0, 0, 0, 0].map((_, i) =>
|
|
809
|
+
Math.max(...cols.map((row) => row[i].length))
|
|
810
|
+
);
|
|
811
|
+
for (const row of cols) {
|
|
812
|
+
console.log(
|
|
813
|
+
row
|
|
814
|
+
.map((cell, i) => cell.padEnd(widths[i]))
|
|
815
|
+
.join(' ')
|
|
816
|
+
);
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
// Aggregate verify counts across all iters of all non-skipped tasks.
|
|
820
|
+
const piVerifies = results.flatMap((r) => r.pi.verifies);
|
|
821
|
+
const oursVerifies = results.flatMap((r) => r.ours.verifies);
|
|
822
|
+
const piPassed = piVerifies.filter(Boolean).length;
|
|
823
|
+
const oursPassed = oursVerifies.filter(Boolean).length;
|
|
824
|
+
console.log(
|
|
825
|
+
`\nOverall: pi ${piPassed}/${piVerifies.length}, ours ${oursPassed}/${oursVerifies.length}.`
|
|
826
|
+
);
|
|
827
|
+
if (piPassed < piVerifies.length || oursPassed < oursVerifies.length) {
|
|
828
|
+
process.exitCode = 1;
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
process.on('unhandledRejection', (reason) => {
|
|
833
|
+
console.error('Unhandled Rejection:', reason);
|
|
834
|
+
process.exit(1);
|
|
835
|
+
});
|
|
836
|
+
|
|
837
|
+
main().catch((err) => {
|
|
838
|
+
console.error(err);
|
|
839
|
+
process.exit(1);
|
|
840
|
+
});
|