@librechat/agents 3.1.86 → 3.1.88
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +69 -0
- package/dist/cjs/events.cjs +23 -0
- package/dist/cjs/events.cjs.map +1 -1
- package/dist/cjs/graphs/Graph.cjs +133 -18
- package/dist/cjs/graphs/Graph.cjs.map +1 -1
- package/dist/cjs/graphs/MultiAgentGraph.cjs +1 -1
- package/dist/cjs/graphs/MultiAgentGraph.cjs.map +1 -1
- package/dist/cjs/llm/anthropic/index.cjs +251 -53
- package/dist/cjs/llm/anthropic/index.cjs.map +1 -1
- package/dist/cjs/llm/init.cjs +1 -5
- package/dist/cjs/llm/init.cjs.map +1 -1
- package/dist/cjs/llm/openai/index.cjs +113 -24
- package/dist/cjs/llm/openai/index.cjs.map +1 -1
- package/dist/cjs/llm/openai/utils/index.cjs.map +1 -1
- package/dist/cjs/llm/openrouter/index.cjs +3 -1
- package/dist/cjs/llm/openrouter/index.cjs.map +1 -1
- package/dist/cjs/main.cjs +18 -5
- package/dist/cjs/main.cjs.map +1 -1
- package/dist/cjs/openai/index.cjs +253 -0
- package/dist/cjs/openai/index.cjs.map +1 -0
- package/dist/cjs/responses/index.cjs +448 -0
- package/dist/cjs/responses/index.cjs.map +1 -0
- package/dist/cjs/run.cjs +108 -7
- package/dist/cjs/run.cjs.map +1 -1
- package/dist/cjs/session/AgentSession.cjs +1057 -0
- package/dist/cjs/session/AgentSession.cjs.map +1 -0
- package/dist/cjs/session/JsonlSessionStore.cjs +425 -0
- package/dist/cjs/session/JsonlSessionStore.cjs.map +1 -0
- package/dist/cjs/session/handlers.cjs +221 -0
- package/dist/cjs/session/handlers.cjs.map +1 -0
- package/dist/cjs/session/ids.cjs +22 -0
- package/dist/cjs/session/ids.cjs.map +1 -0
- package/dist/cjs/session/messageSerialization.cjs +179 -0
- package/dist/cjs/session/messageSerialization.cjs.map +1 -0
- package/dist/cjs/stream.cjs +475 -11
- package/dist/cjs/stream.cjs.map +1 -1
- package/dist/cjs/summarization/node.cjs +1 -1
- package/dist/cjs/summarization/node.cjs.map +1 -1
- package/dist/cjs/tools/ToolNode.cjs +177 -59
- package/dist/cjs/tools/ToolNode.cjs.map +1 -1
- package/dist/cjs/tools/eagerEventExecution.cjs +113 -0
- package/dist/cjs/tools/eagerEventExecution.cjs.map +1 -0
- package/dist/cjs/tools/handlers.cjs +1 -1
- package/dist/cjs/tools/handlers.cjs.map +1 -1
- package/dist/cjs/tools/streamedToolCallSeals.cjs +42 -0
- package/dist/cjs/tools/streamedToolCallSeals.cjs.map +1 -0
- package/dist/esm/events.mjs +23 -1
- package/dist/esm/events.mjs.map +1 -1
- package/dist/esm/graphs/Graph.mjs +133 -18
- package/dist/esm/graphs/Graph.mjs.map +1 -1
- package/dist/esm/graphs/MultiAgentGraph.mjs +1 -1
- package/dist/esm/graphs/MultiAgentGraph.mjs.map +1 -1
- package/dist/esm/llm/anthropic/index.mjs +251 -53
- package/dist/esm/llm/anthropic/index.mjs.map +1 -1
- package/dist/esm/llm/init.mjs +1 -5
- package/dist/esm/llm/init.mjs.map +1 -1
- package/dist/esm/llm/openai/index.mjs +113 -25
- package/dist/esm/llm/openai/index.mjs.map +1 -1
- package/dist/esm/llm/openai/utils/index.mjs.map +1 -1
- package/dist/esm/llm/openrouter/index.mjs +4 -2
- package/dist/esm/llm/openrouter/index.mjs.map +1 -1
- package/dist/esm/main.mjs +5 -1
- package/dist/esm/main.mjs.map +1 -1
- package/dist/esm/openai/index.mjs +246 -0
- package/dist/esm/openai/index.mjs.map +1 -0
- package/dist/esm/responses/index.mjs +440 -0
- package/dist/esm/responses/index.mjs.map +1 -0
- package/dist/esm/run.mjs +108 -7
- package/dist/esm/run.mjs.map +1 -1
- package/dist/esm/session/AgentSession.mjs +1054 -0
- package/dist/esm/session/AgentSession.mjs.map +1 -0
- package/dist/esm/session/JsonlSessionStore.mjs +422 -0
- package/dist/esm/session/JsonlSessionStore.mjs.map +1 -0
- package/dist/esm/session/handlers.mjs +219 -0
- package/dist/esm/session/handlers.mjs.map +1 -0
- package/dist/esm/session/ids.mjs +17 -0
- package/dist/esm/session/ids.mjs.map +1 -0
- package/dist/esm/session/messageSerialization.mjs +173 -0
- package/dist/esm/session/messageSerialization.mjs.map +1 -0
- package/dist/esm/stream.mjs +476 -12
- package/dist/esm/stream.mjs.map +1 -1
- package/dist/esm/summarization/node.mjs +1 -1
- package/dist/esm/summarization/node.mjs.map +1 -1
- package/dist/esm/tools/ToolNode.mjs +177 -59
- package/dist/esm/tools/ToolNode.mjs.map +1 -1
- package/dist/esm/tools/eagerEventExecution.mjs +107 -0
- package/dist/esm/tools/eagerEventExecution.mjs.map +1 -0
- package/dist/esm/tools/handlers.mjs +1 -1
- package/dist/esm/tools/handlers.mjs.map +1 -1
- package/dist/esm/tools/streamedToolCallSeals.mjs +36 -0
- package/dist/esm/tools/streamedToolCallSeals.mjs.map +1 -0
- package/dist/types/events.d.ts +1 -0
- package/dist/types/graphs/Graph.d.ts +24 -9
- package/dist/types/index.d.ts +1 -0
- package/dist/types/llm/openai/index.d.ts +1 -0
- package/dist/types/openai/index.d.ts +75 -0
- package/dist/types/responses/index.d.ts +97 -0
- package/dist/types/run.d.ts +2 -0
- package/dist/types/session/AgentSession.d.ts +32 -0
- package/dist/types/session/JsonlSessionStore.d.ts +67 -0
- package/dist/types/session/handlers.d.ts +8 -0
- package/dist/types/session/ids.d.ts +4 -0
- package/dist/types/session/index.d.ts +5 -0
- package/dist/types/session/messageSerialization.d.ts +7 -0
- package/dist/types/session/types.d.ts +191 -0
- package/dist/types/tools/ToolNode.d.ts +12 -1
- package/dist/types/tools/eagerEventExecution.d.ts +23 -0
- package/dist/types/tools/streamedToolCallSeals.d.ts +13 -0
- package/dist/types/types/hitl.d.ts +4 -0
- package/dist/types/types/run.d.ts +11 -1
- package/dist/types/types/tools.d.ts +36 -0
- package/package.json +19 -2
- package/src/__tests__/stream.eagerEventExecution.test.ts +2571 -0
- package/src/events.ts +29 -0
- package/src/graphs/Graph.ts +224 -50
- package/src/graphs/MultiAgentGraph.ts +1 -1
- package/src/graphs/__tests__/composition.smoke.test.ts +30 -0
- package/src/index.ts +3 -0
- package/src/llm/anthropic/index.ts +356 -84
- package/src/llm/anthropic/llm.spec.ts +64 -0
- package/src/llm/custom-chat-models.smoke.test.ts +175 -4
- package/src/llm/openai/contentBlocks.test.ts +35 -0
- package/src/llm/openai/deepseek.test.ts +201 -2
- package/src/llm/openai/index.ts +171 -26
- package/src/llm/openai/utils/index.ts +22 -0
- package/src/llm/openrouter/index.ts +4 -2
- package/src/openai/__tests__/openai.test.ts +337 -0
- package/src/openai/index.ts +404 -0
- package/src/responses/__tests__/responses.test.ts +652 -0
- package/src/responses/index.ts +677 -0
- package/src/run.ts +158 -8
- package/src/scripts/compare_pi_vs_ours.ts +592 -173
- package/src/scripts/session_live.ts +548 -0
- package/src/session/AgentSession.ts +1432 -0
- package/src/session/JsonlSessionStore.ts +572 -0
- package/src/session/__tests__/JsonlSessionStore.test.ts +1410 -0
- package/src/session/__tests__/handlers.test.ts +161 -0
- package/src/session/handlers.ts +272 -0
- package/src/session/ids.ts +17 -0
- package/src/session/index.ts +44 -0
- package/src/session/messageSerialization.ts +207 -0
- package/src/session/types.ts +275 -0
- package/src/specs/custom-event-await.test.ts +89 -0
- package/src/specs/summarization.test.ts +1 -1
- package/src/stream.ts +756 -48
- package/src/summarization/node.ts +1 -1
- package/src/tools/ToolNode.ts +299 -126
- package/src/tools/__tests__/ToolNode.eagerEventExecution.test.ts +373 -0
- package/src/tools/__tests__/handlers.test.ts +2 -1
- package/src/tools/__tests__/hitl.test.ts +206 -110
- package/src/tools/eagerEventExecution.ts +153 -0
- package/src/tools/handlers.ts +8 -4
- package/src/tools/streamedToolCallSeals.ts +57 -0
- package/src/types/hitl.ts +4 -0
- package/src/types/run.ts +11 -0
- package/src/types/tools.ts +36 -0
- package/dist/cjs/llm/text.cjs +0 -69
- package/dist/cjs/llm/text.cjs.map +0 -1
- package/dist/esm/llm/text.mjs +0 -67
- package/dist/esm/llm/text.mjs.map +0 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* src/scripts/compare_pi_vs_ours.ts
|
|
3
3
|
*
|
|
4
|
-
* Side-by-side runs: pi-mono's `pi` CLI vs our
|
|
4
|
+
* Side-by-side runs: pi-mono's `pi` CLI vs our AgentSession facade, same
|
|
5
5
|
* task, same model, two parallel temp workspaces. We track:
|
|
6
6
|
*
|
|
7
7
|
* - tool calls (name + args length, ordered)
|
|
@@ -10,7 +10,8 @@
|
|
|
10
10
|
* - whether the final on-disk state matches the expected outcome
|
|
11
11
|
*
|
|
12
12
|
* The tasks intentionally probe areas where we expect the local
|
|
13
|
-
* engine to behave differently
|
|
13
|
+
* engine to behave differently, while the preflight probes compare the
|
|
14
|
+
* programmatic session DX now exposed by the SDK:
|
|
14
15
|
*
|
|
15
16
|
* T1 simple-edit — both should one-shot
|
|
16
17
|
* T2 fuzzy-edit — model emits an `oldText` with off-by-
|
|
@@ -32,25 +33,35 @@ config();
|
|
|
32
33
|
import { spawn } from 'child_process';
|
|
33
34
|
import { homedir, tmpdir } from 'os';
|
|
34
35
|
import { join, resolve } from 'path';
|
|
35
|
-
import {
|
|
36
|
+
import {
|
|
37
|
+
copyFile,
|
|
38
|
+
mkdir,
|
|
39
|
+
mkdtemp,
|
|
40
|
+
readFile,
|
|
41
|
+
rm,
|
|
42
|
+
stat,
|
|
43
|
+
symlink,
|
|
44
|
+
writeFile,
|
|
45
|
+
} from 'fs/promises';
|
|
36
46
|
import { performance } from 'perf_hooks';
|
|
37
|
-
import { HumanMessage, ToolMessage } from '@langchain/core/messages';
|
|
47
|
+
import { AIMessage, HumanMessage, ToolMessage } from '@langchain/core/messages';
|
|
48
|
+
import { MemorySaver } from '@langchain/langgraph';
|
|
38
49
|
import type { BaseMessage } from '@langchain/core/messages';
|
|
50
|
+
import type {
|
|
51
|
+
AgentSessionCheckpointing,
|
|
52
|
+
AgentSessionConfig,
|
|
53
|
+
AgentSessionRunResult,
|
|
54
|
+
} from '@/session';
|
|
39
55
|
import type * as t from '@/types';
|
|
40
|
-
import { ChatModelStreamHandler, createContentAggregator } from '@/stream';
|
|
41
|
-
import { ToolEndHandler, ModelEndHandler } from '@/events';
|
|
42
56
|
import { getLLMConfig } from '@/utils/llmConfig';
|
|
43
|
-
import {
|
|
44
|
-
import {
|
|
57
|
+
import { Providers, StepTypes } from '@/common';
|
|
58
|
+
import { createAgentSession } from '@/session';
|
|
45
59
|
|
|
46
60
|
const PROVIDER = Providers.ANTHROPIC;
|
|
47
61
|
const MODEL = 'claude-sonnet-4-5';
|
|
48
62
|
const PI_BIN =
|
|
49
63
|
process.env.PI_BIN ??
|
|
50
|
-
resolve(
|
|
51
|
-
homedir(),
|
|
52
|
-
'Projects/pi-mono/packages/coding-agent/dist/cli.js'
|
|
53
|
-
);
|
|
64
|
+
resolve(homedir(), 'Projects/pi-mono/packages/coding-agent/dist/cli.js');
|
|
54
65
|
|
|
55
66
|
interface Task {
|
|
56
67
|
name: string;
|
|
@@ -95,6 +106,21 @@ interface RunOutcome {
|
|
|
95
106
|
finalAssistant: string;
|
|
96
107
|
errored: boolean;
|
|
97
108
|
errorMessage?: string;
|
|
109
|
+
sessionEvents?: number;
|
|
110
|
+
sessionEntries?: number;
|
|
111
|
+
sessionPath?: string;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
interface DxProbeResult {
|
|
115
|
+
ok: boolean;
|
|
116
|
+
detail: string;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
interface DxProbe {
|
|
120
|
+
feature: string;
|
|
121
|
+
pi: string;
|
|
122
|
+
ours: string;
|
|
123
|
+
run: () => Promise<DxProbeResult>;
|
|
98
124
|
}
|
|
99
125
|
|
|
100
126
|
const TASKS: Task[] = [
|
|
@@ -102,8 +128,7 @@ const TASKS: Task[] = [
|
|
|
102
128
|
name: 'T1 simple-edit',
|
|
103
129
|
description: 'Single literal substitution in an existing file.',
|
|
104
130
|
seed: {
|
|
105
|
-
'greet.py':
|
|
106
|
-
'def greet(name):\n return f"Hello, {name}!"\n',
|
|
131
|
+
'greet.py': 'def greet(name):\n return f"Hello, {name}!"\n',
|
|
107
132
|
},
|
|
108
133
|
prompt:
|
|
109
134
|
'Edit greet.py: change the greeting from "Hello" to "Hi". ' +
|
|
@@ -163,12 +188,11 @@ const TASKS: Task[] = [
|
|
|
163
188
|
null,
|
|
164
189
|
2
|
|
165
190
|
),
|
|
166
|
-
'broken.ts':
|
|
167
|
-
'export const port: number = "not a number";\n',
|
|
191
|
+
'broken.ts': 'export const port: number = "not a number";\n',
|
|
168
192
|
},
|
|
169
193
|
prompt:
|
|
170
194
|
'broken.ts has a type error. Fix it so the project typechecks cleanly. ' +
|
|
171
|
-
|
|
195
|
+
"After fixing, verify by running the project's typecheck (or `compile_check` if available). " +
|
|
172
196
|
'Reply with "done".',
|
|
173
197
|
verify: async (cwd) => {
|
|
174
198
|
const text = await readFile(join(cwd, 'broken.ts'), 'utf8').catch(
|
|
@@ -224,8 +248,12 @@ const TASKS: Task[] = [
|
|
|
224
248
|
'Rename the exported function `calc_total` to `calculateTotal` across src/lib.ts, ' +
|
|
225
249
|
'src/index.ts, and src/index.test.ts. Update every reference. Reply "done" when finished.',
|
|
226
250
|
verify: async (cwd) => {
|
|
227
|
-
const lib = await readFile(join(cwd, 'src/lib.ts'), 'utf8').catch(
|
|
228
|
-
|
|
251
|
+
const lib = await readFile(join(cwd, 'src/lib.ts'), 'utf8').catch(
|
|
252
|
+
() => ''
|
|
253
|
+
);
|
|
254
|
+
const idx = await readFile(join(cwd, 'src/index.ts'), 'utf8').catch(
|
|
255
|
+
() => ''
|
|
256
|
+
);
|
|
229
257
|
const tst = await readFile(join(cwd, 'src/index.test.ts'), 'utf8').catch(
|
|
230
258
|
() => ''
|
|
231
259
|
);
|
|
@@ -240,9 +268,7 @@ const TASKS: Task[] = [
|
|
|
240
268
|
const ok = allRenamed && noOldName;
|
|
241
269
|
return {
|
|
242
270
|
ok,
|
|
243
|
-
detail: ok
|
|
244
|
-
? ''
|
|
245
|
-
: `lib:\n${lib}\nindex:\n${idx}\ntest:\n${tst}`,
|
|
271
|
+
detail: ok ? '' : `lib:\n${lib}\nindex:\n${idx}\ntest:\n${tst}`,
|
|
246
272
|
};
|
|
247
273
|
},
|
|
248
274
|
},
|
|
@@ -252,7 +278,6 @@ const TASKS: Task[] = [
|
|
|
252
278
|
'Reads a PNG and describes it. Ours embeds via attachReadAttachments + image_url block; pi has no equivalent and is skipped.',
|
|
253
279
|
seed: {},
|
|
254
280
|
setup: async (cwd) => {
|
|
255
|
-
const { copyFile } = await import('fs/promises');
|
|
256
281
|
// Use a real PNG (Anthropic refuses tiny 1x1 PNGs with "Could not
|
|
257
282
|
// process image"). Try a few well-known macOS app icons; fall back to
|
|
258
283
|
// any *.png we can find under /System.
|
|
@@ -277,7 +302,6 @@ const TASKS: Task[] = [
|
|
|
277
302
|
// The verify step is soft — we just check the file is still on disk
|
|
278
303
|
// (the agent shouldn't have deleted it) and the script-level error
|
|
279
304
|
// tracking will fail this task if Anthropic refused the request.
|
|
280
|
-
const { stat } = await import('fs/promises');
|
|
281
305
|
try {
|
|
282
306
|
await stat(join(cwd, 'sample.png'));
|
|
283
307
|
return { ok: true, detail: '' };
|
|
@@ -430,58 +454,23 @@ async function runPi(task: Task, cwd: string): Promise<RunOutcome> {
|
|
|
430
454
|
/* Our local-engine runner */
|
|
431
455
|
/* ------------------------------------------------------------------ */
|
|
432
456
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
overrides
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
let outputTokens = 0;
|
|
443
|
-
let cacheReadTokens = 0;
|
|
444
|
-
let cacheWriteTokens = 0;
|
|
445
|
-
|
|
446
|
-
const { aggregateContent } = createContentAggregator();
|
|
447
|
-
const customHandlers = {
|
|
448
|
-
[GraphEvents.TOOL_END]: new ToolEndHandler(),
|
|
449
|
-
[GraphEvents.CHAT_MODEL_END]: new ModelEndHandler(),
|
|
450
|
-
[GraphEvents.CHAT_MODEL_STREAM]: new ChatModelStreamHandler(),
|
|
451
|
-
// ON_RUN_STEP must be forwarded too — without it the aggregator's
|
|
452
|
-
// `stepMap` is empty when ON_RUN_STEP_COMPLETED arrives and you
|
|
453
|
-
// get a "No run step or runId found for completed step event"
|
|
454
|
-
// warn for every tool call. The harness doesn't actually use the
|
|
455
|
-
// aggregated content, but feeding both events keeps logs clean.
|
|
456
|
-
[GraphEvents.ON_RUN_STEP]: {
|
|
457
|
-
handle: (
|
|
458
|
-
event: GraphEvents.ON_RUN_STEP,
|
|
459
|
-
data: t.StreamEventData
|
|
460
|
-
): void => {
|
|
461
|
-
aggregateContent({ event, data: data as t.RunStep });
|
|
462
|
-
},
|
|
463
|
-
},
|
|
464
|
-
[GraphEvents.ON_RUN_STEP_COMPLETED]: {
|
|
465
|
-
handle: (
|
|
466
|
-
event: GraphEvents.ON_RUN_STEP_COMPLETED,
|
|
467
|
-
data: t.StreamEventData
|
|
468
|
-
): void => {
|
|
469
|
-
aggregateContent({
|
|
470
|
-
event,
|
|
471
|
-
data: data as unknown as { result: t.ToolEndEvent },
|
|
472
|
-
});
|
|
473
|
-
},
|
|
474
|
-
},
|
|
475
|
-
};
|
|
476
|
-
|
|
457
|
+
function createOursSessionConfig(params: {
|
|
458
|
+
cwd: string;
|
|
459
|
+
sessionPath?: string;
|
|
460
|
+
overrides?: Partial<t.LocalExecutionConfig>;
|
|
461
|
+
checkpointing?: AgentSessionCheckpointing;
|
|
462
|
+
ephemeral?: boolean;
|
|
463
|
+
humanInTheLoop?: t.HumanInTheLoopConfig;
|
|
464
|
+
graphConfig?: t.RunConfig['graphConfig'];
|
|
465
|
+
}): AgentSessionConfig {
|
|
477
466
|
const llmConfig = getLLMConfig(PROVIDER);
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
467
|
+
return {
|
|
468
|
+
cwd: params.cwd,
|
|
469
|
+
sessionPath: params.sessionPath,
|
|
470
|
+
ephemeral: params.ephemeral,
|
|
471
|
+
checkpointing: params.checkpointing ?? false,
|
|
472
|
+
graphConfig: params.graphConfig ?? {
|
|
481
473
|
type: 'standard',
|
|
482
|
-
// NB: in the legacy path Run.createLegacyGraph rebuilds
|
|
483
|
-
// `clientOptions` from llmConfig (it ignores graphConfig.clientOptions),
|
|
484
|
-
// so promptCache lives here and not on a separate clientOptions field.
|
|
485
474
|
llmConfig: { ...llmConfig, model: MODEL, promptCache: true },
|
|
486
475
|
instructions:
|
|
487
476
|
'You are a coding assistant with local file tools. Use read_file, ' +
|
|
@@ -490,106 +479,199 @@ async function runOurs(
|
|
|
490
479
|
toolExecution: {
|
|
491
480
|
engine: 'local',
|
|
492
481
|
local: {
|
|
493
|
-
cwd,
|
|
482
|
+
cwd: params.cwd,
|
|
494
483
|
postEditSyntaxCheck: 'auto',
|
|
495
484
|
timeoutMs: 30_000,
|
|
496
|
-
...overrides,
|
|
485
|
+
...params.overrides,
|
|
497
486
|
},
|
|
498
487
|
},
|
|
499
|
-
|
|
488
|
+
humanInTheLoop: params.humanInTheLoop,
|
|
500
489
|
skipCleanup: true,
|
|
501
|
-
customHandlers,
|
|
502
490
|
};
|
|
491
|
+
}
|
|
503
492
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
)
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
493
|
+
function getToolCallName(toolCall: t.AgentToolCall): string {
|
|
494
|
+
return 'function' in toolCall
|
|
495
|
+
? toolCall.function.name
|
|
496
|
+
: (toolCall.name ?? '?');
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
function getToolCallArgs(toolCall: t.AgentToolCall): unknown {
|
|
500
|
+
return 'function' in toolCall ? toolCall.function.arguments : toolCall.args;
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
function collectToolCallsFromSteps(steps: t.RunStep[]): ToolCallObservation[] {
|
|
504
|
+
const calls: ToolCallObservation[] = [];
|
|
505
|
+
for (const step of steps) {
|
|
506
|
+
if (step.stepDetails.type !== StepTypes.TOOL_CALLS) {
|
|
507
|
+
continue;
|
|
508
|
+
}
|
|
509
|
+
for (const toolCall of step.stepDetails.tool_calls ?? []) {
|
|
510
|
+
calls.push({
|
|
511
|
+
name: getToolCallName(toolCall),
|
|
512
|
+
argsBytes: JSON.stringify(getToolCallArgs(toolCall) ?? {}).length,
|
|
513
|
+
isError: false,
|
|
514
|
+
});
|
|
521
515
|
}
|
|
522
|
-
} catch (err) {
|
|
523
|
-
errored = true;
|
|
524
|
-
errorMessage = (err as Error).message.slice(0, 500);
|
|
525
516
|
}
|
|
517
|
+
return calls;
|
|
518
|
+
}
|
|
526
519
|
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
520
|
+
function collectToolCallsFromMessages(
|
|
521
|
+
messages: BaseMessage[]
|
|
522
|
+
): ToolCallObservation[] {
|
|
523
|
+
const calls: ToolCallObservation[] = [];
|
|
524
|
+
for (const msg of messages) {
|
|
530
525
|
if (msg._getType() === 'ai') {
|
|
531
526
|
const ai = msg as unknown as {
|
|
532
527
|
tool_calls?: Array<{ name?: string; args?: unknown }>;
|
|
533
|
-
usage_metadata?: { input_tokens?: number; output_tokens?: number };
|
|
534
528
|
};
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
});
|
|
542
|
-
}
|
|
543
|
-
}
|
|
544
|
-
if (ai.usage_metadata != null) {
|
|
545
|
-
const reportedInput = ai.usage_metadata.input_tokens ?? 0;
|
|
546
|
-
outputTokens += ai.usage_metadata.output_tokens ?? 0;
|
|
547
|
-
const idu =
|
|
548
|
-
(ai.usage_metadata as unknown as {
|
|
549
|
-
input_token_details?: {
|
|
550
|
-
cache_read?: number;
|
|
551
|
-
cache_creation?: number;
|
|
552
|
-
};
|
|
553
|
-
}).input_token_details;
|
|
554
|
-
const cacheRead = idu?.cache_read ?? 0;
|
|
555
|
-
const cacheCreate = idu?.cache_creation ?? 0;
|
|
556
|
-
cacheReadTokens += cacheRead;
|
|
557
|
-
cacheWriteTokens += cacheCreate;
|
|
558
|
-
// The Anthropic adapter at src/llm/anthropic/utils/message_outputs.ts:31
|
|
559
|
-
// reports usage_metadata.input_tokens as the TOTAL prompt
|
|
560
|
-
// (input + cache_creation + cache_read), not just the uncached
|
|
561
|
-
// portion. Subtract cached fields so `inputTokens` here is
|
|
562
|
-
// apples-to-apples with pi's `input` field (uncached only).
|
|
563
|
-
const trulyUncached = Math.max(
|
|
564
|
-
0,
|
|
565
|
-
reportedInput - cacheRead - cacheCreate
|
|
566
|
-
);
|
|
567
|
-
inputTokens += trulyUncached;
|
|
529
|
+
for (const toolCall of ai.tool_calls ?? []) {
|
|
530
|
+
calls.push({
|
|
531
|
+
name: toolCall.name ?? '?',
|
|
532
|
+
argsBytes: JSON.stringify(toolCall.args ?? {}).length,
|
|
533
|
+
isError: false,
|
|
534
|
+
});
|
|
568
535
|
}
|
|
536
|
+
continue;
|
|
569
537
|
}
|
|
570
|
-
if (
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
538
|
+
if (
|
|
539
|
+
msg instanceof ToolMessage &&
|
|
540
|
+
msg.status === 'error' &&
|
|
541
|
+
calls.length > 0
|
|
542
|
+
) {
|
|
543
|
+
calls[calls.length - 1].isError = true;
|
|
574
544
|
}
|
|
575
545
|
}
|
|
546
|
+
return calls;
|
|
547
|
+
}
|
|
576
548
|
|
|
577
|
-
|
|
549
|
+
function collectTokenUsage(messages: BaseMessage[]): {
|
|
550
|
+
inputTokens: number;
|
|
551
|
+
outputTokens: number;
|
|
552
|
+
cacheReadTokens: number;
|
|
553
|
+
cacheWriteTokens: number;
|
|
554
|
+
} {
|
|
555
|
+
let inputTokens = 0;
|
|
556
|
+
let outputTokens = 0;
|
|
557
|
+
let cacheReadTokens = 0;
|
|
558
|
+
let cacheWriteTokens = 0;
|
|
559
|
+
for (const msg of messages) {
|
|
560
|
+
if (msg._getType() !== 'ai') {
|
|
561
|
+
continue;
|
|
562
|
+
}
|
|
563
|
+
const ai = msg as unknown as {
|
|
564
|
+
usage_metadata?: { input_tokens?: number; output_tokens?: number };
|
|
565
|
+
};
|
|
566
|
+
if (ai.usage_metadata == null) {
|
|
567
|
+
continue;
|
|
568
|
+
}
|
|
569
|
+
const reportedInput = ai.usage_metadata.input_tokens ?? 0;
|
|
570
|
+
outputTokens += ai.usage_metadata.output_tokens ?? 0;
|
|
571
|
+
const inputTokenDetails = (
|
|
572
|
+
ai.usage_metadata as unknown as {
|
|
573
|
+
input_token_details?: {
|
|
574
|
+
cache_read?: number;
|
|
575
|
+
cache_creation?: number;
|
|
576
|
+
};
|
|
577
|
+
}
|
|
578
|
+
).input_token_details;
|
|
579
|
+
const cacheRead = inputTokenDetails?.cache_read ?? 0;
|
|
580
|
+
const cacheCreate = inputTokenDetails?.cache_creation ?? 0;
|
|
581
|
+
cacheReadTokens += cacheRead;
|
|
582
|
+
cacheWriteTokens += cacheCreate;
|
|
583
|
+
inputTokens += Math.max(0, reportedInput - cacheRead - cacheCreate);
|
|
584
|
+
}
|
|
585
|
+
return {
|
|
586
|
+
inputTokens,
|
|
587
|
+
outputTokens,
|
|
588
|
+
cacheReadTokens,
|
|
589
|
+
cacheWriteTokens,
|
|
590
|
+
};
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
function getFinalAssistant(messages: BaseMessage[], fallback: string): string {
|
|
594
|
+
const lastAssistant = [...messages]
|
|
578
595
|
.reverse()
|
|
579
|
-
.find((
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
596
|
+
.find((message) => message._getType() === 'ai');
|
|
597
|
+
if (!lastAssistant) {
|
|
598
|
+
return fallback;
|
|
599
|
+
}
|
|
600
|
+
const content = lastAssistant.content;
|
|
601
|
+
if (typeof content === 'string') {
|
|
602
|
+
return content;
|
|
603
|
+
}
|
|
604
|
+
if (!Array.isArray(content)) {
|
|
605
|
+
return fallback;
|
|
606
|
+
}
|
|
607
|
+
return content
|
|
608
|
+
.map((block) => ('text' in block ? block.text : ''))
|
|
609
|
+
.filter(Boolean)
|
|
610
|
+
.join(' ');
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
async function runOurs(
|
|
614
|
+
task: Task,
|
|
615
|
+
cwd: string,
|
|
616
|
+
overrides: Partial<t.LocalExecutionConfig> = {}
|
|
617
|
+
): Promise<RunOutcome> {
|
|
618
|
+
const start = performance.now();
|
|
619
|
+
let errored = false;
|
|
620
|
+
let errorMessage: string | undefined;
|
|
621
|
+
let result: AgentSessionRunResult | undefined;
|
|
622
|
+
let finalResultPromise: Promise<AgentSessionRunResult> | undefined;
|
|
623
|
+
let sessionEvents = 0;
|
|
624
|
+
let sessionEntries = 0;
|
|
625
|
+
const sessionPath = join(cwd, '.librechat-agent-session.jsonl');
|
|
626
|
+
try {
|
|
627
|
+
const session = await createAgentSession(
|
|
628
|
+
createOursSessionConfig({
|
|
629
|
+
cwd,
|
|
630
|
+
sessionPath,
|
|
631
|
+
overrides,
|
|
632
|
+
checkpointing: false,
|
|
633
|
+
})
|
|
634
|
+
);
|
|
635
|
+
const stream = session.stream(task.prompt, {
|
|
636
|
+
runId: `compare-${Date.now()}`,
|
|
637
|
+
threadId: `compare-${Date.now()}`,
|
|
638
|
+
config: {
|
|
639
|
+
configurable: { provider: PROVIDER },
|
|
640
|
+
},
|
|
641
|
+
});
|
|
642
|
+
finalResultPromise = stream.finalResult();
|
|
643
|
+
for await (const event of stream) {
|
|
644
|
+
sessionEvents = Math.max(sessionEvents, event.sequence + 1);
|
|
645
|
+
}
|
|
646
|
+
result = await finalResultPromise;
|
|
647
|
+
sessionEntries = session.getSessionStore()?.getEntries().length ?? 0;
|
|
648
|
+
} catch (err) {
|
|
649
|
+
await finalResultPromise?.catch(() => undefined);
|
|
650
|
+
errored = true;
|
|
651
|
+
errorMessage = (err as Error).message.slice(0, 500);
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
const messages = result?.messages ?? [];
|
|
655
|
+
const usage = collectTokenUsage(messages);
|
|
656
|
+
let observedToolCalls = collectToolCallsFromSteps(result?.steps ?? []);
|
|
657
|
+
const messageToolCalls = collectToolCallsFromMessages(messages);
|
|
658
|
+
if (observedToolCalls.length === 0) {
|
|
659
|
+
observedToolCalls = messageToolCalls;
|
|
660
|
+
}
|
|
661
|
+
for (let i = 0; i < observedToolCalls.length; i++) {
|
|
662
|
+
observedToolCalls[i].isError = messageToolCalls[i]?.isError ?? false;
|
|
592
663
|
}
|
|
664
|
+
const inputTokens =
|
|
665
|
+
usage.inputTokens === 0 && result != null
|
|
666
|
+
? result.usage.inputTokens
|
|
667
|
+
: usage.inputTokens;
|
|
668
|
+
const outputTokens =
|
|
669
|
+
usage.outputTokens === 0 && result != null
|
|
670
|
+
? result.usage.outputTokens
|
|
671
|
+
: usage.outputTokens;
|
|
672
|
+
const cacheReadTokens = usage.cacheReadTokens;
|
|
673
|
+
const cacheWriteTokens = usage.cacheWriteTokens;
|
|
674
|
+
const finalAssistant = getFinalAssistant(messages, result?.text ?? '');
|
|
593
675
|
|
|
594
676
|
// Sonnet 4.5 pricing (USD per 1M tokens). Pi computes its own cost; we
|
|
595
677
|
// compute ours from the same per-turn breakdown so the cost columns are
|
|
@@ -615,15 +697,292 @@ async function runOurs(
|
|
|
615
697
|
finalAssistant: finalAssistant.slice(0, 500),
|
|
616
698
|
errored,
|
|
617
699
|
errorMessage,
|
|
700
|
+
sessionEvents,
|
|
701
|
+
sessionEntries,
|
|
702
|
+
sessionPath,
|
|
618
703
|
};
|
|
619
704
|
}
|
|
620
705
|
|
|
706
|
+
/* ------------------------------------------------------------------ */
|
|
707
|
+
/* Programmatic DX probes */
|
|
708
|
+
/* ------------------------------------------------------------------ */
|
|
709
|
+
|
|
710
|
+
async function withDxWorkspace<T>(
|
|
711
|
+
name: string,
|
|
712
|
+
run: (cwd: string) => Promise<T>
|
|
713
|
+
): Promise<T> {
|
|
714
|
+
const cwd = await mkdtemp(join(tmpdir(), `lc-dx-${name}-`));
|
|
715
|
+
try {
|
|
716
|
+
return await run(cwd);
|
|
717
|
+
} finally {
|
|
718
|
+
await rm(cwd, { recursive: true, force: true });
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
async function probeSessionFacade(): Promise<DxProbeResult> {
|
|
723
|
+
return withDxWorkspace('facade', async (cwd) => {
|
|
724
|
+
const session = await createAgentSession(
|
|
725
|
+
createOursSessionConfig({
|
|
726
|
+
cwd,
|
|
727
|
+
sessionPath: join(cwd, 'facade.jsonl'),
|
|
728
|
+
checkpointing: false,
|
|
729
|
+
})
|
|
730
|
+
);
|
|
731
|
+
const methods: Array<keyof typeof session> = [
|
|
732
|
+
'run',
|
|
733
|
+
'stream',
|
|
734
|
+
'clone',
|
|
735
|
+
'fork',
|
|
736
|
+
'branch',
|
|
737
|
+
'compact',
|
|
738
|
+
'resumeInterrupt',
|
|
739
|
+
];
|
|
740
|
+
const missing = methods.filter(
|
|
741
|
+
(method) => typeof session[method] !== 'function'
|
|
742
|
+
);
|
|
743
|
+
return {
|
|
744
|
+
ok: missing.length === 0,
|
|
745
|
+
detail:
|
|
746
|
+
missing.length === 0
|
|
747
|
+
? 'session exposes run/stream plus lifecycle methods'
|
|
748
|
+
: `missing: ${missing.join(', ')}`,
|
|
749
|
+
};
|
|
750
|
+
});
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
async function probeJsonlTree(): Promise<DxProbeResult> {
|
|
754
|
+
return withDxWorkspace('jsonl', async (cwd) => {
|
|
755
|
+
const session = await createAgentSession(
|
|
756
|
+
createOursSessionConfig({
|
|
757
|
+
cwd,
|
|
758
|
+
sessionPath: join(cwd, 'tree.jsonl'),
|
|
759
|
+
checkpointing: false,
|
|
760
|
+
})
|
|
761
|
+
);
|
|
762
|
+
const store = session.getSessionStore();
|
|
763
|
+
if (!store) {
|
|
764
|
+
return { ok: false, detail: 'store was not created' };
|
|
765
|
+
}
|
|
766
|
+
const prompt = await store.appendMessage(
|
|
767
|
+
new HumanMessage('rename calc_total')
|
|
768
|
+
);
|
|
769
|
+
const reply = await store.appendMessage(new AIMessage('done'));
|
|
770
|
+
await store.setLabel(prompt.id, 'coding prompt');
|
|
771
|
+
const path = store.getPath(reply.id);
|
|
772
|
+
const entries = store.getEntries();
|
|
773
|
+
const ok =
|
|
774
|
+
path.length === 2 &&
|
|
775
|
+
store.getTree().length === 1 &&
|
|
776
|
+
store.getLabel(prompt.id) === 'coding prompt' &&
|
|
777
|
+
entries.some((entry) => entry.type === 'session_state');
|
|
778
|
+
return {
|
|
779
|
+
ok,
|
|
780
|
+
detail: `${entries.length} JSONL entries, active path length ${path.length}`,
|
|
781
|
+
};
|
|
782
|
+
});
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
async function probeForkCloneBranch(): Promise<DxProbeResult> {
|
|
786
|
+
return withDxWorkspace('branch', async (cwd) => {
|
|
787
|
+
const session = await createAgentSession(
|
|
788
|
+
createOursSessionConfig({
|
|
789
|
+
cwd,
|
|
790
|
+
sessionPath: join(cwd, 'branch.jsonl'),
|
|
791
|
+
checkpointing: false,
|
|
792
|
+
})
|
|
793
|
+
);
|
|
794
|
+
const store = session.getSessionStore();
|
|
795
|
+
if (!store) {
|
|
796
|
+
return { ok: false, detail: 'store was not created' };
|
|
797
|
+
}
|
|
798
|
+
const prompt = await store.appendMessage(new HumanMessage('turn one'));
|
|
799
|
+
const reply = await store.appendMessage(new AIMessage('reply one'));
|
|
800
|
+
const clone = await session.clone({ name: 'clone' });
|
|
801
|
+
const fork = await session.fork(reply.id, {
|
|
802
|
+
position: 'before',
|
|
803
|
+
name: 'fork-before-reply',
|
|
804
|
+
});
|
|
805
|
+
await session.branch(prompt.id, { position: 'at' });
|
|
806
|
+
const clonePathLength = clone.getSessionStore()?.getPath().length ?? 0;
|
|
807
|
+
const forkLeafId = fork.getSessionStore()?.getLeafEntry()?.id;
|
|
808
|
+
const activeLeafId = store.getLeafEntry()?.id;
|
|
809
|
+
const ok =
|
|
810
|
+
clonePathLength === 2 &&
|
|
811
|
+
forkLeafId === prompt.id &&
|
|
812
|
+
activeLeafId === prompt.id;
|
|
813
|
+
return {
|
|
814
|
+
ok,
|
|
815
|
+
detail:
|
|
816
|
+
`clone path ${clonePathLength}, fork leaf ${forkLeafId ?? 'none'}, ` +
|
|
817
|
+
`active leaf ${activeLeafId ?? 'none'}`,
|
|
818
|
+
};
|
|
819
|
+
});
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
async function probeResumeByPath(): Promise<DxProbeResult> {
|
|
823
|
+
return withDxWorkspace('resume', async (cwd) => {
|
|
824
|
+
const sessionPath = join(cwd, 'resume.jsonl');
|
|
825
|
+
const session = await createAgentSession(
|
|
826
|
+
createOursSessionConfig({ cwd, sessionPath, checkpointing: false })
|
|
827
|
+
);
|
|
828
|
+
const store = session.getSessionStore();
|
|
829
|
+
if (!store) {
|
|
830
|
+
return { ok: false, detail: 'store was not created' };
|
|
831
|
+
}
|
|
832
|
+
await store.appendMessage(new HumanMessage('persist me'));
|
|
833
|
+
const resumed = await createAgentSession(
|
|
834
|
+
createOursSessionConfig({ cwd, sessionPath, checkpointing: false })
|
|
835
|
+
);
|
|
836
|
+
const resumedMessages = resumed.getSessionStore()?.getMessages() ?? [];
|
|
837
|
+
const ok =
|
|
838
|
+
resumed.threadId === session.threadId && resumedMessages.length === 1;
|
|
839
|
+
return {
|
|
840
|
+
ok,
|
|
841
|
+
detail: `thread ${resumed.threadId}, messages ${resumedMessages.length}`,
|
|
842
|
+
};
|
|
843
|
+
});
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
async function probeCheckpointComposition(): Promise<DxProbeResult> {
|
|
847
|
+
return withDxWorkspace('checkpointing', async (cwd) => {
|
|
848
|
+
const checkpointer = new MemorySaver();
|
|
849
|
+
const injected = await createAgentSession(
|
|
850
|
+
createOursSessionConfig({
|
|
851
|
+
cwd,
|
|
852
|
+
ephemeral: true,
|
|
853
|
+
checkpointing: { checkpointer },
|
|
854
|
+
})
|
|
855
|
+
);
|
|
856
|
+
const disabled = await createAgentSession(
|
|
857
|
+
createOursSessionConfig({
|
|
858
|
+
cwd,
|
|
859
|
+
ephemeral: true,
|
|
860
|
+
checkpointing: false,
|
|
861
|
+
humanInTheLoop: { enabled: true },
|
|
862
|
+
})
|
|
863
|
+
);
|
|
864
|
+
const ok =
|
|
865
|
+
injected.getCheckpointer() === checkpointer &&
|
|
866
|
+
injected.getSessionStore() == null &&
|
|
867
|
+
disabled.getCheckpointer() == null &&
|
|
868
|
+
disabled.getSessionStore() == null;
|
|
869
|
+
return {
|
|
870
|
+
ok,
|
|
871
|
+
detail: ok
|
|
872
|
+
? 'custom checkpointer injected; JSONL/checkpointing can both be disabled'
|
|
873
|
+
: 'composition check failed',
|
|
874
|
+
};
|
|
875
|
+
});
|
|
876
|
+
}
|
|
877
|
+
|
|
878
|
+
async function probeMultiAgentWrapping(): Promise<DxProbeResult> {
|
|
879
|
+
return withDxWorkspace('multi', async (cwd) => {
|
|
880
|
+
const clientOptions = {
|
|
881
|
+
modelName: MODEL,
|
|
882
|
+
apiKey: process.env.ANTHROPIC_API_KEY,
|
|
883
|
+
};
|
|
884
|
+
const session = await createAgentSession(
|
|
885
|
+
createOursSessionConfig({
|
|
886
|
+
cwd,
|
|
887
|
+
sessionPath: join(cwd, 'multi.jsonl'),
|
|
888
|
+
checkpointing: false,
|
|
889
|
+
graphConfig: {
|
|
890
|
+
type: 'multi-agent',
|
|
891
|
+
agents: [
|
|
892
|
+
{
|
|
893
|
+
agentId: 'supervisor',
|
|
894
|
+
provider: PROVIDER,
|
|
895
|
+
clientOptions,
|
|
896
|
+
instructions: 'Route coding work to the specialist.',
|
|
897
|
+
},
|
|
898
|
+
{
|
|
899
|
+
agentId: 'coder',
|
|
900
|
+
provider: PROVIDER,
|
|
901
|
+
clientOptions,
|
|
902
|
+
instructions: 'Make precise code changes.',
|
|
903
|
+
},
|
|
904
|
+
],
|
|
905
|
+
edges: [
|
|
906
|
+
{
|
|
907
|
+
from: 'supervisor',
|
|
908
|
+
to: 'coder',
|
|
909
|
+
description: 'Delegate implementation work',
|
|
910
|
+
edgeType: 'handoff',
|
|
911
|
+
},
|
|
912
|
+
],
|
|
913
|
+
},
|
|
914
|
+
})
|
|
915
|
+
);
|
|
916
|
+
const ok =
|
|
917
|
+
session.getSessionStore() != null &&
|
|
918
|
+
typeof session.stream === 'function' &&
|
|
919
|
+
typeof session.run === 'function';
|
|
920
|
+
return {
|
|
921
|
+
ok,
|
|
922
|
+
detail:
|
|
923
|
+
'AgentSession accepted a multi-agent handoff graph without special harness code',
|
|
924
|
+
};
|
|
925
|
+
});
|
|
926
|
+
}
|
|
927
|
+
|
|
928
|
+
const DX_PROBES: DxProbe[] = [
|
|
929
|
+
{
|
|
930
|
+
feature: 'Session facade',
|
|
931
|
+
pi: 'SDK sessions expose a high-level run loop',
|
|
932
|
+
ours: 'createAgentSession().run/stream wraps existing Run',
|
|
933
|
+
run: probeSessionFacade,
|
|
934
|
+
},
|
|
935
|
+
{
|
|
936
|
+
feature: 'Append-only JSONL tree',
|
|
937
|
+
pi: 'Native session JSONL tree',
|
|
938
|
+
ours: 'JsonlSessionStore v1 with entries, labels, state',
|
|
939
|
+
run: probeJsonlTree,
|
|
940
|
+
},
|
|
941
|
+
{
|
|
942
|
+
feature: 'Clone/fork/branch',
|
|
943
|
+
pi: 'Clone, fork, and branch from tree positions',
|
|
944
|
+
ours: 'clone(), fork(before/at), branch(before/at)',
|
|
945
|
+
run: probeForkCloneBranch,
|
|
946
|
+
},
|
|
947
|
+
{
|
|
948
|
+
feature: 'Resume',
|
|
949
|
+
pi: 'Resume by session id/path',
|
|
950
|
+
ours: 'resume/open by exact JSONL path with stable threadId',
|
|
951
|
+
run: probeResumeByPath,
|
|
952
|
+
},
|
|
953
|
+
{
|
|
954
|
+
feature: 'Composable state',
|
|
955
|
+
pi: 'Session log is separate from execution provider state',
|
|
956
|
+
ours: 'JSONL optional; LangGraph checkpointer injectable or off',
|
|
957
|
+
run: probeCheckpointComposition,
|
|
958
|
+
},
|
|
959
|
+
{
|
|
960
|
+
feature: 'Multi-agent/subagent surface',
|
|
961
|
+
pi: 'Coding-agent session loop',
|
|
962
|
+
ours: 'Same AgentSession facade wraps multi-agent graphs and tools',
|
|
963
|
+
run: probeMultiAgentWrapping,
|
|
964
|
+
},
|
|
965
|
+
];
|
|
966
|
+
|
|
967
|
+
async function runDxProbes(): Promise<boolean> {
|
|
968
|
+
console.log('\n================ DX SESSION PROBES ================');
|
|
969
|
+
let allPassed = true;
|
|
970
|
+
for (const probe of DX_PROBES) {
|
|
971
|
+
const result = await probe.run();
|
|
972
|
+
allPassed &&= result.ok;
|
|
973
|
+
console.log(`\n[${result.ok ? 'ok' : 'fail'}] ${probe.feature}`);
|
|
974
|
+
console.log(` pi: ${probe.pi}`);
|
|
975
|
+
console.log(` ours: ${probe.ours}`);
|
|
976
|
+
console.log(` live: ${result.detail}`);
|
|
977
|
+
}
|
|
978
|
+
return allPassed;
|
|
979
|
+
}
|
|
980
|
+
|
|
621
981
|
/* ------------------------------------------------------------------ */
|
|
622
982
|
/* Harness */
|
|
623
983
|
/* ------------------------------------------------------------------ */
|
|
624
984
|
|
|
625
985
|
async function setupWorkspace(task: Task): Promise<string> {
|
|
626
|
-
const { mkdir } = await import('fs/promises');
|
|
627
986
|
const dir = await mkdtemp(join(tmpdir(), 'lc-compare-'));
|
|
628
987
|
for (const [relPath, content] of Object.entries(task.seed)) {
|
|
629
988
|
const abs = join(dir, relPath);
|
|
@@ -642,7 +1001,6 @@ async function setupWorkspace(task: Task): Promise<string> {
|
|
|
642
1001
|
}
|
|
643
1002
|
|
|
644
1003
|
async function symlinkRepoNodeModules(cwd: string): Promise<void> {
|
|
645
|
-
const { symlink } = await import('fs/promises');
|
|
646
1004
|
const repo = resolve(process.cwd(), 'node_modules');
|
|
647
1005
|
await symlink(repo, join(cwd, 'node_modules'), 'dir').catch(() => {
|
|
648
1006
|
/* fall through; tsc just won't be available */
|
|
@@ -655,9 +1013,7 @@ function summariseToolCalls(calls: ToolCallObservation[]): string {
|
|
|
655
1013
|
for (const c of calls) {
|
|
656
1014
|
grouped.set(c.name, (grouped.get(c.name) ?? 0) + 1);
|
|
657
1015
|
}
|
|
658
|
-
const inline = [...grouped.entries()]
|
|
659
|
-
.map(([n, c]) => `${n}×${c}`)
|
|
660
|
-
.join(', ');
|
|
1016
|
+
const inline = [...grouped.entries()].map(([n, c]) => `${n}×${c}`).join(', ');
|
|
661
1017
|
const errors = calls.filter((c) => c.isError).length;
|
|
662
1018
|
return `${calls.length} call(s) [${inline}]${errors > 0 ? ` (${errors} errored)` : ''}`;
|
|
663
1019
|
}
|
|
@@ -677,10 +1033,28 @@ function avg(xs: number[]): number {
|
|
|
677
1033
|
return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
678
1034
|
}
|
|
679
1035
|
|
|
1036
|
+
function selectTasks(tasks: Task[]): Task[] {
|
|
1037
|
+
const raw = process.env.COMPARE_TASKS;
|
|
1038
|
+
if (raw == null || raw.trim() === '') {
|
|
1039
|
+
return tasks;
|
|
1040
|
+
}
|
|
1041
|
+
const requested = raw
|
|
1042
|
+
.split(',')
|
|
1043
|
+
.map((token) => token.trim().toLowerCase())
|
|
1044
|
+
.filter(Boolean);
|
|
1045
|
+
return tasks.filter((task) => {
|
|
1046
|
+
const name = task.name.toLowerCase();
|
|
1047
|
+
return requested.some((token) => name === token || name.startsWith(token));
|
|
1048
|
+
});
|
|
1049
|
+
}
|
|
1050
|
+
|
|
680
1051
|
async function runOnce(
|
|
681
1052
|
task: Task,
|
|
682
1053
|
side: 'pi' | 'ours'
|
|
683
|
-
): Promise<{
|
|
1054
|
+
): Promise<{
|
|
1055
|
+
outcome: RunOutcome;
|
|
1056
|
+
verify: { ok: boolean; detail: string };
|
|
1057
|
+
} | null> {
|
|
684
1058
|
if (task.skip === side) return null;
|
|
685
1059
|
const cwd = await setupWorkspace(task);
|
|
686
1060
|
const outcome =
|
|
@@ -702,10 +1076,28 @@ async function runOnce(
|
|
|
702
1076
|
|
|
703
1077
|
async function main(): Promise<void> {
|
|
704
1078
|
const ITERS = Math.max(1, Number(process.env.COMPARE_ITERS ?? '1'));
|
|
1079
|
+
const DX_ONLY = process.env.COMPARE_DX_ONLY === '1';
|
|
1080
|
+
const selectedTasks = selectTasks(TASKS);
|
|
705
1081
|
console.log(`pi binary: ${PI_BIN}`);
|
|
706
1082
|
console.log(`model: ${MODEL}`);
|
|
707
1083
|
console.log(`provider: ${PROVIDER}`);
|
|
708
1084
|
console.log(`iters: ${ITERS}`);
|
|
1085
|
+
console.log(`dx only: ${DX_ONLY ? 'yes' : 'no'}`);
|
|
1086
|
+
console.log(
|
|
1087
|
+
`tasks: ${
|
|
1088
|
+
DX_ONLY ? 'DX probes' : selectedTasks.map((task) => task.name).join(', ')
|
|
1089
|
+
}`
|
|
1090
|
+
);
|
|
1091
|
+
const dxPassed = await runDxProbes();
|
|
1092
|
+
if (DX_ONLY) {
|
|
1093
|
+
process.exitCode = dxPassed ? 0 : 1;
|
|
1094
|
+
return;
|
|
1095
|
+
}
|
|
1096
|
+
if (selectedTasks.length === 0) {
|
|
1097
|
+
throw new Error(
|
|
1098
|
+
`No tasks matched COMPARE_TASKS=${process.env.COMPARE_TASKS}`
|
|
1099
|
+
);
|
|
1100
|
+
}
|
|
709
1101
|
|
|
710
1102
|
const results: Array<{
|
|
711
1103
|
task: Task;
|
|
@@ -713,7 +1105,7 @@ async function main(): Promise<void> {
|
|
|
713
1105
|
ours: AggregatedSide;
|
|
714
1106
|
}> = [];
|
|
715
1107
|
|
|
716
|
-
for (const task of
|
|
1108
|
+
for (const task of selectedTasks) {
|
|
717
1109
|
console.log(`\n========== ${task.name} ==========`);
|
|
718
1110
|
console.log(task.description);
|
|
719
1111
|
|
|
@@ -733,7 +1125,9 @@ async function main(): Promise<void> {
|
|
|
733
1125
|
`cacheR=${piRes.outcome.cacheReadTokens} cacheW=${piRes.outcome.cacheWriteTokens} ` +
|
|
734
1126
|
`$${piRes.outcome.cost.toFixed(4)}`
|
|
735
1127
|
);
|
|
736
|
-
if (piRes.outcome.errored)
|
|
1128
|
+
if (piRes.outcome.errored) {
|
|
1129
|
+
console.log(` err: ${piRes.outcome.errorMessage}`);
|
|
1130
|
+
}
|
|
737
1131
|
} else {
|
|
738
1132
|
console.log(`[pi]${tag} (skipped)`);
|
|
739
1133
|
}
|
|
@@ -746,9 +1140,12 @@ async function main(): Promise<void> {
|
|
|
746
1140
|
`[ours]${tag} ${oursRes.outcome.errored ? 'ERROR' : oursRes.verify.ok ? 'ok' : 'fail'} ` +
|
|
747
1141
|
`${fmtMs(oursRes.outcome.wallMs)} ${summariseToolCalls(oursRes.outcome.toolCalls)} ` +
|
|
748
1142
|
`in=${oursRes.outcome.inputTokens} out=${oursRes.outcome.outputTokens} ` +
|
|
749
|
-
`cacheR=${oursRes.outcome.cacheReadTokens} cacheW=${oursRes.outcome.cacheWriteTokens}`
|
|
1143
|
+
`cacheR=${oursRes.outcome.cacheReadTokens} cacheW=${oursRes.outcome.cacheWriteTokens} ` +
|
|
1144
|
+
`events=${oursRes.outcome.sessionEvents ?? 0} jsonl=${oursRes.outcome.sessionEntries ?? 0}`
|
|
750
1145
|
);
|
|
751
|
-
if (oursRes.outcome.errored)
|
|
1146
|
+
if (oursRes.outcome.errored) {
|
|
1147
|
+
console.log(` err: ${oursRes.outcome.errorMessage}`);
|
|
1148
|
+
}
|
|
752
1149
|
} else {
|
|
753
1150
|
console.log(`[ours]${tag} (skipped)`);
|
|
754
1151
|
}
|
|
@@ -798,22 +1195,40 @@ async function main(): Promise<void> {
|
|
|
798
1195
|
cols.push([r.task.name, 'verify', fmtVerify(r.pi), fmtVerify(r.ours)]);
|
|
799
1196
|
cols.push(['', 'wall', fmtSideMs(r.pi), fmtSideMs(r.ours)]);
|
|
800
1197
|
cols.push(['', 'tool calls', fmtSideCalls(r.pi), fmtSideCalls(r.ours)]);
|
|
801
|
-
cols.push([
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
1198
|
+
cols.push([
|
|
1199
|
+
'',
|
|
1200
|
+
'input new',
|
|
1201
|
+
fmtSide(r.pi, 'inputTokens'),
|
|
1202
|
+
fmtSide(r.ours, 'inputTokens'),
|
|
1203
|
+
]);
|
|
1204
|
+
cols.push([
|
|
1205
|
+
'',
|
|
1206
|
+
'cache read',
|
|
1207
|
+
fmtSide(r.pi, 'cacheReadTokens'),
|
|
1208
|
+
fmtSide(r.ours, 'cacheReadTokens'),
|
|
1209
|
+
]);
|
|
1210
|
+
cols.push([
|
|
1211
|
+
'',
|
|
1212
|
+
'cache write',
|
|
1213
|
+
fmtSide(r.pi, 'cacheWriteTokens'),
|
|
1214
|
+
fmtSide(r.ours, 'cacheWriteTokens'),
|
|
1215
|
+
]);
|
|
1216
|
+
cols.push([
|
|
1217
|
+
'',
|
|
1218
|
+
'output tok',
|
|
1219
|
+
fmtSide(r.pi, 'outputTokens'),
|
|
1220
|
+
fmtSide(r.ours, 'outputTokens'),
|
|
1221
|
+
]);
|
|
805
1222
|
cols.push(['', 'cost', fmtCost(r.pi), fmtCost(r.ours)]);
|
|
1223
|
+
cols.push(['', 'session events', 'N/A', fmtSide(r.ours, 'sessionEvents')]);
|
|
1224
|
+
cols.push(['', 'jsonl entries', 'N/A', fmtSide(r.ours, 'sessionEntries')]);
|
|
806
1225
|
}
|
|
807
1226
|
|
|
808
1227
|
const widths = [0, 0, 0, 0].map((_, i) =>
|
|
809
1228
|
Math.max(...cols.map((row) => row[i].length))
|
|
810
1229
|
);
|
|
811
1230
|
for (const row of cols) {
|
|
812
|
-
console.log(
|
|
813
|
-
row
|
|
814
|
-
.map((cell, i) => cell.padEnd(widths[i]))
|
|
815
|
-
.join(' ')
|
|
816
|
-
);
|
|
1231
|
+
console.log(row.map((cell, i) => cell.padEnd(widths[i])).join(' '));
|
|
817
1232
|
}
|
|
818
1233
|
|
|
819
1234
|
// Aggregate verify counts across all iters of all non-skipped tasks.
|
|
@@ -824,7 +1239,11 @@ async function main(): Promise<void> {
|
|
|
824
1239
|
console.log(
|
|
825
1240
|
`\nOverall: pi ${piPassed}/${piVerifies.length}, ours ${oursPassed}/${oursVerifies.length}.`
|
|
826
1241
|
);
|
|
827
|
-
if (
|
|
1242
|
+
if (
|
|
1243
|
+
!dxPassed ||
|
|
1244
|
+
piPassed < piVerifies.length ||
|
|
1245
|
+
oursPassed < oursVerifies.length
|
|
1246
|
+
) {
|
|
828
1247
|
process.exitCode = 1;
|
|
829
1248
|
}
|
|
830
1249
|
}
|