@librechat/agents 3.1.86 → 3.1.88

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/README.md +69 -0
  2. package/dist/cjs/events.cjs +23 -0
  3. package/dist/cjs/events.cjs.map +1 -1
  4. package/dist/cjs/graphs/Graph.cjs +133 -18
  5. package/dist/cjs/graphs/Graph.cjs.map +1 -1
  6. package/dist/cjs/graphs/MultiAgentGraph.cjs +1 -1
  7. package/dist/cjs/graphs/MultiAgentGraph.cjs.map +1 -1
  8. package/dist/cjs/llm/anthropic/index.cjs +251 -53
  9. package/dist/cjs/llm/anthropic/index.cjs.map +1 -1
  10. package/dist/cjs/llm/init.cjs +1 -5
  11. package/dist/cjs/llm/init.cjs.map +1 -1
  12. package/dist/cjs/llm/openai/index.cjs +113 -24
  13. package/dist/cjs/llm/openai/index.cjs.map +1 -1
  14. package/dist/cjs/llm/openai/utils/index.cjs.map +1 -1
  15. package/dist/cjs/llm/openrouter/index.cjs +3 -1
  16. package/dist/cjs/llm/openrouter/index.cjs.map +1 -1
  17. package/dist/cjs/main.cjs +18 -5
  18. package/dist/cjs/main.cjs.map +1 -1
  19. package/dist/cjs/openai/index.cjs +253 -0
  20. package/dist/cjs/openai/index.cjs.map +1 -0
  21. package/dist/cjs/responses/index.cjs +448 -0
  22. package/dist/cjs/responses/index.cjs.map +1 -0
  23. package/dist/cjs/run.cjs +108 -7
  24. package/dist/cjs/run.cjs.map +1 -1
  25. package/dist/cjs/session/AgentSession.cjs +1057 -0
  26. package/dist/cjs/session/AgentSession.cjs.map +1 -0
  27. package/dist/cjs/session/JsonlSessionStore.cjs +425 -0
  28. package/dist/cjs/session/JsonlSessionStore.cjs.map +1 -0
  29. package/dist/cjs/session/handlers.cjs +221 -0
  30. package/dist/cjs/session/handlers.cjs.map +1 -0
  31. package/dist/cjs/session/ids.cjs +22 -0
  32. package/dist/cjs/session/ids.cjs.map +1 -0
  33. package/dist/cjs/session/messageSerialization.cjs +179 -0
  34. package/dist/cjs/session/messageSerialization.cjs.map +1 -0
  35. package/dist/cjs/stream.cjs +475 -11
  36. package/dist/cjs/stream.cjs.map +1 -1
  37. package/dist/cjs/summarization/node.cjs +1 -1
  38. package/dist/cjs/summarization/node.cjs.map +1 -1
  39. package/dist/cjs/tools/ToolNode.cjs +177 -59
  40. package/dist/cjs/tools/ToolNode.cjs.map +1 -1
  41. package/dist/cjs/tools/eagerEventExecution.cjs +113 -0
  42. package/dist/cjs/tools/eagerEventExecution.cjs.map +1 -0
  43. package/dist/cjs/tools/handlers.cjs +1 -1
  44. package/dist/cjs/tools/handlers.cjs.map +1 -1
  45. package/dist/cjs/tools/streamedToolCallSeals.cjs +42 -0
  46. package/dist/cjs/tools/streamedToolCallSeals.cjs.map +1 -0
  47. package/dist/esm/events.mjs +23 -1
  48. package/dist/esm/events.mjs.map +1 -1
  49. package/dist/esm/graphs/Graph.mjs +133 -18
  50. package/dist/esm/graphs/Graph.mjs.map +1 -1
  51. package/dist/esm/graphs/MultiAgentGraph.mjs +1 -1
  52. package/dist/esm/graphs/MultiAgentGraph.mjs.map +1 -1
  53. package/dist/esm/llm/anthropic/index.mjs +251 -53
  54. package/dist/esm/llm/anthropic/index.mjs.map +1 -1
  55. package/dist/esm/llm/init.mjs +1 -5
  56. package/dist/esm/llm/init.mjs.map +1 -1
  57. package/dist/esm/llm/openai/index.mjs +113 -25
  58. package/dist/esm/llm/openai/index.mjs.map +1 -1
  59. package/dist/esm/llm/openai/utils/index.mjs.map +1 -1
  60. package/dist/esm/llm/openrouter/index.mjs +4 -2
  61. package/dist/esm/llm/openrouter/index.mjs.map +1 -1
  62. package/dist/esm/main.mjs +5 -1
  63. package/dist/esm/main.mjs.map +1 -1
  64. package/dist/esm/openai/index.mjs +246 -0
  65. package/dist/esm/openai/index.mjs.map +1 -0
  66. package/dist/esm/responses/index.mjs +440 -0
  67. package/dist/esm/responses/index.mjs.map +1 -0
  68. package/dist/esm/run.mjs +108 -7
  69. package/dist/esm/run.mjs.map +1 -1
  70. package/dist/esm/session/AgentSession.mjs +1054 -0
  71. package/dist/esm/session/AgentSession.mjs.map +1 -0
  72. package/dist/esm/session/JsonlSessionStore.mjs +422 -0
  73. package/dist/esm/session/JsonlSessionStore.mjs.map +1 -0
  74. package/dist/esm/session/handlers.mjs +219 -0
  75. package/dist/esm/session/handlers.mjs.map +1 -0
  76. package/dist/esm/session/ids.mjs +17 -0
  77. package/dist/esm/session/ids.mjs.map +1 -0
  78. package/dist/esm/session/messageSerialization.mjs +173 -0
  79. package/dist/esm/session/messageSerialization.mjs.map +1 -0
  80. package/dist/esm/stream.mjs +476 -12
  81. package/dist/esm/stream.mjs.map +1 -1
  82. package/dist/esm/summarization/node.mjs +1 -1
  83. package/dist/esm/summarization/node.mjs.map +1 -1
  84. package/dist/esm/tools/ToolNode.mjs +177 -59
  85. package/dist/esm/tools/ToolNode.mjs.map +1 -1
  86. package/dist/esm/tools/eagerEventExecution.mjs +107 -0
  87. package/dist/esm/tools/eagerEventExecution.mjs.map +1 -0
  88. package/dist/esm/tools/handlers.mjs +1 -1
  89. package/dist/esm/tools/handlers.mjs.map +1 -1
  90. package/dist/esm/tools/streamedToolCallSeals.mjs +36 -0
  91. package/dist/esm/tools/streamedToolCallSeals.mjs.map +1 -0
  92. package/dist/types/events.d.ts +1 -0
  93. package/dist/types/graphs/Graph.d.ts +24 -9
  94. package/dist/types/index.d.ts +1 -0
  95. package/dist/types/llm/openai/index.d.ts +1 -0
  96. package/dist/types/openai/index.d.ts +75 -0
  97. package/dist/types/responses/index.d.ts +97 -0
  98. package/dist/types/run.d.ts +2 -0
  99. package/dist/types/session/AgentSession.d.ts +32 -0
  100. package/dist/types/session/JsonlSessionStore.d.ts +67 -0
  101. package/dist/types/session/handlers.d.ts +8 -0
  102. package/dist/types/session/ids.d.ts +4 -0
  103. package/dist/types/session/index.d.ts +5 -0
  104. package/dist/types/session/messageSerialization.d.ts +7 -0
  105. package/dist/types/session/types.d.ts +191 -0
  106. package/dist/types/tools/ToolNode.d.ts +12 -1
  107. package/dist/types/tools/eagerEventExecution.d.ts +23 -0
  108. package/dist/types/tools/streamedToolCallSeals.d.ts +13 -0
  109. package/dist/types/types/hitl.d.ts +4 -0
  110. package/dist/types/types/run.d.ts +11 -1
  111. package/dist/types/types/tools.d.ts +36 -0
  112. package/package.json +19 -2
  113. package/src/__tests__/stream.eagerEventExecution.test.ts +2571 -0
  114. package/src/events.ts +29 -0
  115. package/src/graphs/Graph.ts +224 -50
  116. package/src/graphs/MultiAgentGraph.ts +1 -1
  117. package/src/graphs/__tests__/composition.smoke.test.ts +30 -0
  118. package/src/index.ts +3 -0
  119. package/src/llm/anthropic/index.ts +356 -84
  120. package/src/llm/anthropic/llm.spec.ts +64 -0
  121. package/src/llm/custom-chat-models.smoke.test.ts +175 -4
  122. package/src/llm/openai/contentBlocks.test.ts +35 -0
  123. package/src/llm/openai/deepseek.test.ts +201 -2
  124. package/src/llm/openai/index.ts +171 -26
  125. package/src/llm/openai/utils/index.ts +22 -0
  126. package/src/llm/openrouter/index.ts +4 -2
  127. package/src/openai/__tests__/openai.test.ts +337 -0
  128. package/src/openai/index.ts +404 -0
  129. package/src/responses/__tests__/responses.test.ts +652 -0
  130. package/src/responses/index.ts +677 -0
  131. package/src/run.ts +158 -8
  132. package/src/scripts/compare_pi_vs_ours.ts +592 -173
  133. package/src/scripts/session_live.ts +548 -0
  134. package/src/session/AgentSession.ts +1432 -0
  135. package/src/session/JsonlSessionStore.ts +572 -0
  136. package/src/session/__tests__/JsonlSessionStore.test.ts +1410 -0
  137. package/src/session/__tests__/handlers.test.ts +161 -0
  138. package/src/session/handlers.ts +272 -0
  139. package/src/session/ids.ts +17 -0
  140. package/src/session/index.ts +44 -0
  141. package/src/session/messageSerialization.ts +207 -0
  142. package/src/session/types.ts +275 -0
  143. package/src/specs/custom-event-await.test.ts +89 -0
  144. package/src/specs/summarization.test.ts +1 -1
  145. package/src/stream.ts +756 -48
  146. package/src/summarization/node.ts +1 -1
  147. package/src/tools/ToolNode.ts +299 -126
  148. package/src/tools/__tests__/ToolNode.eagerEventExecution.test.ts +373 -0
  149. package/src/tools/__tests__/handlers.test.ts +2 -1
  150. package/src/tools/__tests__/hitl.test.ts +206 -110
  151. package/src/tools/eagerEventExecution.ts +153 -0
  152. package/src/tools/handlers.ts +8 -4
  153. package/src/tools/streamedToolCallSeals.ts +57 -0
  154. package/src/types/hitl.ts +4 -0
  155. package/src/types/run.ts +11 -0
  156. package/src/types/tools.ts +36 -0
  157. package/dist/cjs/llm/text.cjs +0 -69
  158. package/dist/cjs/llm/text.cjs.map +0 -1
  159. package/dist/esm/llm/text.mjs +0 -67
  160. package/dist/esm/llm/text.mjs.map +0 -1
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * src/scripts/compare_pi_vs_ours.ts
3
3
  *
4
- * Side-by-side runs: pi-mono's `pi` CLI vs our local engine, same
4
+ * Side-by-side runs: pi-mono's `pi` CLI vs our AgentSession facade, same
5
5
  * task, same model, two parallel temp workspaces. We track:
6
6
  *
7
7
  * - tool calls (name + args length, ordered)
@@ -10,7 +10,8 @@
10
10
  * - whether the final on-disk state matches the expected outcome
11
11
  *
12
12
  * The tasks intentionally probe areas where we expect the local
13
- * engine to behave differently:
13
+ * engine to behave differently, while the preflight probes compare the
14
+ * programmatic session DX now exposed by the SDK:
14
15
  *
15
16
  * T1 simple-edit — both should one-shot
16
17
  * T2 fuzzy-edit — model emits an `oldText` with off-by-
@@ -32,25 +33,35 @@ config();
32
33
  import { spawn } from 'child_process';
33
34
  import { homedir, tmpdir } from 'os';
34
35
  import { join, resolve } from 'path';
35
- import { mkdtemp, readFile, rm, writeFile } from 'fs/promises';
36
+ import {
37
+ copyFile,
38
+ mkdir,
39
+ mkdtemp,
40
+ readFile,
41
+ rm,
42
+ stat,
43
+ symlink,
44
+ writeFile,
45
+ } from 'fs/promises';
36
46
  import { performance } from 'perf_hooks';
37
- import { HumanMessage, ToolMessage } from '@langchain/core/messages';
47
+ import { AIMessage, HumanMessage, ToolMessage } from '@langchain/core/messages';
48
+ import { MemorySaver } from '@langchain/langgraph';
38
49
  import type { BaseMessage } from '@langchain/core/messages';
50
+ import type {
51
+ AgentSessionCheckpointing,
52
+ AgentSessionConfig,
53
+ AgentSessionRunResult,
54
+ } from '@/session';
39
55
  import type * as t from '@/types';
40
- import { ChatModelStreamHandler, createContentAggregator } from '@/stream';
41
- import { ToolEndHandler, ModelEndHandler } from '@/events';
42
56
  import { getLLMConfig } from '@/utils/llmConfig';
43
- import { GraphEvents, Providers } from '@/common';
44
- import { Run } from '@/run';
57
+ import { Providers, StepTypes } from '@/common';
58
+ import { createAgentSession } from '@/session';
45
59
 
46
60
  const PROVIDER = Providers.ANTHROPIC;
47
61
  const MODEL = 'claude-sonnet-4-5';
48
62
  const PI_BIN =
49
63
  process.env.PI_BIN ??
50
- resolve(
51
- homedir(),
52
- 'Projects/pi-mono/packages/coding-agent/dist/cli.js'
53
- );
64
+ resolve(homedir(), 'Projects/pi-mono/packages/coding-agent/dist/cli.js');
54
65
 
55
66
  interface Task {
56
67
  name: string;
@@ -95,6 +106,21 @@ interface RunOutcome {
95
106
  finalAssistant: string;
96
107
  errored: boolean;
97
108
  errorMessage?: string;
109
+ sessionEvents?: number;
110
+ sessionEntries?: number;
111
+ sessionPath?: string;
112
+ }
113
+
114
+ interface DxProbeResult {
115
+ ok: boolean;
116
+ detail: string;
117
+ }
118
+
119
+ interface DxProbe {
120
+ feature: string;
121
+ pi: string;
122
+ ours: string;
123
+ run: () => Promise<DxProbeResult>;
98
124
  }
99
125
 
100
126
  const TASKS: Task[] = [
@@ -102,8 +128,7 @@ const TASKS: Task[] = [
102
128
  name: 'T1 simple-edit',
103
129
  description: 'Single literal substitution in an existing file.',
104
130
  seed: {
105
- 'greet.py':
106
- 'def greet(name):\n return f"Hello, {name}!"\n',
131
+ 'greet.py': 'def greet(name):\n return f"Hello, {name}!"\n',
107
132
  },
108
133
  prompt:
109
134
  'Edit greet.py: change the greeting from "Hello" to "Hi". ' +
@@ -163,12 +188,11 @@ const TASKS: Task[] = [
163
188
  null,
164
189
  2
165
190
  ),
166
- 'broken.ts':
167
- 'export const port: number = "not a number";\n',
191
+ 'broken.ts': 'export const port: number = "not a number";\n',
168
192
  },
169
193
  prompt:
170
194
  'broken.ts has a type error. Fix it so the project typechecks cleanly. ' +
171
- 'After fixing, verify by running the project\'s typecheck (or `compile_check` if available). ' +
195
+ "After fixing, verify by running the project's typecheck (or `compile_check` if available). " +
172
196
  'Reply with "done".',
173
197
  verify: async (cwd) => {
174
198
  const text = await readFile(join(cwd, 'broken.ts'), 'utf8').catch(
@@ -224,8 +248,12 @@ const TASKS: Task[] = [
224
248
  'Rename the exported function `calc_total` to `calculateTotal` across src/lib.ts, ' +
225
249
  'src/index.ts, and src/index.test.ts. Update every reference. Reply "done" when finished.',
226
250
  verify: async (cwd) => {
227
- const lib = await readFile(join(cwd, 'src/lib.ts'), 'utf8').catch(() => '');
228
- const idx = await readFile(join(cwd, 'src/index.ts'), 'utf8').catch(() => '');
251
+ const lib = await readFile(join(cwd, 'src/lib.ts'), 'utf8').catch(
252
+ () => ''
253
+ );
254
+ const idx = await readFile(join(cwd, 'src/index.ts'), 'utf8').catch(
255
+ () => ''
256
+ );
229
257
  const tst = await readFile(join(cwd, 'src/index.test.ts'), 'utf8').catch(
230
258
  () => ''
231
259
  );
@@ -240,9 +268,7 @@ const TASKS: Task[] = [
240
268
  const ok = allRenamed && noOldName;
241
269
  return {
242
270
  ok,
243
- detail: ok
244
- ? ''
245
- : `lib:\n${lib}\nindex:\n${idx}\ntest:\n${tst}`,
271
+ detail: ok ? '' : `lib:\n${lib}\nindex:\n${idx}\ntest:\n${tst}`,
246
272
  };
247
273
  },
248
274
  },
@@ -252,7 +278,6 @@ const TASKS: Task[] = [
252
278
  'Reads a PNG and describes it. Ours embeds via attachReadAttachments + image_url block; pi has no equivalent and is skipped.',
253
279
  seed: {},
254
280
  setup: async (cwd) => {
255
- const { copyFile } = await import('fs/promises');
256
281
  // Use a real PNG (Anthropic refuses tiny 1x1 PNGs with "Could not
257
282
  // process image"). Try a few well-known macOS app icons; fall back to
258
283
  // any *.png we can find under /System.
@@ -277,7 +302,6 @@ const TASKS: Task[] = [
277
302
  // The verify step is soft — we just check the file is still on disk
278
303
  // (the agent shouldn't have deleted it) and the script-level error
279
304
  // tracking will fail this task if Anthropic refused the request.
280
- const { stat } = await import('fs/promises');
281
305
  try {
282
306
  await stat(join(cwd, 'sample.png'));
283
307
  return { ok: true, detail: '' };
@@ -430,58 +454,23 @@ async function runPi(task: Task, cwd: string): Promise<RunOutcome> {
430
454
  /* Our local-engine runner */
431
455
  /* ------------------------------------------------------------------ */
432
456
 
433
- async function runOurs(
434
- task: Task,
435
- cwd: string,
436
- overrides: Partial<t.LocalExecutionConfig> = {}
437
- ): Promise<RunOutcome> {
438
- const start = performance.now();
439
- const conversation: BaseMessage[] = [];
440
- const observedToolCalls: ToolCallObservation[] = [];
441
- let inputTokens = 0;
442
- let outputTokens = 0;
443
- let cacheReadTokens = 0;
444
- let cacheWriteTokens = 0;
445
-
446
- const { aggregateContent } = createContentAggregator();
447
- const customHandlers = {
448
- [GraphEvents.TOOL_END]: new ToolEndHandler(),
449
- [GraphEvents.CHAT_MODEL_END]: new ModelEndHandler(),
450
- [GraphEvents.CHAT_MODEL_STREAM]: new ChatModelStreamHandler(),
451
- // ON_RUN_STEP must be forwarded too — without it the aggregator's
452
- // `stepMap` is empty when ON_RUN_STEP_COMPLETED arrives and you
453
- // get a "No run step or runId found for completed step event"
454
- // warn for every tool call. The harness doesn't actually use the
455
- // aggregated content, but feeding both events keeps logs clean.
456
- [GraphEvents.ON_RUN_STEP]: {
457
- handle: (
458
- event: GraphEvents.ON_RUN_STEP,
459
- data: t.StreamEventData
460
- ): void => {
461
- aggregateContent({ event, data: data as t.RunStep });
462
- },
463
- },
464
- [GraphEvents.ON_RUN_STEP_COMPLETED]: {
465
- handle: (
466
- event: GraphEvents.ON_RUN_STEP_COMPLETED,
467
- data: t.StreamEventData
468
- ): void => {
469
- aggregateContent({
470
- event,
471
- data: data as unknown as { result: t.ToolEndEvent },
472
- });
473
- },
474
- },
475
- };
476
-
457
+ function createOursSessionConfig(params: {
458
+ cwd: string;
459
+ sessionPath?: string;
460
+ overrides?: Partial<t.LocalExecutionConfig>;
461
+ checkpointing?: AgentSessionCheckpointing;
462
+ ephemeral?: boolean;
463
+ humanInTheLoop?: t.HumanInTheLoopConfig;
464
+ graphConfig?: t.RunConfig['graphConfig'];
465
+ }): AgentSessionConfig {
477
466
  const llmConfig = getLLMConfig(PROVIDER);
478
- const runConfig: t.RunConfig = {
479
- runId: `compare-${Date.now()}`,
480
- graphConfig: {
467
+ return {
468
+ cwd: params.cwd,
469
+ sessionPath: params.sessionPath,
470
+ ephemeral: params.ephemeral,
471
+ checkpointing: params.checkpointing ?? false,
472
+ graphConfig: params.graphConfig ?? {
481
473
  type: 'standard',
482
- // NB: in the legacy path Run.createLegacyGraph rebuilds
483
- // `clientOptions` from llmConfig (it ignores graphConfig.clientOptions),
484
- // so promptCache lives here and not on a separate clientOptions field.
485
474
  llmConfig: { ...llmConfig, model: MODEL, promptCache: true },
486
475
  instructions:
487
476
  'You are a coding assistant with local file tools. Use read_file, ' +
@@ -490,106 +479,199 @@ async function runOurs(
490
479
  toolExecution: {
491
480
  engine: 'local',
492
481
  local: {
493
- cwd,
482
+ cwd: params.cwd,
494
483
  postEditSyntaxCheck: 'auto',
495
484
  timeoutMs: 30_000,
496
- ...overrides,
485
+ ...params.overrides,
497
486
  },
498
487
  },
499
- returnContent: true,
488
+ humanInTheLoop: params.humanInTheLoop,
500
489
  skipCleanup: true,
501
- customHandlers,
502
490
  };
491
+ }
503
492
 
504
- let errored = false;
505
- let errorMessage: string | undefined;
506
- try {
507
- const run = await Run.create<t.IState>(runConfig);
508
- conversation.push(new HumanMessage(task.prompt));
509
- const streamConfig = {
510
- configurable: { provider: PROVIDER, thread_id: `compare-${Date.now()}` },
511
- streamMode: 'values',
512
- version: 'v2' as const,
513
- };
514
- await run.processStream(
515
- { messages: conversation },
516
- streamConfig as Parameters<typeof run.processStream>[1]
517
- );
518
- const finalMessages = run.getRunMessages();
519
- if (finalMessages) {
520
- conversation.push(...finalMessages);
493
+ function getToolCallName(toolCall: t.AgentToolCall): string {
494
+ return 'function' in toolCall
495
+ ? toolCall.function.name
496
+ : (toolCall.name ?? '?');
497
+ }
498
+
499
+ function getToolCallArgs(toolCall: t.AgentToolCall): unknown {
500
+ return 'function' in toolCall ? toolCall.function.arguments : toolCall.args;
501
+ }
502
+
503
+ function collectToolCallsFromSteps(steps: t.RunStep[]): ToolCallObservation[] {
504
+ const calls: ToolCallObservation[] = [];
505
+ for (const step of steps) {
506
+ if (step.stepDetails.type !== StepTypes.TOOL_CALLS) {
507
+ continue;
508
+ }
509
+ for (const toolCall of step.stepDetails.tool_calls ?? []) {
510
+ calls.push({
511
+ name: getToolCallName(toolCall),
512
+ argsBytes: JSON.stringify(getToolCallArgs(toolCall) ?? {}).length,
513
+ isError: false,
514
+ });
521
515
  }
522
- } catch (err) {
523
- errored = true;
524
- errorMessage = (err as Error).message.slice(0, 500);
525
516
  }
517
+ return calls;
518
+ }
526
519
 
527
- // Walk the conversation: tool calls live on AIMessage as `tool_calls`,
528
- // tool results are ToolMessage entries (already chronologically next to them).
529
- for (const msg of conversation) {
520
+ function collectToolCallsFromMessages(
521
+ messages: BaseMessage[]
522
+ ): ToolCallObservation[] {
523
+ const calls: ToolCallObservation[] = [];
524
+ for (const msg of messages) {
530
525
  if (msg._getType() === 'ai') {
531
526
  const ai = msg as unknown as {
532
527
  tool_calls?: Array<{ name?: string; args?: unknown }>;
533
- usage_metadata?: { input_tokens?: number; output_tokens?: number };
534
528
  };
535
- if (ai.tool_calls != null) {
536
- for (const tc of ai.tool_calls) {
537
- observedToolCalls.push({
538
- name: tc.name ?? '?',
539
- argsBytes: JSON.stringify(tc.args ?? {}).length,
540
- isError: false,
541
- });
542
- }
543
- }
544
- if (ai.usage_metadata != null) {
545
- const reportedInput = ai.usage_metadata.input_tokens ?? 0;
546
- outputTokens += ai.usage_metadata.output_tokens ?? 0;
547
- const idu =
548
- (ai.usage_metadata as unknown as {
549
- input_token_details?: {
550
- cache_read?: number;
551
- cache_creation?: number;
552
- };
553
- }).input_token_details;
554
- const cacheRead = idu?.cache_read ?? 0;
555
- const cacheCreate = idu?.cache_creation ?? 0;
556
- cacheReadTokens += cacheRead;
557
- cacheWriteTokens += cacheCreate;
558
- // The Anthropic adapter at src/llm/anthropic/utils/message_outputs.ts:31
559
- // reports usage_metadata.input_tokens as the TOTAL prompt
560
- // (input + cache_creation + cache_read), not just the uncached
561
- // portion. Subtract cached fields so `inputTokens` here is
562
- // apples-to-apples with pi's `input` field (uncached only).
563
- const trulyUncached = Math.max(
564
- 0,
565
- reportedInput - cacheRead - cacheCreate
566
- );
567
- inputTokens += trulyUncached;
529
+ for (const toolCall of ai.tool_calls ?? []) {
530
+ calls.push({
531
+ name: toolCall.name ?? '?',
532
+ argsBytes: JSON.stringify(toolCall.args ?? {}).length,
533
+ isError: false,
534
+ });
568
535
  }
536
+ continue;
569
537
  }
570
- if (msg instanceof ToolMessage) {
571
- if (msg.status === 'error' && observedToolCalls.length > 0) {
572
- observedToolCalls[observedToolCalls.length - 1].isError = true;
573
- }
538
+ if (
539
+ msg instanceof ToolMessage &&
540
+ msg.status === 'error' &&
541
+ calls.length > 0
542
+ ) {
543
+ calls[calls.length - 1].isError = true;
574
544
  }
575
545
  }
546
+ return calls;
547
+ }
576
548
 
577
- const lastAssistant = [...conversation]
549
+ function collectTokenUsage(messages: BaseMessage[]): {
550
+ inputTokens: number;
551
+ outputTokens: number;
552
+ cacheReadTokens: number;
553
+ cacheWriteTokens: number;
554
+ } {
555
+ let inputTokens = 0;
556
+ let outputTokens = 0;
557
+ let cacheReadTokens = 0;
558
+ let cacheWriteTokens = 0;
559
+ for (const msg of messages) {
560
+ if (msg._getType() !== 'ai') {
561
+ continue;
562
+ }
563
+ const ai = msg as unknown as {
564
+ usage_metadata?: { input_tokens?: number; output_tokens?: number };
565
+ };
566
+ if (ai.usage_metadata == null) {
567
+ continue;
568
+ }
569
+ const reportedInput = ai.usage_metadata.input_tokens ?? 0;
570
+ outputTokens += ai.usage_metadata.output_tokens ?? 0;
571
+ const inputTokenDetails = (
572
+ ai.usage_metadata as unknown as {
573
+ input_token_details?: {
574
+ cache_read?: number;
575
+ cache_creation?: number;
576
+ };
577
+ }
578
+ ).input_token_details;
579
+ const cacheRead = inputTokenDetails?.cache_read ?? 0;
580
+ const cacheCreate = inputTokenDetails?.cache_creation ?? 0;
581
+ cacheReadTokens += cacheRead;
582
+ cacheWriteTokens += cacheCreate;
583
+ inputTokens += Math.max(0, reportedInput - cacheRead - cacheCreate);
584
+ }
585
+ return {
586
+ inputTokens,
587
+ outputTokens,
588
+ cacheReadTokens,
589
+ cacheWriteTokens,
590
+ };
591
+ }
592
+
593
+ function getFinalAssistant(messages: BaseMessage[], fallback: string): string {
594
+ const lastAssistant = [...messages]
578
595
  .reverse()
579
- .find((m) => m._getType() === 'ai');
580
- let finalAssistant = '';
581
- if (lastAssistant) {
582
- const c = lastAssistant.content;
583
- finalAssistant =
584
- typeof c === 'string'
585
- ? c
586
- : Array.isArray(c)
587
- ? c
588
- .map((b) => ('text' in b ? b.text : ''))
589
- .filter(Boolean)
590
- .join(' ')
591
- : '';
596
+ .find((message) => message._getType() === 'ai');
597
+ if (!lastAssistant) {
598
+ return fallback;
599
+ }
600
+ const content = lastAssistant.content;
601
+ if (typeof content === 'string') {
602
+ return content;
603
+ }
604
+ if (!Array.isArray(content)) {
605
+ return fallback;
606
+ }
607
+ return content
608
+ .map((block) => ('text' in block ? block.text : ''))
609
+ .filter(Boolean)
610
+ .join(' ');
611
+ }
612
+
613
+ async function runOurs(
614
+ task: Task,
615
+ cwd: string,
616
+ overrides: Partial<t.LocalExecutionConfig> = {}
617
+ ): Promise<RunOutcome> {
618
+ const start = performance.now();
619
+ let errored = false;
620
+ let errorMessage: string | undefined;
621
+ let result: AgentSessionRunResult | undefined;
622
+ let finalResultPromise: Promise<AgentSessionRunResult> | undefined;
623
+ let sessionEvents = 0;
624
+ let sessionEntries = 0;
625
+ const sessionPath = join(cwd, '.librechat-agent-session.jsonl');
626
+ try {
627
+ const session = await createAgentSession(
628
+ createOursSessionConfig({
629
+ cwd,
630
+ sessionPath,
631
+ overrides,
632
+ checkpointing: false,
633
+ })
634
+ );
635
+ const stream = session.stream(task.prompt, {
636
+ runId: `compare-${Date.now()}`,
637
+ threadId: `compare-${Date.now()}`,
638
+ config: {
639
+ configurable: { provider: PROVIDER },
640
+ },
641
+ });
642
+ finalResultPromise = stream.finalResult();
643
+ for await (const event of stream) {
644
+ sessionEvents = Math.max(sessionEvents, event.sequence + 1);
645
+ }
646
+ result = await finalResultPromise;
647
+ sessionEntries = session.getSessionStore()?.getEntries().length ?? 0;
648
+ } catch (err) {
649
+ await finalResultPromise?.catch(() => undefined);
650
+ errored = true;
651
+ errorMessage = (err as Error).message.slice(0, 500);
652
+ }
653
+
654
+ const messages = result?.messages ?? [];
655
+ const usage = collectTokenUsage(messages);
656
+ let observedToolCalls = collectToolCallsFromSteps(result?.steps ?? []);
657
+ const messageToolCalls = collectToolCallsFromMessages(messages);
658
+ if (observedToolCalls.length === 0) {
659
+ observedToolCalls = messageToolCalls;
660
+ }
661
+ for (let i = 0; i < observedToolCalls.length; i++) {
662
+ observedToolCalls[i].isError = messageToolCalls[i]?.isError ?? false;
592
663
  }
664
+ const inputTokens =
665
+ usage.inputTokens === 0 && result != null
666
+ ? result.usage.inputTokens
667
+ : usage.inputTokens;
668
+ const outputTokens =
669
+ usage.outputTokens === 0 && result != null
670
+ ? result.usage.outputTokens
671
+ : usage.outputTokens;
672
+ const cacheReadTokens = usage.cacheReadTokens;
673
+ const cacheWriteTokens = usage.cacheWriteTokens;
674
+ const finalAssistant = getFinalAssistant(messages, result?.text ?? '');
593
675
 
594
676
  // Sonnet 4.5 pricing (USD per 1M tokens). Pi computes its own cost; we
595
677
  // compute ours from the same per-turn breakdown so the cost columns are
@@ -615,15 +697,292 @@ async function runOurs(
615
697
  finalAssistant: finalAssistant.slice(0, 500),
616
698
  errored,
617
699
  errorMessage,
700
+ sessionEvents,
701
+ sessionEntries,
702
+ sessionPath,
618
703
  };
619
704
  }
620
705
 
706
+ /* ------------------------------------------------------------------ */
707
+ /* Programmatic DX probes */
708
+ /* ------------------------------------------------------------------ */
709
+
710
+ async function withDxWorkspace<T>(
711
+ name: string,
712
+ run: (cwd: string) => Promise<T>
713
+ ): Promise<T> {
714
+ const cwd = await mkdtemp(join(tmpdir(), `lc-dx-${name}-`));
715
+ try {
716
+ return await run(cwd);
717
+ } finally {
718
+ await rm(cwd, { recursive: true, force: true });
719
+ }
720
+ }
721
+
722
+ async function probeSessionFacade(): Promise<DxProbeResult> {
723
+ return withDxWorkspace('facade', async (cwd) => {
724
+ const session = await createAgentSession(
725
+ createOursSessionConfig({
726
+ cwd,
727
+ sessionPath: join(cwd, 'facade.jsonl'),
728
+ checkpointing: false,
729
+ })
730
+ );
731
+ const methods: Array<keyof typeof session> = [
732
+ 'run',
733
+ 'stream',
734
+ 'clone',
735
+ 'fork',
736
+ 'branch',
737
+ 'compact',
738
+ 'resumeInterrupt',
739
+ ];
740
+ const missing = methods.filter(
741
+ (method) => typeof session[method] !== 'function'
742
+ );
743
+ return {
744
+ ok: missing.length === 0,
745
+ detail:
746
+ missing.length === 0
747
+ ? 'session exposes run/stream plus lifecycle methods'
748
+ : `missing: ${missing.join(', ')}`,
749
+ };
750
+ });
751
+ }
752
+
753
+ async function probeJsonlTree(): Promise<DxProbeResult> {
754
+ return withDxWorkspace('jsonl', async (cwd) => {
755
+ const session = await createAgentSession(
756
+ createOursSessionConfig({
757
+ cwd,
758
+ sessionPath: join(cwd, 'tree.jsonl'),
759
+ checkpointing: false,
760
+ })
761
+ );
762
+ const store = session.getSessionStore();
763
+ if (!store) {
764
+ return { ok: false, detail: 'store was not created' };
765
+ }
766
+ const prompt = await store.appendMessage(
767
+ new HumanMessage('rename calc_total')
768
+ );
769
+ const reply = await store.appendMessage(new AIMessage('done'));
770
+ await store.setLabel(prompt.id, 'coding prompt');
771
+ const path = store.getPath(reply.id);
772
+ const entries = store.getEntries();
773
+ const ok =
774
+ path.length === 2 &&
775
+ store.getTree().length === 1 &&
776
+ store.getLabel(prompt.id) === 'coding prompt' &&
777
+ entries.some((entry) => entry.type === 'session_state');
778
+ return {
779
+ ok,
780
+ detail: `${entries.length} JSONL entries, active path length ${path.length}`,
781
+ };
782
+ });
783
+ }
784
+
785
+ async function probeForkCloneBranch(): Promise<DxProbeResult> {
786
+ return withDxWorkspace('branch', async (cwd) => {
787
+ const session = await createAgentSession(
788
+ createOursSessionConfig({
789
+ cwd,
790
+ sessionPath: join(cwd, 'branch.jsonl'),
791
+ checkpointing: false,
792
+ })
793
+ );
794
+ const store = session.getSessionStore();
795
+ if (!store) {
796
+ return { ok: false, detail: 'store was not created' };
797
+ }
798
+ const prompt = await store.appendMessage(new HumanMessage('turn one'));
799
+ const reply = await store.appendMessage(new AIMessage('reply one'));
800
+ const clone = await session.clone({ name: 'clone' });
801
+ const fork = await session.fork(reply.id, {
802
+ position: 'before',
803
+ name: 'fork-before-reply',
804
+ });
805
+ await session.branch(prompt.id, { position: 'at' });
806
+ const clonePathLength = clone.getSessionStore()?.getPath().length ?? 0;
807
+ const forkLeafId = fork.getSessionStore()?.getLeafEntry()?.id;
808
+ const activeLeafId = store.getLeafEntry()?.id;
809
+ const ok =
810
+ clonePathLength === 2 &&
811
+ forkLeafId === prompt.id &&
812
+ activeLeafId === prompt.id;
813
+ return {
814
+ ok,
815
+ detail:
816
+ `clone path ${clonePathLength}, fork leaf ${forkLeafId ?? 'none'}, ` +
817
+ `active leaf ${activeLeafId ?? 'none'}`,
818
+ };
819
+ });
820
+ }
821
+
822
+ async function probeResumeByPath(): Promise<DxProbeResult> {
823
+ return withDxWorkspace('resume', async (cwd) => {
824
+ const sessionPath = join(cwd, 'resume.jsonl');
825
+ const session = await createAgentSession(
826
+ createOursSessionConfig({ cwd, sessionPath, checkpointing: false })
827
+ );
828
+ const store = session.getSessionStore();
829
+ if (!store) {
830
+ return { ok: false, detail: 'store was not created' };
831
+ }
832
+ await store.appendMessage(new HumanMessage('persist me'));
833
+ const resumed = await createAgentSession(
834
+ createOursSessionConfig({ cwd, sessionPath, checkpointing: false })
835
+ );
836
+ const resumedMessages = resumed.getSessionStore()?.getMessages() ?? [];
837
+ const ok =
838
+ resumed.threadId === session.threadId && resumedMessages.length === 1;
839
+ return {
840
+ ok,
841
+ detail: `thread ${resumed.threadId}, messages ${resumedMessages.length}`,
842
+ };
843
+ });
844
+ }
845
+
846
+ async function probeCheckpointComposition(): Promise<DxProbeResult> {
847
+ return withDxWorkspace('checkpointing', async (cwd) => {
848
+ const checkpointer = new MemorySaver();
849
+ const injected = await createAgentSession(
850
+ createOursSessionConfig({
851
+ cwd,
852
+ ephemeral: true,
853
+ checkpointing: { checkpointer },
854
+ })
855
+ );
856
+ const disabled = await createAgentSession(
857
+ createOursSessionConfig({
858
+ cwd,
859
+ ephemeral: true,
860
+ checkpointing: false,
861
+ humanInTheLoop: { enabled: true },
862
+ })
863
+ );
864
+ const ok =
865
+ injected.getCheckpointer() === checkpointer &&
866
+ injected.getSessionStore() == null &&
867
+ disabled.getCheckpointer() == null &&
868
+ disabled.getSessionStore() == null;
869
+ return {
870
+ ok,
871
+ detail: ok
872
+ ? 'custom checkpointer injected; JSONL/checkpointing can both be disabled'
873
+ : 'composition check failed',
874
+ };
875
+ });
876
+ }
877
+
878
+ async function probeMultiAgentWrapping(): Promise<DxProbeResult> {
879
+ return withDxWorkspace('multi', async (cwd) => {
880
+ const clientOptions = {
881
+ modelName: MODEL,
882
+ apiKey: process.env.ANTHROPIC_API_KEY,
883
+ };
884
+ const session = await createAgentSession(
885
+ createOursSessionConfig({
886
+ cwd,
887
+ sessionPath: join(cwd, 'multi.jsonl'),
888
+ checkpointing: false,
889
+ graphConfig: {
890
+ type: 'multi-agent',
891
+ agents: [
892
+ {
893
+ agentId: 'supervisor',
894
+ provider: PROVIDER,
895
+ clientOptions,
896
+ instructions: 'Route coding work to the specialist.',
897
+ },
898
+ {
899
+ agentId: 'coder',
900
+ provider: PROVIDER,
901
+ clientOptions,
902
+ instructions: 'Make precise code changes.',
903
+ },
904
+ ],
905
+ edges: [
906
+ {
907
+ from: 'supervisor',
908
+ to: 'coder',
909
+ description: 'Delegate implementation work',
910
+ edgeType: 'handoff',
911
+ },
912
+ ],
913
+ },
914
+ })
915
+ );
916
+ const ok =
917
+ session.getSessionStore() != null &&
918
+ typeof session.stream === 'function' &&
919
+ typeof session.run === 'function';
920
+ return {
921
+ ok,
922
+ detail:
923
+ 'AgentSession accepted a multi-agent handoff graph without special harness code',
924
+ };
925
+ });
926
+ }
927
+
928
+ const DX_PROBES: DxProbe[] = [
929
+ {
930
+ feature: 'Session facade',
931
+ pi: 'SDK sessions expose a high-level run loop',
932
+ ours: 'createAgentSession().run/stream wraps existing Run',
933
+ run: probeSessionFacade,
934
+ },
935
+ {
936
+ feature: 'Append-only JSONL tree',
937
+ pi: 'Native session JSONL tree',
938
+ ours: 'JsonlSessionStore v1 with entries, labels, state',
939
+ run: probeJsonlTree,
940
+ },
941
+ {
942
+ feature: 'Clone/fork/branch',
943
+ pi: 'Clone, fork, and branch from tree positions',
944
+ ours: 'clone(), fork(before/at), branch(before/at)',
945
+ run: probeForkCloneBranch,
946
+ },
947
+ {
948
+ feature: 'Resume',
949
+ pi: 'Resume by session id/path',
950
+ ours: 'resume/open by exact JSONL path with stable threadId',
951
+ run: probeResumeByPath,
952
+ },
953
+ {
954
+ feature: 'Composable state',
955
+ pi: 'Session log is separate from execution provider state',
956
+ ours: 'JSONL optional; LangGraph checkpointer injectable or off',
957
+ run: probeCheckpointComposition,
958
+ },
959
+ {
960
+ feature: 'Multi-agent/subagent surface',
961
+ pi: 'Coding-agent session loop',
962
+ ours: 'Same AgentSession facade wraps multi-agent graphs and tools',
963
+ run: probeMultiAgentWrapping,
964
+ },
965
+ ];
966
+
967
+ async function runDxProbes(): Promise<boolean> {
968
+ console.log('\n================ DX SESSION PROBES ================');
969
+ let allPassed = true;
970
+ for (const probe of DX_PROBES) {
971
+ const result = await probe.run();
972
+ allPassed &&= result.ok;
973
+ console.log(`\n[${result.ok ? 'ok' : 'fail'}] ${probe.feature}`);
974
+ console.log(` pi: ${probe.pi}`);
975
+ console.log(` ours: ${probe.ours}`);
976
+ console.log(` live: ${result.detail}`);
977
+ }
978
+ return allPassed;
979
+ }
980
+
621
981
  /* ------------------------------------------------------------------ */
622
982
  /* Harness */
623
983
  /* ------------------------------------------------------------------ */
624
984
 
625
985
  async function setupWorkspace(task: Task): Promise<string> {
626
- const { mkdir } = await import('fs/promises');
627
986
  const dir = await mkdtemp(join(tmpdir(), 'lc-compare-'));
628
987
  for (const [relPath, content] of Object.entries(task.seed)) {
629
988
  const abs = join(dir, relPath);
@@ -642,7 +1001,6 @@ async function setupWorkspace(task: Task): Promise<string> {
642
1001
  }
643
1002
 
644
1003
  async function symlinkRepoNodeModules(cwd: string): Promise<void> {
645
- const { symlink } = await import('fs/promises');
646
1004
  const repo = resolve(process.cwd(), 'node_modules');
647
1005
  await symlink(repo, join(cwd, 'node_modules'), 'dir').catch(() => {
648
1006
  /* fall through; tsc just won't be available */
@@ -655,9 +1013,7 @@ function summariseToolCalls(calls: ToolCallObservation[]): string {
655
1013
  for (const c of calls) {
656
1014
  grouped.set(c.name, (grouped.get(c.name) ?? 0) + 1);
657
1015
  }
658
- const inline = [...grouped.entries()]
659
- .map(([n, c]) => `${n}×${c}`)
660
- .join(', ');
1016
+ const inline = [...grouped.entries()].map(([n, c]) => `${n}×${c}`).join(', ');
661
1017
  const errors = calls.filter((c) => c.isError).length;
662
1018
  return `${calls.length} call(s) [${inline}]${errors > 0 ? ` (${errors} errored)` : ''}`;
663
1019
  }
@@ -677,10 +1033,28 @@ function avg(xs: number[]): number {
677
1033
  return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
678
1034
  }
679
1035
 
1036
+ function selectTasks(tasks: Task[]): Task[] {
1037
+ const raw = process.env.COMPARE_TASKS;
1038
+ if (raw == null || raw.trim() === '') {
1039
+ return tasks;
1040
+ }
1041
+ const requested = raw
1042
+ .split(',')
1043
+ .map((token) => token.trim().toLowerCase())
1044
+ .filter(Boolean);
1045
+ return tasks.filter((task) => {
1046
+ const name = task.name.toLowerCase();
1047
+ return requested.some((token) => name === token || name.startsWith(token));
1048
+ });
1049
+ }
1050
+
680
1051
  async function runOnce(
681
1052
  task: Task,
682
1053
  side: 'pi' | 'ours'
683
- ): Promise<{ outcome: RunOutcome; verify: { ok: boolean; detail: string } } | null> {
1054
+ ): Promise<{
1055
+ outcome: RunOutcome;
1056
+ verify: { ok: boolean; detail: string };
1057
+ } | null> {
684
1058
  if (task.skip === side) return null;
685
1059
  const cwd = await setupWorkspace(task);
686
1060
  const outcome =
@@ -702,10 +1076,28 @@ async function runOnce(
702
1076
 
703
1077
  async function main(): Promise<void> {
704
1078
  const ITERS = Math.max(1, Number(process.env.COMPARE_ITERS ?? '1'));
1079
+ const DX_ONLY = process.env.COMPARE_DX_ONLY === '1';
1080
+ const selectedTasks = selectTasks(TASKS);
705
1081
  console.log(`pi binary: ${PI_BIN}`);
706
1082
  console.log(`model: ${MODEL}`);
707
1083
  console.log(`provider: ${PROVIDER}`);
708
1084
  console.log(`iters: ${ITERS}`);
1085
+ console.log(`dx only: ${DX_ONLY ? 'yes' : 'no'}`);
1086
+ console.log(
1087
+ `tasks: ${
1088
+ DX_ONLY ? 'DX probes' : selectedTasks.map((task) => task.name).join(', ')
1089
+ }`
1090
+ );
1091
+ const dxPassed = await runDxProbes();
1092
+ if (DX_ONLY) {
1093
+ process.exitCode = dxPassed ? 0 : 1;
1094
+ return;
1095
+ }
1096
+ if (selectedTasks.length === 0) {
1097
+ throw new Error(
1098
+ `No tasks matched COMPARE_TASKS=${process.env.COMPARE_TASKS}`
1099
+ );
1100
+ }
709
1101
 
710
1102
  const results: Array<{
711
1103
  task: Task;
@@ -713,7 +1105,7 @@ async function main(): Promise<void> {
713
1105
  ours: AggregatedSide;
714
1106
  }> = [];
715
1107
 
716
- for (const task of TASKS) {
1108
+ for (const task of selectedTasks) {
717
1109
  console.log(`\n========== ${task.name} ==========`);
718
1110
  console.log(task.description);
719
1111
 
@@ -733,7 +1125,9 @@ async function main(): Promise<void> {
733
1125
  `cacheR=${piRes.outcome.cacheReadTokens} cacheW=${piRes.outcome.cacheWriteTokens} ` +
734
1126
  `$${piRes.outcome.cost.toFixed(4)}`
735
1127
  );
736
- if (piRes.outcome.errored) console.log(` err: ${piRes.outcome.errorMessage}`);
1128
+ if (piRes.outcome.errored) {
1129
+ console.log(` err: ${piRes.outcome.errorMessage}`);
1130
+ }
737
1131
  } else {
738
1132
  console.log(`[pi]${tag} (skipped)`);
739
1133
  }
@@ -746,9 +1140,12 @@ async function main(): Promise<void> {
746
1140
  `[ours]${tag} ${oursRes.outcome.errored ? 'ERROR' : oursRes.verify.ok ? 'ok' : 'fail'} ` +
747
1141
  `${fmtMs(oursRes.outcome.wallMs)} ${summariseToolCalls(oursRes.outcome.toolCalls)} ` +
748
1142
  `in=${oursRes.outcome.inputTokens} out=${oursRes.outcome.outputTokens} ` +
749
- `cacheR=${oursRes.outcome.cacheReadTokens} cacheW=${oursRes.outcome.cacheWriteTokens}`
1143
+ `cacheR=${oursRes.outcome.cacheReadTokens} cacheW=${oursRes.outcome.cacheWriteTokens} ` +
1144
+ `events=${oursRes.outcome.sessionEvents ?? 0} jsonl=${oursRes.outcome.sessionEntries ?? 0}`
750
1145
  );
751
- if (oursRes.outcome.errored) console.log(` err: ${oursRes.outcome.errorMessage}`);
1146
+ if (oursRes.outcome.errored) {
1147
+ console.log(` err: ${oursRes.outcome.errorMessage}`);
1148
+ }
752
1149
  } else {
753
1150
  console.log(`[ours]${tag} (skipped)`);
754
1151
  }
@@ -798,22 +1195,40 @@ async function main(): Promise<void> {
798
1195
  cols.push([r.task.name, 'verify', fmtVerify(r.pi), fmtVerify(r.ours)]);
799
1196
  cols.push(['', 'wall', fmtSideMs(r.pi), fmtSideMs(r.ours)]);
800
1197
  cols.push(['', 'tool calls', fmtSideCalls(r.pi), fmtSideCalls(r.ours)]);
801
- cols.push(['', 'input new', fmtSide(r.pi, 'inputTokens'), fmtSide(r.ours, 'inputTokens')]);
802
- cols.push(['', 'cache read', fmtSide(r.pi, 'cacheReadTokens'), fmtSide(r.ours, 'cacheReadTokens')]);
803
- cols.push(['', 'cache write', fmtSide(r.pi, 'cacheWriteTokens'), fmtSide(r.ours, 'cacheWriteTokens')]);
804
- cols.push(['', 'output tok', fmtSide(r.pi, 'outputTokens'), fmtSide(r.ours, 'outputTokens')]);
1198
+ cols.push([
1199
+ '',
1200
+ 'input new',
1201
+ fmtSide(r.pi, 'inputTokens'),
1202
+ fmtSide(r.ours, 'inputTokens'),
1203
+ ]);
1204
+ cols.push([
1205
+ '',
1206
+ 'cache read',
1207
+ fmtSide(r.pi, 'cacheReadTokens'),
1208
+ fmtSide(r.ours, 'cacheReadTokens'),
1209
+ ]);
1210
+ cols.push([
1211
+ '',
1212
+ 'cache write',
1213
+ fmtSide(r.pi, 'cacheWriteTokens'),
1214
+ fmtSide(r.ours, 'cacheWriteTokens'),
1215
+ ]);
1216
+ cols.push([
1217
+ '',
1218
+ 'output tok',
1219
+ fmtSide(r.pi, 'outputTokens'),
1220
+ fmtSide(r.ours, 'outputTokens'),
1221
+ ]);
805
1222
  cols.push(['', 'cost', fmtCost(r.pi), fmtCost(r.ours)]);
1223
+ cols.push(['', 'session events', 'N/A', fmtSide(r.ours, 'sessionEvents')]);
1224
+ cols.push(['', 'jsonl entries', 'N/A', fmtSide(r.ours, 'sessionEntries')]);
806
1225
  }
807
1226
 
808
1227
  const widths = [0, 0, 0, 0].map((_, i) =>
809
1228
  Math.max(...cols.map((row) => row[i].length))
810
1229
  );
811
1230
  for (const row of cols) {
812
- console.log(
813
- row
814
- .map((cell, i) => cell.padEnd(widths[i]))
815
- .join(' ')
816
- );
1231
+ console.log(row.map((cell, i) => cell.padEnd(widths[i])).join(' '));
817
1232
  }
818
1233
 
819
1234
  // Aggregate verify counts across all iters of all non-skipped tasks.
@@ -824,7 +1239,11 @@ async function main(): Promise<void> {
824
1239
  console.log(
825
1240
  `\nOverall: pi ${piPassed}/${piVerifies.length}, ours ${oursPassed}/${oursVerifies.length}.`
826
1241
  );
827
- if (piPassed < piVerifies.length || oursPassed < oursVerifies.length) {
1242
+ if (
1243
+ !dxPassed ||
1244
+ piPassed < piVerifies.length ||
1245
+ oursPassed < oursVerifies.length
1246
+ ) {
828
1247
  process.exitCode = 1;
829
1248
  }
830
1249
  }