@inixiative/agent-session 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,947 @@
1
+ // ---------------------------------------------------------------------------
2
+ // CodexSession — long-lived OpenAI Codex CLI process with full event capture
3
+ // ---------------------------------------------------------------------------
4
+ //
5
+ // Two implementations of the SAME HarnessSession interface, mirroring
6
+ // ClaudeCodeSession. Both keep one persistent `codex` process alive and stream
7
+ // turns over a JSON-RPC channel on stdin/stdout, classifying every event into
8
+ // the shared SessionEvent taxonomy and resolving a turn on completion.
9
+ //
10
+ // CodexMcpSession (default) — `codex mcp-server` (stdio MCP JSON-RPC)
11
+ // CodexAppServerSession (experimental) — `codex app-server` (WebSocket JSON-RPC)
12
+ //
13
+ // Both DISABLE codex's own approvals + sandbox (the union of
14
+ // `--dangerously-bypass-approvals-and-sandbox`) so OUR container is the only
15
+ // jail — identical intent to ClaudeCodeSession's `bypassPermissions`.
16
+ //
17
+ // Architecture (mirrors claude-code-session.ts):
18
+ // start() → spawn one process, start background stdout reader, MCP handshake
19
+ // send() → JSON-RPC tools/call (codex / codex-reply), resolve on completion
20
+ // fork() → new (unstarted) session resuming from the captured threadId
21
+ // kill() → close stdin, kill process
22
+ //
23
+ // codex's native "session ID" is the threadId returned by the `codex` tool's
24
+ // structuredContent — captured into externalSessionId and used for multi-turn
25
+ // (`codex-reply`) and fork.
26
+ //
27
+ // Verified against codex 0.140.
28
+ // ---------------------------------------------------------------------------
29
+
30
+ import type {
31
+ BeforeSendHook,
32
+ HarnessSession,
33
+ SessionEvent,
34
+ SessionEventHandler,
35
+ SessionResult,
36
+ SessionArtifact,
37
+ } from "./harness-session";
38
+
39
+ // Re-export types so importers can stay on one path (mirrors claude-code-session).
40
+ export type {
41
+ SessionEvent,
42
+ SessionEventKind,
43
+ SessionResult,
44
+ SessionArtifact,
45
+ } from "./harness-session";
46
+
47
+ // ---------------------------------------------------------------------------
48
+ // Shared subprocess shape (mirrors ClaudeCodeSession.PipedSubprocess)
49
+ // ---------------------------------------------------------------------------
50
+
51
+ /** Concrete type for Bun.spawn with all pipes — also the shape tests mock. */
52
+ export interface PipedSubprocess {
53
+ stdin: { write(data: string): void; flush(): void; end(): void };
54
+ stdout: ReadableStream<Uint8Array>;
55
+ stderr: ReadableStream<Uint8Array>;
56
+ exited: Promise<number>;
57
+ kill(): void;
58
+ }
59
+
60
+ export type CodexSpawn = (
61
+ cmd: string[],
62
+ opts: { cwd: string; env: Record<string, string | undefined> },
63
+ ) => PipedSubprocess;
64
+
65
+ // ---------------------------------------------------------------------------
66
+ // Configuration (mirrors ClaudeCodeSessionConfig)
67
+ // ---------------------------------------------------------------------------
68
+
69
+ export interface CodexSessionConfig {
70
+ /** Path to codex CLI binary. Defaults to "codex". */
71
+ bin?: string;
72
+ /** Model. Defaults to "gpt-5.5". */
73
+ model?: string;
74
+ /**
75
+ * Reasoning-effort level (`model_reasoning_effort`). One of
76
+ * minimal|low|medium|high|xhigh. Omitted → codex's default. Recorded per run.
77
+ */
78
+ effort?: string;
79
+ /** Working directory for the session. */
80
+ cwd?: string;
81
+ /** Default per-send timeout in ms. Defaults to 600000 (10 min). */
82
+ timeout?: number;
83
+ /**
84
+ * Base context to pre-load. Codex has no `--append-system-prompt`; we prepend
85
+ * it to the first turn's prompt (and keep it for fork). Persists logically for
86
+ * the session via the continued thread.
87
+ */
88
+ baseContext?: string;
89
+ /**
90
+ * codex's native thread ID (the value the `codex` tool returns). When set, the
91
+ * first send() continues that thread via `codex-reply` — used for fork and
92
+ * crash recovery. Also set by a SessionAdapter resuming a mapped Foundry thread.
93
+ */
94
+ externalSessionId?: string;
95
+ /**
96
+ * Override for the process spawner. Defaults to Bun.spawn. Tests inject a fake
97
+ * subprocess that emulates the codex JSON-RPC protocol; the docker-spawn helper
98
+ * wraps the CLI in `docker run`.
99
+ */
100
+ spawn?: CodexSpawn;
101
+ }
102
+
103
+ // ---------------------------------------------------------------------------
104
+ // Internal turn queue entry (identical shape to ClaudeCodeSession)
105
+ // ---------------------------------------------------------------------------
106
+
107
+ interface QueuedTurn {
108
+ message: string;
109
+ timeout: number;
110
+ resolve: (result: SessionResult) => void;
111
+ reject: (error: Error) => void;
112
+ }
113
+
114
+ // codex reasoning-effort levels (model_reasoning_effort), low→high.
115
+ const VALID_EFFORTS = ["minimal", "low", "medium", "high", "xhigh"];
116
+
117
+ // ---------------------------------------------------------------------------
118
+ // Base class — shared queue, event log, classification helpers, lifecycle
119
+ // ---------------------------------------------------------------------------
120
+ //
121
+ // CodexMcpSession and CodexAppServerSession differ only in the wire protocol
122
+ // (how start() handshakes, how a turn is sent, and how a raw message maps to
123
+ // SessionEvents). Everything else — the turn queue, token accounting, the
124
+ // stdout read loop, event emission, fork, artifact — is shared here, exactly as
125
+ // ClaudeCodeSession structures it.
126
+
127
+ abstract class BaseCodexSession implements HarnessSession {
128
+ // -- Config --
129
+ protected _bin: string;
130
+ protected _model: string;
131
+ protected _effort?: string;
132
+ protected _cwd: string;
133
+ protected _defaultTimeout: number;
134
+ protected _baseContext?: string;
135
+ protected _spawn?: CodexSpawn;
136
+
137
+ // -- Process --
138
+ protected _proc: PipedSubprocess | null = null;
139
+ protected _stderr = "";
140
+
141
+ // -- Session state --
142
+ protected _externalSessionId?: string;
143
+ protected _alive = false;
144
+ protected _eventLog: SessionEvent[] = [];
145
+ protected _handlers: SessionEventHandler[] = [];
146
+ protected _beforeSendHooks: BeforeSendHook[] = [];
147
+ protected _turns = 0;
148
+ protected _totalTokens = { input: 0, output: 0 };
149
+ protected _startedAt: number;
150
+ /** Whether baseContext has been prepended onto a sent turn yet. */
151
+ protected _injectedBaseContext = false;
152
+ /** Whether this session was created via fork() (continues a thread). */
153
+ protected _forking = false;
154
+
155
+ // -- JSON-RPC --
156
+ /** Monotonic JSON-RPC request id. */
157
+ protected _rpcId = 0;
158
+ /** Pending JSON-RPC responses, keyed by request id. */
159
+ protected _pending = new Map<
160
+ number,
161
+ { resolve: (result: unknown) => void; reject: (e: Error) => void }
162
+ >();
163
+
164
+ // -- Turn queue (identical to ClaudeCodeSession) --
165
+ protected _queue: QueuedTurn[] = [];
166
+ protected _inflight: QueuedTurn | null = null;
167
+ protected _turnEvents: SessionEvent[] = [];
168
+ protected _resultText = "";
169
+ protected _turnTimer: ReturnType<typeof setTimeout> | null = null;
170
+
171
+ constructor(config?: CodexSessionConfig) {
172
+ const bin = config?.bin ?? "codex";
173
+ if (!/^[a-zA-Z0-9_.\/\\-]+$/.test(bin)) {
174
+ throw new Error(`Invalid codex CLI binary path: "${bin}"`);
175
+ }
176
+ if (config?.effort && !VALID_EFFORTS.includes(config.effort)) {
177
+ throw new Error(
178
+ `Invalid codex effort "${config.effort}". Valid: ${VALID_EFFORTS.join(", ")}`,
179
+ );
180
+ }
181
+ this._bin = bin;
182
+ this._model = config?.model ?? "gpt-5.5";
183
+ this._effort = config?.effort;
184
+ this._cwd = config?.cwd ?? process.cwd();
185
+ this._defaultTimeout = config?.timeout ?? 600_000;
186
+ this._baseContext = config?.baseContext;
187
+ this._externalSessionId = config?.externalSessionId;
188
+ this._spawn = config?.spawn;
189
+ this._startedAt = Date.now();
190
+ }
191
+
192
+ // ---------------------------------------------------------------------------
193
+ // Accessors (identical to ClaudeCodeSession)
194
+ // ---------------------------------------------------------------------------
195
+
196
+ get alive(): boolean { return this._alive; }
197
+ get externalSessionId(): string | undefined { return this._externalSessionId; }
198
+ get events(): readonly SessionEvent[] { return this._eventLog; }
199
+ get turns(): number { return this._turns; }
200
+ get totalTokens(): Readonly<{ input: number; output: number }> {
201
+ return { ...this._totalTokens };
202
+ }
203
+
204
+ // ---------------------------------------------------------------------------
205
+ // Event subscription (identical to ClaudeCodeSession)
206
+ // ---------------------------------------------------------------------------
207
+
208
+ onEvent(handler: SessionEventHandler): () => void {
209
+ this._handlers.push(handler);
210
+ return () => {
211
+ const idx = this._handlers.indexOf(handler);
212
+ if (idx !== -1) this._handlers.splice(idx, 1);
213
+ };
214
+ }
215
+
216
+ onBeforeSend(hook: BeforeSendHook): () => void {
217
+ this._beforeSendHooks.push(hook);
218
+ return () => {
219
+ const idx = this._beforeSendHooks.indexOf(hook);
220
+ if (idx !== -1) this._beforeSendHooks.splice(idx, 1);
221
+ };
222
+ }
223
+
224
+ /**
225
+ * Mid-turn push. Like Claude Code's stream-json stdin, codex's tools/call has
226
+ * no out-of-band signal channel mid-turn; we emit a "push_ignored" error event
227
+ * so callers observe the attempt without the model seeing the payload until the
228
+ * next turn.
229
+ */
230
+ async push(payload: { kind: string; text: string }): Promise<void> {
231
+ this._emit({
232
+ kind: "error",
233
+ timestamp: Date.now(),
234
+ text: `push_ignored: kind=${payload.kind} — codex turn has no OOB channel`,
235
+ raw: payload,
236
+ });
237
+ }
238
+
239
+ // ---------------------------------------------------------------------------
240
+ // interrupt / kill / artifact (identical to ClaudeCodeSession)
241
+ // ---------------------------------------------------------------------------
242
+
243
+ interrupt(): void {
244
+ if (!this._inflight) return;
245
+ this._rejectInflight(new Error("Turn interrupted"));
246
+ }
247
+
248
+ kill(): void {
249
+ if (!this._proc) return;
250
+ this._alive = false;
251
+
252
+ if (this._turnTimer) {
253
+ clearTimeout(this._turnTimer);
254
+ this._turnTimer = null;
255
+ }
256
+
257
+ this._rejectInflight(new Error("Session killed"));
258
+ this._rejectQueue(new Error("Session killed"));
259
+ for (const p of this._pending.values()) p.reject(new Error("Session killed"));
260
+ this._pending.clear();
261
+
262
+ try { this._proc.stdin.end(); } catch { /* already closed */ }
263
+ try { this._proc.kill(); } catch { /* already dead */ }
264
+ this._proc = null;
265
+
266
+ this._emit({ kind: "session_end", timestamp: Date.now() });
267
+ }
268
+
269
+ artifact(): SessionArtifact {
270
+ return {
271
+ externalSessionId: this._externalSessionId,
272
+ events: [...this._eventLog],
273
+ startedAt: this._startedAt,
274
+ endedAt: this._alive ? undefined : Date.now(),
275
+ turns: this._turns,
276
+ totalTokens: { ...this._totalTokens },
277
+ toolCalls: this._eventLog.filter((e) => e.kind === "tool_use").length,
278
+ toolResults: this._eventLog.filter((e) => e.kind === "tool_result").length,
279
+ errors: this._eventLog.filter((e) => e.kind === "error").length,
280
+ };
281
+ }
282
+
283
+ // ---------------------------------------------------------------------------
284
+ // start() — spawn the persistent process + handshake (subclass-specific)
285
+ // ---------------------------------------------------------------------------
286
+
287
+ async start(): Promise<void> {
288
+ if (this._proc) throw new Error("Session already started");
289
+
290
+ const args = this._buildSpawnArgs();
291
+ const env: Record<string, string | undefined> = {
292
+ ...process.env,
293
+ DISABLE_AUTOUPDATER: "1",
294
+ };
295
+
296
+ if (this._spawn) {
297
+ this._proc = this._spawn([this._bin, ...args], { cwd: this._cwd, env });
298
+ } else {
299
+ this._proc = Bun.spawn([this._bin, ...args], {
300
+ cwd: this._cwd,
301
+ stdin: "pipe",
302
+ stdout: "pipe",
303
+ stderr: "pipe",
304
+ env,
305
+ }) as unknown as PipedSubprocess;
306
+ }
307
+
308
+ this._alive = true;
309
+ this._emit({ kind: "session_start", timestamp: Date.now() });
310
+
311
+ this._readStdout();
312
+ this._readStderr();
313
+
314
+ this._proc.exited.then((code) => {
315
+ if (!this._alive) return;
316
+ this._alive = false;
317
+ const errMsg = this._stderr.trim()
318
+ ? `Process exited (code ${code}): ${this._stderr.trim().slice(0, 500)}`
319
+ : `Process exited with code ${code}`;
320
+ this._rejectInflight(new Error(errMsg));
321
+ this._rejectQueue(new Error("Session ended"));
322
+ for (const p of this._pending.values()) p.reject(new Error("Session ended"));
323
+ this._pending.clear();
324
+ this._emit({ kind: "session_end", timestamp: Date.now() });
325
+ });
326
+
327
+ await this._handshake();
328
+ }
329
+
330
+ // ---------------------------------------------------------------------------
331
+ // send() — queue a turn, resolve on completion (identical control flow)
332
+ // ---------------------------------------------------------------------------
333
+
334
+ async send(
335
+ message: string,
336
+ opts?: { timeout?: number },
337
+ ): Promise<SessionResult> {
338
+ if (!this._proc && this._externalSessionId) {
339
+ await this.start();
340
+ }
341
+ if (!this._proc) throw new Error("Session not started — call start() first");
342
+ if (!this._alive) throw new Error("Session ended");
343
+
344
+ const timeout = opts?.timeout ?? this._defaultTimeout;
345
+
346
+ let transformed = message;
347
+ for (const hook of this._beforeSendHooks) {
348
+ transformed = await hook(transformed);
349
+ }
350
+
351
+ // Prepend baseContext onto the first turn (codex has no system-prompt flag).
352
+ if (this._baseContext && !this._injectedBaseContext) {
353
+ transformed = `${this._baseContext}\n\n${transformed}`;
354
+ this._injectedBaseContext = true;
355
+ }
356
+
357
+ return new Promise<SessionResult>((resolve, reject) => {
358
+ const turn: QueuedTurn = { message: transformed, timeout, resolve, reject };
359
+ if (!this._inflight) {
360
+ this._dispatchTurn(turn);
361
+ } else {
362
+ this._queue.push(turn);
363
+ }
364
+ });
365
+ }
366
+
367
+ // ---------------------------------------------------------------------------
368
+ // Private — turn dispatch + queue (mirrors ClaudeCodeSession)
369
+ // ---------------------------------------------------------------------------
370
+
371
+ private _dispatchTurn(turn: QueuedTurn): void {
372
+ this._inflight = turn;
373
+ this._turnEvents = [];
374
+ this._resultText = "";
375
+
376
+ // Send via the subclass's wire protocol. The returned promise resolves when
377
+ // the tools/call (or turn) completes; that resolves the turn.
378
+ this._sendTurn(turn.message)
379
+ .then((tokens) => {
380
+ if (this._inflight !== turn) return; // interrupted / timed out
381
+ if (tokens) {
382
+ this._totalTokens.input += tokens.input;
383
+ this._totalTokens.output += tokens.output;
384
+ }
385
+ this._resolveTurn(tokens);
386
+ })
387
+ .catch((err: Error) => {
388
+ if (this._inflight !== turn) return;
389
+ this._rejectInflight(err);
390
+ this._processNextTurn();
391
+ });
392
+
393
+ if (turn.timeout > 0) {
394
+ this._turnTimer = setTimeout(() => {
395
+ this._turnTimer = null;
396
+ this._rejectInflight(new Error(`Turn timed out after ${turn.timeout}ms`));
397
+ }, turn.timeout);
398
+ }
399
+ }
400
+
401
+ private _resolveTurn(tokens?: { input: number; output: number }): void {
402
+ if (!this._inflight) return;
403
+
404
+ if (this._turnTimer) {
405
+ clearTimeout(this._turnTimer);
406
+ this._turnTimer = null;
407
+ }
408
+
409
+ this._turns++;
410
+ const result: SessionResult = {
411
+ content: this._resultText,
412
+ events: [...this._turnEvents],
413
+ tokens: tokens ?? this._turnEvents.find((e) => e.tokens)?.tokens,
414
+ externalSessionId: this._externalSessionId,
415
+ };
416
+ this._inflight.resolve(result);
417
+ this._inflight = null;
418
+
419
+ this._processNextTurn();
420
+ }
421
+
422
+ private _processNextTurn(): void {
423
+ if (this._queue.length > 0 && this._alive) {
424
+ const next = this._queue.shift()!;
425
+ this._dispatchTurn(next);
426
+ }
427
+ }
428
+
429
+ protected _rejectInflight(err: Error): void {
430
+ if (!this._inflight) return;
431
+ if (this._turnTimer) {
432
+ clearTimeout(this._turnTimer);
433
+ this._turnTimer = null;
434
+ }
435
+ this._inflight.reject(err);
436
+ this._inflight = null;
437
+ }
438
+
439
+ protected _rejectQueue(err: Error): void {
440
+ for (const turn of this._queue) turn.reject(err);
441
+ this._queue = [];
442
+ }
443
+
444
+ // ---------------------------------------------------------------------------
445
+ // Private — stdout/stderr readers (mirrors ClaudeCodeSession)
446
+ // ---------------------------------------------------------------------------
447
+
448
+ private async _readStdout(): Promise<void> {
449
+ const reader = this._proc!.stdout.getReader();
450
+ const decoder = new TextDecoder();
451
+ let buffer = "";
452
+ try {
453
+ while (true) {
454
+ const { done, value } = await reader.read();
455
+ if (done) break;
456
+ buffer += decoder.decode(value, { stream: true });
457
+ const lines = buffer.split("\n");
458
+ buffer = lines.pop()!;
459
+ for (const line of lines) {
460
+ if (!line.trim()) continue;
461
+ this._processLine(line);
462
+ }
463
+ }
464
+ if (buffer.trim()) this._processLine(buffer);
465
+ } catch (err) {
466
+ this._rejectInflight(err as Error);
467
+ }
468
+ }
469
+
470
+ private async _readStderr(): Promise<void> {
471
+ const reader = this._proc!.stderr.getReader();
472
+ const decoder = new TextDecoder();
473
+ try {
474
+ while (true) {
475
+ const { done, value } = await reader.read();
476
+ if (done) break;
477
+ this._stderr += decoder.decode(value, { stream: true });
478
+ }
479
+ } catch { /* ignore */ }
480
+ }
481
+
482
+ // ---------------------------------------------------------------------------
483
+ // Private — JSON-RPC line processor (shared envelope; payload is per-protocol)
484
+ // ---------------------------------------------------------------------------
485
+
486
+ private _processLine(line: string): void {
487
+ let msg: unknown;
488
+ try {
489
+ msg = JSON.parse(line);
490
+ } catch {
491
+ return;
492
+ }
493
+ const raw = msg as Record<string, unknown>;
494
+
495
+ // JSON-RPC response to one of our requests (has matching `id` + result/error).
496
+ if (typeof raw.id === "number" && (("result" in raw) || ("error" in raw))) {
497
+ const pending = this._pending.get(raw.id);
498
+ if (pending) {
499
+ this._pending.delete(raw.id);
500
+ if ("error" in raw && raw.error) {
501
+ const e = raw.error as Record<string, unknown>;
502
+ pending.reject(new Error((e.message as string) ?? JSON.stringify(e)));
503
+ } else {
504
+ pending.resolve(raw.result);
505
+ }
506
+ return;
507
+ }
508
+ }
509
+
510
+ // Otherwise it's a notification (streamed event) — classify it.
511
+ const classified = this._classify(raw);
512
+ for (const event of classified) {
513
+ this._emit(event);
514
+ this._turnEvents.push(event);
515
+ if (event.kind === "result" || event.kind === "text") {
516
+ if (event.text) this._resultText = event.text;
517
+ }
518
+ if (event.tokens) {
519
+ this._totalTokens.input += event.tokens.input;
520
+ this._totalTokens.output += event.tokens.output;
521
+ }
522
+ }
523
+ }
524
+
525
+ // ---------------------------------------------------------------------------
526
+ // Private — emit (identical to ClaudeCodeSession)
527
+ // ---------------------------------------------------------------------------
528
+
529
+ protected _emit(event: SessionEvent): void {
530
+ this._eventLog.push(event);
531
+ for (const handler of this._handlers) {
532
+ try {
533
+ handler(event);
534
+ } catch (err) {
535
+ console.warn(
536
+ `[${this.constructor.name}] handler error:`,
537
+ (err as Error).message,
538
+ );
539
+ }
540
+ }
541
+ }
542
+
543
+ // ---------------------------------------------------------------------------
544
+ // Private — JSON-RPC request helper (resolves on the matching response)
545
+ // ---------------------------------------------------------------------------
546
+
547
+ protected _rpcRequest(method: string, params?: unknown): Promise<unknown> {
548
+ const id = ++this._rpcId;
549
+ const payload = JSON.stringify({ jsonrpc: "2.0", id, method, params }) + "\n";
550
+ return new Promise<unknown>((resolve, reject) => {
551
+ this._pending.set(id, { resolve, reject });
552
+ try {
553
+ this._proc!.stdin.write(payload);
554
+ this._proc!.stdin.flush();
555
+ } catch (err) {
556
+ this._pending.delete(id);
557
+ reject(err as Error);
558
+ }
559
+ });
560
+ }
561
+
562
+ protected _rpcNotify(method: string, params?: unknown): void {
563
+ const payload = JSON.stringify({ jsonrpc: "2.0", method, params }) + "\n";
564
+ this._proc!.stdin.write(payload);
565
+ this._proc!.stdin.flush();
566
+ }
567
+
568
+ // ---------------------------------------------------------------------------
569
+ // Subclass hooks — the only protocol-specific surface
570
+ // ---------------------------------------------------------------------------
571
+
572
+ /** CLI args for the persistent process. */
573
+ protected abstract _buildSpawnArgs(): string[];
574
+ /** Protocol handshake after spawn (initialize / initialized / etc.). */
575
+ protected abstract _handshake(): Promise<void>;
576
+ /** Send one turn; resolve with token usage when the turn completes. */
577
+ protected abstract _sendTurn(
578
+ message: string,
579
+ ): Promise<{ input: number; output: number } | undefined>;
580
+ /** Map a streamed notification to SessionEvents. */
581
+ protected abstract _classify(msg: Record<string, unknown>): SessionEvent[];
582
+
583
+ /** Shared fork constructor — subclass passes its own ctor. */
584
+ fork(opts?: { cwd?: string; baseContext?: string }): HarnessSession {
585
+ if (!this._externalSessionId) {
586
+ throw new Error(
587
+ "Cannot fork — no thread ID yet (send at least one message first)",
588
+ );
589
+ }
590
+ const Ctor = this.constructor as new (c?: CodexSessionConfig) => BaseCodexSession;
591
+ const forked = new Ctor({
592
+ bin: this._bin,
593
+ model: this._model,
594
+ effort: this._effort,
595
+ cwd: opts?.cwd ?? this._cwd,
596
+ timeout: this._defaultTimeout,
597
+ baseContext: opts?.baseContext ?? this._baseContext,
598
+ externalSessionId: this._externalSessionId,
599
+ spawn: this._spawn,
600
+ });
601
+ forked._forking = true;
602
+ return forked;
603
+ }
604
+ }
605
+
606
+ // ---------------------------------------------------------------------------
607
+ // CodexMcpSession (DEFAULT) — `codex mcp-server` over stdio MCP JSON-RPC
608
+ // ---------------------------------------------------------------------------
609
+ //
610
+ // Handshake: initialize → notifications/initialized → tools/list.
611
+ // A turn is an MCP `tools/call`:
612
+ // - first turn → tool "codex" (params: prompt, model, cwd, ...)
613
+ // - subsequent turns → tool "codex-reply" (params: prompt, threadId)
614
+ // During the call, codex streams `codex/event` notifications (agent message,
615
+ // reasoning, command execution, token usage) — mapped to SessionEvents. The
616
+ // threadId comes back in the call result's structuredContent (captured for
617
+ // multi-turn + fork).
618
+
619
+ export class CodexMcpSession extends BaseCodexSession {
620
+ protected _buildSpawnArgs(): string[] {
621
+ // Disable codex's own approvals + sandbox (the union of
622
+ // --dangerously-bypass-approvals-and-sandbox) so OUR container is the jail.
623
+ // model_reasoning_effort sets the effort level; passed as TOML-ish `-c` values.
624
+ const args = [
625
+ "mcp-server",
626
+ "-c", `sandbox_mode="danger-full-access"`,
627
+ "-c", `approval_policy="never"`,
628
+ ];
629
+ if (this._effort) {
630
+ args.push("-c", `model_reasoning_effort="${this._effort}"`);
631
+ }
632
+ return args;
633
+ }
634
+
635
+ protected async _handshake(): Promise<void> {
636
+ await this._rpcRequest("initialize", {
637
+ protocolVersion: "2025-06-18",
638
+ capabilities: {},
639
+ clientInfo: { name: "inixiative-bench", version: "0.1.0" },
640
+ });
641
+ this._rpcNotify("notifications/initialized");
642
+ // tools/list confirms the `codex` + `codex-reply` tools are present.
643
+ await this._rpcRequest("tools/list", {});
644
+ }
645
+
646
+ protected async _sendTurn(
647
+ message: string,
648
+ ): Promise<{ input: number; output: number } | undefined> {
649
+ const isReply = this._externalSessionId !== undefined;
650
+ const name = isReply ? "codex-reply" : "codex";
651
+ const args: Record<string, unknown> = isReply
652
+ ? { prompt: message, threadId: this._externalSessionId }
653
+ : {
654
+ prompt: message,
655
+ model: this._model,
656
+ cwd: this._cwd,
657
+ // Belt-and-suspenders: also disable per-call (matches spawn `-c` flags).
658
+ sandbox: "danger-full-access",
659
+ "approval-policy": "never",
660
+ ...(this._effort
661
+ ? { config: { model_reasoning_effort: this._effort } }
662
+ : {}),
663
+ };
664
+
665
+ const result = (await this._rpcRequest("tools/call", {
666
+ name,
667
+ arguments: args,
668
+ })) as Record<string, unknown> | undefined;
669
+
670
+ // Capture the threadId for multi-turn (codex-reply) + fork.
671
+ const structured = result?.structuredContent as
672
+ | Record<string, unknown>
673
+ | undefined;
674
+ const threadId = structured?.threadId as string | undefined;
675
+ if (threadId && !this._externalSessionId) {
676
+ this._externalSessionId = threadId;
677
+ }
678
+ // Final text: prefer structuredContent.content, else the tool result content.
679
+ const finalText =
680
+ (structured?.content as string | undefined) ??
681
+ this._extractToolText(result?.content);
682
+ if (finalText) this._resultText = finalText;
683
+
684
+ // Usage, when the call result reports it. Streamed token_count events are
685
+ // already accounted by the base loop; the call result is the authoritative
686
+ // turn total, returned to _dispatchTurn for the SessionResult. To avoid
687
+ // double-counting against streamed events, we only return it if no streamed
688
+ // token event was seen this turn.
689
+ const sawStreamedTokens = this._turnEvents.some((e) => e.tokens);
690
+ const usage = structured?.usage as Record<string, number> | undefined;
691
+ if (usage && !sawStreamedTokens) {
692
+ return {
693
+ input: (usage.input_tokens ?? usage.inputTokens ?? 0) as number,
694
+ output: (usage.output_tokens ?? usage.outputTokens ?? 0) as number,
695
+ };
696
+ }
697
+ return undefined;
698
+ }
699
+
700
+ private _extractToolText(content: unknown): string | undefined {
701
+ if (!Array.isArray(content)) return undefined;
702
+ const parts: string[] = [];
703
+ for (const block of content) {
704
+ const b = block as Record<string, unknown>;
705
+ if (b.type === "text" && typeof b.text === "string") parts.push(b.text);
706
+ }
707
+ return parts.length ? parts.join("\n") : undefined;
708
+ }
709
+
710
+ protected _classify(msg: Record<string, unknown>): SessionEvent[] {
711
+ // Streamed events arrive as JSON-RPC notifications: method "codex/event"
712
+ // with params carrying a `msg` of a tagged type.
713
+ if (msg.method !== "codex/event") return [];
714
+ const params = msg.params as Record<string, unknown> | undefined;
715
+ const ev = (params?.msg ?? params) as Record<string, unknown> | undefined;
716
+ if (!ev) return [];
717
+ return classifyCodexEvent(ev, "type");
718
+ }
719
+ }
720
+
721
+ // ---------------------------------------------------------------------------
722
+ // CodexAppServerSession (EXPERIMENTAL) — `codex app-server` over WebSocket
723
+ // ---------------------------------------------------------------------------
724
+ //
725
+ // EXPERIMENTAL: the app-server protocol is newer and less battle-tested than
726
+ // mcp-server. Prefer CodexMcpSession unless you specifically need app-server.
727
+ //
728
+ // `codex app-server --listen ws://127.0.0.1:<port>` — localhost needs no auth
729
+ // token. JSON-RPC 2.0 with slash-delimited methods:
730
+ // initialize → initialized → thread/start (returns thread.id) → turn/start
731
+ // Consume turn/started, item/started, item/completed, turn/completed
732
+ // notifications (ThreadItem types agentMessage / reasoning / commandExecution;
733
+ // ThreadTokenUsage). Mapped to SessionEvents.
734
+ //
735
+ // We connect over the spawned process's stdin/stdout JSON-RPC for parity with
736
+ // the rest of the harness (the WebSocket listen address is for external
737
+ // clients; the stdio channel carries the same JSON-RPC frames). The base
738
+ // class's read/write loop is reused unchanged.
739
+
740
+ export class CodexAppServerSession extends BaseCodexSession {
741
+ /** Resolves when the in-flight turn's `turn/completed` arrives. */
742
+ private _turnDone?: {
743
+ resolve: (t: { input: number; output: number } | undefined) => void;
744
+ reject: (e: Error) => void;
745
+ };
746
+
747
+ protected _buildSpawnArgs(): string[] {
748
+ // app-server on localhost needs no auth token. Approvals/sandbox are set
749
+ // per-thread in thread/start (approvalPolicy / sandbox below).
750
+ return ["app-server", "--listen", "ws://127.0.0.1:0"];
751
+ }
752
+
753
+ protected async _handshake(): Promise<void> {
754
+ await this._rpcRequest("initialize", {
755
+ protocolVersion: "2025-06-18",
756
+ capabilities: {},
757
+ clientInfo: { name: "inixiative-bench", version: "0.1.0" },
758
+ });
759
+ this._rpcNotify("initialized");
760
+ }
761
+
762
+ protected async _sendTurn(
763
+ message: string,
764
+ ): Promise<{ input: number; output: number } | undefined> {
765
+ // Start (or reuse) a thread. Disable codex's approvals + sandbox so OUR
766
+ // container is the only jail.
767
+ if (!this._externalSessionId) {
768
+ const started = (await this._rpcRequest("thread/start", {
769
+ model: this._model,
770
+ cwd: this._cwd,
771
+ approvalPolicy: "never",
772
+ sandbox: "danger-full-access",
773
+ ...(this._effort ? { modelReasoningEffort: this._effort } : {}),
774
+ })) as Record<string, unknown> | undefined;
775
+ const thread = started?.thread as Record<string, unknown> | undefined;
776
+ const id = (thread?.id ?? started?.threadId) as string | undefined;
777
+ if (id) this._externalSessionId = id;
778
+ }
779
+
780
+ const completed = new Promise<{ input: number; output: number } | undefined>(
781
+ (resolve, reject) => {
782
+ this._turnDone = { resolve, reject };
783
+ },
784
+ );
785
+
786
+ // turn/start streams turn/started, item/*, turn/completed back as
787
+ // notifications, consumed in _classify; turn/completed resolves the turn.
788
+ await this._rpcRequest("turn/start", {
789
+ threadId: this._externalSessionId,
790
+ input: message,
791
+ });
792
+
793
+ return completed;
794
+ }
795
+
796
+ protected _classify(msg: Record<string, unknown>): SessionEvent[] {
797
+ const method = msg.method as string | undefined;
798
+ if (!method) return [];
799
+ const params = (msg.params ?? {}) as Record<string, unknown>;
800
+
801
+ if (method === "turn/completed") {
802
+ const usage = (params.usage ?? params.tokenUsage) as
803
+ | Record<string, number>
804
+ | undefined;
805
+ const tokens = usage
806
+ ? {
807
+ input: (usage.inputTokens ?? usage.input_tokens ?? 0) as number,
808
+ output: (usage.outputTokens ?? usage.output_tokens ?? 0) as number,
809
+ }
810
+ : undefined;
811
+ this._turnDone?.resolve(tokens);
812
+ this._turnDone = undefined;
813
+ // Note: no `tokens` on this event — _sendTurn returns them to _dispatchTurn,
814
+ // which does the accounting once. Putting tokens here too would double-count.
815
+ return [{ kind: "result", timestamp: Date.now(), text: this._resultText, raw: msg }];
816
+ }
817
+
818
+ if (method === "item/completed" || method === "item/started") {
819
+ const item = (params.item ?? params) as Record<string, unknown>;
820
+ return classifyCodexEvent(item, "type");
821
+ }
822
+
823
+ // turn/started and other lifecycle notifications carry no turn content.
824
+ return [];
825
+ }
826
+ }
827
+
828
+ // ---------------------------------------------------------------------------
829
+ // Shared event mapping — codex item/event → SessionEvent
830
+ // ---------------------------------------------------------------------------
831
+ //
832
+ // Both protocols carry the same ThreadItem / event shapes (agent message,
833
+ // reasoning, command execution, token usage). `tag` is the discriminant field
834
+ // ("type" for both mcp `codex/event.msg.type` and app-server `item.type`).
835
+ //
836
+ // agent_message / agentMessage → text
837
+ // reasoning / agent_reasoning → thinking
838
+ // command_execution / commandExecution → tool_use (+ tool_result when done)
839
+ // token_count / usage → tokens (attached to a result event)
840
+
841
+ function classifyCodexEvent(
842
+ ev: Record<string, unknown>,
843
+ tag: string,
844
+ ): SessionEvent[] {
845
+ const ts = Date.now();
846
+ const type = String(ev[tag] ?? "").toLowerCase();
847
+ const events: SessionEvent[] = [];
848
+
849
+ // Agent message → text.
850
+ if (type === "agent_message" || type === "agentmessage" || type === "agent_message_delta") {
851
+ const text = (ev.message ?? ev.text ?? ev.delta) as string | undefined;
852
+ if (text) events.push({ kind: "text", timestamp: ts, text, raw: ev });
853
+ return events;
854
+ }
855
+
856
+ // Reasoning → thinking.
857
+ if (
858
+ type === "reasoning" ||
859
+ type === "agent_reasoning" ||
860
+ type === "agentreasoning" ||
861
+ type === "agent_reasoning_delta"
862
+ ) {
863
+ const text = (ev.text ?? ev.reasoning ?? ev.delta ?? ev.summary) as string | undefined;
864
+ if (text) events.push({ kind: "thinking", timestamp: ts, text, raw: ev });
865
+ return events;
866
+ }
867
+
868
+ // Command execution → tool_use, plus tool_result if output is present.
869
+ if (
870
+ type === "command_execution" ||
871
+ type === "commandexecution" ||
872
+ type === "exec_command_begin" ||
873
+ type === "exec_command_end"
874
+ ) {
875
+ const command = (ev.command ?? ev.cmd) as string | string[] | undefined;
876
+ const cmdStr = Array.isArray(command) ? command.join(" ") : command;
877
+ events.push({
878
+ kind: "tool_use",
879
+ timestamp: ts,
880
+ toolName: "shell",
881
+ toolInput: cmdStr ? { command: cmdStr } : (ev as Record<string, unknown>),
882
+ raw: ev,
883
+ });
884
+ const output = (ev.output ?? ev.stdout ?? ev.aggregated_output) as
885
+ | string
886
+ | undefined;
887
+ const exitCode = (ev.exit_code ?? ev.exitCode) as number | undefined;
888
+ if (output !== undefined || exitCode !== undefined) {
889
+ events.push({
890
+ kind: "tool_result",
891
+ timestamp: ts,
892
+ toolOutput: output ?? "",
893
+ toolError: exitCode !== undefined && exitCode !== 0,
894
+ raw: ev,
895
+ });
896
+ }
897
+ return events;
898
+ }
899
+
900
+ // Token usage → carried on a result-less event so accounting picks it up.
901
+ if (
902
+ type === "token_count" ||
903
+ type === "token_usage" ||
904
+ type === "usage" ||
905
+ type === "tokenusage"
906
+ ) {
907
+ const u = (ev.info ?? ev.usage ?? ev) as Record<string, unknown>;
908
+ const input =
909
+ (u.input_tokens ?? u.inputTokens ?? u.total_input_tokens ?? 0) as number;
910
+ const output =
911
+ (u.output_tokens ?? u.outputTokens ?? u.total_output_tokens ?? 0) as number;
912
+ if (input || output) {
913
+ events.push({
914
+ kind: "result",
915
+ timestamp: ts,
916
+ tokens: { input, output },
917
+ raw: ev,
918
+ });
919
+ }
920
+ return events;
921
+ }
922
+
923
+ // Errors.
924
+ if (type === "error" || type === "stream_error") {
925
+ events.push({
926
+ kind: "error",
927
+ timestamp: ts,
928
+ text: (ev.message ?? ev.error ?? JSON.stringify(ev)) as string,
929
+ raw: ev,
930
+ });
931
+ return events;
932
+ }
933
+
934
+ // Unclassified events are preserved via `raw` on nothing — but we keep them
935
+ // out of turn content. Oracle can still introspect via the live stream.
936
+ return events;
937
+ }
938
+
939
+ // ---------------------------------------------------------------------------
940
+ // Default export selection
941
+ // ---------------------------------------------------------------------------
942
+
943
+ /**
944
+ * The default CodexSession is the MCP variant (stdio mcp-server). Import
945
+ * CodexAppServerSession explicitly for the experimental app-server variant.
946
+ */
947
+ export const CodexSession = CodexMcpSession;