optio-codex 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
optio_codex/session.py ADDED
@@ -0,0 +1,731 @@
1
+ """State machine for one optio-codex session (Stage 0: iframe/ttyd, local)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import inspect
7
+ import logging
8
+ import mimetypes
9
+ import os
10
+ import re
11
+ import secrets
12
+ import shlex
13
+ from typing import AsyncIterator
14
+
15
+ from optio_core.context import ProcessContext
16
+ from optio_core.models import BasicAuth, TaskInstance
17
+
18
+ from optio_agents import HookContext, RESUME_NOTICE, SYSTEM_MESSAGE_PREFIX, get_protocol
19
+ from optio_agents import seeds as _seeds
20
+ from optio_agents.protocol.session import _SessionFailed, run_log_protocol_session
21
+ from optio_host.host import Host, LocalHost, ProcessHandle, proc_wait
22
+ from optio_host.paths import task_dir
23
+
24
+ from optio_codex import cred_watcher, host_actions
25
+ from optio_codex import models as codex_models
26
+ from optio_codex.conversation import CodexConversation
27
+ from optio_codex.conversation_listener import ConversationListener
28
+ from optio_codex.fs_allowlist import (
29
+ SandboxSettings,
30
+ build_sandbox_cli_args,
31
+ build_sandbox_config_overrides,
32
+ resolve_sandbox_settings,
33
+ )
34
+ from optio_codex.prompt import compose_agents_md
35
+ from optio_codex.seed_manifest import CODEX_SEED_MANIFEST, CODEX_SEED_SUFFIX
36
+ from optio_codex.snapshots import (
37
+ effective_workdir_exclude,
38
+ insert_snapshot,
39
+ load_latest_snapshot,
40
+ prune_snapshots,
41
+ )
42
+ from optio_codex.types import CodexTaskConfig
43
+
44
+
45
+ _LOG = logging.getLogger(__name__)
46
+ READY_TIMEOUT_S = 30.0
47
+
48
+
49
+ def _build_host(config: CodexTaskConfig, process_id: str) -> Host:
50
+ taskdir = task_dir(
51
+ ssh=config.ssh, process_id=process_id, consumer_name="optio-codex",
52
+ )
53
+ return host_actions.build_host(config.ssh, taskdir)
54
+
55
+
56
+ async def _call_maybe_async(fn, *args) -> None:
57
+ """Invoke a callback that may be sync or async."""
58
+ result = fn(*args)
59
+ if inspect.isawaitable(result):
60
+ await result
61
+
62
+
63
+ def _teardown_aggressive(*, cancelled: bool, seeded: bool) -> bool:
64
+ """Whether to SIGKILL codex immediately on teardown vs SIGTERM-and-wait.
65
+
66
+ A **seeded** session is torn down GRACEFULLY even on cancel: codex's
67
+ single-use ChatGPT refresh token may have rotated this session, and codex's
68
+ auth.json write is best-effort — an aggressive SIGKILL can beat the flush,
69
+ stranding the rotation so the credential save-back persists the now-spent
70
+ token and the next launch demands re-auth. SIGTERM-and-wait lets codex
71
+ flush first. A non-seeded session keeps the fast aggressive kill on cancel.
72
+ """
73
+ return cancelled and not seeded
74
+
75
+
76
+ async def run_codex_session(ctx: ProcessContext, config: CodexTaskConfig) -> None:
77
+ """Execute function body for one optio-codex task instance."""
78
+ host: Host = _build_host(config, ctx.process_id)
79
+ # redirect (not suppress): codex's first-launch `codex login` opens the
80
+ # loopback OAuth URL via xdg-open; the redirect shim captures it as a
81
+ # BROWSER: marker so the driver surfaces it to the operator (who completes
82
+ # the sign-in), instead of silently swallowing it. Matches claudecode/grok.
83
+ protocol = get_protocol(browser="redirect")
84
+ launched_handle: ProcessHandle | None = None
85
+ tmux_path: str | None = None
86
+ tmux_socket: str | None = None
87
+ tmux_session: str | None = None
88
+ codex_path: str | None = None
89
+ ttyd_path: str | None = None
90
+ # Stage 8: the task's resolved native-sandbox posture (mode + writable
91
+ # roots + network), computed ONCE in _prepare from config + the real host
92
+ # home, then rendered into every launch surface (iframe/exec argv via
93
+ # build_sandbox_cli_args; the codex app-server launch via thread/start's
94
+ # `sandbox` mode + build_sandbox_config_overrides in the conversation body).
95
+ sandbox_settings: SandboxSettings | None = None
96
+ cancelled = False
97
+ # Whether a snapshot was restored this run (suppresses the auto-start
98
+ # positional). Set by _prepare, read by the body.
99
+ resuming = False
100
+ # The session/rollout id recorded in the restored snapshot; drives the
101
+ # `codex resume <id>` relaunch. None ⇒ fresh codex session even when the
102
+ # workdir was restored (the snapshot predates any rollout).
103
+ resume_session_id: str | None = None
104
+ # Resolved seed id for a fresh, seeded launch (Stage 3). Set by _prepare
105
+ # (str seed_id → itself; SeedProvider callable → awaited). Stays None on
106
+ # resume and when no seed_id is configured.
107
+ resolved_seed_id: str | None = None
108
+ # Stage 4 lease + credential save-back. ``lease_holder`` is the task's
109
+ # process_id when the seed came from a lease-holding SeedProvider
110
+ # (renewed by the watcher, released at teardown). ``cred_baseline`` is
111
+ # the post-merge auth.json fingerprint the watcher/backstop diff against.
112
+ lease_holder: str | None = None
113
+ cred_baseline: str | None = None
114
+ cred_watch_task: "asyncio.Task | None" = None
115
+ # Conversation mode (Stage 6). The CodexConversation is constructed inside
116
+ # _conversation_body (it needs resume_thread_id, resolved by _prepare) and
117
+ # published via ctx.publish_result; the per-task SSE listener (conversation_ui
118
+ # only) is started after publish and torn down first in the finally block.
119
+ conversation: CodexConversation | None = None
120
+ conv_listener: ConversationListener | None = None
121
+
122
+ await host.connect()
123
+
124
+ async def _prepare(host: Host, hook_ctx: HookContext) -> None:
125
+ """Restore a resume snapshot, provision codex + ttyd, plant AGENTS.md.
126
+
127
+ Handed to run_log_protocol_session, which runs it AFTER
128
+ host.setup_workdir() has wiped the workdir and BEFORE it subscribes
129
+ the optio.log tail. That ordering is why the restore belongs here:
130
+ the restored optio.log is rotated away below before the tail can
131
+ replay its stale DONE/ERROR, and AGENTS.md is planted AFTER the
132
+ restore so the restore cannot wipe it.
133
+
134
+ Restore runs BEFORE ensure_codex_installed — a deliberate divergence
135
+ from the grok template (which ensures first): codex's launch path is
136
+ the per-task symlink INSIDE the workdir
137
+ (<workdir>/home/.local/bin/codex), and restore_workdir empties the
138
+ workdir before extracting. Provisioning after the restore re-creates
139
+ the home tree and re-points the symlink (mkdir -p / ln -sfn are
140
+ idempotent), so the launch path can never dangle.
141
+ """
142
+ nonlocal codex_path, ttyd_path, resuming, resume_session_id
143
+ nonlocal resolved_seed_id, lease_holder, cred_baseline
144
+ nonlocal sandbox_settings
145
+
146
+ resume_requested = bool(getattr(ctx, "resume", False))
147
+ snapshot = None
148
+ if resume_requested:
149
+ snapshot = await load_latest_snapshot(
150
+ ctx._db, ctx._prefix, ctx.process_id,
151
+ )
152
+ resuming = snapshot is not None
153
+ if resuming:
154
+ # Restore the workdir tar (carries home/.codex — sessions/,
155
+ # auth, config). A present snapshot that fails to restore is
156
+ # fatal — the call is intentionally outside any except so it
157
+ # surfaces to the caller (no silent fresh-start).
158
+ await host.restore_workdir(_stream_blob(ctx, snapshot["workdirBlobId"]))
159
+ await host_actions._rotate_optio_log(host)
160
+ resume_session_id = snapshot.get("sessionId")
161
+ if resume_session_id is None:
162
+ _LOG.warning(
163
+ "resume: snapshot for %s carries no sessionId (codex never "
164
+ "persisted a rollout in that run); the workdir is restored "
165
+ "but codex starts a FRESH session — explicit-id resume "
166
+ "only, never `resume --last` (it silently mints a new "
167
+ "session on a miss).",
168
+ ctx.process_id,
169
+ )
170
+
171
+ codex_path = await host_actions.ensure_codex_installed(
172
+ hook_ctx,
173
+ install_if_missing=config.install_if_missing,
174
+ install_dir=config.codex_install_dir,
175
+ )
176
+ # Stage 8: resolve the native-sandbox posture ONCE. ``~/`` grants
177
+ # expand against the REAL host home (codex runs under an isolated
178
+ # $HOME), so the settings need it up front; every launch surface
179
+ # renders from this single object.
180
+ host_home = await host.resolve_host_home()
181
+ sandbox_settings = resolve_sandbox_settings(config, host_home=host_home)
182
+ # Conversation mode is headless (codex app-server stdio) — no ttyd.
183
+ if config.mode != "conversation":
184
+ ttyd_path = await host_actions.ensure_ttyd_installed(
185
+ hook_ctx,
186
+ install_if_missing=config.install_ttyd_if_missing,
187
+ install_dir=config.ttyd_install_dir,
188
+ )
189
+ if not resuming and config.seed_id is not None:
190
+ # Seeded FRESH start: resolve the seed id (str → itself; a
191
+ # SeedProvider callable → awaited, may raise
192
+ # SeedUnavailableError) and overlay the stored codex identity
193
+ # (auth.json + config.toml) into the fresh workdir BEFORE
194
+ # AGENTS.md, so codex launches already-authed. Codex auth/config
195
+ # are cwd-independent, so no rekey is needed — but the new
196
+ # workdir must be pre-trusted (cwd-dependent, hence a post-merge
197
+ # edit here rather than a manifest transform).
198
+ if callable(config.seed_id):
199
+ # A SeedProvider leases a seed from the pool (holder =
200
+ # process_id); the watcher renews the lease, teardown
201
+ # releases it. A plain string carries no lease.
202
+ resolved_seed_id = await config.seed_id(ctx.process_id)
203
+ lease_holder = ctx.process_id
204
+ else:
205
+ resolved_seed_id = config.seed_id
206
+ await _seeds.merge_seed(
207
+ ctx, host,
208
+ seed_id=resolved_seed_id,
209
+ manifest=CODEX_SEED_MANIFEST,
210
+ suffix=CODEX_SEED_SUFFIX,
211
+ decrypt=None,
212
+ )
213
+ await host_actions.ensure_workdir_trusted(host)
214
+ # Baseline the merged auth.json so the in-session watcher and
215
+ # the teardown backstop only save back a genuinely rotated token.
216
+ cred_baseline = await cred_watcher.cred_fingerprint(host)
217
+ await host.write_text(
218
+ "AGENTS.md",
219
+ compose_agents_md(
220
+ config.consumer_instructions,
221
+ documentation=protocol.documentation if config.host_protocol else None,
222
+ host_protocol=config.host_protocol,
223
+ workdir_exclude=config.workdir_exclude,
224
+ supports_resume=config.supports_resume,
225
+ file_download=config.file_download,
226
+ ),
227
+ )
228
+ if config.supports_resume:
229
+ await host_actions._append_resume_log_entry(host)
230
+ if config.before_execute is not None:
231
+ # End-of-prepare placement matches claudecode (its
232
+ # _plant_session_content ends with before_execute, inside its
233
+ # _prepare); opencode fires it inside the body instead.
234
+ await config.before_execute(hook_ctx)
235
+
236
+ async def _codex_body(host: Host, hook_ctx: HookContext) -> None:
237
+ nonlocal launched_handle, tmux_path, tmux_socket, tmux_session
238
+ nonlocal cred_watch_task
239
+
240
+ bind_addr = os.environ.get("OPTIO_WIDGET_TUNNEL_BIND", "127.0.0.1")
241
+ upstream_host = os.environ.get("OPTIO_WIDGET_TUNNEL_HOST", "127.0.0.1")
242
+ ttyd_iface = bind_addr if isinstance(host, LocalHost) else "127.0.0.1"
243
+
244
+ codex_flags = [
245
+ # `codex resume <id>` is a SUBCOMMAND — it must precede the flags.
246
+ *host_actions.build_resume_args(resume_session_id),
247
+ *host_actions.build_codex_flags(
248
+ model=config.model,
249
+ ask_for_approval=config.ask_for_approval,
250
+ sandbox_args=build_sandbox_cli_args(sandbox_settings),
251
+ ),
252
+ # Positional kickoff prompt: fresh launches only (suppressed when
253
+ # a snapshot was restored — re-kicking would duplicate the task).
254
+ *host_actions.build_auto_start_args(
255
+ auto_start=config.auto_start, resuming=resuming,
256
+ ),
257
+ # PUSH resume awareness (Gap 1): a System: notice positional appended
258
+ # after `resume <id>` + flags so the resumed TUI session gets a "you
259
+ # have been resumed" turn (mutually exclusive with the fresh-launch
260
+ # kickoff above). Parity with claudecode/opencode/grok; resume.log
261
+ # stays the pull-based backstop.
262
+ *host_actions.build_resume_notice_args(resuming=resuming),
263
+ ]
264
+ launch_env = {
265
+ **(config.env or {}),
266
+ **(hook_ctx.browser_launch_env or {}),
267
+ }
268
+ ctx.report_progress(None, "Launching Codex…")
269
+ handle, tmux_path_local, ttyd_port, tmux_socket, tmux_session = await host_actions.launch_ttyd_with_codex(
270
+ host,
271
+ ttyd_path=ttyd_path,
272
+ codex_path=codex_path,
273
+ bind_iface=ttyd_iface,
274
+ extra_env=launch_env,
275
+ codex_flags=codex_flags,
276
+ ready_timeout_s=READY_TIMEOUT_S,
277
+ env_remove=config.scrub_env,
278
+ )
279
+ launched_handle = handle
280
+ tmux_path = tmux_path_local
281
+
282
+ worker_port = await host.establish_tunnel(ttyd_port, bind_addr=bind_addr)
283
+ await ctx.set_widget_upstream(f"http://{upstream_host}:{worker_port}")
284
+ await ctx.set_widget_data({
285
+ "iframeSrc": "{widgetProxyUrl}/",
286
+ })
287
+ ctx.report_progress(None, "Codex is live")
288
+
289
+ # Start the in-session credential watcher for a seeded session: it
290
+ # saves back the rotated auth.json, and (when the seed is leased)
291
+ # renews the lease and aborts the session on lease loss.
292
+ if resolved_seed_id is not None:
293
+ cred_watch_task = asyncio.create_task(
294
+ cred_watcher.run_credential_watcher(
295
+ ctx, host,
296
+ seed_id=resolved_seed_id,
297
+ baseline=cred_baseline,
298
+ encrypt=None,
299
+ decrypt=None,
300
+ lease_holder=lease_holder,
301
+ )
302
+ )
303
+
304
+ while ctx.should_continue() and await host_actions.tmux_session_alive(
305
+ host, tmux_path, tmux_socket, tmux_session,
306
+ ):
307
+ await asyncio.sleep(1.0)
308
+
309
+ async def _conversation_body(host: Host, hook_ctx: HookContext) -> None:
310
+ nonlocal launched_handle, conversation, conv_listener
311
+
312
+ # Launch `codex app-server` directly (no tmux/ttyd). The sandbox MODE
313
+ # and approval policy travel in thread/start params (the app-server has
314
+ # no --sandbox flag); the writable_roots/network_access ride the same
315
+ # `-c sandbox_workspace_write.*` overrides the iframe uses, on the
316
+ # app-server command line (Stage 8, one SandboxSettings SSOT).
317
+ # merge_stderr=False keeps codex diagnostics off the JSONL stdout.
318
+ conversation = CodexConversation(
319
+ cwd=host.workdir,
320
+ permission_gate=config.permission_gate,
321
+ model=config.model,
322
+ sandbox=sandbox_settings.mode,
323
+ # Plan B: on resume, continue the stored thread (thread/resume)
324
+ # instead of starting a fresh one. resume_session_id is the codex
325
+ # thread id recorded in the restored snapshot.
326
+ resume_thread_id=resume_session_id if resuming else None,
327
+ )
328
+ argv = [
329
+ codex_path, "app-server",
330
+ *build_sandbox_config_overrides(sandbox_settings),
331
+ ]
332
+ cmd = " ".join(shlex.quote(a) for a in argv)
333
+ env = {
334
+ **host_actions._isolation_env(host.workdir),
335
+ **(config.env or {}),
336
+ **(hook_ctx.browser_launch_env or {}),
337
+ }
338
+ ctx.report_progress(None, "Launching Codex (conversation)…")
339
+ handle = await host.launch_subprocess(
340
+ cmd, env=env, cwd=host.workdir,
341
+ env_remove=config.scrub_env, stdin=True, merge_stderr=False,
342
+ )
343
+ launched_handle = handle
344
+ conversation.attach(handle)
345
+ reader_task = asyncio.create_task(conversation.run_reader())
346
+ try:
347
+ await conversation.bootstrap()
348
+ except Exception:
349
+ reader_task.cancel()
350
+ raise
351
+
352
+ ctx.publish_result(conversation)
353
+ ctx.report_progress(None, "Codex conversation is live")
354
+
355
+ # Opt-in dashboard chat widget: per-task SSE listener over the
356
+ # published conversation, reached via the widget proxy (which injects
357
+ # the basic-auth credential).
358
+ if config.conversation_ui:
359
+ listener_password = secrets.token_urlsafe(32)
360
+ bind_addr = os.environ.get("OPTIO_WIDGET_TUNNEL_BIND", "127.0.0.1")
361
+ upstream_host = os.environ.get("OPTIO_WIDGET_TUNNEL_HOST", "127.0.0.1")
362
+ # File upload: bytes land under <workdir>/uploads with a sanitized
363
+ # name; the view injects a System: path reference so codex reads
364
+ # them with its own tools.
365
+ uploads_dir = f"{host.workdir}/uploads"
366
+
367
+ async def _write_upload(name: str, data: bytes) -> str:
368
+ safe = re.sub(
369
+ r"[^A-Za-z0-9._-]", "_", (name.split("/")[-1] or "file"),
370
+ )[:200] or "file"
371
+ await host.put_file_to_host(data, f"{uploads_dir}/{safe}")
372
+ return f"uploads/{safe}"
373
+
374
+ # File download: serve workdir-confined bytes for the optio-file:
375
+ # sentinel links codex emits. realpath guards against ../ escapes.
376
+ async def _read_download(relpath: str) -> tuple[bytes, str]:
377
+ workdir = host.workdir.rstrip("/")
378
+ real = os.path.realpath(os.path.join(workdir, relpath))
379
+ if real != workdir and not real.startswith(workdir + os.sep):
380
+ raise ValueError("forbidden") # outside the workdir
381
+ data = await host.fetch_bytes_from_host(real)
382
+ if len(data) > config.max_download_bytes:
383
+ raise ValueError("too-large")
384
+ mime = mimetypes.guess_type(real)[0] or "application/octet-stream"
385
+ return data, mime
386
+
387
+ conv_listener = ConversationListener(
388
+ conversation, password=listener_password,
389
+ upload_writer=_write_upload,
390
+ max_upload_bytes=config.max_upload_bytes,
391
+ download_reader=_read_download,
392
+ max_download_bytes=config.max_download_bytes,
393
+ )
394
+ # In-process aiohttp app: binds directly on the widget-tunnel
395
+ # interface, no host tunnel needed.
396
+ listener_port = await conv_listener.start(bind_addr)
397
+ await ctx.set_widget_upstream(
398
+ f"http://{upstream_host}:{listener_port}",
399
+ inner_auth=BasicAuth(username="optio", password=listener_password),
400
+ )
401
+ # Model picker options come from the model/list captured at
402
+ # bootstrap (authed, exact ids), else the static fallback.
403
+ model_list = codex_models.parse_model_list(conversation.model_list)
404
+ current_model = (
405
+ config.default_model
406
+ or conversation.current_model_id
407
+ or model_list.get("default")
408
+ )
409
+ await ctx.set_widget_data({
410
+ "protocol": "codex",
411
+ "toolVerbosity": config.tool_verbosity,
412
+ "thinkingVerbosity": config.thinking_verbosity,
413
+ "showModelSelector": config.show_model_selector,
414
+ "models": model_list["models"],
415
+ "currentModel": current_model,
416
+ "showFileUpload": config.show_file_upload,
417
+ "maxUploadBytes": config.max_upload_bytes,
418
+ "fileDownload": config.file_download,
419
+ "maxDownloadBytes": config.max_download_bytes,
420
+ })
421
+ ctx.report_progress(None, "Conversation UI is live")
422
+
423
+ # Kickoff prompt as the first turn (headless: no positional prompt
424
+ # path). Suppressed on resume — re-kicking would duplicate the task.
425
+ # On resume, PUSH a System: resume notice instead so the resumed thread
426
+ # notices promptly (parity; resume.log stays the pull-based backstop).
427
+ if config.auto_start and not resuming:
428
+ await conversation.send(host_actions.AUTO_START_PROMPT)
429
+ elif resuming:
430
+ await conversation.send(f"{SYSTEM_MESSAGE_PREFIX}{RESUME_NOTICE}")
431
+
432
+ try:
433
+ while True:
434
+ wait_task = asyncio.create_task(proc_wait(handle))
435
+ close_task = asyncio.create_task(
436
+ conversation.close_requested.wait())
437
+ done, _ = await asyncio.wait(
438
+ {wait_task, close_task},
439
+ return_when=asyncio.FIRST_COMPLETED,
440
+ )
441
+ for t in (wait_task, close_task):
442
+ if t not in done:
443
+ t.cancel()
444
+
445
+ if close_task in done and wait_task not in done:
446
+ # Caller asked to close: cooperative clean end.
447
+ if config.host_protocol:
448
+ # The keyword driver treats a body return without DONE
449
+ # as a premature exit; a caller-requested close IS the
450
+ # clean end, so emit DONE ourselves and park until the
451
+ # driver observes it and cancels this body.
452
+ log_path = f"{host.workdir}/optio.log"
453
+ await host.run_command(
454
+ f"echo DONE >> {shlex.quote(log_path)}"
455
+ )
456
+ await asyncio.Event().wait() # cancelled by the driver
457
+ break
458
+
459
+ # Subprocess exited on its own.
460
+ try:
461
+ rc = wait_task.result()
462
+ except Exception:
463
+ rc = None
464
+ if (
465
+ not conversation.close_requested.is_set()
466
+ and ctx.should_continue()
467
+ ):
468
+ raise RuntimeError(f"codex exited unexpectedly (exit {rc})")
469
+ break
470
+ finally:
471
+ reader_task.cancel()
472
+ try:
473
+ await reader_task
474
+ except asyncio.CancelledError:
475
+ pass
476
+
477
+ async def _agent_sender(message: str) -> None:
478
+ if config.mode == "conversation":
479
+ await conversation.send(message)
480
+ return
481
+ await host_actions.send_text_to_codex(
482
+ host, tmux_path, tmux_socket, tmux_session, message,
483
+ )
484
+
485
+ body = _conversation_body if config.mode == "conversation" else _codex_body
486
+ try:
487
+ await run_log_protocol_session(
488
+ host, ctx,
489
+ body=body,
490
+ prepare=_prepare,
491
+ on_deliverable=config.on_deliverable,
492
+ after_execute=config.after_execute,
493
+ protocol=protocol,
494
+ agent_sender=_agent_sender,
495
+ keywords=config.host_protocol,
496
+ )
497
+ except _SessionFailed as fail:
498
+ raise RuntimeError(str(fail)) from None
499
+ finally:
500
+ if not ctx.should_continue():
501
+ cancelled = True
502
+ # Codex authenticates (ChatGPT mode) with a SINGLE-USE rotating refresh
503
+ # token. If codex rotated it this session, the new auth.json must reach
504
+ # the seed via the backstop below — but an aggressive SIGKILL can beat
505
+ # codex's flush, stranding the rotation (the seed keeps the now-spent
506
+ # token → the next launch demands re-auth). So when a SEED is in use,
507
+ # tear codex down GRACEFULLY (SIGTERM + wait) even on cancel, giving it
508
+ # time to persist auth.json before the final save-back reads it. Only a
509
+ # non-seeded session keeps the fast aggressive kill on cancel.
510
+ codex_aggressive = _teardown_aggressive(
511
+ cancelled=cancelled, seeded=resolved_seed_id is not None,
512
+ )
513
+ # Stop the conversation listener first so its long-lived SSE loops are
514
+ # woken (bounded shutdown) before the subprocess teardown below.
515
+ if conv_listener is not None:
516
+ try:
517
+ await conv_listener.stop()
518
+ except Exception:
519
+ _LOG.exception("conversation listener cleanup failed")
520
+ # Conversation mode has no tmux/ttyd tree — terminate the app-server
521
+ # subprocess directly. Its EOF drives the conversation to closed.
522
+ if config.mode == "conversation" and launched_handle is not None:
523
+ try:
524
+ await host.terminate_subprocess(
525
+ launched_handle, aggressive=codex_aggressive)
526
+ except Exception:
527
+ _LOG.exception("terminate codex conversation subprocess failed")
528
+ if (
529
+ tmux_path is not None
530
+ and tmux_socket is not None
531
+ and tmux_session is not None
532
+ and codex_path
533
+ ):
534
+ try:
535
+ await host_actions.teardown_session_tree(
536
+ host,
537
+ tmux_path=tmux_path,
538
+ tmux_socket=tmux_socket,
539
+ tmux_session=tmux_session,
540
+ codex_path=codex_path,
541
+ ttyd_handle=launched_handle,
542
+ aggressive=codex_aggressive,
543
+ )
544
+ except Exception:
545
+ _LOG.exception("teardown_session_tree failed")
546
+
547
+ # Stop the credential watcher before the final save-back so the two
548
+ # never race on the same seed blob.
549
+ if cred_watch_task is not None:
550
+ cred_watch_task.cancel()
551
+ try:
552
+ await cred_watch_task
553
+ except asyncio.CancelledError:
554
+ pass
555
+
556
+ # Final backstop save-back — LOAD-BEARING, not defensive: codex's
557
+ # refresh already consumed the old refresh token server-side
558
+ # (single-use, openai/codex#15410); a rotation in the last poll
559
+ # window is persisted ONLY here. Runs after codex terminated so
560
+ # auth.json is final.
561
+ if resolved_seed_id is not None:
562
+ try:
563
+ cred_baseline = await cred_watcher.save_back_if_changed(
564
+ ctx, host,
565
+ seed_id=resolved_seed_id,
566
+ baseline=cred_baseline,
567
+ encrypt=None,
568
+ decrypt=None,
569
+ )
570
+ except Exception:
571
+ _LOG.exception("final credential save-back failed")
572
+
573
+ # Release the lease AFTER the final save-back (opencode's deliberate
574
+ # ordering, ported via grok): a new acquirer must never merge the
575
+ # pre-save-back blob.
576
+ if lease_holder is not None and resolved_seed_id is not None:
577
+ try:
578
+ await _seeds.release(
579
+ ctx._db, prefix=ctx._prefix, suffix=CODEX_SEED_SUFFIX,
580
+ seed_id=resolved_seed_id, holder=lease_holder,
581
+ )
582
+ except Exception:
583
+ _LOG.exception("lease release failed (TTL will reclaim)")
584
+
585
+ # Seed capture (fresh only): store this session's codex identity as
586
+ # a reusable seed so a later fresh task can start already-authed.
587
+ # Reached-live gate: launched_handle is assigned strictly after a
588
+ # successful launch — an interrupt before launch leaves it None.
589
+ # Guarded on a VALID auth.json (capture_gate_ok) — never seed a
590
+ # login-less identity. Ignored on resume.
591
+ if (
592
+ not resuming
593
+ and config.on_seed_saved is not None
594
+ and launched_handle is not None
595
+ ):
596
+ try:
597
+ if not await cred_watcher.capture_gate_ok(host):
598
+ _LOG.warning(
599
+ "seed capture skipped: home/.codex/auth.json absent "
600
+ "or invalid (login-less session)",
601
+ )
602
+ else:
603
+ seed_id = await _seeds.capture_seed(
604
+ ctx, host,
605
+ manifest=CODEX_SEED_MANIFEST,
606
+ suffix=CODEX_SEED_SUFFIX,
607
+ encrypt=None,
608
+ )
609
+ # 2nd arg (account summary) is resolved in a later
610
+ # stage; None for now.
611
+ await _call_maybe_async(config.on_seed_saved, seed_id, None)
612
+ except Exception:
613
+ _LOG.exception(
614
+ "seed capture failed; callback not fired, teardown continues",
615
+ )
616
+
617
+ # Reached-live gate: only capture if codex actually came up
618
+ # (launched_handle is assigned strictly after a successful ttyd/codex
619
+ # launch). An interrupt before launch leaves it None — skip capture
620
+ # so any prior good snapshot survives and hasSavedState is untouched.
621
+ if config.supports_resume and launched_handle is not None:
622
+ try:
623
+ await _capture_snapshot(
624
+ ctx, host,
625
+ end_state="cancelled" if cancelled else "done",
626
+ workdir_exclude=config.workdir_exclude,
627
+ # Iframe mode: scan the newest rollout filename. The
628
+ # conversation body records the live thread id captured at
629
+ # thread/start (thread/resume's resume source) instead.
630
+ session_id=(
631
+ conversation.thread_id
632
+ if config.mode == "conversation" and conversation is not None
633
+ else await host_actions.read_latest_session_id(host)
634
+ ),
635
+ )
636
+ except Exception:
637
+ _LOG.exception(
638
+ "snapshot capture failed; proceeding with workdir wipe",
639
+ )
640
+
641
+ try:
642
+ await host.cleanup_taskdir(aggressive=cancelled)
643
+ except Exception:
644
+ _LOG.exception("cleanup_taskdir failed")
645
+ try:
646
+ await host.disconnect()
647
+ except Exception:
648
+ _LOG.exception("host.disconnect failed")
649
+
650
+
651
+ async def _stream_blob(ctx: ProcessContext, blob_id) -> "AsyncIterator[bytes]":
652
+ async with ctx.load_blob(blob_id) as reader:
653
+ while True:
654
+ chunk = await reader.read(1 << 20)
655
+ if not chunk:
656
+ break
657
+ yield chunk
658
+
659
+
660
+ async def _capture_snapshot(
661
+ ctx: ProcessContext,
662
+ host: Host,
663
+ *,
664
+ end_state: str,
665
+ workdir_exclude: list[str] | None,
666
+ session_id: str | None,
667
+ ) -> None:
668
+ """Capture a single-blob resume snapshot of the (now static) workdir.
669
+
670
+ Codex's rollout store lives under ``home/.codex/sessions`` INSIDE the
671
+ workdir, so one workdir tar carries everything ``codex resume <id>``
672
+ needs; ``session_id`` records WHICH session to resume. Streams the tar
673
+ into GridFS honoring the effective exclude list, records the snapshot
674
+ row, prunes to the retention limit (deleting stale blobs), and surfaces
675
+ the Resume affordance.
676
+ """
677
+ exclude = effective_workdir_exclude(workdir_exclude)
678
+ async with ctx.store_blob("workdir") as wwriter:
679
+ async for chunk in host.archive_workdir(exclude):
680
+ await wwriter.write(chunk)
681
+ workdir_blob_id = wwriter.file_id
682
+
683
+ await insert_snapshot(
684
+ ctx._db, ctx._prefix,
685
+ process_id=ctx.process_id,
686
+ end_state=end_state,
687
+ workdir_blob_id=workdir_blob_id,
688
+ session_id=session_id,
689
+ )
690
+
691
+ stale = await prune_snapshots(ctx._db, ctx._prefix, ctx.process_id)
692
+ for blob_id in stale:
693
+ try:
694
+ await ctx.delete_blob(blob_id)
695
+ except Exception:
696
+ _LOG.exception("delete_blob(workdir) failed")
697
+
698
+ await ctx.mark_has_saved_state()
699
+
700
+
701
+ def create_codex_task(
702
+ process_id: str,
703
+ name: str,
704
+ config: CodexTaskConfig,
705
+ description: str | None = None,
706
+ metadata: dict | None = None,
707
+ ) -> TaskInstance:
708
+ """Return a TaskInstance that runs one optio-codex session."""
709
+
710
+ async def _execute(ctx: ProcessContext) -> None:
711
+ await run_codex_session(ctx, config)
712
+
713
+ # iframe → the ttyd TUI widget. Conversation mode carries the live chat
714
+ # widget only when conversation_ui is on; otherwise no widget (the
715
+ # published Conversation is driven programmatically).
716
+ if config.conversation_ui:
717
+ ui_widget: str | None = "conversation"
718
+ elif config.mode == "conversation":
719
+ ui_widget = None
720
+ else:
721
+ ui_widget = "iframe"
722
+
723
+ return TaskInstance(
724
+ execute=_execute,
725
+ process_id=process_id,
726
+ name=name,
727
+ description=description,
728
+ ui_widget=ui_widget,
729
+ supports_resume=config.supports_resume,
730
+ metadata=metadata or {},
731
+ )