codex-autorunner 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. codex_autorunner/agents/opencode/client.py +113 -4
  2. codex_autorunner/agents/opencode/supervisor.py +4 -0
  3. codex_autorunner/agents/registry.py +17 -7
  4. codex_autorunner/bootstrap.py +219 -1
  5. codex_autorunner/core/__init__.py +17 -1
  6. codex_autorunner/core/about_car.py +114 -1
  7. codex_autorunner/core/app_server_threads.py +6 -0
  8. codex_autorunner/core/config.py +236 -1
  9. codex_autorunner/core/context_awareness.py +38 -0
  10. codex_autorunner/core/docs.py +0 -122
  11. codex_autorunner/core/filebox.py +265 -0
  12. codex_autorunner/core/flows/controller.py +71 -1
  13. codex_autorunner/core/flows/reconciler.py +4 -1
  14. codex_autorunner/core/flows/runtime.py +22 -0
  15. codex_autorunner/core/flows/store.py +61 -9
  16. codex_autorunner/core/flows/transition.py +23 -16
  17. codex_autorunner/core/flows/ux_helpers.py +18 -3
  18. codex_autorunner/core/flows/worker_process.py +32 -6
  19. codex_autorunner/core/hub.py +198 -41
  20. codex_autorunner/core/lifecycle_events.py +253 -0
  21. codex_autorunner/core/path_utils.py +2 -1
  22. codex_autorunner/core/pma_audit.py +224 -0
  23. codex_autorunner/core/pma_context.py +496 -0
  24. codex_autorunner/core/pma_dispatch_interceptor.py +284 -0
  25. codex_autorunner/core/pma_lifecycle.py +527 -0
  26. codex_autorunner/core/pma_queue.py +367 -0
  27. codex_autorunner/core/pma_safety.py +221 -0
  28. codex_autorunner/core/pma_state.py +115 -0
  29. codex_autorunner/core/ports/agent_backend.py +2 -5
  30. codex_autorunner/core/ports/run_event.py +1 -4
  31. codex_autorunner/core/prompt.py +0 -80
  32. codex_autorunner/core/prompts.py +56 -172
  33. codex_autorunner/core/redaction.py +0 -4
  34. codex_autorunner/core/review_context.py +11 -9
  35. codex_autorunner/core/runner_controller.py +35 -33
  36. codex_autorunner/core/runner_state.py +147 -0
  37. codex_autorunner/core/runtime.py +829 -0
  38. codex_autorunner/core/sqlite_utils.py +13 -4
  39. codex_autorunner/core/state.py +7 -10
  40. codex_autorunner/core/state_roots.py +5 -0
  41. codex_autorunner/core/templates/__init__.py +39 -0
  42. codex_autorunner/core/templates/git_mirror.py +234 -0
  43. codex_autorunner/core/templates/provenance.py +56 -0
  44. codex_autorunner/core/templates/scan_cache.py +120 -0
  45. codex_autorunner/core/ticket_linter_cli.py +17 -0
  46. codex_autorunner/core/ticket_manager_cli.py +154 -92
  47. codex_autorunner/core/time_utils.py +11 -0
  48. codex_autorunner/core/types.py +18 -0
  49. codex_autorunner/core/utils.py +34 -6
  50. codex_autorunner/flows/review/service.py +23 -25
  51. codex_autorunner/flows/ticket_flow/definition.py +43 -1
  52. codex_autorunner/integrations/agents/__init__.py +2 -0
  53. codex_autorunner/integrations/agents/backend_orchestrator.py +18 -0
  54. codex_autorunner/integrations/agents/codex_backend.py +19 -8
  55. codex_autorunner/integrations/agents/runner.py +3 -8
  56. codex_autorunner/integrations/agents/wiring.py +8 -0
  57. codex_autorunner/integrations/telegram/doctor.py +228 -6
  58. codex_autorunner/integrations/telegram/handlers/commands/execution.py +236 -74
  59. codex_autorunner/integrations/telegram/handlers/commands/files.py +314 -75
  60. codex_autorunner/integrations/telegram/handlers/commands/flows.py +346 -58
  61. codex_autorunner/integrations/telegram/handlers/commands/workspace.py +498 -37
  62. codex_autorunner/integrations/telegram/handlers/commands_runtime.py +202 -45
  63. codex_autorunner/integrations/telegram/handlers/commands_spec.py +18 -7
  64. codex_autorunner/integrations/telegram/handlers/messages.py +26 -1
  65. codex_autorunner/integrations/telegram/helpers.py +1 -3
  66. codex_autorunner/integrations/telegram/runtime.py +9 -4
  67. codex_autorunner/integrations/telegram/service.py +30 -0
  68. codex_autorunner/integrations/telegram/state.py +38 -0
  69. codex_autorunner/integrations/telegram/ticket_flow_bridge.py +10 -4
  70. codex_autorunner/integrations/telegram/transport.py +10 -3
  71. codex_autorunner/integrations/templates/__init__.py +27 -0
  72. codex_autorunner/integrations/templates/scan_agent.py +312 -0
  73. codex_autorunner/server.py +2 -2
  74. codex_autorunner/static/agentControls.js +21 -5
  75. codex_autorunner/static/app.js +115 -11
  76. codex_autorunner/static/chatUploads.js +137 -0
  77. codex_autorunner/static/docChatCore.js +185 -13
  78. codex_autorunner/static/fileChat.js +68 -40
  79. codex_autorunner/static/fileboxUi.js +159 -0
  80. codex_autorunner/static/hub.js +46 -81
  81. codex_autorunner/static/index.html +303 -24
  82. codex_autorunner/static/messages.js +82 -4
  83. codex_autorunner/static/notifications.js +255 -0
  84. codex_autorunner/static/pma.js +1167 -0
  85. codex_autorunner/static/settings.js +3 -0
  86. codex_autorunner/static/streamUtils.js +57 -0
  87. codex_autorunner/static/styles.css +9125 -6742
  88. codex_autorunner/static/templateReposSettings.js +225 -0
  89. codex_autorunner/static/ticketChatActions.js +165 -3
  90. codex_autorunner/static/ticketChatStream.js +17 -119
  91. codex_autorunner/static/ticketEditor.js +41 -13
  92. codex_autorunner/static/ticketTemplates.js +798 -0
  93. codex_autorunner/static/tickets.js +69 -19
  94. codex_autorunner/static/turnEvents.js +27 -0
  95. codex_autorunner/static/turnResume.js +33 -0
  96. codex_autorunner/static/utils.js +28 -0
  97. codex_autorunner/static/workspace.js +258 -44
  98. codex_autorunner/static/workspaceFileBrowser.js +6 -4
  99. codex_autorunner/surfaces/cli/cli.py +1465 -155
  100. codex_autorunner/surfaces/cli/pma_cli.py +817 -0
  101. codex_autorunner/surfaces/web/app.py +253 -49
  102. codex_autorunner/surfaces/web/routes/__init__.py +4 -0
  103. codex_autorunner/surfaces/web/routes/analytics.py +29 -22
  104. codex_autorunner/surfaces/web/routes/file_chat.py +317 -36
  105. codex_autorunner/surfaces/web/routes/filebox.py +227 -0
  106. codex_autorunner/surfaces/web/routes/flows.py +219 -29
  107. codex_autorunner/surfaces/web/routes/messages.py +70 -39
  108. codex_autorunner/surfaces/web/routes/pma.py +1652 -0
  109. codex_autorunner/surfaces/web/routes/repos.py +1 -1
  110. codex_autorunner/surfaces/web/routes/shared.py +0 -3
  111. codex_autorunner/surfaces/web/routes/templates.py +634 -0
  112. codex_autorunner/surfaces/web/runner_manager.py +2 -2
  113. codex_autorunner/surfaces/web/schemas.py +70 -18
  114. codex_autorunner/tickets/agent_pool.py +27 -0
  115. codex_autorunner/tickets/files.py +33 -16
  116. codex_autorunner/tickets/lint.py +50 -0
  117. codex_autorunner/tickets/models.py +3 -0
  118. codex_autorunner/tickets/outbox.py +41 -5
  119. codex_autorunner/tickets/runner.py +350 -69
  120. {codex_autorunner-1.1.0.dist-info → codex_autorunner-1.2.0.dist-info}/METADATA +15 -19
  121. {codex_autorunner-1.1.0.dist-info → codex_autorunner-1.2.0.dist-info}/RECORD +125 -94
  122. codex_autorunner/core/adapter_utils.py +0 -21
  123. codex_autorunner/core/engine.py +0 -3302
  124. {codex_autorunner-1.1.0.dist-info → codex_autorunner-1.2.0.dist-info}/WHEEL +0 -0
  125. {codex_autorunner-1.1.0.dist-info → codex_autorunner-1.2.0.dist-info}/entry_points.txt +0 -0
  126. {codex_autorunner-1.1.0.dist-info → codex_autorunner-1.2.0.dist-info}/licenses/LICENSE +0 -0
  127. {codex_autorunner-1.1.0.dist-info → codex_autorunner-1.2.0.dist-info}/top_level.txt +0 -0
@@ -1,3302 +0,0 @@
1
- import asyncio
2
- import contextlib
3
- import dataclasses
4
- import hashlib
5
- import importlib
6
- import inspect
7
- import json
8
- import logging
9
- import os
10
- import signal
11
- import threading
12
- import time
13
- import traceback
14
- import uuid
15
- from collections import Counter
16
- from datetime import datetime, timezone
17
- from logging.handlers import RotatingFileHandler
18
- from pathlib import Path
19
- from typing import IO, Any, Awaitable, Callable, Iterator, Optional
20
-
21
- import yaml
22
-
23
- from ..agents.registry import validate_agent_id
24
- from ..manifest import MANIFEST_VERSION
25
- from ..tickets.files import list_ticket_paths, ticket_is_done
26
- from .about_car import ensure_about_car_file
27
- from .adapter_utils import handle_agent_output
28
- from .app_server_ids import (
29
- extract_thread_id,
30
- extract_thread_id_for_turn,
31
- extract_turn_id,
32
- )
33
- from .app_server_logging import AppServerEventFormatter
34
- from .app_server_prompts import build_autorunner_prompt
35
- from .app_server_threads import AppServerThreadRegistry, default_app_server_threads_path
36
- from .config import (
37
- CONFIG_FILENAME,
38
- CONFIG_VERSION,
39
- DEFAULT_REPO_CONFIG,
40
- ConfigError,
41
- RepoConfig,
42
- _build_repo_config,
43
- _is_loopback_host,
44
- _load_yaml_dict,
45
- _merge_defaults,
46
- _validate_repo_config,
47
- derive_repo_config,
48
- load_hub_config,
49
- load_repo_config,
50
- )
51
- from .docs import DocsManager, parse_todos
52
- from .flows.models import FlowEventType
53
- from .git_utils import GitError, run_git
54
- from .locks import (
55
- DEFAULT_RUNNER_CMD_HINTS,
56
- FileLock,
57
- FileLockBusy,
58
- assess_lock,
59
- process_alive,
60
- read_lock_info,
61
- write_lock_info,
62
- )
63
- from .notifications import NotificationManager
64
- from .optional_dependencies import missing_optional_dependencies
65
- from .ports.agent_backend import AgentBackend
66
- from .ports.run_event import (
67
- ApprovalRequested,
68
- Completed,
69
- Failed,
70
- OutputDelta,
71
- RunEvent,
72
- RunNotice,
73
- Started,
74
- TokenUsage,
75
- ToolCall,
76
- )
77
- from .prompt import build_final_summary_prompt
78
- from .redaction import redact_text
79
- from .review_context import build_spec_progress_review_context
80
- from .run_index import RunIndexStore
81
- from .state import RunnerState, load_state, now_iso, save_state, state_lock
82
- from .state_roots import resolve_global_state_root, resolve_repo_state_root
83
- from .ticket_linter_cli import ensure_ticket_linter
84
- from .ticket_manager_cli import ensure_ticket_manager
85
- from .utils import (
86
- RepoNotFoundError,
87
- atomic_write,
88
- ensure_executable,
89
- find_repo_root,
90
- )
91
-
92
-
93
- class LockError(Exception):
94
- pass
95
-
96
-
97
- def timestamp() -> str:
98
- return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
99
-
100
-
101
- SUMMARY_FINALIZED_MARKER = "CAR:SUMMARY_FINALIZED"
102
- SUMMARY_FINALIZED_MARKER_PREFIX = f"<!-- {SUMMARY_FINALIZED_MARKER}"
103
- AUTORUNNER_APP_SERVER_MESSAGE = (
104
- "Continue working through TODO items from top to bottom."
105
- )
106
- AUTORUNNER_STOP_POLL_SECONDS = 1.0
107
- AUTORUNNER_INTERRUPT_GRACE_SECONDS = 30.0
108
-
109
-
110
- @dataclasses.dataclass
111
- class RunTelemetry:
112
- run_id: int
113
- thread_id: Optional[str] = None
114
- turn_id: Optional[str] = None
115
- token_total: Optional[dict[str, Any]] = None
116
- plan: Optional[Any] = None
117
- diff: Optional[Any] = None
118
-
119
-
120
- NotificationHandler = Callable[[dict[str, Any]], Awaitable[None]]
121
- BackendFactory = Callable[
122
- [str, RunnerState, Optional[NotificationHandler]], AgentBackend
123
- ]
124
- AppServerSupervisorFactory = Callable[[str, Optional[NotificationHandler]], Any]
125
-
126
-
127
- class Engine:
128
- def __init__(
129
- self,
130
- repo_root: Path,
131
- *,
132
- config: Optional[RepoConfig] = None,
133
- hub_path: Optional[Path] = None,
134
- backend_factory: Optional[BackendFactory] = None,
135
- app_server_supervisor_factory: Optional[AppServerSupervisorFactory] = None,
136
- backend_orchestrator: Optional[Any] = None,
137
- agent_id_validator: Optional[Callable[[str], str]] = None,
138
- ):
139
- if config is None:
140
- config = load_repo_config(repo_root, hub_path=hub_path)
141
- self.config = config
142
- self.repo_root = self.config.root
143
- self.docs = DocsManager(self.config)
144
- self.notifier = NotificationManager(self.config)
145
- self.state_path = self.repo_root / ".codex-autorunner" / "state.sqlite3"
146
- self.log_path = self.config.log.path
147
- self._run_index_store = RunIndexStore(self.state_path)
148
- self.lock_path = self.repo_root / ".codex-autorunner" / "lock"
149
- self.stop_path = self.repo_root / ".codex-autorunner" / "stop"
150
- self._hub_path = hub_path
151
- self._active_global_handler: Optional[RotatingFileHandler] = None
152
- self._active_run_log: Optional[IO[str]] = None
153
- self._app_server_threads = AppServerThreadRegistry(
154
- default_app_server_threads_path(self.repo_root)
155
- )
156
- self._app_server_threads_lock = threading.Lock()
157
- self._backend_factory = backend_factory
158
- self._app_server_supervisor_factory = app_server_supervisor_factory
159
- self._app_server_supervisor: Optional[Any] = None
160
- self._backend_orchestrator: Optional[Any] = None
161
- self._app_server_logger = logging.getLogger("codex_autorunner.app_server")
162
- self._agent_id_validator = agent_id_validator or validate_agent_id
163
- redact_enabled = self.config.security.get("redact_run_logs", True)
164
- self._app_server_event_formatter = AppServerEventFormatter(
165
- redact_enabled=redact_enabled
166
- )
167
- self._opencode_supervisor: Optional[Any] = None
168
-
169
- # Backend orchestrator for protocol-agnostic backend management
170
- # Use provided orchestrator if available (for testing), otherwise create it
171
- self._backend_orchestrator = None
172
- if backend_orchestrator is not None:
173
- self._backend_orchestrator = backend_orchestrator
174
- elif backend_factory is None and app_server_supervisor_factory is None:
175
- self._backend_orchestrator = self._build_backend_orchestrator()
176
- else:
177
- self._app_server_logger.debug(
178
- "Skipping BackendOrchestrator creation because backend_factory or app_server_supervisor_factory is set",
179
- )
180
- self._backend_orchestrator = None
181
- self._run_telemetry_lock = threading.Lock()
182
- self._run_telemetry: Optional[RunTelemetry] = None
183
- self._last_telemetry_update_time: float = 0.0
184
- self._canonical_event_lock = threading.Lock()
185
- self._canonical_event_seq: dict[int, int] = {}
186
- self._last_run_interrupted = False
187
- self._lock_handle: Optional[FileLock] = None
188
- # Ensure the interactive TUI briefing doc exists (for web Terminal "New").
189
- try:
190
- ensure_about_car_file(self.config)
191
- except (OSError, IOError) as exc:
192
- # Never fail Engine creation due to a best-effort helper doc.
193
- self._app_server_logger.debug(
194
- "Best-effort ABOUT_CAR.md creation failed: %s", exc
195
- )
196
- try:
197
- ensure_ticket_linter(self.config.root)
198
- except (OSError, IOError) as exc:
199
- self._app_server_logger.debug(
200
- "Best-effort lint_tickets.py creation failed: %s", exc
201
- )
202
- try:
203
- ensure_ticket_manager(self.config.root)
204
- except (OSError, IOError) as exc:
205
- self._app_server_logger.debug(
206
- "Best-effort ticket_tool.py creation failed: %s", exc
207
- )
208
-
209
- def _build_backend_orchestrator(self) -> Optional[Any]:
210
- """
211
- Dynamically construct BackendOrchestrator without introducing a core -> integrations
212
- import-time dependency. Keeps import-boundary checks satisfied.
213
- """
214
- try:
215
- module = importlib.import_module(
216
- "codex_autorunner.integrations.agents.backend_orchestrator"
217
- )
218
- orchestrator_cls = getattr(module, "BackendOrchestrator", None)
219
- if orchestrator_cls is None:
220
- raise AttributeError("BackendOrchestrator not found in module")
221
- return orchestrator_cls(
222
- repo_root=self.repo_root,
223
- config=self.config,
224
- notification_handler=self._handle_app_server_notification,
225
- logger=self._app_server_logger,
226
- )
227
- except Exception as exc:
228
- self._app_server_logger.warning(
229
- "Failed to create BackendOrchestrator: %s\n%s",
230
- exc,
231
- traceback.format_exc(),
232
- )
233
- return None
234
-
235
- @staticmethod
236
- def from_cwd(repo: Optional[Path] = None) -> "Engine":
237
- root = find_repo_root(repo or Path.cwd())
238
- return Engine(root)
239
-
240
- def acquire_lock(self, force: bool = False) -> None:
241
- self._lock_handle = FileLock(self.lock_path)
242
- try:
243
- self._lock_handle.acquire(blocking=False)
244
- except FileLockBusy as exc:
245
- info = read_lock_info(self.lock_path)
246
- pid = info.pid
247
- if pid and process_alive(pid):
248
- raise LockError(
249
- f"Another autorunner is active (pid={pid}); stop it before continuing"
250
- ) from exc
251
- raise LockError(
252
- "Another autorunner is active; stop it before continuing"
253
- ) from exc
254
- info = read_lock_info(self.lock_path)
255
- pid = info.pid
256
- if pid and process_alive(pid) and not force:
257
- self._lock_handle.release()
258
- self._lock_handle = None
259
- raise LockError(
260
- f"Another autorunner is active (pid={pid}); use --force to override"
261
- )
262
- write_lock_info(
263
- self.lock_path,
264
- os.getpid(),
265
- started_at=now_iso(),
266
- lock_file=self._lock_handle.file,
267
- )
268
-
269
- def release_lock(self) -> None:
270
- if self._lock_handle is not None:
271
- self._lock_handle.release()
272
- self._lock_handle = None
273
- if self.lock_path.exists():
274
- self.lock_path.unlink()
275
-
276
- def repo_busy_reason(self) -> Optional[str]:
277
- if self.lock_path.exists():
278
- assessment = assess_lock(
279
- self.lock_path,
280
- expected_cmd_substrings=DEFAULT_RUNNER_CMD_HINTS,
281
- )
282
- if assessment.freeable:
283
- return "Autorunner lock is stale; clear it before continuing."
284
- pid = assessment.pid
285
- if pid and process_alive(pid):
286
- host = f" on {assessment.host}" if assessment.host else ""
287
- return f"Autorunner is running (pid={pid}{host}); try again later."
288
- return "Autorunner lock present; clear or resume before continuing."
289
-
290
- state = load_state(self.state_path)
291
- if state.status == "running":
292
- return "Autorunner is currently running; try again later."
293
- return None
294
-
295
- def request_stop(self) -> None:
296
- self.stop_path.parent.mkdir(parents=True, exist_ok=True)
297
- atomic_write(self.stop_path, f"{now_iso()}\n")
298
-
299
- def clear_stop_request(self) -> None:
300
- self.stop_path.unlink(missing_ok=True)
301
-
302
- def stop_requested(self) -> bool:
303
- return self.stop_path.exists()
304
-
305
- def _should_stop(self, external_stop_flag: Optional[threading.Event]) -> bool:
306
- if external_stop_flag and external_stop_flag.is_set():
307
- return True
308
- return self.stop_requested()
309
-
310
- def kill_running_process(self) -> Optional[int]:
311
- """Force-kill the process holding the lock, if any. Returns pid if killed."""
312
- if not self.lock_path.exists():
313
- return None
314
- info = read_lock_info(self.lock_path)
315
- pid = info.pid
316
- if pid and process_alive(pid):
317
- try:
318
- os.kill(pid, signal.SIGTERM)
319
- return pid
320
- except OSError:
321
- return None
322
- # stale lock
323
- self.lock_path.unlink(missing_ok=True)
324
- return None
325
-
326
- def runner_pid(self) -> Optional[int]:
327
- state = load_state(self.state_path)
328
- pid = state.runner_pid
329
- if pid and process_alive(pid):
330
- return pid
331
- info = read_lock_info(self.lock_path)
332
- if info.pid and process_alive(info.pid):
333
- return info.pid
334
- return None
335
-
336
- def todos_done(self) -> bool:
337
- # Ticket-first mode: completion is determined by ticket files, not TODO.md.
338
- ticket_dir = self.repo_root / ".codex-autorunner" / "tickets"
339
- ticket_paths = list_ticket_paths(ticket_dir)
340
- if not ticket_paths:
341
- return False
342
- return all(ticket_is_done(path) for path in ticket_paths)
343
-
344
- def summary_finalized(self) -> bool:
345
- # Legacy docs finalization no longer applies (no SUMMARY doc).
346
- return True
347
-
348
- def _stamp_summary_finalized(self, run_id: int) -> None:
349
- # No-op: summary file no longer exists.
350
- _ = run_id
351
- return
352
-
353
- async def _execute_run_step(
354
- self,
355
- prompt: str,
356
- run_id: int,
357
- *,
358
- external_stop_flag: Optional[threading.Event] = None,
359
- ) -> int:
360
- """
361
- Execute a single run step:
362
- 1. Update state to 'running'
363
- 2. Log start
364
- 3. Run Codex CLI
365
- 4. Log end
366
- 5. Update state to 'idle' or 'error'
367
- 6. Commit if successful and auto-commit is enabled
368
- """
369
- try:
370
- todo_before = self.docs.read_doc("todo")
371
- except (FileNotFoundError, OSError) as exc:
372
- self._app_server_logger.debug(
373
- "Failed to read TODO.md before run %s: %s", run_id, exc
374
- )
375
- todo_before = ""
376
- state = load_state(self.state_path)
377
- try:
378
- validated_agent = self._agent_id_validator(
379
- state.autorunner_agent_override or "codex"
380
- )
381
- except ValueError:
382
- validated_agent = "codex"
383
- self.log_line(
384
- run_id,
385
- f"info: unknown agent '{state.autorunner_agent_override}', defaulting to codex",
386
- )
387
- self._update_state("running", run_id, None, started=True)
388
- self._last_run_interrupted = False
389
- self._start_run_telemetry(run_id)
390
-
391
- actor: dict[str, Any] = {
392
- "backend": "codex_app_server",
393
- "agent_id": validated_agent,
394
- "surface": "hub" if self._hub_path else "cli",
395
- }
396
- mode: dict[str, Any] = {
397
- "approval_policy": state.autorunner_approval_policy or "never",
398
- "sandbox": state.autorunner_sandbox_mode or "dangerFullAccess",
399
- }
400
- runner_cfg = self.config.raw.get("runner") or {}
401
- review_cfg = runner_cfg.get("review")
402
- if isinstance(review_cfg, dict):
403
- mode["review_enabled"] = bool(review_cfg.get("enabled"))
404
-
405
- with self._run_log_context(run_id):
406
- self._write_run_marker(run_id, "start", actor=actor, mode=mode)
407
- exit_code = await self._run_agent_async(
408
- agent_id=validated_agent,
409
- prompt=prompt,
410
- run_id=run_id,
411
- state=state,
412
- external_stop_flag=external_stop_flag,
413
- )
414
- self._write_run_marker(run_id, "end", exit_code=exit_code)
415
-
416
- try:
417
- todo_after = self.docs.read_doc("todo")
418
- except (FileNotFoundError, OSError) as exc:
419
- self._app_server_logger.debug(
420
- "Failed to read TODO.md after run %s: %s", run_id, exc
421
- )
422
- todo_after = ""
423
- todo_delta = self._compute_todo_attribution(todo_before, todo_after)
424
- todo_snapshot = self._build_todo_snapshot(todo_before, todo_after)
425
- run_updates: dict[str, Any] = {
426
- "todo": todo_delta,
427
- "todo_snapshot": todo_snapshot,
428
- }
429
- telemetry = self._snapshot_run_telemetry(run_id)
430
- usage_payload: Optional[dict[str, Any]] = None
431
- if (
432
- telemetry
433
- and telemetry.thread_id
434
- and isinstance(telemetry.token_total, dict)
435
- ):
436
- baseline = None
437
- # OpenCode reports per-turn totals, so skip cross-run deltas.
438
- if validated_agent != "opencode":
439
- baseline = self._find_thread_token_baseline(
440
- thread_id=telemetry.thread_id, run_id=run_id
441
- )
442
- delta = self._compute_token_delta(baseline, telemetry.token_total)
443
- token_usage_payload = {
444
- "delta": delta,
445
- "thread_total_before": baseline,
446
- "thread_total_after": telemetry.token_total,
447
- }
448
- run_updates["token_usage"] = token_usage_payload
449
- usage_payload = {
450
- "run_id": run_id,
451
- "captured_at": timestamp(),
452
- "agent": validated_agent,
453
- "thread_id": telemetry.thread_id,
454
- "turn_id": telemetry.turn_id,
455
- "token_usage": token_usage_payload,
456
- # Use getattr() for optional config attributes that may not exist in all config versions
457
- "cache_scope": getattr(self.config.usage, "cache_scope", "global"),
458
- }
459
- artifacts: dict[str, str] = {}
460
- if usage_payload is not None:
461
- usage_path = self._write_run_usage_artifact(run_id, usage_payload)
462
- if usage_path is not None:
463
- artifacts["usage_path"] = str(usage_path)
464
- redact_enabled = self.config.security.get("redact_run_logs", True)
465
- if telemetry and telemetry.plan is not None:
466
- plan_content = self._serialize_plan_content(
467
- telemetry.plan, redact_enabled=redact_enabled, run_id=run_id
468
- )
469
- plan_path = self._write_run_artifact(run_id, "plan.json", plan_content)
470
- artifacts["plan_path"] = str(plan_path)
471
- if telemetry and telemetry.diff is not None:
472
- diff_content = self._serialize_diff_content(
473
- telemetry.diff, redact_enabled=redact_enabled
474
- )
475
- if diff_content is not None:
476
- diff_path = self._write_run_artifact(run_id, "diff.patch", diff_content)
477
- artifacts["diff_path"] = str(diff_path)
478
- if artifacts:
479
- run_updates["artifacts"] = artifacts
480
- if redact_enabled:
481
- from .redaction import get_redaction_patterns
482
-
483
- run_updates["security"] = {
484
- "redaction_enabled": True,
485
- "redaction_version": "1.0",
486
- "redaction_patterns": get_redaction_patterns(),
487
- }
488
- if run_updates:
489
- self._merge_run_index_entry(run_id, run_updates)
490
- self._clear_run_telemetry(run_id)
491
- self._update_state(
492
- "error" if exit_code != 0 else "idle",
493
- run_id,
494
- exit_code,
495
- finished=True,
496
- )
497
- if exit_code != 0:
498
- self.notifier.notify_run_finished(run_id=run_id, exit_code=exit_code)
499
-
500
- if exit_code == 0 and self.config.git_auto_commit:
501
- if self._last_run_interrupted:
502
- return exit_code
503
- self.maybe_git_commit(run_id)
504
-
505
- return exit_code
506
-
507
- async def _run_final_summary_job(
508
- self, run_id: int, *, external_stop_flag: Optional[threading.Event] = None
509
- ) -> int:
510
- """
511
- Run a dedicated Codex invocation that produces/updates SUMMARY.md as the final user report.
512
- """
513
- prev_output = self.extract_prev_output(run_id - 1)
514
- prompt = build_final_summary_prompt(self.config, self.docs, prev_output)
515
-
516
- exit_code = await self._execute_run_step(
517
- prompt, run_id, external_stop_flag=external_stop_flag
518
- )
519
-
520
- if exit_code == 0:
521
- self._stamp_summary_finalized(run_id)
522
- self.notifier.notify_run_finished(run_id=run_id, exit_code=exit_code)
523
- # Commit is already handled by _execute_run_step if auto-commit is enabled.
524
- return exit_code
525
-
526
- def extract_prev_output(self, run_id: int) -> Optional[str]:
527
- if run_id <= 0:
528
- return None
529
- run_log = self._run_log_path(run_id)
530
- if run_log.exists():
531
- try:
532
- text = run_log.read_text(encoding="utf-8")
533
- except (FileNotFoundError, OSError) as exc:
534
- self._app_server_logger.debug(
535
- "Failed to read previous run log for run %s: %s", run_id, exc
536
- )
537
- text = ""
538
- if text:
539
- lines = [
540
- line
541
- for line in text.splitlines()
542
- if not line.startswith("=== run ")
543
- ]
544
- text = _strip_log_prefixes("\n".join(lines))
545
- max_chars = self.config.prompt_prev_run_max_chars
546
- return text[-max_chars:] if text else None
547
- if not self.log_path.exists():
548
- return None
549
- start = f"=== run {run_id} start ==="
550
- end = f"=== run {run_id} end"
551
- # NOTE: do NOT read the full log file into memory. Logs can be very large
552
- # (especially with verbose Codex output) and this can OOM the server/runner.
553
- text = _read_tail_text(self.log_path, max_bytes=250_000)
554
- lines = text.splitlines()
555
- collecting = False
556
- collected = []
557
- for line in lines:
558
- if line.strip() == start:
559
- collecting = True
560
- continue
561
- if collecting and line.startswith(end):
562
- break
563
- if collecting:
564
- collected.append(line)
565
- if not collected:
566
- return None
567
- text = "\n".join(collected)
568
- text = _strip_log_prefixes(text)
569
- max_chars = self.config.prompt_prev_run_max_chars
570
- return text[-max_chars:]
571
-
572
- def read_run_block(self, run_id: int) -> Optional[str]:
573
- """Return a single run block from the log."""
574
- index_entry = self._load_run_index().get(str(run_id))
575
- run_log = None
576
- if index_entry:
577
- run_log_raw = index_entry.get("run_log_path")
578
- if isinstance(run_log_raw, str) and run_log_raw:
579
- run_log = Path(run_log_raw)
580
- if run_log is None:
581
- run_log = self._run_log_path(run_id)
582
- if run_log.exists():
583
- try:
584
- return run_log.read_text(encoding="utf-8")
585
- except (FileNotFoundError, OSError) as exc:
586
- self._app_server_logger.debug(
587
- "Failed to read run log block for run %s: %s", run_id, exc
588
- )
589
- return None
590
- if index_entry:
591
- block = self._read_log_range(run_id, index_entry)
592
- if block is not None:
593
- return block
594
- if not self.log_path.exists():
595
- return None
596
- start = f"=== run {run_id} start"
597
- end = f"=== run {run_id} end"
598
- # Avoid reading entire log into memory; prefer tail scan.
599
- max_bytes = 1_000_000
600
- text = _read_tail_text(self.log_path, max_bytes=max_bytes)
601
- lines = text.splitlines()
602
- buf = []
603
- printing = False
604
- for line in lines:
605
- if line.startswith(start):
606
- printing = True
607
- buf.append(line)
608
- continue
609
- if printing and line.startswith(end):
610
- buf.append(line)
611
- break
612
- if printing:
613
- buf.append(line)
614
- if buf:
615
- return "\n".join(buf)
616
- # If file is small, fall back to full read (safe).
617
- try:
618
- if self.log_path.stat().st_size <= max_bytes:
619
- lines = self.log_path.read_text(encoding="utf-8").splitlines()
620
- buf = []
621
- printing = False
622
- for line in lines:
623
- if line.startswith(start):
624
- printing = True
625
- buf.append(line)
626
- continue
627
- if printing and line.startswith(end):
628
- buf.append(line)
629
- break
630
- if printing:
631
- buf.append(line)
632
- return "\n".join(buf) if buf else None
633
- except (FileNotFoundError, OSError, ValueError) as exc:
634
- self._app_server_logger.debug(
635
- "Failed to read full log for run %s block: %s", run_id, exc
636
- )
637
- return None
638
- return None
639
-
640
- def tail_log(self, tail: int) -> str:
641
- if not self.log_path.exists():
642
- return ""
643
- # Bound memory usage: only read a chunk from the end.
644
- text = _read_tail_text(self.log_path, max_bytes=400_000)
645
- lines = text.splitlines()
646
- return "\n".join(lines[-tail:])
647
-
648
- def log_line(self, run_id: int, message: str) -> None:
649
- line = f"[{timestamp()}] run={run_id} {message}\n"
650
- if self._active_global_handler is not None:
651
- self._emit_global_line(line.rstrip("\n"))
652
- else:
653
- self._ensure_log_path()
654
- with self.log_path.open("a", encoding="utf-8") as f:
655
- f.write(line)
656
- if self._active_run_log is not None:
657
- try:
658
- self._active_run_log.write(line)
659
- self._active_run_log.flush()
660
- except (OSError, IOError) as exc:
661
- self._app_server_logger.warning(
662
- "Failed to write to active run log for run %s: %s", run_id, exc
663
- )
664
- else:
665
- run_log = self._run_log_path(run_id)
666
- self._ensure_run_log_dir()
667
- with run_log.open("a", encoding="utf-8") as f:
668
- f.write(line)
669
-
670
- def _emit_event(self, run_id: int, event: str, **payload: Any) -> None:
671
- import json as _json
672
-
673
- event_data = {
674
- "ts": timestamp(),
675
- "event": event,
676
- "run_id": run_id,
677
- }
678
- if payload:
679
- event_data.update(payload)
680
- events_path = self._events_log_path(run_id)
681
- self._ensure_run_log_dir()
682
- try:
683
- with events_path.open("a", encoding="utf-8") as f:
684
- f.write(_json.dumps(event_data) + "\n")
685
- except (OSError, IOError) as exc:
686
- self._app_server_logger.warning(
687
- "Failed to write event to events log for run %s: %s", run_id, exc
688
- )
689
- event_type = {
690
- "run.started": FlowEventType.RUN_STARTED,
691
- "run.finished": FlowEventType.RUN_FINISHED,
692
- "run.state_changed": FlowEventType.RUN_STATE_CHANGED,
693
- "run.no_progress": FlowEventType.RUN_NO_PROGRESS,
694
- "token.updated": FlowEventType.TOKEN_USAGE,
695
- "plan.updated": FlowEventType.PLAN_UPDATED,
696
- "diff.updated": FlowEventType.DIFF_UPDATED,
697
- }.get(event)
698
- if event_type is not None:
699
- self._emit_canonical_event(run_id, event_type, payload)
700
-
701
- def _emit_canonical_event(
702
- self,
703
- run_id: int,
704
- event_type: FlowEventType,
705
- data: Optional[dict[str, Any]] = None,
706
- *,
707
- step_id: Optional[str] = None,
708
- timestamp_override: Optional[str] = None,
709
- ) -> None:
710
- event_payload: dict[str, Any] = {
711
- "id": uuid.uuid4().hex,
712
- "run_id": str(run_id),
713
- "event_type": event_type.value,
714
- "timestamp": timestamp_override or now_iso(),
715
- "data": data or {},
716
- }
717
- if step_id is not None:
718
- event_payload["step_id"] = step_id
719
- self._ensure_run_log_dir()
720
- with self._canonical_event_lock:
721
- seq = self._canonical_event_seq.get(run_id, 0) + 1
722
- self._canonical_event_seq[run_id] = seq
723
- event_payload["seq"] = seq
724
- events_path = self._canonical_events_log_path(run_id)
725
- try:
726
- with events_path.open("a", encoding="utf-8") as f:
727
- f.write(json.dumps(event_payload, ensure_ascii=True) + "\n")
728
- except (OSError, IOError) as exc:
729
- self._app_server_logger.warning(
730
- "Failed to write canonical event for run %s: %s", run_id, exc
731
- )
732
-
733
- async def _cancel_task_with_notice(
734
- self,
735
- run_id: int,
736
- task: asyncio.Task[Any],
737
- *,
738
- name: str,
739
- ) -> None:
740
- if task.done():
741
- return
742
- task.cancel()
743
- try:
744
- await task
745
- except asyncio.CancelledError:
746
- self._emit_canonical_event(
747
- run_id,
748
- FlowEventType.RUN_CANCELLED,
749
- {"task": name},
750
- )
751
-
752
- def _ensure_log_path(self) -> None:
753
- self.log_path.parent.mkdir(parents=True, exist_ok=True)
754
-
755
- def _run_log_path(self, run_id: int) -> Path:
756
- return self.log_path.parent / "runs" / f"run-{run_id}.log"
757
-
758
- def _events_log_path(self, run_id: int) -> Path:
759
- return self.log_path.parent / "runs" / f"run-{run_id}.events.jsonl"
760
-
761
- def _canonical_events_log_path(self, run_id: int) -> Path:
762
- return self.log_path.parent / "runs" / f"run-{run_id}.events.canonical.jsonl"
763
-
764
- def _ensure_run_log_dir(self) -> None:
765
- (self.log_path.parent / "runs").mkdir(parents=True, exist_ok=True)
766
-
767
- def _write_run_marker(
768
- self,
769
- run_id: int,
770
- marker: str,
771
- exit_code: Optional[int] = None,
772
- *,
773
- actor: Optional[dict[str, Any]] = None,
774
- mode: Optional[dict[str, Any]] = None,
775
- ) -> None:
776
- suffix = ""
777
- if marker == "end":
778
- suffix = f" (code {exit_code})"
779
- self._emit_event(run_id, "run.finished", exit_code=exit_code)
780
- elif marker == "start":
781
- payload: dict[str, Any] = {}
782
- if actor is not None:
783
- payload["actor"] = actor
784
- if mode is not None:
785
- payload["mode"] = mode
786
- self._emit_event(run_id, "run.started", **payload)
787
- text = f"=== run {run_id} {marker}{suffix} ==="
788
- offset = self._emit_global_line(text)
789
- if self._active_run_log is not None:
790
- try:
791
- self._active_run_log.write(f"{text}\n")
792
- self._active_run_log.flush()
793
- except (OSError, IOError) as exc:
794
- self._app_server_logger.warning(
795
- "Failed to write marker to active run log for run %s: %s",
796
- run_id,
797
- exc,
798
- )
799
- else:
800
- self._ensure_run_log_dir()
801
- run_log = self._run_log_path(run_id)
802
- with run_log.open("a", encoding="utf-8") as f:
803
- f.write(f"{text}\n")
804
- self._update_run_index(
805
- run_id, marker, offset, exit_code, actor=actor, mode=mode
806
- )
807
-
808
- def _emit_global_line(self, text: str) -> Optional[tuple[int, int]]:
809
- if self._active_global_handler is None:
810
- self._ensure_log_path()
811
- try:
812
- with self.log_path.open("a", encoding="utf-8") as f:
813
- start = f.tell()
814
- f.write(f"{text}\n")
815
- f.flush()
816
- return (start, f.tell())
817
- except (OSError, IOError) as exc:
818
- self._app_server_logger.warning(
819
- "Failed to write global log line: %s", exc
820
- )
821
- return None
822
- handler = self._active_global_handler
823
- record = logging.LogRecord(
824
- name="codex_autorunner.engine",
825
- level=logging.INFO,
826
- pathname="",
827
- lineno=0,
828
- msg=text,
829
- args=(),
830
- exc_info=None,
831
- )
832
- handler.acquire()
833
- try:
834
- if handler.shouldRollover(record):
835
- handler.doRollover()
836
- if handler.stream is None:
837
- handler.stream = handler._open()
838
- start_offset = handler.stream.tell()
839
- logging.FileHandler.emit(handler, record)
840
- handler.flush()
841
- end_offset = handler.stream.tell()
842
- return (start_offset, end_offset)
843
- except (OSError, IOError, RuntimeError) as exc:
844
- self._app_server_logger.warning("Failed to emit log via handler: %s", exc)
845
- return None
846
- finally:
847
- handler.release()
848
-
849
- @contextlib.contextmanager
850
- def _run_log_context(self, run_id: int) -> Iterator[None]:
851
- self._ensure_log_path()
852
- self._ensure_run_log_dir()
853
- # Use getattr() for optional config attributes that may not exist in all config versions
854
- max_bytes = getattr(self.config.log, "max_bytes", None) or 0
855
- backup_count = getattr(self.config.log, "backup_count", 0) or 0
856
- handler = RotatingFileHandler(
857
- self.log_path,
858
- maxBytes=max_bytes,
859
- backupCount=backup_count,
860
- encoding="utf-8",
861
- )
862
- handler.setFormatter(logging.Formatter("%(message)s"))
863
- run_log = self._run_log_path(run_id)
864
- with run_log.open("a", encoding="utf-8") as run_handle:
865
- self._active_global_handler = handler
866
- self._active_run_log = run_handle
867
- try:
868
- yield
869
- finally:
870
- self._active_global_handler = None
871
- self._active_run_log = None
872
- try:
873
- handler.close()
874
- except (OSError, IOError) as exc:
875
- self._app_server_logger.debug(
876
- "Failed to close run log handler for run %s: %s", run_id, exc
877
- )
878
-
879
- def _start_run_telemetry(self, run_id: int) -> None:
880
- with self._run_telemetry_lock:
881
- self._run_telemetry = RunTelemetry(run_id=run_id)
882
- self._app_server_event_formatter.reset()
883
-
884
- def _update_run_telemetry(self, run_id: int, **updates: Any) -> None:
885
- with self._run_telemetry_lock:
886
- telemetry = self._run_telemetry
887
- if telemetry is None or telemetry.run_id != run_id:
888
- return
889
- for key, value in updates.items():
890
- if hasattr(telemetry, key):
891
- setattr(telemetry, key, value)
892
-
893
- def _snapshot_run_telemetry(self, run_id: int) -> Optional[RunTelemetry]:
894
- with self._run_telemetry_lock:
895
- telemetry = self._run_telemetry
896
- if telemetry is None or telemetry.run_id != run_id:
897
- return None
898
- return dataclasses.replace(telemetry)
899
-
900
- def _clear_run_telemetry(self, run_id: int) -> None:
901
- with self._run_telemetry_lock:
902
- telemetry = self._run_telemetry
903
- if telemetry is None or telemetry.run_id != run_id:
904
- return
905
- self._run_telemetry = None
906
-
907
- @staticmethod
908
- def _normalize_diff_payload(diff: Any) -> Optional[Any]:
909
- if diff is None:
910
- return None
911
- if isinstance(diff, str):
912
- return diff if diff.strip() else None
913
- if isinstance(diff, dict):
914
- # Prefer meaningful fields if present.
915
- for key in ("diff", "patch", "content", "value"):
916
- if key in diff:
917
- val = diff.get(key)
918
- if isinstance(val, str) and val.strip():
919
- return val
920
- if val not in (None, "", [], {}, ()):
921
- return diff
922
- for val in diff.values():
923
- if isinstance(val, str) and val.strip():
924
- return diff
925
- if val not in (None, "", [], {}, ()):
926
- return diff
927
- return None
928
- return diff
929
-
930
- @staticmethod
931
- def _hash_content(content: str) -> str:
932
- return hashlib.sha256((content or "").encode("utf-8")).hexdigest()
933
-
934
- def _serialize_plan_content(
935
- self,
936
- plan: Any,
937
- *,
938
- redact_enabled: bool,
939
- run_id: Optional[int] = None,
940
- ) -> str:
941
- try:
942
- content = (
943
- plan
944
- if isinstance(plan, str)
945
- else json.dumps(plan, ensure_ascii=True, indent=2, default=str)
946
- )
947
- except (TypeError, ValueError) as exc:
948
- if run_id is not None:
949
- self._app_server_logger.debug(
950
- "Failed to serialize plan to JSON for run %s: %s", run_id, exc
951
- )
952
- else:
953
- self._app_server_logger.debug(
954
- "Failed to serialize plan to JSON: %s", exc
955
- )
956
- content = json.dumps({"plan": str(plan)}, ensure_ascii=True, indent=2)
957
- if redact_enabled:
958
- content = redact_text(content)
959
- return content
960
-
961
- def _serialize_diff_content(
962
- self, diff: Any, *, redact_enabled: bool
963
- ) -> Optional[str]:
964
- normalized = self._normalize_diff_payload(diff)
965
- if normalized is None:
966
- return None
967
- content = (
968
- normalized
969
- if isinstance(normalized, str)
970
- else json.dumps(normalized, ensure_ascii=True, indent=2, default=str)
971
- )
972
- if redact_enabled:
973
- content = redact_text(content)
974
- return content
975
-
976
- def _maybe_update_run_index_telemetry(
977
- self, run_id: int, min_interval_seconds: float = 3.0
978
- ) -> None:
979
- import time as _time
980
-
981
- now = _time.time()
982
- if now - self._last_telemetry_update_time < min_interval_seconds:
983
- return
984
- telemetry = self._snapshot_run_telemetry(run_id)
985
- if telemetry is None:
986
- return
987
- if telemetry.thread_id and isinstance(telemetry.token_total, dict):
988
- with state_lock(self.state_path):
989
- state = load_state(self.state_path)
990
- selected_agent = (
991
- (state.autorunner_agent_override or "codex").strip().lower()
992
- )
993
- baseline = None
994
- if selected_agent != "opencode":
995
- baseline = self._find_thread_token_baseline(
996
- thread_id=telemetry.thread_id, run_id=run_id
997
- )
998
- delta = self._compute_token_delta(baseline, telemetry.token_total)
999
- self._merge_run_index_entry(
1000
- run_id,
1001
- {
1002
- "token_usage": {
1003
- "delta": delta,
1004
- "thread_total_before": baseline,
1005
- "thread_total_after": telemetry.token_total,
1006
- }
1007
- },
1008
- )
1009
- self._last_telemetry_update_time = now
1010
-
1011
- async def _handle_app_server_notification(self, message: dict[str, Any]) -> None:
1012
- if not isinstance(message, dict):
1013
- return
1014
- method = message.get("method")
1015
- params_raw = message.get("params")
1016
- params = params_raw if isinstance(params_raw, dict) else {}
1017
- thread_id = (
1018
- extract_thread_id_for_turn(params)
1019
- or extract_thread_id(params)
1020
- or extract_thread_id(message)
1021
- )
1022
- turn_id = extract_turn_id(params) or extract_turn_id(message)
1023
- run_id: Optional[int] = None
1024
- plan_update: Any = None
1025
- diff_update: Any = None
1026
- with self._run_telemetry_lock:
1027
- telemetry = self._run_telemetry
1028
- if telemetry is None:
1029
- return
1030
- if telemetry.thread_id and thread_id and telemetry.thread_id != thread_id:
1031
- return
1032
- if telemetry.turn_id and turn_id and telemetry.turn_id != turn_id:
1033
- return
1034
- if telemetry.thread_id is None and thread_id:
1035
- telemetry.thread_id = thread_id
1036
- if telemetry.turn_id is None and turn_id:
1037
- telemetry.turn_id = turn_id
1038
- run_id = telemetry.run_id
1039
- if method == "thread/tokenUsage/updated":
1040
- token_usage = (
1041
- params.get("token_usage") or params.get("tokenUsage") or {}
1042
- )
1043
- if isinstance(token_usage, dict):
1044
- total = token_usage.get("total") or token_usage.get("totals")
1045
- if isinstance(total, dict):
1046
- telemetry.token_total = total
1047
- self._maybe_update_run_index_telemetry(run_id)
1048
- self._emit_event(run_id, "token.updated", token_total=total)
1049
- if method == "turn/plan/updated":
1050
- plan_update = params.get("plan") if "plan" in params else params
1051
- telemetry.plan = plan_update
1052
- if method == "turn/diff/updated":
1053
- diff: Any = None
1054
- for key in ("diff", "patch", "content", "value"):
1055
- if key in params:
1056
- diff = params.get(key)
1057
- break
1058
- diff_update = diff if diff is not None else params or None
1059
- telemetry.diff = diff_update
1060
- if run_id is None:
1061
- return
1062
- redact_enabled = self.config.security.get("redact_run_logs", True)
1063
- notification_path = self._append_run_notification(
1064
- run_id, message, redact_enabled
1065
- )
1066
- if notification_path is not None:
1067
- self._merge_run_index_entry(
1068
- run_id,
1069
- {
1070
- "artifacts": {
1071
- "app_server_notifications_path": str(notification_path)
1072
- }
1073
- },
1074
- )
1075
- if plan_update is not None:
1076
- plan_content = self._serialize_plan_content(
1077
- plan_update, redact_enabled=redact_enabled, run_id=run_id
1078
- )
1079
- plan_path = self._write_run_artifact(run_id, "plan.json", plan_content)
1080
- self._merge_run_index_entry(
1081
- run_id, {"artifacts": {"plan_path": str(plan_path)}}
1082
- )
1083
- self._emit_event(
1084
- run_id,
1085
- "plan.updated",
1086
- plan_hash=self._hash_content(plan_content),
1087
- plan_path=str(plan_path),
1088
- )
1089
- if diff_update is not None:
1090
- diff_content = self._serialize_diff_content(
1091
- diff_update, redact_enabled=redact_enabled
1092
- )
1093
- if diff_content is not None:
1094
- diff_path = self._write_run_artifact(run_id, "diff.patch", diff_content)
1095
- self._merge_run_index_entry(
1096
- run_id, {"artifacts": {"diff_path": str(diff_path)}}
1097
- )
1098
- self._emit_event(
1099
- run_id,
1100
- "diff.updated",
1101
- diff_hash=self._hash_content(diff_content),
1102
- diff_path=str(diff_path),
1103
- )
1104
- for line in self._app_server_event_formatter.format_event(message):
1105
- self.log_line(run_id, f"stdout: {line}" if line else "stdout: ")
1106
-
1107
- def _load_run_index(self) -> dict[str, dict]:
1108
- return self._run_index_store.load_all()
1109
-
1110
- def reconcile_run_index(self) -> None:
1111
- """Best-effort: mark stale runs that still look 'running' in the run index.
1112
-
1113
- The Runs UI considers a run "running" when both `finished_at` and `exit_code`
1114
- are missing. If the runner process was killed or crashed, the `end` marker is
1115
- never written, so the entry stays "running" forever. This method uses the
1116
- runner state + lock pid as the authoritative signal for whether a run can
1117
- still be active, then forces stale entries to a finished/error state.
1118
- """
1119
- try:
1120
- state = load_state(self.state_path)
1121
- except Exception as exc:
1122
- self._app_server_logger.warning(
1123
- "Failed to load state during run index reconciliation: %s", exc
1124
- )
1125
- return
1126
-
1127
- active_pid: Optional[int] = None
1128
- pid = state.runner_pid
1129
- if pid and process_alive(pid):
1130
- active_pid = pid
1131
- else:
1132
- info = read_lock_info(self.lock_path)
1133
- if info.pid and process_alive(info.pid):
1134
- active_pid = info.pid
1135
-
1136
- active_run_id: Optional[int] = None
1137
- if (
1138
- active_pid is not None
1139
- and state.status == "running"
1140
- and state.last_run_id is not None
1141
- ):
1142
- active_run_id = int(state.last_run_id)
1143
-
1144
- now = now_iso()
1145
- try:
1146
- index = self._run_index_store.load_all()
1147
- except Exception as exc:
1148
- self._app_server_logger.warning(
1149
- "Failed to load run index during reconciliation: %s", exc
1150
- )
1151
- return
1152
-
1153
- for key, entry in index.items():
1154
- try:
1155
- run_id = int(key)
1156
- except (TypeError, ValueError):
1157
- continue
1158
- if not isinstance(entry, dict):
1159
- continue
1160
- if run_id <= 0:
1161
- continue
1162
-
1163
- if active_run_id is not None and run_id == active_run_id:
1164
- continue
1165
-
1166
- if entry.get("reconciled_at") is not None:
1167
- continue
1168
-
1169
- finished_at = entry.get("finished_at")
1170
- exit_code = entry.get("exit_code")
1171
-
1172
- if isinstance(finished_at, str) and finished_at:
1173
- continue
1174
- if exit_code is not None:
1175
- continue
1176
-
1177
- inferred_exit: int
1178
- if state.last_run_id == run_id and state.last_exit_code is not None:
1179
- inferred_exit = int(state.last_exit_code)
1180
- else:
1181
- inferred_exit = 1
1182
-
1183
- try:
1184
- self._run_index_store.merge_entry(
1185
- run_id,
1186
- {
1187
- "finished_at": now,
1188
- "exit_code": inferred_exit,
1189
- "reconciled_at": now,
1190
- "reconciled_reason": (
1191
- "runner_active"
1192
- if active_pid is not None
1193
- else "runner_inactive"
1194
- ),
1195
- },
1196
- )
1197
- except Exception as exc:
1198
- self._app_server_logger.warning(
1199
- "Failed to reconcile run index entry for run %d: %s", run_id, exc
1200
- )
1201
- continue
1202
-
1203
- def _merge_run_index_entry(self, run_id: int, updates: dict[str, Any]) -> None:
1204
- self._run_index_store.merge_entry(run_id, updates)
1205
-
1206
- def _update_run_index(
1207
- self,
1208
- run_id: int,
1209
- marker: str,
1210
- offset: Optional[tuple[int, int]],
1211
- exit_code: Optional[int],
1212
- *,
1213
- actor: Optional[dict[str, Any]] = None,
1214
- mode: Optional[dict[str, Any]] = None,
1215
- ) -> None:
1216
- self._run_index_store.update_marker(
1217
- run_id,
1218
- marker,
1219
- offset,
1220
- exit_code,
1221
- log_path=str(self.log_path),
1222
- run_log_path=str(self._run_log_path(run_id)),
1223
- actor=actor,
1224
- mode=mode,
1225
- )
1226
-
1227
- def _list_from_counts(self, source: list[str], counts: Counter[str]) -> list[str]:
1228
- if not source or not counts:
1229
- return []
1230
- remaining = Counter(counts)
1231
- items: list[str] = []
1232
- for entry in source:
1233
- if remaining[entry] > 0:
1234
- items.append(entry)
1235
- remaining[entry] -= 1
1236
- return items
1237
-
1238
- def _compute_todo_attribution(
1239
- self, before_text: str, after_text: str
1240
- ) -> dict[str, Any]:
1241
- before_out, before_done = parse_todos(before_text or "")
1242
- after_out, after_done = parse_todos(after_text or "")
1243
- before_out_counter = Counter(before_out)
1244
- before_done_counter = Counter(before_done)
1245
- after_out_counter = Counter(after_out)
1246
- after_done_counter = Counter(after_done)
1247
-
1248
- completed_counts: Counter[str] = Counter()
1249
- for item, count in after_done_counter.items():
1250
- if before_out_counter[item] > 0:
1251
- completed_counts[item] = min(before_out_counter[item], count)
1252
-
1253
- reopened_counts: Counter[str] = Counter()
1254
- for item, count in after_out_counter.items():
1255
- if before_done_counter[item] > 0:
1256
- reopened_counts[item] = min(before_done_counter[item], count)
1257
-
1258
- new_outstanding_counts = after_out_counter - before_out_counter
1259
- added_counts = new_outstanding_counts - reopened_counts
1260
-
1261
- completed = self._list_from_counts(after_done, completed_counts)
1262
- reopened = self._list_from_counts(after_out, reopened_counts)
1263
- added = self._list_from_counts(after_out, added_counts)
1264
-
1265
- return {
1266
- "completed": completed,
1267
- "added": added,
1268
- "reopened": reopened,
1269
- "counts": {
1270
- "completed": len(completed),
1271
- "added": len(added),
1272
- "reopened": len(reopened),
1273
- },
1274
- }
1275
-
1276
- def _build_todo_snapshot(self, before_text: str, after_text: str) -> dict[str, Any]:
1277
- before_out, before_done = parse_todos(before_text or "")
1278
- after_out, after_done = parse_todos(after_text or "")
1279
- return {
1280
- "before": {
1281
- "outstanding": before_out,
1282
- "done": before_done,
1283
- "counts": {
1284
- "outstanding": len(before_out),
1285
- "done": len(before_done),
1286
- },
1287
- },
1288
- "after": {
1289
- "outstanding": after_out,
1290
- "done": after_done,
1291
- "counts": {
1292
- "outstanding": len(after_out),
1293
- "done": len(after_done),
1294
- },
1295
- },
1296
- }
1297
-
1298
- def _find_thread_token_baseline(
1299
- self, *, thread_id: str, run_id: int
1300
- ) -> Optional[dict[str, Any]]:
1301
- index = self._load_run_index()
1302
- best_run = -1
1303
- baseline: Optional[dict[str, Any]] = None
1304
- for key, entry in index.items():
1305
- try:
1306
- entry_id = int(key)
1307
- except (TypeError, ValueError) as exc:
1308
- self._app_server_logger.debug(
1309
- "Failed to parse run index key '%s' while resolving run %s: %s",
1310
- key,
1311
- run_id,
1312
- exc,
1313
- )
1314
- continue
1315
- if entry_id >= run_id:
1316
- continue
1317
- app_server = entry.get("app_server")
1318
- if not isinstance(app_server, dict):
1319
- continue
1320
- if app_server.get("thread_id") != thread_id:
1321
- continue
1322
- token_usage = entry.get("token_usage")
1323
- if not isinstance(token_usage, dict):
1324
- continue
1325
- total = token_usage.get("thread_total_after")
1326
- if isinstance(total, dict) and entry_id > best_run:
1327
- best_run = entry_id
1328
- baseline = total
1329
- return baseline
1330
-
1331
- def _compute_token_delta(
1332
- self,
1333
- baseline: Optional[dict[str, Any]],
1334
- final_total: Optional[dict[str, Any]],
1335
- ) -> Optional[dict[str, Any]]:
1336
- if not isinstance(final_total, dict):
1337
- return None
1338
- base = baseline if isinstance(baseline, dict) else {}
1339
- delta: dict[str, Any] = {}
1340
- for key, value in final_total.items():
1341
- if not isinstance(value, (int, float)):
1342
- continue
1343
- prior = base.get(key, 0)
1344
- if isinstance(prior, (int, float)):
1345
- delta[key] = value - prior
1346
- else:
1347
- delta[key] = value
1348
- return delta
1349
-
1350
- def _build_app_server_meta(
1351
- self,
1352
- *,
1353
- thread_id: str,
1354
- turn_id: str,
1355
- thread_info: Optional[dict[str, Any]],
1356
- model: Optional[str],
1357
- reasoning_effort: Optional[str],
1358
- ) -> dict[str, Any]:
1359
- meta: dict[str, Any] = {"thread_id": thread_id, "turn_id": turn_id}
1360
- if model:
1361
- meta["model"] = model
1362
- if reasoning_effort:
1363
- meta["reasoning_effort"] = reasoning_effort
1364
- if not isinstance(thread_info, dict):
1365
- return meta
1366
-
1367
- def _first_string(keys: tuple[str, ...]) -> Optional[str]:
1368
- for key in keys:
1369
- value = thread_info.get(key)
1370
- if isinstance(value, str) and value:
1371
- return value
1372
- return None
1373
-
1374
- if "model" not in meta:
1375
- thread_model = _first_string(("model", "model_id", "modelId", "model_name"))
1376
- if thread_model:
1377
- meta["model"] = thread_model
1378
- provider = _first_string(
1379
- ("model_provider", "modelProvider", "provider", "model_provider_name")
1380
- )
1381
- if provider:
1382
- meta["model_provider"] = provider
1383
- if "reasoning_effort" not in meta:
1384
- thread_effort = _first_string(
1385
- ("reasoning_effort", "reasoningEffort", "effort")
1386
- )
1387
- if thread_effort:
1388
- meta["reasoning_effort"] = thread_effort
1389
- return meta
1390
-
1391
- def _write_run_artifact(self, run_id: int, name: str, content: str) -> Path:
1392
- self._ensure_run_log_dir()
1393
- path = self.log_path.parent / "runs" / f"run-{run_id}.{name}"
1394
- atomic_write(path, content)
1395
- return path
1396
-
1397
- def _write_run_usage_artifact(
1398
- self, run_id: int, payload: dict[str, Any]
1399
- ) -> Optional[Path]:
1400
- self._ensure_run_log_dir()
1401
- run_dir = self.log_path.parent / "runs" / str(run_id)
1402
- try:
1403
- run_dir.mkdir(parents=True, exist_ok=True)
1404
- path = run_dir / "usage.json"
1405
- atomic_write(
1406
- path,
1407
- json.dumps(payload, ensure_ascii=True, indent=2, default=str),
1408
- )
1409
- return path
1410
- except OSError as exc:
1411
- self._app_server_logger.warning(
1412
- "Failed to write usage artifact for run %s: %s", run_id, exc
1413
- )
1414
- return None
1415
-
1416
- def _app_server_notifications_path(self, run_id: int) -> Path:
1417
- return (
1418
- self.log_path.parent
1419
- / "runs"
1420
- / f"run-{run_id}.app_server.notifications.jsonl"
1421
- )
1422
-
1423
- def _append_run_notification(
1424
- self, run_id: int, message: dict[str, Any], redact_enabled: bool
1425
- ) -> Optional[Path]:
1426
- self._ensure_run_log_dir()
1427
- path = self._app_server_notifications_path(run_id)
1428
- payload = {"ts": timestamp(), "message": message}
1429
- try:
1430
- line = json.dumps(payload, ensure_ascii=True, default=str)
1431
- if redact_enabled:
1432
- line = redact_text(line)
1433
- with path.open("a", encoding="utf-8") as f:
1434
- f.write(line + "\n")
1435
- except (OSError, IOError, TypeError, ValueError) as exc:
1436
- self._app_server_logger.warning(
1437
- "Failed to write app-server notification for run %s: %s", run_id, exc
1438
- )
1439
- return None
1440
- return path
1441
-
1442
- def _read_log_range(self, run_id: int, entry: dict) -> Optional[str]:
1443
- start = entry.get("start_offset")
1444
- end = entry.get("end_offset")
1445
- if start is None or end is None:
1446
- return None
1447
- try:
1448
- start_offset = int(start)
1449
- end_offset = int(end)
1450
- except (TypeError, ValueError) as exc:
1451
- self._app_server_logger.debug(
1452
- "Failed to parse log range offsets for run %s: %s", run_id, exc
1453
- )
1454
- return None
1455
- if end_offset < start_offset:
1456
- return None
1457
- log_path = Path(entry.get("log_path", self.log_path))
1458
- if not log_path.exists():
1459
- return None
1460
- try:
1461
- size = log_path.stat().st_size
1462
- if size < end_offset:
1463
- return None
1464
- with log_path.open("rb") as f:
1465
- f.seek(start_offset)
1466
- data = f.read(end_offset - start_offset)
1467
- return data.decode("utf-8", errors="replace")
1468
- except (FileNotFoundError, OSError) as exc:
1469
- self._app_server_logger.debug(
1470
- "Failed to read log range for run %s: %s", run_id, exc
1471
- )
1472
- return None
1473
-
1474
- def _build_app_server_prompt(self, prev_output: Optional[str]) -> str:
1475
- return build_autorunner_prompt(
1476
- self.config,
1477
- message=AUTORUNNER_APP_SERVER_MESSAGE,
1478
- prev_run_summary=prev_output,
1479
- )
1480
-
1481
- def run_codex_app_server(
1482
- self,
1483
- prompt: str,
1484
- run_id: int,
1485
- *,
1486
- external_stop_flag: Optional[threading.Event] = None,
1487
- ) -> int:
1488
- try:
1489
- return asyncio.run(
1490
- self._run_codex_app_server_async(
1491
- prompt,
1492
- run_id,
1493
- external_stop_flag=external_stop_flag,
1494
- )
1495
- )
1496
- except RuntimeError as exc:
1497
- if "asyncio.run" in str(exc):
1498
- self.log_line(
1499
- run_id,
1500
- "error: app-server backend cannot run inside an active event loop",
1501
- )
1502
- return 1
1503
- raise
1504
-
1505
- async def _run_agent_async(
1506
- self,
1507
- *,
1508
- agent_id: str,
1509
- prompt: str,
1510
- run_id: int,
1511
- state: RunnerState,
1512
- external_stop_flag: Optional[threading.Event],
1513
- ) -> int:
1514
- """
1515
- Run an agent turn using the specified backend.
1516
-
1517
- This method is protocol-agnostic - it determines the appropriate
1518
- model/reasoning parameters based on the agent_id and delegates to
1519
- either the BackendOrchestrator or _run_agent_backend_async().
1520
- """
1521
- # Determine model and reasoning parameters based on agent
1522
- if agent_id == "codex":
1523
- model = state.autorunner_model_override or self.config.codex_model
1524
- reasoning = state.autorunner_effort_override or self.config.codex_reasoning
1525
- elif agent_id == "opencode":
1526
- model = state.autorunner_model_override
1527
- reasoning = state.autorunner_effort_override
1528
- else:
1529
- # Fallback to codex defaults for unknown agents
1530
- model = state.autorunner_model_override or self.config.codex_model
1531
- reasoning = state.autorunner_effort_override or self.config.codex_reasoning
1532
-
1533
- # Use BackendOrchestrator if available, otherwise fall back to old method
1534
- if agent_id == "codex":
1535
- session_key = "autorunner"
1536
- elif agent_id == "opencode":
1537
- session_key = "autorunner.opencode"
1538
- else:
1539
- session_key = "autorunner"
1540
-
1541
- if self._backend_orchestrator is not None:
1542
- return await self._run_agent_via_orchestrator(
1543
- agent_id=agent_id,
1544
- prompt=prompt,
1545
- run_id=run_id,
1546
- state=state,
1547
- model=model,
1548
- reasoning=reasoning,
1549
- session_key=session_key,
1550
- external_stop_flag=external_stop_flag,
1551
- )
1552
-
1553
- # Fallback to old method for backward compatibility (testing)
1554
- return await self._run_agent_backend_async(
1555
- agent_id=agent_id,
1556
- prompt=prompt,
1557
- run_id=run_id,
1558
- state=state,
1559
- session_key=session_key,
1560
- model=model,
1561
- reasoning=reasoning,
1562
- external_stop_flag=external_stop_flag,
1563
- )
1564
-
1565
- async def _run_agent_via_orchestrator(
1566
- self,
1567
- *,
1568
- agent_id: str,
1569
- prompt: str,
1570
- run_id: int,
1571
- state: RunnerState,
1572
- model: Optional[str],
1573
- reasoning: Optional[str],
1574
- session_key: str,
1575
- external_stop_flag: Optional[threading.Event],
1576
- ) -> int:
1577
- """
1578
- Run an agent turn using the BackendOrchestrator.
1579
-
1580
- This method uses the orchestrator's protocol-agnostic interface to run
1581
- a turn on the backend, handling all events and emitting canonical events.
1582
- """
1583
- orchestrator = self._backend_orchestrator
1584
- assert (
1585
- orchestrator is not None
1586
- ), "orchestrator should be set when calling this method"
1587
-
1588
- events: asyncio.Queue[Optional[RunEvent]] = asyncio.Queue()
1589
-
1590
- async def _produce_events() -> None:
1591
- try:
1592
- async for event in orchestrator.run_turn(
1593
- agent_id=agent_id,
1594
- state=state,
1595
- prompt=prompt,
1596
- model=model,
1597
- reasoning=reasoning,
1598
- session_key=session_key,
1599
- ):
1600
- await events.put(event)
1601
- except Exception as exc:
1602
- await events.put(Failed(timestamp=now_iso(), error_message=str(exc)))
1603
- finally:
1604
- await events.put(None)
1605
-
1606
- producer_task = asyncio.create_task(_produce_events())
1607
- stop_task = asyncio.create_task(self._wait_for_stop(external_stop_flag))
1608
- timeout_seconds = self.config.app_server.turn_timeout_seconds
1609
- timeout_task: Optional[asyncio.Task] = (
1610
- asyncio.create_task(asyncio.sleep(timeout_seconds))
1611
- if timeout_seconds
1612
- else None
1613
- )
1614
-
1615
- assistant_messages: list[str] = []
1616
- final_message: Optional[str] = None
1617
- failed_error: Optional[str] = None
1618
-
1619
- try:
1620
- while True:
1621
- get_task = asyncio.create_task(events.get())
1622
- tasks = {get_task, stop_task}
1623
- if timeout_task is not None:
1624
- tasks.add(timeout_task)
1625
- done, pending = await asyncio.wait(
1626
- tasks, return_when=asyncio.FIRST_COMPLETED
1627
- )
1628
-
1629
- if get_task in done:
1630
- event = get_task.result()
1631
- if event is None:
1632
- break
1633
- if isinstance(event, Started) and event.session_id:
1634
- self._update_run_telemetry(run_id, thread_id=event.session_id)
1635
- elif isinstance(event, OutputDelta):
1636
- self._emit_canonical_event(
1637
- run_id,
1638
- FlowEventType.AGENT_STREAM_DELTA,
1639
- {
1640
- "delta": event.content,
1641
- "delta_type": event.delta_type,
1642
- },
1643
- timestamp_override=event.timestamp,
1644
- )
1645
- if event.delta_type in {
1646
- "assistant_message",
1647
- "assistant_stream",
1648
- }:
1649
- assistant_messages.append(event.content)
1650
- elif event.delta_type == "log_line":
1651
- self.log_line(
1652
- run_id,
1653
- (
1654
- f"stdout: {event.content}"
1655
- if event.content
1656
- else "stdout: "
1657
- ),
1658
- )
1659
- elif isinstance(event, ToolCall):
1660
- self._emit_canonical_event(
1661
- run_id,
1662
- FlowEventType.TOOL_CALL,
1663
- {
1664
- "tool_name": event.tool_name,
1665
- "tool_input": event.tool_input,
1666
- },
1667
- timestamp_override=event.timestamp,
1668
- )
1669
- elif isinstance(event, ApprovalRequested):
1670
- self._emit_canonical_event(
1671
- run_id,
1672
- FlowEventType.APPROVAL_REQUESTED,
1673
- {
1674
- "request_id": event.request_id,
1675
- "description": event.description,
1676
- "context": event.context,
1677
- },
1678
- timestamp_override=event.timestamp,
1679
- )
1680
- elif isinstance(event, TokenUsage):
1681
- self._emit_canonical_event(
1682
- run_id,
1683
- FlowEventType.TOKEN_USAGE,
1684
- {"usage": event.usage},
1685
- timestamp_override=event.timestamp,
1686
- )
1687
- elif isinstance(event, RunNotice):
1688
- notice_type = FlowEventType.RUN_STATE_CHANGED
1689
- if event.kind.endswith("timeout"):
1690
- notice_type = FlowEventType.RUN_TIMEOUT
1691
- elif "cancel" in event.kind:
1692
- notice_type = FlowEventType.RUN_CANCELLED
1693
- data: dict[str, Any] = {
1694
- "kind": event.kind,
1695
- "message": event.message,
1696
- }
1697
- if event.data:
1698
- data["data"] = event.data
1699
- self._emit_canonical_event(
1700
- run_id,
1701
- notice_type,
1702
- data,
1703
- timestamp_override=event.timestamp,
1704
- )
1705
- elif isinstance(event, Completed):
1706
- if event.final_message:
1707
- self._emit_canonical_event(
1708
- run_id,
1709
- FlowEventType.AGENT_MESSAGE_COMPLETE,
1710
- {"final_message": event.final_message},
1711
- timestamp_override=event.timestamp,
1712
- )
1713
- if event.final_message:
1714
- final_message = event.final_message
1715
- elif isinstance(event, Failed):
1716
- self.log_line(
1717
- run_id,
1718
- f"error: backend run failed: {event.error_message}",
1719
- )
1720
- failed_error = event.error_message
1721
-
1722
- if stop_task in done:
1723
- self._last_run_interrupted = True
1724
- self.log_line(run_id, "info: stop requested; interrupting backend")
1725
- if not producer_task.done():
1726
- producer_task.cancel()
1727
- try:
1728
- await producer_task
1729
- except asyncio.CancelledError:
1730
- pass
1731
- if timeout_task and not timeout_task.done():
1732
- timeout_task.cancel()
1733
- try:
1734
- await orchestrator.interrupt(agent_id, state)
1735
- except Exception as exc:
1736
- self.log_line(run_id, f"interrupt failed: {exc}")
1737
- if not get_task.done():
1738
- get_task.cancel()
1739
- for task in pending:
1740
- task.cancel()
1741
- return 0
1742
-
1743
- if timeout_task and timeout_task in done:
1744
- if not producer_task.done():
1745
- producer_task.cancel()
1746
- try:
1747
- await producer_task
1748
- except asyncio.CancelledError:
1749
- pass
1750
- try:
1751
- await orchestrator.interrupt(agent_id, state)
1752
- except Exception as exc:
1753
- self.log_line(run_id, f"interrupt failed: {exc}")
1754
- if not get_task.done():
1755
- get_task.cancel()
1756
- for task in pending:
1757
- task.cancel()
1758
- return 1
1759
- finally:
1760
- if not producer_task.done():
1761
- producer_task.cancel()
1762
- try:
1763
- await producer_task
1764
- except asyncio.CancelledError:
1765
- pass
1766
- if timeout_task and not timeout_task.done():
1767
- timeout_task.cancel()
1768
- if stop_task and not stop_task.done():
1769
- stop_task.cancel()
1770
-
1771
- if failed_error:
1772
- return 1
1773
-
1774
- output_messages: list[str] = []
1775
- if final_message:
1776
- self.log_line(run_id, final_message)
1777
- output_messages = [final_message]
1778
- elif assistant_messages:
1779
- output_messages = assistant_messages
1780
-
1781
- if output_messages:
1782
- handle_agent_output(
1783
- self._log_app_server_output,
1784
- self._write_run_artifact,
1785
- self._merge_run_index_entry,
1786
- run_id,
1787
- output_messages,
1788
- )
1789
-
1790
- context = orchestrator.get_context()
1791
- if context:
1792
- turn_id = context.turn_id or orchestrator.get_last_turn_id()
1793
- thread_info = context.thread_info or orchestrator.get_last_thread_info()
1794
- token_total = orchestrator.get_last_token_total()
1795
- self._update_run_telemetry(
1796
- run_id,
1797
- turn_id=turn_id,
1798
- token_total=token_total,
1799
- )
1800
- if thread_info:
1801
- self._update_run_telemetry(run_id, thread_info=thread_info)
1802
-
1803
- return 0
1804
-
1805
- async def _run_codex_app_server_async(
1806
- self,
1807
- prompt: str,
1808
- run_id: int,
1809
- *,
1810
- external_stop_flag: Optional[threading.Event] = None,
1811
- ) -> int:
1812
- config = self.config
1813
- if not config.app_server.command:
1814
- self.log_line(
1815
- run_id,
1816
- "error: app-server backend requires app_server.command to be configured",
1817
- )
1818
- return 1
1819
- with state_lock(self.state_path):
1820
- state = load_state(self.state_path)
1821
- effective_model = state.autorunner_model_override or config.codex_model
1822
- effective_effort = state.autorunner_effort_override or config.codex_reasoning
1823
- return await self._run_agent_backend_async(
1824
- agent_id="codex",
1825
- prompt=prompt,
1826
- run_id=run_id,
1827
- state=state,
1828
- session_key="autorunner",
1829
- model=effective_model,
1830
- reasoning=effective_effort,
1831
- external_stop_flag=external_stop_flag,
1832
- )
1833
-
1834
- async def _run_agent_backend_async(
1835
- self,
1836
- *,
1837
- agent_id: str,
1838
- prompt: str,
1839
- run_id: int,
1840
- state: RunnerState,
1841
- session_key: str,
1842
- model: Optional[str],
1843
- reasoning: Optional[str],
1844
- external_stop_flag: Optional[threading.Event],
1845
- ) -> int:
1846
- if self._backend_factory is None:
1847
- self.log_line(
1848
- run_id,
1849
- f"error: {agent_id} backend factory is not configured for this engine",
1850
- )
1851
- return 1
1852
-
1853
- try:
1854
- backend = self._backend_factory(
1855
- agent_id, state, self._handle_app_server_notification
1856
- )
1857
- except Exception as exc:
1858
- self.log_line(
1859
- run_id, f"error: failed to initialize {agent_id} backend: {exc}"
1860
- )
1861
- return 1
1862
-
1863
- reuse_session = bool(getattr(self.config, "autorunner_reuse_session", False))
1864
- session_id: Optional[str] = None
1865
- if reuse_session and self._backend_orchestrator is not None:
1866
- session_id = self._backend_orchestrator.get_thread_id(session_key)
1867
- elif reuse_session:
1868
- with self._app_server_threads_lock:
1869
- session_id = self._app_server_threads.get_thread_id(session_key)
1870
-
1871
- try:
1872
- session_id = await backend.start_session(
1873
- target={"workspace": str(self.repo_root)},
1874
- context={"workspace": str(self.repo_root), "session_id": session_id},
1875
- )
1876
- except Exception as exc:
1877
- self.log_line(
1878
- run_id, f"error: {agent_id} backend failed to start session: {exc}"
1879
- )
1880
- return 1
1881
-
1882
- if not session_id:
1883
- self.log_line(
1884
- run_id, f"error: {agent_id} backend did not return a session id"
1885
- )
1886
- return 1
1887
-
1888
- if reuse_session and self._backend_orchestrator is not None:
1889
- self._backend_orchestrator.set_thread_id(session_key, session_id)
1890
- elif reuse_session:
1891
- with self._app_server_threads_lock:
1892
- self._app_server_threads.set_thread_id(session_key, session_id)
1893
-
1894
- self._update_run_telemetry(run_id, thread_id=session_id)
1895
-
1896
- events: asyncio.Queue[Optional[RunEvent]] = asyncio.Queue()
1897
-
1898
- async def _produce_events() -> None:
1899
- try:
1900
- async for event in backend.run_turn_events(session_id, prompt):
1901
- await events.put(event)
1902
- except Exception as exc:
1903
- await events.put(Failed(timestamp=now_iso(), error_message=str(exc)))
1904
- finally:
1905
- await events.put(None)
1906
-
1907
- producer_task = asyncio.create_task(_produce_events())
1908
- stop_task = asyncio.create_task(self._wait_for_stop(external_stop_flag))
1909
- timeout_seconds = self.config.app_server.turn_timeout_seconds
1910
- timeout_task: Optional[asyncio.Task] = (
1911
- asyncio.create_task(asyncio.sleep(timeout_seconds))
1912
- if timeout_seconds
1913
- else None
1914
- )
1915
-
1916
- assistant_messages: list[str] = []
1917
- final_message: Optional[str] = None
1918
- failed_error: Optional[str] = None
1919
-
1920
- try:
1921
- while True:
1922
- get_task = asyncio.create_task(events.get())
1923
- tasks = {get_task, stop_task}
1924
- if timeout_task is not None:
1925
- tasks.add(timeout_task)
1926
- done, pending = await asyncio.wait(
1927
- tasks, return_when=asyncio.FIRST_COMPLETED
1928
- )
1929
-
1930
- if get_task in done:
1931
- event = get_task.result()
1932
- if event is None:
1933
- break
1934
- if isinstance(event, Started) and event.session_id:
1935
- self._update_run_telemetry(
1936
- run_id, thread_id=event.session_id, turn_id=event.turn_id
1937
- )
1938
- elif isinstance(event, OutputDelta):
1939
- self._emit_canonical_event(
1940
- run_id,
1941
- FlowEventType.AGENT_STREAM_DELTA,
1942
- {
1943
- "delta": event.content,
1944
- "delta_type": event.delta_type,
1945
- },
1946
- timestamp_override=event.timestamp,
1947
- )
1948
- if event.delta_type in {
1949
- "assistant_message",
1950
- "assistant_stream",
1951
- }:
1952
- assistant_messages.append(event.content)
1953
- elif event.delta_type == "log_line":
1954
- self.log_line(
1955
- run_id,
1956
- (
1957
- f"stdout: {event.content}"
1958
- if event.content
1959
- else "stdout: "
1960
- ),
1961
- )
1962
- elif isinstance(event, ToolCall):
1963
- self._emit_canonical_event(
1964
- run_id,
1965
- FlowEventType.TOOL_CALL,
1966
- {
1967
- "tool_name": event.tool_name,
1968
- "tool_input": event.tool_input,
1969
- },
1970
- timestamp_override=event.timestamp,
1971
- )
1972
- elif isinstance(event, ApprovalRequested):
1973
- self._emit_canonical_event(
1974
- run_id,
1975
- FlowEventType.APPROVAL_REQUESTED,
1976
- {
1977
- "request_id": event.request_id,
1978
- "description": event.description,
1979
- "context": event.context,
1980
- },
1981
- timestamp_override=event.timestamp,
1982
- )
1983
- elif isinstance(event, TokenUsage):
1984
- self._emit_canonical_event(
1985
- run_id,
1986
- FlowEventType.TOKEN_USAGE,
1987
- {"usage": event.usage},
1988
- timestamp_override=event.timestamp,
1989
- )
1990
- elif isinstance(event, RunNotice):
1991
- notice_type = FlowEventType.RUN_STATE_CHANGED
1992
- if event.kind.endswith("timeout"):
1993
- notice_type = FlowEventType.RUN_TIMEOUT
1994
- elif "cancel" in event.kind:
1995
- notice_type = FlowEventType.RUN_CANCELLED
1996
- data: dict[str, Any] = {
1997
- "kind": event.kind,
1998
- "message": event.message,
1999
- }
2000
- if event.data:
2001
- data["data"] = event.data
2002
- self._emit_canonical_event(
2003
- run_id,
2004
- notice_type,
2005
- data,
2006
- timestamp_override=event.timestamp,
2007
- )
2008
- elif isinstance(event, Completed):
2009
- if event.final_message:
2010
- self._emit_canonical_event(
2011
- run_id,
2012
- FlowEventType.AGENT_MESSAGE_COMPLETE,
2013
- {"final_message": event.final_message},
2014
- timestamp_override=event.timestamp,
2015
- )
2016
- if event.final_message:
2017
- final_message = event.final_message
2018
- elif isinstance(event, Failed):
2019
- self._emit_canonical_event(
2020
- run_id,
2021
- FlowEventType.AGENT_FAILED,
2022
- {"error_message": event.error_message},
2023
- timestamp_override=event.timestamp,
2024
- )
2025
- failed_error = event.error_message
2026
- continue
2027
-
2028
- timed_out = timeout_task is not None and timeout_task in done
2029
- stopped = stop_task in done
2030
- if timed_out:
2031
- self.log_line(
2032
- run_id,
2033
- "error: app-server turn timed out; interrupting app-server",
2034
- )
2035
- self._emit_canonical_event(
2036
- run_id,
2037
- FlowEventType.RUN_TIMEOUT,
2038
- {
2039
- "context": "app_server_turn",
2040
- "timeout_seconds": timeout_seconds,
2041
- },
2042
- )
2043
- if stopped:
2044
- self._last_run_interrupted = True
2045
- self.log_line(
2046
- run_id, "info: stop requested; interrupting app-server"
2047
- )
2048
- try:
2049
- await backend.interrupt(session_id)
2050
- except Exception as exc:
2051
- self.log_line(run_id, f"error: app-server interrupt failed: {exc}")
2052
-
2053
- done_after_interrupt, _pending = await asyncio.wait(
2054
- {producer_task}, timeout=AUTORUNNER_INTERRUPT_GRACE_SECONDS
2055
- )
2056
- if not done_after_interrupt:
2057
- await self._cancel_task_with_notice(
2058
- run_id, producer_task, name="producer_task"
2059
- )
2060
- if stopped:
2061
- return 0
2062
- return 1
2063
- if stopped:
2064
- return 0
2065
- return 1
2066
-
2067
- await producer_task
2068
- finally:
2069
- await self._cancel_task_with_notice(run_id, stop_task, name="stop_task")
2070
- if timeout_task is not None:
2071
- await self._cancel_task_with_notice(
2072
- run_id, timeout_task, name="timeout_task"
2073
- )
2074
-
2075
- if failed_error:
2076
- self.log_line(run_id, f"error: {failed_error}")
2077
- return 1
2078
-
2079
- output_messages = []
2080
- if final_message:
2081
- output_messages = [final_message]
2082
- elif assistant_messages:
2083
- output_messages = assistant_messages
2084
-
2085
- if output_messages:
2086
- handle_agent_output(
2087
- self._log_app_server_output,
2088
- self._write_run_artifact,
2089
- self._merge_run_index_entry,
2090
- run_id,
2091
- output_messages,
2092
- )
2093
-
2094
- token_total = getattr(backend, "last_token_total", None)
2095
- if isinstance(token_total, dict):
2096
- self._update_run_telemetry(run_id, token_total=token_total)
2097
-
2098
- telemetry = self._snapshot_run_telemetry(run_id)
2099
- turn_id = None
2100
- if telemetry is not None:
2101
- turn_id = telemetry.turn_id
2102
- if not turn_id:
2103
- turn_id = getattr(backend, "last_turn_id", None)
2104
- thread_info = getattr(backend, "last_thread_info", None)
2105
-
2106
- if session_id and turn_id:
2107
- app_server_meta = self._build_app_server_meta(
2108
- thread_id=session_id,
2109
- turn_id=turn_id,
2110
- thread_info=thread_info if isinstance(thread_info, dict) else None,
2111
- model=model,
2112
- reasoning_effort=reasoning,
2113
- )
2114
- if agent_id != "codex":
2115
- app_server_meta["agent"] = agent_id
2116
- self._merge_run_index_entry(run_id, {"app_server": app_server_meta})
2117
-
2118
- return 0
2119
-
2120
- def _log_app_server_output(self, run_id: int, messages: list[str]) -> None:
2121
- if not messages:
2122
- return
2123
- for message in messages:
2124
- text = str(message)
2125
- lines = text.splitlines() or [""]
2126
- for line in lines:
2127
- self.log_line(run_id, f"stdout: {line}" if line else "stdout: ")
2128
-
2129
- def maybe_git_commit(self, run_id: int) -> None:
2130
- msg = self.config.git_commit_message_template.replace(
2131
- "{run_id}", str(run_id)
2132
- ).replace("#{run_id}", str(run_id))
2133
- paths = []
2134
- for key in ("active_context", "decisions", "spec"):
2135
- try:
2136
- paths.append(self.config.doc_path(key))
2137
- except KeyError:
2138
- pass
2139
- add_paths = [str(p.relative_to(self.repo_root)) for p in paths if p.exists()]
2140
- if not add_paths:
2141
- return
2142
- try:
2143
- add_proc = run_git(["add", *add_paths], self.repo_root, check=False)
2144
- if add_proc.returncode != 0:
2145
- detail = (
2146
- add_proc.stderr or add_proc.stdout or ""
2147
- ).strip() or f"exit {add_proc.returncode}"
2148
- self.log_line(run_id, f"git add failed: {detail}")
2149
- return
2150
- except GitError as exc:
2151
- self.log_line(run_id, f"git add failed: {exc}")
2152
- return
2153
- try:
2154
- commit_proc = run_git(
2155
- ["commit", "-m", msg],
2156
- self.repo_root,
2157
- check=False,
2158
- timeout_seconds=120,
2159
- )
2160
- if commit_proc.returncode != 0:
2161
- detail = (
2162
- commit_proc.stderr or commit_proc.stdout or ""
2163
- ).strip() or f"exit {commit_proc.returncode}"
2164
- self.log_line(run_id, f"git commit failed: {detail}")
2165
- except GitError as exc:
2166
- self.log_line(run_id, f"git commit failed: {exc}")
2167
-
2168
- def _ensure_app_server_supervisor(self, event_prefix: str) -> Optional[Any]:
2169
- """
2170
- Ensure app server supervisor exists by delegating to BackendOrchestrator.
2171
-
2172
- This method is kept for backward compatibility but now delegates to
2173
- BackendOrchestrator to keep Engine protocol-agnostic.
2174
- """
2175
- if self._app_server_supervisor is None:
2176
- if (
2177
- self._backend_orchestrator is None
2178
- and self._app_server_supervisor_factory is not None
2179
- ):
2180
- self._app_server_supervisor = self._app_server_supervisor_factory(
2181
- event_prefix, self._handle_app_server_notification
2182
- )
2183
- elif self._backend_orchestrator is not None:
2184
- try:
2185
- self._app_server_supervisor = (
2186
- self._backend_orchestrator.build_app_server_supervisor(
2187
- event_prefix=event_prefix,
2188
- notification_handler=self._handle_app_server_notification,
2189
- )
2190
- )
2191
- except Exception:
2192
- if self._app_server_supervisor_factory is not None:
2193
- self._app_server_supervisor = (
2194
- self._app_server_supervisor_factory(
2195
- event_prefix, self._handle_app_server_notification
2196
- )
2197
- )
2198
- return self._app_server_supervisor
2199
-
2200
- async def _close_app_server_supervisor(self) -> None:
2201
- if self._app_server_supervisor is None:
2202
- return
2203
- supervisor = self._app_server_supervisor
2204
- self._app_server_supervisor = None
2205
- try:
2206
- close_all = getattr(supervisor, "close_all", None)
2207
- if close_all is None:
2208
- return
2209
- result = close_all()
2210
- if inspect.isawaitable(result):
2211
- await result
2212
- except Exception as exc:
2213
- self._app_server_logger.warning(
2214
- "app-server supervisor close failed: %s", exc
2215
- )
2216
-
2217
- async def _close_agent_backends(self) -> None:
2218
- if self._backend_factory is None:
2219
- return
2220
- close_all = getattr(self._backend_factory, "close_all", None)
2221
- if close_all is None:
2222
- return
2223
- try:
2224
- result = close_all()
2225
- if inspect.isawaitable(result):
2226
- await result
2227
- except Exception as exc:
2228
- self._app_server_logger.warning("agent backend close failed: %s", exc)
2229
-
2230
- def _build_opencode_supervisor(self) -> Optional[Any]:
2231
- """
2232
- Build OpenCode supervisor by delegating to BackendOrchestrator.
2233
-
2234
- This method is kept for backward compatibility but now delegates to
2235
- BackendOrchestrator to keep Engine protocol-agnostic.
2236
- """
2237
- if self._backend_orchestrator is None:
2238
- return None
2239
-
2240
- return self._backend_orchestrator.ensure_opencode_supervisor()
2241
-
2242
- def _ensure_opencode_supervisor(self) -> Optional[Any]:
2243
- """
2244
- Ensure OpenCode supervisor exists by delegating to BackendOrchestrator.
2245
-
2246
- This method is kept for backward compatibility but now delegates to
2247
- BackendOrchestrator to keep Engine protocol-agnostic.
2248
- """
2249
- if self._opencode_supervisor is None:
2250
- self._opencode_supervisor = self._build_opencode_supervisor()
2251
- return self._opencode_supervisor
2252
-
2253
- async def _close_opencode_supervisor(self) -> None:
2254
- if self._opencode_supervisor is None:
2255
- return
2256
- supervisor = self._opencode_supervisor
2257
- self._opencode_supervisor = None
2258
- try:
2259
- await supervisor.close_all()
2260
- except Exception as exc:
2261
- self._app_server_logger.warning("opencode supervisor close failed: %s", exc)
2262
-
2263
- async def _wait_for_stop(
2264
- self,
2265
- external_stop_flag: Optional[threading.Event],
2266
- stop_event: Optional[asyncio.Event] = None,
2267
- ) -> None:
2268
- while not self._should_stop(external_stop_flag):
2269
- await asyncio.sleep(AUTORUNNER_STOP_POLL_SECONDS)
2270
- if stop_event is not None:
2271
- stop_event.set()
2272
-
2273
- async def _wait_for_turn_with_stop(
2274
- self,
2275
- client: Any,
2276
- handle: Any,
2277
- run_id: int,
2278
- *,
2279
- timeout: Optional[float],
2280
- external_stop_flag: Optional[threading.Event],
2281
- supervisor: Optional[Any] = None,
2282
- ) -> tuple[Any, bool]:
2283
- stop_task = asyncio.create_task(self._wait_for_stop(external_stop_flag))
2284
- turn_task = asyncio.create_task(handle.wait(timeout=None))
2285
- timeout_task: Optional[asyncio.Task] = (
2286
- asyncio.create_task(asyncio.sleep(timeout)) if timeout else None
2287
- )
2288
- interrupted = False
2289
- try:
2290
- tasks = {stop_task, turn_task}
2291
- if timeout_task is not None:
2292
- tasks.add(timeout_task)
2293
- done, _pending = await asyncio.wait(
2294
- tasks, return_when=asyncio.FIRST_COMPLETED
2295
- )
2296
- if turn_task in done:
2297
- result = await turn_task
2298
- return result, interrupted
2299
- timed_out = timeout_task is not None and timeout_task in done
2300
- stopped = stop_task in done
2301
- if timed_out:
2302
- self.log_line(
2303
- run_id, "error: app-server turn timed out; interrupting app-server"
2304
- )
2305
- self._emit_canonical_event(
2306
- run_id,
2307
- FlowEventType.RUN_TIMEOUT,
2308
- {"context": "app_server_turn", "timeout_seconds": timeout},
2309
- )
2310
- if stopped and not turn_task.done():
2311
- interrupted = True
2312
- self.log_line(run_id, "info: stop requested; interrupting app-server")
2313
- if not turn_task.done():
2314
- try:
2315
- await client.turn_interrupt(
2316
- handle.turn_id, thread_id=handle.thread_id
2317
- )
2318
- except Exception as exc:
2319
- self.log_line(run_id, f"error: app-server interrupt failed: {exc}")
2320
- if interrupted:
2321
- self.kill_running_process()
2322
- raise
2323
- done, _pending = await asyncio.wait(
2324
- {turn_task}, timeout=AUTORUNNER_INTERRUPT_GRACE_SECONDS
2325
- )
2326
- if not done:
2327
- self.log_line(
2328
- run_id,
2329
- "error: app-server interrupt timed out; cleaning up",
2330
- )
2331
- if interrupted:
2332
- self.kill_running_process()
2333
- raise RuntimeError("App-server interrupt timed out")
2334
- if supervisor is not None:
2335
- await supervisor.close_all()
2336
- raise asyncio.TimeoutError()
2337
- result = await turn_task
2338
- if timed_out:
2339
- raise asyncio.TimeoutError()
2340
- return result, interrupted
2341
- finally:
2342
- await self._cancel_task_with_notice(run_id, stop_task, name="stop_task")
2343
- if timeout_task is not None:
2344
- await self._cancel_task_with_notice(
2345
- run_id, timeout_task, name="timeout_task"
2346
- )
2347
-
2348
- async def _run_loop_async(
2349
- self,
2350
- stop_after_runs: Optional[int] = None,
2351
- external_stop_flag: Optional[threading.Event] = None,
2352
- ) -> None:
2353
- state = load_state(self.state_path)
2354
- run_id = (state.last_run_id or 0) + 1
2355
- last_exit_code: Optional[int] = state.last_exit_code
2356
- start_wallclock = time.time()
2357
- target_runs = (
2358
- stop_after_runs
2359
- if stop_after_runs is not None
2360
- else (
2361
- state.runner_stop_after_runs
2362
- if state.runner_stop_after_runs is not None
2363
- else self.config.runner_stop_after_runs
2364
- )
2365
- )
2366
- no_progress_count = 0
2367
- ticket_dir = self.repo_root / ".codex-autorunner" / "tickets"
2368
- initial_tickets = list_ticket_paths(ticket_dir)
2369
- last_done_count = sum(1 for path in initial_tickets if ticket_is_done(path))
2370
- last_outstanding_count = len(initial_tickets) - last_done_count
2371
- exit_reason: Optional[str] = None
2372
-
2373
- try:
2374
- while True:
2375
- if self._should_stop(external_stop_flag):
2376
- self.clear_stop_request()
2377
- self._update_state(
2378
- "idle", run_id - 1, last_exit_code, finished=True
2379
- )
2380
- exit_reason = "stop_requested"
2381
- break
2382
- if self.config.runner_max_wallclock_seconds is not None:
2383
- if (
2384
- time.time() - start_wallclock
2385
- > self.config.runner_max_wallclock_seconds
2386
- ):
2387
- self._update_state(
2388
- "idle", run_id - 1, state.last_exit_code, finished=True
2389
- )
2390
- exit_reason = "max_wallclock_seconds"
2391
- break
2392
-
2393
- if self.todos_done():
2394
- if not self.summary_finalized():
2395
- exit_code = await self._run_final_summary_job(
2396
- run_id, external_stop_flag=external_stop_flag
2397
- )
2398
- last_exit_code = exit_code
2399
- exit_reason = (
2400
- "error_exit" if exit_code != 0 else "todos_complete"
2401
- )
2402
- else:
2403
- current = load_state(self.state_path)
2404
- last_exit_code = current.last_exit_code
2405
- self._update_state(
2406
- "idle", run_id - 1, last_exit_code, finished=True
2407
- )
2408
- exit_reason = "todos_complete"
2409
- break
2410
-
2411
- prev_output = self.extract_prev_output(run_id - 1)
2412
- prompt = self._build_app_server_prompt(prev_output)
2413
-
2414
- exit_code = await self._execute_run_step(
2415
- prompt, run_id, external_stop_flag=external_stop_flag
2416
- )
2417
- last_exit_code = exit_code
2418
-
2419
- if exit_code != 0:
2420
- exit_reason = "error_exit"
2421
- break
2422
-
2423
- # Check for no progress across runs
2424
- current_tickets = list_ticket_paths(ticket_dir)
2425
- current_done_count = sum(
2426
- 1 for path in current_tickets if ticket_is_done(path)
2427
- )
2428
- current_outstanding_count = len(current_tickets) - current_done_count
2429
-
2430
- # Check if there was any meaningful progress
2431
- has_progress = (
2432
- current_outstanding_count != last_outstanding_count
2433
- or current_done_count != last_done_count
2434
- )
2435
-
2436
- # Check if there was any meaningful output (diff, plan, etc.)
2437
- has_output = False
2438
- run_entry = self._run_index_store.get_entry(run_id)
2439
- if run_entry:
2440
- artifacts = run_entry.get("artifacts", {})
2441
- if isinstance(artifacts, dict):
2442
- diff_path = artifacts.get("diff_path")
2443
- if diff_path:
2444
- try:
2445
- diff_content = (
2446
- Path(diff_path).read_text(encoding="utf-8").strip()
2447
- )
2448
- has_output = len(diff_content) > 0
2449
- except (OSError, IOError):
2450
- pass
2451
- if not has_output:
2452
- plan_path = artifacts.get("plan_path")
2453
- if plan_path:
2454
- try:
2455
- plan_content = (
2456
- Path(plan_path)
2457
- .read_text(encoding="utf-8")
2458
- .strip()
2459
- )
2460
- has_output = len(plan_content) > 0
2461
- except (OSError, IOError):
2462
- pass
2463
-
2464
- if not has_progress and not has_output:
2465
- no_progress_count += 1
2466
-
2467
- evidence = {
2468
- "outstanding_count": current_outstanding_count,
2469
- "done_count": current_done_count,
2470
- "has_diff": bool(
2471
- run_entry
2472
- and isinstance(run_entry.get("artifacts"), dict)
2473
- and run_entry["artifacts"].get("diff_path")
2474
- ),
2475
- "has_plan": bool(
2476
- run_entry
2477
- and isinstance(run_entry.get("artifacts"), dict)
2478
- and run_entry["artifacts"].get("plan_path")
2479
- ),
2480
- "run_id": run_id,
2481
- }
2482
- self._emit_event(
2483
- run_id, "run.no_progress", count=no_progress_count, **evidence
2484
- )
2485
- self.log_line(
2486
- run_id,
2487
- f"info: no progress detected ({no_progress_count}/{self.config.runner_no_progress_threshold} runs without progress)",
2488
- )
2489
- if no_progress_count >= self.config.runner_no_progress_threshold:
2490
- self.log_line(
2491
- run_id,
2492
- f"info: stopping after {no_progress_count} consecutive runs with no progress (threshold: {self.config.runner_no_progress_threshold})",
2493
- )
2494
- self._update_state(
2495
- "idle",
2496
- run_id,
2497
- exit_code,
2498
- finished=True,
2499
- )
2500
- exit_reason = "no_progress_threshold"
2501
- break
2502
- else:
2503
- no_progress_count = 0
2504
-
2505
- last_outstanding_count = current_outstanding_count
2506
- last_done_count = current_done_count
2507
-
2508
- # If TODO is now complete, run the final report job once and stop.
2509
- if self.todos_done() and not self.summary_finalized():
2510
- exit_code = await self._run_final_summary_job(
2511
- run_id + 1, external_stop_flag=external_stop_flag
2512
- )
2513
- last_exit_code = exit_code
2514
- exit_reason = "error_exit" if exit_code != 0 else "todos_complete"
2515
- break
2516
-
2517
- if target_runs is not None and run_id >= target_runs:
2518
- exit_reason = "stop_after_runs"
2519
- break
2520
-
2521
- run_id += 1
2522
- if self._should_stop(external_stop_flag):
2523
- self.clear_stop_request()
2524
- self._update_state("idle", run_id - 1, exit_code, finished=True)
2525
- exit_reason = "stop_requested"
2526
- break
2527
- await asyncio.sleep(self.config.runner_sleep_seconds)
2528
- except Exception as exc:
2529
- # Never silently die: persist's reason to agent log and surface in state.
2530
- exit_reason = exit_reason or "error_exit"
2531
- try:
2532
- self.log_line(run_id, f"FATAL: run_loop crashed: {exc!r}")
2533
- tb = traceback.format_exc()
2534
- for line in tb.splitlines():
2535
- self.log_line(run_id, f"traceback: {line}")
2536
- except (OSError, IOError) as exc:
2537
- self._app_server_logger.error(
2538
- "Failed to log run_loop crash for run %s: %s", run_id, exc
2539
- )
2540
- try:
2541
- self._update_state("error", run_id, 1, finished=True)
2542
- except (OSError, IOError) as exc:
2543
- self._app_server_logger.error(
2544
- "Failed to update state after run_loop crash for run %s: %s",
2545
- run_id,
2546
- exc,
2547
- )
2548
- finally:
2549
- try:
2550
- await self._maybe_run_end_review(
2551
- exit_reason=exit_reason or "unknown",
2552
- last_exit_code=last_exit_code,
2553
- )
2554
- except Exception as exc:
2555
- self._app_server_logger.warning(
2556
- "End-of-run review failed for run %s: %s", run_id, exc
2557
- )
2558
- await self._close_app_server_supervisor()
2559
- await self._close_opencode_supervisor()
2560
- await self._close_agent_backends()
2561
- # IMPORTANT: lock ownership is managed by the caller (CLI/Hub/Server runner).
2562
- # Engine.run_loop must never unconditionally mutate the lock file.
2563
-
2564
- async def _maybe_run_end_review(
2565
- self, *, exit_reason: str, last_exit_code: Optional[int]
2566
- ) -> None:
2567
- runner_cfg = self.config.raw.get("runner") or {}
2568
- review_cfg = runner_cfg.get("review")
2569
- if not isinstance(review_cfg, dict) or not review_cfg.get("enabled"):
2570
- return
2571
-
2572
- trigger_cfg = review_cfg.get("trigger") or {}
2573
- reason_key_map = {
2574
- "todos_complete": "on_todos_complete",
2575
- "no_progress_threshold": "on_no_progress_stop",
2576
- "stop_after_runs": "on_max_runs_stop",
2577
- # Share the max-runs trigger for wallclock cutoffs to avoid extra config flags.
2578
- "max_wallclock_seconds": "on_max_runs_stop",
2579
- "stop_requested": "on_stop_requested",
2580
- "error_exit": "on_error_exit",
2581
- }
2582
- trigger_key = reason_key_map.get(exit_reason)
2583
- if not trigger_key or not trigger_cfg.get(trigger_key, False):
2584
- return
2585
-
2586
- state = load_state(self.state_path)
2587
- last_run_id = state.last_run_id
2588
- if last_run_id is None:
2589
- return
2590
-
2591
- top_review_cfg = self.config.raw.get("review") or {}
2592
- agent = review_cfg.get("agent") or top_review_cfg.get("agent") or "opencode"
2593
- model = review_cfg.get("model") or top_review_cfg.get("model")
2594
- reasoning = review_cfg.get("reasoning") or top_review_cfg.get("reasoning")
2595
- max_wallclock_seconds = review_cfg.get("max_wallclock_seconds")
2596
- if max_wallclock_seconds is None:
2597
- max_wallclock_seconds = top_review_cfg.get("max_wallclock_seconds")
2598
-
2599
- context_cfg = review_cfg.get("context") or {}
2600
- primary_docs = context_cfg.get("primary_docs") or ["spec", "progress"]
2601
- include_docs = context_cfg.get("include_docs") or []
2602
- include_last_run_artifacts = bool(
2603
- context_cfg.get("include_last_run_artifacts", True)
2604
- )
2605
- max_doc_chars = context_cfg.get("max_doc_chars", 20000)
2606
- try:
2607
- max_doc_chars = int(max_doc_chars)
2608
- except (TypeError, ValueError):
2609
- max_doc_chars = 20000
2610
-
2611
- context_md = build_spec_progress_review_context(
2612
- self,
2613
- exit_reason=exit_reason,
2614
- last_run_id=last_run_id,
2615
- last_exit_code=last_exit_code,
2616
- max_doc_chars=max_doc_chars,
2617
- primary_docs=primary_docs,
2618
- include_docs=include_docs,
2619
- include_last_run_artifacts=include_last_run_artifacts,
2620
- )
2621
-
2622
- payload: dict[str, Any] = {
2623
- "agent": agent,
2624
- "model": model,
2625
- "reasoning": reasoning,
2626
- "max_wallclock_seconds": max_wallclock_seconds,
2627
- }
2628
- payload = {k: v for k, v in payload.items() if v is not None}
2629
-
2630
- opencode_supervisor: Optional[Any] = None
2631
- app_server_supervisor: Optional[Any] = None
2632
-
2633
- if agent == "codex":
2634
- if not self.config.app_server.command:
2635
- self._app_server_logger.info(
2636
- "Skipping end-of-run review: codex backend not configured"
2637
- )
2638
- return
2639
- app_server_supervisor = self._ensure_app_server_supervisor("review")
2640
- if app_server_supervisor is None:
2641
- self._app_server_logger.info(
2642
- "Skipping end-of-run review: codex supervisor factory unavailable"
2643
- )
2644
- return
2645
- else:
2646
- opencode_supervisor = self._ensure_opencode_supervisor()
2647
- if opencode_supervisor is None:
2648
- self._app_server_logger.info(
2649
- "Skipping end-of-run review: opencode backend not configured"
2650
- )
2651
- return
2652
-
2653
- from ..flows.review import ReviewService
2654
-
2655
- review_service = ReviewService(
2656
- self,
2657
- opencode_supervisor=opencode_supervisor,
2658
- app_server_supervisor=app_server_supervisor,
2659
- logger=self._app_server_logger,
2660
- )
2661
- result_state = await review_service.run_blocking_async(
2662
- payload=payload,
2663
- prompt_kind="spec_progress",
2664
- seed_context_files={"AUTORUNNER_CONTEXT.md": context_md},
2665
- ignore_repo_busy=True,
2666
- )
2667
-
2668
- review_id = result_state.get("id")
2669
- artifacts_cfg = review_cfg.get("artifacts") or {}
2670
- attach = bool(artifacts_cfg.get("attach_to_last_run_index", True))
2671
- if attach:
2672
- artifacts_update: dict[str, str] = {}
2673
- final_report = result_state.get("final_output_path")
2674
- scratch_bundle = result_state.get("scratchpad_bundle_path")
2675
- if isinstance(final_report, str) and final_report:
2676
- artifacts_update["final_review_report_path"] = final_report
2677
- if isinstance(scratch_bundle, str) and scratch_bundle:
2678
- artifacts_update["final_review_scratchpad_bundle_path"] = scratch_bundle
2679
- if artifacts_update:
2680
- self._merge_run_index_entry(
2681
- last_run_id,
2682
- {"artifacts": artifacts_update},
2683
- )
2684
- if review_id:
2685
- self.log_line(
2686
- last_run_id,
2687
- f"info: end-of-run review completed (review_id={review_id})",
2688
- )
2689
-
2690
- def run_loop(
2691
- self,
2692
- stop_after_runs: Optional[int] = None,
2693
- external_stop_flag: Optional[threading.Event] = None,
2694
- ) -> None:
2695
- try:
2696
- asyncio.run(self._run_loop_async(stop_after_runs, external_stop_flag))
2697
- except RuntimeError as exc:
2698
- if "asyncio.run" in str(exc):
2699
- raise
2700
- raise
2701
-
2702
- def run_once(self) -> None:
2703
- self.run_loop(stop_after_runs=1)
2704
-
2705
- def _update_state(
2706
- self,
2707
- status: str,
2708
- run_id: int,
2709
- exit_code: Optional[int],
2710
- *,
2711
- started: bool = False,
2712
- finished: bool = False,
2713
- ) -> None:
2714
- prev_status: Optional[str] = None
2715
- last_run_started_at: Optional[str] = None
2716
- last_run_finished_at: Optional[str] = None
2717
- with state_lock(self.state_path):
2718
- current = load_state(self.state_path)
2719
- prev_status = current.status
2720
- last_run_started_at = current.last_run_started_at
2721
- last_run_finished_at = current.last_run_finished_at
2722
- runner_pid = current.runner_pid
2723
- if started:
2724
- last_run_started_at = now_iso()
2725
- last_run_finished_at = None
2726
- runner_pid = os.getpid()
2727
- if finished:
2728
- last_run_finished_at = now_iso()
2729
- runner_pid = None
2730
- new_state = RunnerState(
2731
- last_run_id=run_id,
2732
- status=status,
2733
- last_exit_code=exit_code,
2734
- last_run_started_at=last_run_started_at,
2735
- last_run_finished_at=last_run_finished_at,
2736
- autorunner_agent_override=current.autorunner_agent_override,
2737
- autorunner_model_override=current.autorunner_model_override,
2738
- autorunner_effort_override=current.autorunner_effort_override,
2739
- autorunner_approval_policy=current.autorunner_approval_policy,
2740
- autorunner_sandbox_mode=current.autorunner_sandbox_mode,
2741
- autorunner_workspace_write_network=current.autorunner_workspace_write_network,
2742
- runner_pid=runner_pid,
2743
- sessions=current.sessions,
2744
- repo_to_session=current.repo_to_session,
2745
- )
2746
- save_state(self.state_path, new_state)
2747
- if run_id > 0 and prev_status != status:
2748
- payload: dict[str, Any] = {
2749
- "from_status": prev_status,
2750
- "to_status": status,
2751
- }
2752
- if exit_code is not None:
2753
- payload["exit_code"] = exit_code
2754
- if started and last_run_started_at:
2755
- payload["started_at"] = last_run_started_at
2756
- if finished and last_run_finished_at:
2757
- payload["finished_at"] = last_run_finished_at
2758
- self._emit_event(run_id, "run.state_changed", **payload)
2759
-
2760
-
2761
- def clear_stale_lock(lock_path: Path) -> bool:
2762
- assessment = assess_lock(
2763
- lock_path,
2764
- expected_cmd_substrings=DEFAULT_RUNNER_CMD_HINTS,
2765
- )
2766
- if assessment.freeable:
2767
- lock_path.unlink(missing_ok=True)
2768
- return True
2769
- return False
2770
-
2771
-
2772
- def _strip_log_prefixes(text: str) -> str:
2773
- """Strip log prefixes and clip to content after token-usage marker if present."""
2774
- lines = text.splitlines()
2775
- cleaned_lines = []
2776
- token_marker_idx = None
2777
- for idx, line in enumerate(lines):
2778
- if "stdout: tokens used" in line:
2779
- token_marker_idx = idx
2780
- break
2781
- if token_marker_idx is not None:
2782
- lines = lines[token_marker_idx + 1 :]
2783
-
2784
- for line in lines:
2785
- if "] run=" in line and "stdout:" in line:
2786
- try:
2787
- _, remainder = line.split("stdout:", 1)
2788
- cleaned_lines.append(remainder.strip())
2789
- continue
2790
- except ValueError:
2791
- pass
2792
- cleaned_lines.append(line)
2793
- return "\n".join(cleaned_lines).strip()
2794
-
2795
-
2796
- def _read_tail_text(path: Path, *, max_bytes: int) -> str:
2797
- """
2798
- Read at most last `max_bytes` bytes from a UTF-8-ish text file.
2799
- Returns decoded text with errors replaced.
2800
- """
2801
- logger = logging.getLogger("codex_autorunner.engine")
2802
- try:
2803
- size = path.stat().st_size
2804
- except OSError as exc:
2805
- logger.debug("Failed to stat log file for tail read: %s", exc)
2806
- return ""
2807
- if size <= 0:
2808
- return ""
2809
- try:
2810
- with path.open("rb") as f:
2811
- if size > max_bytes:
2812
- f.seek(-max_bytes, os.SEEK_END)
2813
- data = f.read()
2814
- return data.decode("utf-8", errors="replace")
2815
- except (FileNotFoundError, OSError, IOError) as exc:
2816
- logger.debug("Failed to read tail of log file: %s", exc)
2817
- return ""
2818
- if size <= 0:
2819
- return ""
2820
- try:
2821
- with path.open("rb") as f:
2822
- if size > max_bytes:
2823
- f.seek(-max_bytes, os.SEEK_END)
2824
- data = f.read()
2825
- return data.decode("utf-8", errors="replace")
2826
- except Exception:
2827
- return ""
2828
-
2829
-
2830
- @dataclasses.dataclass(frozen=True)
2831
- class DoctorCheck:
2832
- check_id: str
2833
- status: str
2834
- message: str
2835
- fix: Optional[str] = None
2836
-
2837
- def to_dict(self) -> dict:
2838
- payload = {
2839
- "id": self.check_id,
2840
- "status": self.status,
2841
- "message": self.message,
2842
- }
2843
- if self.fix:
2844
- payload["fix"] = self.fix
2845
- return payload
2846
-
2847
-
2848
- @dataclasses.dataclass(frozen=True)
2849
- class DoctorReport:
2850
- checks: list[DoctorCheck]
2851
-
2852
- def has_errors(self) -> bool:
2853
- return any(check.status == "error" for check in self.checks)
2854
-
2855
- def to_dict(self) -> dict:
2856
- return {
2857
- "ok": sum(1 for check in self.checks if check.status == "ok"),
2858
- "warnings": sum(1 for check in self.checks if check.status == "warning"),
2859
- "errors": sum(1 for check in self.checks if check.status == "error"),
2860
- "checks": [check.to_dict() for check in self.checks],
2861
- }
2862
-
2863
-
2864
- def _append_check(
2865
- checks: list[DoctorCheck],
2866
- check_id: str,
2867
- status: str,
2868
- message: str,
2869
- fix: Optional[str] = None,
2870
- ) -> None:
2871
- checks.append(
2872
- DoctorCheck(check_id=check_id, status=status, message=message, fix=fix)
2873
- )
2874
-
2875
-
2876
- def _parse_manifest_version(manifest_path: Path) -> Optional[int]:
2877
- logger = logging.getLogger("codex_autorunner.engine")
2878
- try:
2879
- raw = yaml.safe_load(manifest_path.read_text(encoding="utf-8")) or {}
2880
- except (FileNotFoundError, OSError, yaml.YAMLError) as exc:
2881
- logger.debug("Failed to parse manifest version: %s", exc)
2882
- return None
2883
- if not isinstance(raw, dict):
2884
- return None
2885
- version = raw.get("version")
2886
- return int(version) if isinstance(version, int) else None
2887
-
2888
-
2889
- def _manifest_has_worktrees(manifest_path: Path) -> bool:
2890
- logger = logging.getLogger("codex_autorunner.engine")
2891
- try:
2892
- raw = yaml.safe_load(manifest_path.read_text(encoding="utf-8")) or {}
2893
- except (FileNotFoundError, OSError, yaml.YAMLError) as exc:
2894
- logger.debug("Failed to parse manifest for worktrees: %s", exc)
2895
- return False
2896
- if not isinstance(raw, dict):
2897
- return False
2898
- repos = raw.get("repos")
2899
- if not isinstance(repos, list):
2900
- return False
2901
- for entry in repos:
2902
- if isinstance(entry, dict) and entry.get("kind") == "worktree":
2903
- return True
2904
- return False
2905
-
2906
-
2907
- def _append_repo_check(
2908
- checks: list[DoctorCheck],
2909
- prefix: str,
2910
- check_id: str,
2911
- status: str,
2912
- message: str,
2913
- fix: Optional[str] = None,
2914
- ) -> None:
2915
- full_id = f"{prefix}.{check_id}" if prefix else check_id
2916
- _append_check(checks, full_id, status, message, fix)
2917
-
2918
-
2919
- def _load_isolated_repo_config(repo_root: Path) -> RepoConfig:
2920
- config_path = repo_root / CONFIG_FILENAME
2921
- raw_config = _load_yaml_dict(config_path) if config_path.exists() else {}
2922
- raw = _merge_defaults(DEFAULT_REPO_CONFIG, raw_config or {})
2923
- raw["mode"] = "repo"
2924
- raw["version"] = raw.get("version") or CONFIG_VERSION
2925
- _validate_repo_config(raw, root=repo_root)
2926
- return _build_repo_config(config_path, raw)
2927
-
2928
-
2929
- def _repo_checks(
2930
- repo_config: RepoConfig,
2931
- global_state_root: Path,
2932
- prefix: str = "",
2933
- ) -> list[DoctorCheck]:
2934
- checks: list[DoctorCheck] = []
2935
- repo_state_root = resolve_repo_state_root(repo_config.root)
2936
- _append_repo_check(
2937
- checks,
2938
- prefix,
2939
- "state.roots",
2940
- "ok",
2941
- f"Repo state root: {repo_state_root}; Global state root: {global_state_root}",
2942
- )
2943
-
2944
- missing = []
2945
- configured_docs = repo_config.docs or {}
2946
- for key in configured_docs:
2947
- path = repo_config.doc_path(key)
2948
- if not path.exists():
2949
- missing.append(path)
2950
- if missing:
2951
- names = ", ".join(str(p) for p in missing)
2952
- _append_repo_check(
2953
- checks,
2954
- prefix,
2955
- "docs.required",
2956
- "warning",
2957
- f"Configured doc files are missing: {names}",
2958
- "Create the missing files (workspace docs are optional but recommended).",
2959
- )
2960
- else:
2961
- _append_repo_check(
2962
- checks,
2963
- prefix,
2964
- "docs.required",
2965
- "ok",
2966
- "Configured doc files are present.",
2967
- )
2968
-
2969
- if ensure_executable(repo_config.codex_binary):
2970
- _append_repo_check(
2971
- checks,
2972
- prefix,
2973
- "codex.binary",
2974
- "ok",
2975
- f"Codex binary resolved: {repo_config.codex_binary}",
2976
- )
2977
- else:
2978
- _append_repo_check(
2979
- checks,
2980
- prefix,
2981
- "codex.binary",
2982
- "error",
2983
- f"Codex binary not found in PATH: {repo_config.codex_binary}",
2984
- "Install Codex or set codex.binary to a full path.",
2985
- )
2986
-
2987
- voice_enabled = bool(repo_config.voice.get("enabled", True))
2988
- if voice_enabled:
2989
- missing_voice = missing_optional_dependencies(
2990
- (
2991
- ("httpx", "httpx"),
2992
- (("multipart", "python_multipart"), "python-multipart"),
2993
- )
2994
- )
2995
- if missing_voice:
2996
- deps_list = ", ".join(missing_voice)
2997
- _append_repo_check(
2998
- checks,
2999
- prefix,
3000
- "voice.dependencies",
3001
- "error",
3002
- f"Voice is enabled but missing optional deps: {deps_list}",
3003
- "Install with `pip install codex-autorunner[voice]`.",
3004
- )
3005
- else:
3006
- _append_repo_check(
3007
- checks,
3008
- prefix,
3009
- "voice.dependencies",
3010
- "ok",
3011
- "Voice dependencies are installed.",
3012
- )
3013
-
3014
- env_candidates = [
3015
- repo_config.root / ".env",
3016
- repo_config.root / ".codex-autorunner" / ".env",
3017
- ]
3018
- env_found = [str(path) for path in env_candidates if path.exists()]
3019
- if env_found:
3020
- _append_repo_check(
3021
- checks,
3022
- prefix,
3023
- "dotenv.locations",
3024
- "ok",
3025
- f"Found .env files: {', '.join(env_found)}",
3026
- )
3027
- else:
3028
- _append_repo_check(
3029
- checks,
3030
- prefix,
3031
- "dotenv.locations",
3032
- "warning",
3033
- "No .env files found in repo root or .codex-autorunner/.env.",
3034
- "Create one of these files if you rely on env vars.",
3035
- )
3036
-
3037
- host = str(repo_config.server_host or "")
3038
- if not _is_loopback_host(host):
3039
- if not repo_config.server_auth_token_env:
3040
- _append_repo_check(
3041
- checks,
3042
- prefix,
3043
- "server.auth",
3044
- "error",
3045
- f"Non-loopback host {host} requires server.auth_token_env.",
3046
- "Set server.auth_token_env or bind to 127.0.0.1.",
3047
- )
3048
- else:
3049
- token_val = os.environ.get(repo_config.server_auth_token_env)
3050
- if not token_val:
3051
- _append_repo_check(
3052
- checks,
3053
- prefix,
3054
- "server.auth",
3055
- "warning",
3056
- f"Auth token env var {repo_config.server_auth_token_env} is not set.",
3057
- "Export the env var or add it to .env.",
3058
- )
3059
- else:
3060
- _append_repo_check(
3061
- checks,
3062
- prefix,
3063
- "server.auth",
3064
- "ok",
3065
- "Server auth token env var is set for non-loopback host.",
3066
- )
3067
-
3068
- return checks
3069
-
3070
-
3071
- def _iter_hub_repos(hub_config) -> list[tuple[str, Path]]:
3072
- repos: list[tuple[str, Path]] = []
3073
- if hub_config.manifest_path.exists():
3074
- try:
3075
- raw = yaml.safe_load(hub_config.manifest_path.read_text(encoding="utf-8"))
3076
- except (OSError, yaml.YAMLError):
3077
- raw = None
3078
- if isinstance(raw, dict):
3079
- entries = raw.get("repos")
3080
- if isinstance(entries, list):
3081
- for entry in entries:
3082
- if not isinstance(entry, dict):
3083
- continue
3084
- if not entry.get("enabled", True):
3085
- continue
3086
- path_val = entry.get("path")
3087
- if not isinstance(path_val, str):
3088
- continue
3089
- repo_id = str(entry.get("id") or path_val)
3090
- repos.append((repo_id, (hub_config.root / path_val).resolve()))
3091
- if not repos and hub_config.repos_root.exists():
3092
- for child in hub_config.repos_root.iterdir():
3093
- if child.is_dir():
3094
- repos.append((child.name, child.resolve()))
3095
- return repos
3096
-
3097
-
3098
- def doctor(start_path: Path) -> DoctorReport:
3099
- checks: list[DoctorCheck] = []
3100
- hub_config = None
3101
- try:
3102
- hub_config = load_hub_config(start_path)
3103
- except ConfigError:
3104
- hub_config = None
3105
-
3106
- repo_root: Optional[Path] = None
3107
- try:
3108
- repo_root = find_repo_root(start_path)
3109
- except RepoNotFoundError:
3110
- repo_root = None
3111
-
3112
- repo_config: Optional[RepoConfig] = None
3113
- if hub_config is not None and repo_root is not None:
3114
- try:
3115
- repo_config = derive_repo_config(hub_config, repo_root)
3116
- except ConfigError:
3117
- repo_config = None
3118
- elif hub_config is None and repo_root is not None:
3119
- try:
3120
- repo_config = load_repo_config(start_path)
3121
- except ConfigError:
3122
- repo_config = _load_isolated_repo_config(repo_root)
3123
-
3124
- if hub_config is not None:
3125
- global_state_root = resolve_global_state_root(config=hub_config)
3126
- _append_check(
3127
- checks,
3128
- "state.roots",
3129
- "ok",
3130
- f"Hub root: {hub_config.root}; Global state root: {global_state_root}",
3131
- )
3132
- elif repo_config is not None:
3133
- global_state_root = resolve_global_state_root(config=repo_config)
3134
- _append_check(
3135
- checks,
3136
- "state.roots",
3137
- "ok",
3138
- f"Repo state root: {resolve_repo_state_root(repo_config.root)}; Global state root: {global_state_root}",
3139
- )
3140
- else:
3141
- raise ConfigError("No hub or repo configuration found for doctor check.")
3142
-
3143
- if hub_config is not None:
3144
- if hub_config.manifest_path.exists():
3145
- version = _parse_manifest_version(hub_config.manifest_path)
3146
- if version is None:
3147
- _append_check(
3148
- checks,
3149
- "hub.manifest.version",
3150
- "error",
3151
- f"Failed to read manifest version from {hub_config.manifest_path}.",
3152
- "Fix the manifest YAML or regenerate it with `car hub scan`.",
3153
- )
3154
- elif version != MANIFEST_VERSION:
3155
- _append_check(
3156
- checks,
3157
- "hub.manifest.version",
3158
- "error",
3159
- f"Hub manifest version {version} unsupported (expected {MANIFEST_VERSION}).",
3160
- "Regenerate the manifest (delete it and run `car hub scan`).",
3161
- )
3162
- else:
3163
- _append_check(
3164
- checks,
3165
- "hub.manifest.version",
3166
- "ok",
3167
- f"Hub manifest version {version} is supported.",
3168
- )
3169
- else:
3170
- _append_check(
3171
- checks,
3172
- "hub.manifest.exists",
3173
- "warning",
3174
- f"Hub manifest missing at {hub_config.manifest_path}.",
3175
- "Run `car hub scan` or `car hub create` to generate it.",
3176
- )
3177
-
3178
- if not hub_config.repos_root.exists():
3179
- _append_check(
3180
- checks,
3181
- "hub.repos_root",
3182
- "error",
3183
- f"Hub repos_root does not exist: {hub_config.repos_root}",
3184
- "Create the directory or update hub.repos_root in config.",
3185
- )
3186
- elif not hub_config.repos_root.is_dir():
3187
- _append_check(
3188
- checks,
3189
- "hub.repos_root",
3190
- "error",
3191
- f"Hub repos_root is not a directory: {hub_config.repos_root}",
3192
- "Point hub.repos_root at a directory.",
3193
- )
3194
- else:
3195
- _append_check(
3196
- checks,
3197
- "hub.repos_root",
3198
- "ok",
3199
- f"Hub repos_root exists: {hub_config.repos_root}",
3200
- )
3201
-
3202
- manifest_has_worktrees = (
3203
- hub_config.manifest_path.exists()
3204
- and _manifest_has_worktrees(hub_config.manifest_path)
3205
- )
3206
- worktrees_enabled = hub_config.worktrees_root.exists() or manifest_has_worktrees
3207
- if worktrees_enabled:
3208
- if ensure_executable("git"):
3209
- _append_check(
3210
- checks,
3211
- "hub.git",
3212
- "ok",
3213
- "git is available for hub worktrees.",
3214
- )
3215
- else:
3216
- _append_check(
3217
- checks,
3218
- "hub.git",
3219
- "error",
3220
- "git is not available but hub worktrees are enabled.",
3221
- "Install git or disable worktrees.",
3222
- )
3223
-
3224
- env_candidates = [
3225
- hub_config.root / ".env",
3226
- hub_config.root / ".codex-autorunner" / ".env",
3227
- ]
3228
- env_found = [str(path) for path in env_candidates if path.exists()]
3229
- if env_found:
3230
- _append_check(
3231
- checks,
3232
- "dotenv.locations",
3233
- "ok",
3234
- f"Found .env files: {', '.join(env_found)}",
3235
- )
3236
- else:
3237
- _append_check(
3238
- checks,
3239
- "dotenv.locations",
3240
- "warning",
3241
- "No .env files found in repo root or .codex-autorunner/.env.",
3242
- "Create one of these files if you rely on env vars.",
3243
- )
3244
-
3245
- host = str(hub_config.server_host or "")
3246
- if not _is_loopback_host(host):
3247
- if not hub_config.server_auth_token_env:
3248
- _append_check(
3249
- checks,
3250
- "server.auth",
3251
- "error",
3252
- f"Non-loopback host {host} requires server.auth_token_env.",
3253
- "Set server.auth_token_env or bind to 127.0.0.1.",
3254
- )
3255
- else:
3256
- token_val = os.environ.get(hub_config.server_auth_token_env)
3257
- if not token_val:
3258
- _append_check(
3259
- checks,
3260
- "server.auth",
3261
- "warning",
3262
- f"Auth token env var {hub_config.server_auth_token_env} is not set.",
3263
- "Export the env var or add it to .env.",
3264
- )
3265
- else:
3266
- _append_check(
3267
- checks,
3268
- "server.auth",
3269
- "ok",
3270
- "Server auth token env var is set for non-loopback host.",
3271
- )
3272
-
3273
- for repo_id, repo_path in _iter_hub_repos(hub_config):
3274
- prefix = f"repo[{repo_id}]"
3275
- if not repo_path.exists():
3276
- _append_repo_check(
3277
- checks,
3278
- prefix,
3279
- "state.roots",
3280
- "error",
3281
- f"Repo path not found: {repo_path}",
3282
- "Clone or initialize the repo, or update the hub manifest.",
3283
- )
3284
- continue
3285
- try:
3286
- repo_cfg = derive_repo_config(hub_config, repo_path)
3287
- except ConfigError as exc:
3288
- _append_repo_check(
3289
- checks,
3290
- prefix,
3291
- "config",
3292
- "error",
3293
- f"Failed to derive repo config: {exc}",
3294
- )
3295
- continue
3296
- checks.extend(_repo_checks(repo_cfg, global_state_root, prefix=prefix))
3297
-
3298
- else:
3299
- assert repo_config is not None
3300
- checks.extend(_repo_checks(repo_config, global_state_root))
3301
-
3302
- return DoctorReport(checks=checks)