agent-control-plane 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,7 +45,7 @@ function renderResult(row) {
45
45
 
46
46
  function renderControllerState(row) {
47
47
  const state = row.state || "n/a";
48
- const stale = state !== "stopped" && row.controller_live === false;
48
+ const stale = row.controller_stale === true || (state !== "stopped" && row.controller_live === false);
49
49
  const label = stale ? `${state} (stale)` : state;
50
50
  return `<span class="status-pill ${statusClass(stale ? "stale" : state)}">${label}</span>`;
51
51
  }
@@ -123,7 +123,9 @@ function renderProfile(profile) {
123
123
  ["Reported", profile.counts.reported_runs],
124
124
  ["Blocked", profile.counts.blocked_runs],
125
125
  ["Live controllers", profile.counts.live_resident_controllers],
126
+ ["Stale controllers", profile.counts.stale_resident_controllers],
126
127
  ["Provider cooldowns", profile.counts.provider_cooldowns],
128
+ ["Issue retries", profile.counts.active_retries],
127
129
  ["Queued issues", profile.counts.queued_issues],
128
130
  ["Scheduled", profile.counts.scheduled_issues],
129
131
  ]
@@ -165,6 +167,18 @@ function renderProfile(profile) {
165
167
  "No resident controllers recorded for this profile.",
166
168
  );
167
169
 
170
+ const retryTable = renderTable(
171
+ [
172
+ { label: "Issue", key: "issue_id" },
173
+ { label: "Status", render: (row) => `<span class="status-pill ${row.ready ? "" : "waiting-provider"}">${row.ready ? "ready" : "retrying"}</span>` },
174
+ { label: "Reason", render: (row) => row.last_reason || "n/a" },
175
+ { label: "Attempts", key: "attempts" },
176
+ { label: "Next attempt", render: (row) => row.next_attempt_at ? `${relativeTime(row.next_attempt_at)}<div class="muted">${row.next_attempt_at}</div>` : "n/a" },
177
+ ],
178
+ profile.issue_retries || [],
179
+ "No issue retries recorded.",
180
+ );
181
+
168
182
  const workerTable = renderTable(
169
183
  [
170
184
  { label: "Key", render: (row) => `<div class="mono">${row.key}</div>` },
@@ -233,9 +247,13 @@ function renderProfile(profile) {
233
247
  </section>
234
248
  <section class="panel">
235
249
  <h3>Resident Controllers</h3>
236
- <p class="panel-subtitle">Includes provider wait and failover telemetry.</p>
250
+ <p class="panel-subtitle">Includes provider wait and failover telemetry. Stale controllers show a warning.</p>
237
251
  ${controllerTable}
238
252
  </section>
253
+ <section class="panel half">
254
+ <h3>Issue Retries</h3>
255
+ ${retryTable}
256
+ </section>
239
257
  <section class="panel">
240
258
  <h3>Resident Worker Metadata</h3>
241
259
  ${workerTable}
@@ -225,6 +225,20 @@ def collect_runs(runs_root: Path) -> list[dict[str, Any]]:
225
225
  return runs
226
226
 
227
227
 
228
+ def controller_is_stale(env: dict[str, str], controller_path: Path) -> bool:
229
+ """A controller is stale if it claims to be running but its PID is dead or its
230
+ UPDATED_AT file mtime is older than 10 minutes."""
231
+ if env.get("CONTROLLER_STATE", "") in {"stopped", ""}:
232
+ return False
233
+ if not pid_alive(env.get("CONTROLLER_PID", "")):
234
+ return True
235
+ try:
236
+ file_age = datetime.now(timezone.utc).timestamp() - controller_path.stat().st_mtime
237
+ return file_age > 600
238
+ except Exception:
239
+ return False
240
+
241
+
228
242
  def collect_resident_controllers(state_root: Path) -> list[dict[str, Any]]:
229
243
  controllers_root = state_root / "resident-workers" / "issues"
230
244
  if not controllers_root.is_dir():
@@ -240,6 +254,7 @@ def collect_resident_controllers(state_root: Path) -> list[dict[str, Any]]:
240
254
  "session": env.get("SESSION", ""),
241
255
  "controller_pid": controller_pid,
242
256
  "controller_live": pid_alive(controller_pid),
257
+ "controller_stale": controller_is_stale(env, path),
243
258
  "mode": env.get("CONTROLLER_MODE", ""),
244
259
  "loop_count": safe_int(env.get("CONTROLLER_LOOP_COUNT")),
245
260
  "state": env.get("CONTROLLER_STATE", ""),
@@ -346,6 +361,32 @@ def collect_scheduled_issues(state_root: Path) -> list[dict[str, Any]]:
346
361
  return items
347
362
 
348
363
 
364
+ def collect_issue_retries(state_root: Path) -> list[dict[str, Any]]:
365
+ """Collect retry/backoff state for issues tracked by agent-project-retry-state."""
366
+ retries_root = state_root / "retries" / "issues"
367
+ if not retries_root.is_dir():
368
+ return []
369
+
370
+ now_epoch = int(datetime.now(timezone.utc).timestamp())
371
+ items: list[dict[str, Any]] = []
372
+ for path in sorted(retries_root.glob("*.env"), key=lambda item: item.stat().st_mtime, reverse=True):
373
+ env = read_env_file(path)
374
+ next_attempt_epoch = safe_int(env.get("NEXT_ATTEMPT_EPOCH"))
375
+ items.append(
376
+ {
377
+ "issue_id": path.stem,
378
+ "attempts": safe_int(env.get("ATTEMPTS")) or 0,
379
+ "next_attempt_epoch": next_attempt_epoch,
380
+ "next_attempt_at": env.get("NEXT_ATTEMPT_AT", ""),
381
+ "last_reason": env.get("LAST_REASON", ""),
382
+ "updated_at": env.get("UPDATED_AT", "") or file_mtime_iso(path),
383
+ "ready": not bool(next_attempt_epoch and next_attempt_epoch > now_epoch),
384
+ "state_file": str(path),
385
+ }
386
+ )
387
+ return items
388
+
389
+
349
390
  def collect_issue_queue(state_root: Path) -> dict[str, list[dict[str, Any]]]:
350
391
  queue_root = state_root / "resident-workers" / "issue-queue"
351
392
  pending_root = queue_root / "pending"
@@ -387,6 +428,7 @@ def build_profile_snapshot(profile_id: str, registry_root: Path) -> dict[str, An
387
428
  resident_workers = collect_resident_workers(state_root)
388
429
  cooldowns = collect_provider_cooldowns(state_root)
389
430
  scheduled = collect_scheduled_issues(state_root)
431
+ retries = collect_issue_retries(state_root)
390
432
  queue = collect_issue_queue(state_root)
391
433
 
392
434
  return {
@@ -422,10 +464,12 @@ def build_profile_snapshot(profile_id: str, registry_root: Path) -> dict[str, An
422
464
  ),
423
465
  "resident_controllers": len(controllers),
424
466
  "live_resident_controllers": sum(1 for item in controllers if item["state"] != "stopped" and item["controller_live"]),
467
+ "stale_resident_controllers": sum(1 for item in controllers if item.get("controller_stale", False)),
425
468
  "resident_workers": len(resident_workers),
426
469
  "queued_issues": len(queue["pending"]),
427
470
  "claimed_issues": len(queue["claims"]),
428
471
  "provider_cooldowns": sum(1 for item in cooldowns if item["active"]),
472
+ "active_retries": sum(1 for item in retries if not item.get("ready", True)),
429
473
  "scheduled_issues": len(scheduled),
430
474
  },
431
475
  "runs": runs,
@@ -433,6 +477,7 @@ def build_profile_snapshot(profile_id: str, registry_root: Path) -> dict[str, An
433
477
  "resident_workers": resident_workers,
434
478
  "provider_cooldowns": cooldowns,
435
479
  "scheduled_issues": scheduled,
480
+ "issue_retries": retries,
436
481
  "issue_queue": queue,
437
482
  }
438
483