npm - @kbediako/codex-orchestrator - Versions diffs - 0.1.16 → 0.1.18 - Mend

@kbediako/codex-orchestrator 0.1.16 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +1 -1
package/dist/bin/codex-orchestrator.js +119 -2
package/dist/orchestrator/src/cli/exec/experience.js +20 -2
package/dist/orchestrator/src/cli/exec/tfgrpo.js +13 -1
package/dist/orchestrator/src/cli/orchestrator.js +120 -2
package/dist/orchestrator/src/cli/rlm/symbolic.js +494 -38
package/dist/orchestrator/src/cli/rlmRunner.js +248 -15
package/dist/orchestrator/src/cli/run/manifest.js +200 -4
package/dist/orchestrator/src/cli/services/pipelineExperience.js +122 -0
package/dist/orchestrator/src/cloud/CodexCloudTaskExecutor.js +34 -2
package/docs/README.md +4 -1
package/package.json +2 -1
package/skills/collab-deliberation/SKILL.md +72 -8
package/skills/delegate-early/SKILL.md +13 -41
package/skills/delegation-usage/DELEGATION_GUIDE.md +18 -4
package/skills/delegation-usage/SKILL.md +21 -6
package/skills/docs-first/SKILL.md +1 -0
package/skills/standalone-review/SKILL.md +13 -1
package/templates/codex/AGENTS.md +30 -1

package/dist/orchestrator/src/cli/services/pipelineExperience.js ADDED Viewed

@@ -0,0 +1,122 @@
+import { relativeToRepo } from '../run/runPaths.js';
+import { ExperienceStore } from '../../persistence/ExperienceStore.js';
+import { loadInstructionSet } from '../../../../packages/orchestrator/src/instructions/loader.js';
+import { logger } from '../../logger.js';
+const SUCCESS_REWARD = 1;
+const COST_PER_TOKEN_USD = 0.000002;
+export async function persistPipelineExperience(params) {
+    const { env, pipeline, manifest, paths } = params;
+    if (manifest.status !== 'succeeded') {
+        return;
+    }
+    try {
+        const instructions = await loadInstructionSet(env.repoRoot);
+        const promptPacks = instructions.promptPacks.filter((pack) => pack.experienceSlots > 0);
+        if (promptPacks.length === 0) {
+            return;
+        }
+        const domain = resolveExperienceDomain(pipeline, promptPacks);
+        if (!domain) {
+            return;
+        }
+        const selectedPack = promptPacks.find((pack) => pack.domain === domain);
+        if (!selectedPack) {
+            return;
+        }
+        const summary = summarizePipelineOutcome(manifest);
+        if (!summary) {
+            return;
+        }
+        const tokenCount = Math.max(1, countWords(summary));
+        const durationMs = resolveDurationMs(manifest);
+        const record = {
+            runId: manifest.run_id,
+            taskId: manifest.task_id,
+            epoch: null,
+            groupId: null,
+            summary,
+            reward: { gtScore: SUCCESS_REWARD, relativeRank: 0 },
+            toolStats: [
+                {
+                    tool: `pipeline:${pipeline.id}`,
+                    tokens: tokenCount,
+                    latencyMs: durationMs,
+                    costUsd: roundCurrency(tokenCount * COST_PER_TOKEN_USD)
+                }
+            ],
+            stampSignature: selectedPack.stamp,
+            domain
+        };
+        const store = new ExperienceStore({
+            outDir: env.outRoot,
+            runsDir: env.runsRoot,
+            maxSummaryWords: instructions.experienceMaxWords
+        });
+        await store.recordBatch([record], relativeToRepo(env, paths.manifestPath));
+    }
+    catch (error) {
+        logger.warn(`Failed to persist pipeline experience for run ${manifest.run_id}: ${error?.message ?? String(error)}`);
+    }
+}
+export function resolveExperienceDomain(pipeline, promptPacks) {
+    const domains = uniqueDomains(promptPacks);
+    if (domains.length === 0) {
+        return null;
+    }
+    const haystack = normalizeSummary(`${pipeline.id} ${pipeline.title} ${(pipeline.tags ?? []).join(' ')}`).toLowerCase();
+    const directMatch = domains.find((domain) => haystack.includes(domain.toLowerCase()));
+    if (directMatch) {
+        return directMatch;
+    }
+    if (domains.includes('implementation')) {
+        return 'implementation';
+    }
+    return domains[0] ?? null;
+}
+export function summarizePipelineOutcome(manifest) {
+    const chunks = [];
+    if (typeof manifest.summary === 'string' && manifest.summary.trim().length > 0) {
+        chunks.push(normalizeSummary(manifest.summary));
+    }
+    const stageHighlights = manifest.commands
+        .filter((command) => command.status === 'succeeded')
+        .map((command) => normalizeSummary(command.summary ?? command.title))
+        .filter((value) => value.length > 0)
+        .slice(0, 2);
+    chunks.push(...stageHighlights);
+    if (chunks.length === 0) {
+        return null;
+    }
+    return chunks.join(' | ');
+}
+function uniqueDomains(promptPacks) {
+    const seen = new Set();
+    const domains = [];
+    for (const pack of promptPacks) {
+        const domain = pack.domain.trim();
+        if (!domain || seen.has(domain)) {
+            continue;
+        }
+        seen.add(domain);
+        domains.push(domain);
+    }
+    return domains;
+}
+function resolveDurationMs(manifest) {
+    const startedAt = Date.parse(manifest.started_at);
+    const completedAt = Date.parse(manifest.completed_at ?? manifest.started_at);
+    if (!Number.isFinite(startedAt) || !Number.isFinite(completedAt)) {
+        return 0;
+    }
+    return Math.max(0, completedAt - startedAt);
+}
+function normalizeSummary(value) {
+    return value.replace(/\s+/gu, ' ').trim();
+}
+function countWords(value) {
+    const tokens = value.trim().split(/\s+/u).filter(Boolean);
+    return tokens.length;
+}
+function roundCurrency(value) {
+    return Math.round(value * 1_000_000) / 1_000_000;
+}

package/dist/orchestrator/src/cloud/CodexCloudTaskExecutor.js CHANGED Viewed

@@ -5,7 +5,7 @@ import { setTimeout as sleep } from 'node:timers/promises';
 import { isoTimestamp } from '../cli/utils/time.js';
 const TASK_ID_PATTERN = /\btask_[a-z]_[a-f0-9]+\b/i;
 const MAX_LOG_CHARS = 32 * 1024;
-const STATUS_RETRY_LIMIT = 3;
+const STATUS_RETRY_LIMIT = 12;
 const STATUS_RETRY_BACKOFF_MS = 1500;
 const DEFAULT_LIST_LIMIT = 20;
 export function extractCloudTaskId(text) {
@@ -104,6 +104,12 @@ export class CodexCloudTaskExecutor {
             if (input.branch && input.branch.trim()) {
                 execArgs.push('--branch', input.branch.trim());
             }
+            for (const feature of normalizeFeatureList(input.enableFeatures)) {
+                execArgs.push('--enable', feature);
+            }
+            for (const feature of normalizeFeatureList(input.disableFeatures)) {
+                execArgs.push('--disable', feature);
+            }
             execArgs.push(input.prompt);
             const execResult = await runCloudCommand(execArgs);
             if (execResult.exitCode !== 0) {
@@ -123,6 +129,8 @@ export class CodexCloudTaskExecutor {
             }
             const timeoutAt = Date.now() + cloudExecution.timeout_seconds * 1000;
             let statusRetries = 0;
+            let lastKnownStatus = cloudExecution.status;
+            let loggedNonZeroStatus = false;
             while (Date.now() < timeoutAt) {
                 const statusResult = await runCloudCommand(['cloud', 'status', taskId]);
                 cloudExecution.last_polled_at = this.now();
@@ -139,9 +147,14 @@ export class CodexCloudTaskExecutor {
                     await this.sleepFn(STATUS_RETRY_BACKOFF_MS * statusRetries);
                     continue;
                 }
+                if (statusResult.exitCode !== 0 && mapped !== 'unknown' && !loggedNonZeroStatus) {
+                    notes.push(`Cloud status returned exit ${statusResult.exitCode} with remote status ${mapped}; continuing to poll.`);
+                    loggedNonZeroStatus = true;
+                }
                 statusRetries = 0;
                 if (mapped !== 'unknown') {
                     cloudExecution.status = mapped;
+                    lastKnownStatus = mapped;
                 }
                 if (mapped === 'ready') {
                     notes.push(`Cloud task completed: ${taskId}`);
@@ -155,7 +168,7 @@ export class CodexCloudTaskExecutor {
             }
             if (cloudExecution.status === 'running' || cloudExecution.status === 'queued') {
                 cloudExecution.status = 'failed';
-                cloudExecution.error = `Timed out waiting for cloud task completion after ${cloudExecution.timeout_seconds}s.`;
+                cloudExecution.error = `Timed out waiting for cloud task completion after ${cloudExecution.timeout_seconds}s (last remote status: ${lastKnownStatus}, polls: ${cloudExecution.poll_count}).`;
             }
             if (cloudExecution.status === 'ready') {
                 const diffResult = await runCloudCommand(['cloud', 'diff', taskId]);
@@ -213,6 +226,25 @@ export class CodexCloudTaskExecutor {
         }
     }
 }
+function normalizeFeatureList(features) {
+    if (!Array.isArray(features) || features.length === 0) {
+        return [];
+    }
+    const seen = new Set();
+    const normalized = [];
+    for (const raw of features) {
+        if (typeof raw !== 'string') {
+            continue;
+        }
+        const feature = raw.trim();
+        if (!feature || seen.has(feature)) {
+            continue;
+        }
+        seen.add(feature);
+        normalized.push(feature);
+    }
+    return normalized;
+}
 export async function defaultCloudCommandRunner(request) {
     return await new Promise((resolve, reject) => {
         const child = spawn(request.command, request.args, {

package/docs/README.md CHANGED Viewed

@@ -53,6 +53,9 @@ Group execution (when `FEATURE_TFGRPO_GROUP=on`): repeat the Builder → Tester
 - Manifests record the tag, commit SHA, tarball digest/path, queue payload path, and validation status (`validated`, `snapshot_failed`, `stalled_snapshot`, `needs_manual_scenario`) under `learning.*` so reviewers can audit outcomes without external storage.
 - Scenario synthesis replays the most recent successful command from the run (or prompt/diff fallback), writes `learning/scenario.json`, and automatically executes the commands; validation logs live at `learning/scenario-validation.log` and are stored in `learning.validation.log_path`.
 - Override snapshot storage with `LEARNING_SNAPSHOT_DIR=/custom/dir` when needed; the default lives under `.runs/learning-snapshots/` (or `$CODEX_ORCHESTRATOR_RUNS_DIR/learning-snapshots/` when configured).
+- Successful pipeline runs also persist lightweight experience records in `out/<task-id>/experiences.jsonl` using prompt-pack domains, so future runs can inject higher-signal context without requiring learning snapshots.
+- Prompt-pack injections apply a minimum reward threshold (`TFGRPO_EXPERIENCE_MIN_REWARD`, default `0.1`) to avoid re-injecting low-signal records.
+- In cloud execution mode, the orchestrator now injects a bounded subset of relevant prompt-pack experience snippets directly into the cloud task prompt, so persisted experience data can influence execution outcomes immediately.
 ### How to run the learning pipeline locally
 - Seed a normal run and keep manifests grouped by task:
@@ -199,7 +202,7 @@ Note: the commands below assume a source checkout; `scripts/` helpers are not in
 | `npm run eval:test` | Optional evaluation harness (enable when `evaluation/fixtures/**` is populated). |
 | `npm run docs:check` | Deterministically validates scripts/pipelines/paths referenced in agent-facing docs. |
 | `npm run docs:freshness` | Validates docs registry coverage + review recency; writes `out/<task-id>/docs-freshness.json`. |
-| `npm run ci:cloud-canary` | Runs the cloud canary harness (`scripts/cloud-canary-ci.mjs`) to verify cloud lifecycle manifest + run-summary evidence; credential-gated by `CODEX_CLOUD_ENV_ID` and optional auth secrets (`CODEX_CLOUD_BRANCH` defaults to `main`). |
+| `npm run ci:cloud-canary` | Runs the cloud canary harness (`scripts/cloud-canary-ci.mjs`) to verify cloud lifecycle manifest + run-summary evidence; credential-gated by `CODEX_CLOUD_ENV_ID` and optional auth secrets (`CODEX_CLOUD_BRANCH` defaults to `main`). Feature flags can be passed through with `CODEX_CLOUD_ENABLE_FEATURES` / `CODEX_CLOUD_DISABLE_FEATURES` (comma- or space-delimited, e.g. `sqlite,memory_tool`). |
 | `node scripts/delegation-guard.mjs` | Enforces subagent delegation evidence before review (repo-only). |
 | `node scripts/spec-guard.mjs --dry-run` | Validates spec freshness; required before review (repo-only). |
 | `node scripts/diff-budget.mjs` | Guards against oversized diffs before review (repo-only; defaults: 25 files / 800 lines; supports explicit overrides). |

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@kbediako/codex-orchestrator",
-  "version": "0.1.16",
+  "version": "0.1.18",
   "license": "MIT",
   "repository": {
     "type": "git",
@@ -62,6 +62,7 @@
     "eval:test": "vitest run evaluation/tests",
     "test:watch": "vitest",
     "review": "node --loader ts-node/esm scripts/run-review.ts",
+    "pr:watch-merge": "node scripts/pr-monitor-merge.mjs",
     "status-ui": "node scripts/status-ui-serve.mjs",
     "design:purge-expired": "node --loader ts-node/esm scripts/design/purgeExpired.ts",
     "generate:manifest-types": "node scripts/generate-manifest-types.mjs",

package/skills/collab-deliberation/SKILL.md CHANGED Viewed

@@ -5,17 +5,81 @@ description: Structure multi-agent brainstorming and deliberation (options, trad
 # Collab Deliberation
-Use this skill when the user asks for brainstorming, multiple approaches, pros/cons, or decision support. This skill is for **ideas**, not implementation.
+Use this skill when the user asks for brainstorming, tradeoffs, option comparison, or decision support before implementation. This skill is for ideas and decisions, not coding.
-## Workflow
+## Deliberation Default v1 (required)
+- Keep MCP as the lead control plane. Use collab/delegated subagents to generate and challenge options.
+- Run full deliberation when any hard-stop trigger is true:
+  - Irreversible/destructive change with unclear rollback.
+  - Auth/secrets/PII boundary touched.
+  - Direct production customer/financial/legal impact.
+  - Conflicting intent on high-impact work.
+- Otherwise compute a risk score (`0..2` each): reversibility, external impact, security/privacy boundary, blast radius, requirement clarity, verification strength, time pressure.
+- Run full deliberation when score `>=7` or two or more criteria score `2`.
+- Use these time budgets for auto-deliberation:
-1) Clarify the decision: summarize the goal, constraints, and success criteria.
-2) Generate options: 3–5 distinct approaches with short descriptions.
-3) Compare tradeoffs: cost, risk, speed, maintenance, and alignment with guardrails.
-4) Recommend: choose a recommended approach and explain why.
-5) Open questions: list 1–3 questions that would change the recommendation.
+| Class | Horizon | Soft cap | Hard cap |
+| --- | --- | --- | --- |
+| `T0` | `<=15m` | `5s` | `12s` |
+| `T1` | `15m..2h` | `20s` | `45s` |
+| `T2` | `2h..8h` | `60s` | `120s` |
+| `T3` | `>8h` | `120s` | `300s` |
+- On soft cap: stop branching and execute the best current plan.
+- On hard cap: disable auto-deliberation for that stage and continue execution.
+## Auto-trigger cadence (required)
+- Run deliberation at task bootstrap for non-trivial work.
+- Re-run deliberation after each meaningful chunk (default: behavior change or about 2+ files touched).
+- Re-run deliberation when external feedback lands (PR review, bot findings, CI failures).
+- Re-run deliberation when ambiguity/risk increases mid-flight (new constraints, conflicting evidence, high-signal `P1` or any `P0` finding).
+- Re-run deliberation at least every 45 minutes during active implementation.
+- If orchestration uses symbolic RLM, keep runtime auto-deliberation enabled:
+  - `RLM_SYMBOLIC_DELIBERATION=1` (default)
+  - `RLM_SYMBOLIC_DELIBERATION_INTERVAL` (default `2`)
+  - `RLM_SYMBOLIC_DELIBERATION_MAX_RUNS` (default `12`)
+  - `RLM_SYMBOLIC_DELIBERATION_MAX_SUMMARY_BYTES` (default `2048`)
+  - `RLM_SYMBOLIC_DELIBERATION_INCLUDE_IN_PLANNER=1` (default)
+## Workflow (required)
+1) Frame the decision.
+- Write a one-sentence decision statement.
+- Capture goals, constraints, non-goals, and success criteria.
+- List assumptions and label each `confirmed` or `unconfirmed`.
+2) Close critical context gaps.
+- Ask up to 3 targeted questions only if answers could change the recommendation.
+- If delegation is available, prefer a subagent for context gathering before asking the user.
+3) Generate distinct options.
+- Produce 3-5 materially different options.
+- For each option include approach, prerequisites, blast radius, and time/risk profile.
+4) Evaluate and stress test.
+- Use a tailored rubric (3-5 dimensions relevant to the decision).
+- For each option include one likely failure mode and one mitigation.
+5) Recommend or defer explicitly.
+- Recommend one option when confidence is sufficient.
+- If uncertainty is high, defer with explicit decision gates.
+6) Close with decision-driving questions.
+- List 1-3 prioritized open questions that could change the recommendation.
+- End with one concrete next step that improves decision quality without implementation.
+## Output contract
+- `Decision`: one sentence.
+- `Context`: goals, constraints, non-goals, assumptions.
+- `Options`: 3-5 concise options.
+- `Tradeoffs`: rubric and comparative rationale.
+- `Recommendation`: chosen option or explicit defer with decision gates.
+- `Open questions`: prioritized items only.
+- `Next step`: single highest-leverage action.
+- `Confidence`: `high` | `medium` | `low`.
 ## Guardrails
-- Separate ideas from decisions.
+- Separate facts from assumptions.
 - Do not implement or modify code unless explicitly asked.
+- Do not present uncertainty as certainty.
 - Keep outputs concise and action-oriented.
+- If collab subagents are used, close lifecycle loops per id (`spawn_agent` -> `wait` -> `close_agent`) before finishing.

package/skills/delegate-early/SKILL.md CHANGED Viewed

@@ -1,49 +1,21 @@
 ---
 name: delegate-early
-description: Spawn subagents early and often to conserve context and parallelize research, review, and planning.
+description: Compatibility alias for delegation-first workflows. Prefer `delegation-usage` as the canonical skill.
 ---
-# Delegate Early
+# Delegate Early (Compatibility Alias)
-Use this skill when a task can be split into parallel streams or when the main context risks ballooning. The top-level Codex remains the lead; subagents are assistants.
+`delegate-early` is kept for backward compatibility. The canonical delegation workflow now lives in `delegation-usage`.
-## Goals
-- Conserve primary context by offloading research/review/planning.
-- Improve throughput with parallel subagent streams.
-- Capture delegation evidence for auditability.
+## Required behavior
+- Immediately follow `delegation-usage` for setup, spawn semantics, question queue handling, confirmation flow, and manifest usage.
+- Keep delegation MCP enabled by default; enable other MCP servers only when relevant to the task.
+- Preserve delegation evidence (task-id stream naming + manifest path capture) exactly as documented in `delegation-usage`.
-## When to spawn
-- Before deep reading/analysis to avoid bloating context.
-- When new ambiguity appears or scope changes.
-- For independent streams (research, review, planning, edge cases).
+## Quick routing
+1. Use `delegation-usage` as the source of truth.
+2. Apply early fan-out only when streams are clearly independent and acceptance criteria are explicit.
+3. Keep summaries short and artifact-first; avoid long chat dumps.
-## Task slicing heuristic
-- Identify 2–4 independent streams with minimal shared context.
-- Prefer streams like: `research`, `review`, `spec-check`, `edge-cases`.
-## Required conventions
-- Use `MCP_RUNNER_TASK_ID=<task-id>-<stream>` for subagents.
-- Record manifest paths and summarize findings in the main run.
-- Before review handoff, run the delegation guard stage via the packaged runner:
-  `node "$CODEX_ORCHESTRATOR_PACKAGE_ROOT/dist/orchestrator/src/cli/utils/delegationGuardRunner.js"`.
-  For ad-hoc runs without task IDs, set `CODEX_ORCHESTRATOR_GUARD_PROFILE=warn`.
-## Minimal delegation workflow
-1) Name streams and write 1–2 sentence goals for each.
-2) Spawn subagents with clear, bounded prompts.
-3) Wait for subagent completion; retrieve manifest evidence and summarize findings into the main plan.
-4) Proceed with implementation.
-## Prompt patterns
-- Research: “Find X, cite Y, return 3 bullets + risks.”
-- Review: “Inspect files A/B for regressions; list issues by severity.”
-- Planning: “Draft a 3–5-step plan, call out unknowns.”
-## Escalation rules
-- If delegation is impossible, set `DELEGATION_GUARD_OVERRIDE_REASON` and document it in the task checklist.
-## Subagent summary format
-- **Findings**: Key results and conclusions from the subagent run
-- **Risks**: Issues, blockers, or concerns
-- **Open questions**: Unresolved items requiring follow-up
-- **Evidence**: Manifest path (e.g., `.runs/<task-id>-<stream>/cli/<timestamp>/manifest.json`)
+## Note
+If guidance in this file conflicts with `delegation-usage`, follow `delegation-usage`.

package/skills/delegation-usage/DELEGATION_GUIDE.md CHANGED Viewed

@@ -1,6 +1,7 @@
 # Delegation Guide (Detailed)
 Use this guide for deeper context on delegation behavior, tool surfaces, and troubleshooting.
+`delegation-usage` is the canonical delegation workflow; `delegate-early` should be treated as a compatibility alias.
 ## Mental model
@@ -80,6 +81,16 @@ delegate.spawn({
 })
 ```
+## Collab lifecycle hygiene (required)
+When using collab tools (`spawn_agent` / `wait` / `close_agent`):
+- Treat each spawned `agent_id` as a resource that must be closed.
+- For every successful spawn, run `wait` then `close_agent` for the same id.
+- Keep a local list of spawned ids and run a final cleanup pass before returning.
+- On timeout/error paths, still close known ids before reporting failure.
+- If you see `agent thread limit reached`, stop spawning immediately, close known ids, and retry only after cleanup.
 ## RLM budget overrides (recommended defaults)
 If you want deeper recursion or longer wall-clock time for delegated runs, set RLM budgets on the delegation server:
@@ -106,12 +117,12 @@ If you need delegation to respect a repo’s `.codex/orchestrator.toml` (e.g., s
 ## Version guard (JSONL handshake)
-Delegation MCP expects JSONL. Use `codex-orchestrator` 0.1.12 or newer.
+Delegation MCP expects JSONL. Keep `codex-orchestrator` aligned with the current release line.
 - Check: `codex-orchestrator --version`
-- Update global: `npm i -g @kbediako/codex-orchestrator@0.1.12`
-- Or pin via npx: `npx -y @kbediako/codex-orchestrator@0.1.12 delegate-server`
-- If your installed CLI is behind 0.1.12, prefer upgrading or pinning to the docs’ minimum.
+- Update global: `npm i -g @kbediako/codex-orchestrator@latest`
+- Or pin via npx: `npx -y @kbediako/codex-orchestrator@<version> delegate-server`
+- If using a custom Codex fork, fast-forward from `upstream/main` regularly and rebuild to avoid protocol drift.
 ## Common failures
@@ -120,3 +131,6 @@ Delegation MCP expects JSONL. Use `codex-orchestrator` 0.1.12 or newer.
 - **Tool profile ignored**: The repo’s `delegate.allowed_tool_servers` may be empty, or names are invalid.
 - **Missing control files**: delegate tools rely on `control_endpoint.json` in the run directory.
 - **Run identifiers**: status/pause/cancel require `manifest_path`; question queue requires `parent_manifest_path`.
+- **Collab payload mismatch**: `spawn_agent` calls fail if they include both `message` and `items`.
+- **Collab depth limits**: recursive collab fan-out can fail near max depth; prefer shallow parent fan-out.
+- **Collab lifecycle leaks**: missing `close_agent` calls can exhaust thread slots and block future spawns (`agent thread limit reached`).

package/skills/delegation-usage/SKILL.md CHANGED Viewed

@@ -9,8 +9,21 @@ description: Use when operating the Codex delegation MCP server and tools (deleg
 Use this skill to operate delegation MCP tools with delegation enabled by default (the only MCP on by default). Disable it only when required by safety constraints, and keep other MCPs off unless they are relevant to the task.
+`delegation-usage` is the canonical delegation workflow skill. If `delegate-early` is present, treat it as a compatibility alias that should redirect to this skill.
 Collab multi-agent mode is separate from delegation. For symbolic RLM subcalls that use collab tools, set `RLM_SYMBOLIC_COLLAB=1` and ensure a collab-capable Codex CLI; collab tool calls are recorded in `manifest.collab_tool_calls`. If collab tools are unavailable in your CLI build, skip collab steps; delegation still works independently.
+## Collab realities in delegated runs (current behavior)
+- `spawn_agent` accepts one input style per call: either `message` (plain text) or `items` (structured input).
+- Do not send both `message` and `items` in the same `spawn_agent` call.
+- Spawn returns an `agent_id` (thread id). Current TUI collab rendering is id-based; do not depend on custom visible agent names.
+- Subagents spawned through collab run with approval effectively set to `never`; design child tasks to avoid approval/escalation requirements.
+- Collab spawn depth is bounded. Near/at max depth, recursive delegation can fail or collab can be disabled in children; prefer shallow parent fan-out.
+- **Lifecycle is mandatory:** for every successful `spawn_agent`, run `wait` and then `close_agent` for that same id before task completion.
+- Keep a local list of spawned ids and run a final cleanup pass so no agent id is left unclosed on timeout/error paths.
+- If spawn fails with `agent thread limit reached`, stop spawning, close any known ids first, then surface a concise recovery note.
 ## Quick-start workflow (canned)
 Use this when delegation tools are missing in the current run (MCP disabled) and you want a background Codex run to handle delegation:
@@ -64,12 +77,11 @@ For runner + delegation coordination (short `--task` flow), see `docs/delegation
 ### 0a) Version guard (JSONL handshake)
-- Delegation MCP uses JSONL; ensure the server binary meets the docs’ minimum version (0.1.12):
-  - `codex-orchestrator --version` should be `>= 0.1.12`.
-- If not, update global install: `npm i -g @kbediako/codex-orchestrator@0.1.12`
-- Alternative: pin the MCP server to `npx -y @kbediako/codex-orchestrator@0.1.12` for deterministic behavior.
-- Note: if your installed CLI is older than 0.1.12, prefer upgrading or pinning to the docs’ minimum.
-- Keep the version pins in this section in sync with the docs’ minimum (currently 0.1.12).
+- Delegation MCP uses JSONL; keep `codex-orchestrator` aligned with the current release line.
+  - Check installed version: `codex-orchestrator --version`
+  - Preferred update path: `npm i -g @kbediako/codex-orchestrator@latest`
+  - Deterministic pin path (for reproducible environments): `npx -y @kbediako/codex-orchestrator@<version> delegate-server`
+- If using a custom Codex fork, fast-forward it regularly from `upstream/main` and rebuild the managed CLI to avoid delegation/collab protocol drift.
 ### 0b) Background terminal bootstrap (required when MCP is disabled)
@@ -163,3 +175,6 @@ repeat:
 - **Confirmation misuse:** never pass `confirm_nonce` from model/tool input; it is runner‑injected only.
 - **Secrets exposure:** never include secrets/tokens/PII in delegate prompts or files.
 - **Missing control files:** delegate tools rely on `control_endpoint.json` in the run directory; older runs may not have it.
+- **Collab payload mismatch:** `spawn_agent` rejects calls that include both `message` and `items`.
+- **Collab UI assumptions:** agent rows/records are id-based today; use explicit stream role text in prompts/artifacts for operator clarity.
+- **Collab lifecycle leaks:** missing `close_agent` calls accumulate open threads and can trigger `agent thread limit reached`; always finish `spawn -> wait -> close_agent` per id.

package/skills/docs-first/SKILL.md CHANGED Viewed

@@ -16,6 +16,7 @@ Use this skill when a task needs a spec-driven workflow. The objective is to cre
 - TECH_SPEC: capture technical requirements (use `.agent/task/templates/tech-spec-template.md`; stored under `tasks/specs/<id>-<slug>.md`).
 - ACTION_PLAN: capture sequencing/milestones (use `.agent/task/templates/action-plan-template.md`).
 - Depth scales with scope, but all three docs are required.
+- For low-risk tiny edits, follow the bounded shortcut in `docs/micro-task-path.md` instead of long-form rewrites (still requires task/spec evidence).
 2) Register the TECH_SPEC and task
 - Add the TECH_SPEC to `tasks/index.json` (including `last_review`).

package/skills/standalone-review/SKILL.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 name: standalone-review
-description: Use for ad-hoc/standalone reviews outside pipelines (fast checks during implementation or before handoff) using `codex review`.
+description: Use for required periodic cross-check reviews during implementation and before handoff using `codex review`.
 ---
 # Standalone Review
@@ -10,6 +10,17 @@ description: Use for ad-hoc/standalone reviews outside pipelines (fast checks du
 Use this skill when you need a fast, ad-hoc review without running a pipeline or collecting a manifest. It is ideal during implementation or for quick pre-flight checks.
 Before implementation, use it to review the task/spec against the user’s intent and record the approval in the PRD/TECH_SPEC or task notes.
+## Auto-trigger policy (required)
+Run this skill automatically whenever any condition is true:
+- You made code/config/script/test edits since the last standalone review.
+- You finished a meaningful chunk of work (default: behavior change or about 2+ files touched).
+- You are about to report completion, propose merge, or answer "what's next?" with recommendations.
+- You addressed external feedback (PR reviews, bot comments, or CI-fix patches).
+- 45 minutes of active implementation elapsed without a standalone review.
+If review execution is blocked, record why in task notes, then do manual diff review plus targeted tests before proceeding.
 ## Quick start
 Uncommitted diff:
@@ -39,6 +50,7 @@ codex review "Focus on correctness, regressions, edge cases; list missing tests.
 - Keep prompts short, specific, and test-oriented.
 2) Run the review often
+- Follow the auto-trigger policy above (not optional).
 - Run after each meaningful chunk of work.
 - Prefer targeted focus prompts for WIP reviews.

package/templates/codex/AGENTS.md CHANGED Viewed

@@ -1,9 +1,11 @@
-<!-- codex:instruction-stamp 7477a786938b5c7f883b7d21a0954b48a1ddbbc3dcabd052b347b303cb3075a4 -->
+<!-- codex:instruction-stamp 2408396e5cc9b25d5522b7064010a36a43007508072f3e0f051ab042370928a1 -->
 # Agent Instructions (Template)
 ## Orchestrator-first workflow
 - Use `codex-orchestrator` pipelines for planning, implementation, validation, and review.
 - Default to `docs-review` before implementation and `implementation-gate` after code changes.
+- Prefer cloud mode when runs are long-running/parallel and cloud prerequisites are ready.
+- Before cloud mode, verify branch availability, non-interactive setup commands, and required secrets/variables; if missing, run in local `mcp` mode and record why.
 - Before implementation, run a standalone review of the task/spec against the user’s intent and record the approval in the spec + checklist notes.
 - Delegation is mandatory for top-level tasks once a task id exists: spawn at least one subagent run using `MCP_RUNNER_TASK_ID=<task-id>-<stream>`, capture manifest evidence, and summarize in the main run. Use `DELEGATION_GUARD_OVERRIDE_REASON` only when delegation is impossible (technical/blocking limitation or explicit operational block) and record the justification.
 - Once a task id exists, prefer delegation for research, review, and planning work. Use `codex exec` only for pre-task triage (no task id yet) or when delegation is genuinely unavailable (technical/blocking limitation or explicit operational block), and set `DELEGATION_GUARD_OVERRIDE_REASON` with a clear justification.
@@ -18,11 +20,38 @@
 - Use `codex review` for quick checks during implementation.
 - Capture standalone review approval in the spec/task notes before implementation begins.
 - When you need manifest-backed review evidence, run `npm run review` with the manifest path.
+- Before merge for non-trivial changes, run one explicit elegance/minimality review pass and simplify avoidable complexity.
 ## Delegation (recommended)
 - For non-trivial work, spawn at least one subagent run using `MCP_RUNNER_TASK_ID=<task-id>-<stream>`.
 - If delegation is not possible, record the reason in the task checklist.
+## Deliberation Default (agent-first)
+- Keep MCP as the lead control plane. Use collab/delegated subagents for deliberation when ambiguity or impact is high.
+- Run full deliberation on any hard-stop trigger:
+  - Irreversible/destructive changes with unclear rollback.
+  - Auth/secrets/PII boundary changes.
+  - Direct production customer/financial/legal impact.
+  - Conflicting intent on high-impact changes.
+- Otherwise, use a simple risk score (`0..2` each): reversibility, external impact, security/privacy boundary, blast radius, requirement clarity, verification strength, time pressure.
+- Require full deliberation when score `>=7` or two or more criteria score `2`.
+- Time budgets for auto-deliberation:
+  - `T0` quick: `5s / 12s` (soft/hard)
+  - `T1` standard: `20s / 45s`
+  - `T2` complex: `60s / 120s`
+  - `T3` long-horizon: `120s / 300s`
+- On soft cap: stop branching and execute the best current plan.
+- On hard cap: disable auto-deliberation for that stage and continue execution.
+- Review-signal policy:
+  - `P0` critical findings are hard-stop.
+  - `P1` high findings are hard-stop only when high-signal (clear evidence or corroboration).
+  - `P2/P3` findings are tracked follow-ups.
+## Completion discipline (patience-first)
+- Wait/poll for terminal state on long-running operations (CI checks, reviews, cloud jobs, orchestrator runs) before reporting completion.
+- Reset waiting windows when checks restart or new feedback appears.
+- Do not hand off mid-flight work unless the user explicitly asks to stop.
 ## Instruction stamp
 - If you edit this file, refresh the instruction stamp.
 - One-liner: