@kbediako/codex-orchestrator 0.1.16 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,122 @@
1
+ import { relativeToRepo } from '../run/runPaths.js';
2
+ import { ExperienceStore } from '../../persistence/ExperienceStore.js';
3
+ import { loadInstructionSet } from '../../../../packages/orchestrator/src/instructions/loader.js';
4
+ import { logger } from '../../logger.js';
5
+ const SUCCESS_REWARD = 1;
6
+ const COST_PER_TOKEN_USD = 0.000002;
7
+ export async function persistPipelineExperience(params) {
8
+ const { env, pipeline, manifest, paths } = params;
9
+ if (manifest.status !== 'succeeded') {
10
+ return;
11
+ }
12
+ try {
13
+ const instructions = await loadInstructionSet(env.repoRoot);
14
+ const promptPacks = instructions.promptPacks.filter((pack) => pack.experienceSlots > 0);
15
+ if (promptPacks.length === 0) {
16
+ return;
17
+ }
18
+ const domain = resolveExperienceDomain(pipeline, promptPacks);
19
+ if (!domain) {
20
+ return;
21
+ }
22
+ const selectedPack = promptPacks.find((pack) => pack.domain === domain);
23
+ if (!selectedPack) {
24
+ return;
25
+ }
26
+ const summary = summarizePipelineOutcome(manifest);
27
+ if (!summary) {
28
+ return;
29
+ }
30
+ const tokenCount = Math.max(1, countWords(summary));
31
+ const durationMs = resolveDurationMs(manifest);
32
+ const record = {
33
+ runId: manifest.run_id,
34
+ taskId: manifest.task_id,
35
+ epoch: null,
36
+ groupId: null,
37
+ summary,
38
+ reward: { gtScore: SUCCESS_REWARD, relativeRank: 0 },
39
+ toolStats: [
40
+ {
41
+ tool: `pipeline:${pipeline.id}`,
42
+ tokens: tokenCount,
43
+ latencyMs: durationMs,
44
+ costUsd: roundCurrency(tokenCount * COST_PER_TOKEN_USD)
45
+ }
46
+ ],
47
+ stampSignature: selectedPack.stamp,
48
+ domain
49
+ };
50
+ const store = new ExperienceStore({
51
+ outDir: env.outRoot,
52
+ runsDir: env.runsRoot,
53
+ maxSummaryWords: instructions.experienceMaxWords
54
+ });
55
+ await store.recordBatch([record], relativeToRepo(env, paths.manifestPath));
56
+ }
57
+ catch (error) {
58
+ logger.warn(`Failed to persist pipeline experience for run ${manifest.run_id}: ${error?.message ?? String(error)}`);
59
+ }
60
+ }
61
+ export function resolveExperienceDomain(pipeline, promptPacks) {
62
+ const domains = uniqueDomains(promptPacks);
63
+ if (domains.length === 0) {
64
+ return null;
65
+ }
66
+ const haystack = normalizeSummary(`${pipeline.id} ${pipeline.title} ${(pipeline.tags ?? []).join(' ')}`).toLowerCase();
67
+ const directMatch = domains.find((domain) => haystack.includes(domain.toLowerCase()));
68
+ if (directMatch) {
69
+ return directMatch;
70
+ }
71
+ if (domains.includes('implementation')) {
72
+ return 'implementation';
73
+ }
74
+ return domains[0] ?? null;
75
+ }
76
+ export function summarizePipelineOutcome(manifest) {
77
+ const chunks = [];
78
+ if (typeof manifest.summary === 'string' && manifest.summary.trim().length > 0) {
79
+ chunks.push(normalizeSummary(manifest.summary));
80
+ }
81
+ const stageHighlights = manifest.commands
82
+ .filter((command) => command.status === 'succeeded')
83
+ .map((command) => normalizeSummary(command.summary ?? command.title))
84
+ .filter((value) => value.length > 0)
85
+ .slice(0, 2);
86
+ chunks.push(...stageHighlights);
87
+ if (chunks.length === 0) {
88
+ return null;
89
+ }
90
+ return chunks.join(' | ');
91
+ }
92
+ function uniqueDomains(promptPacks) {
93
+ const seen = new Set();
94
+ const domains = [];
95
+ for (const pack of promptPacks) {
96
+ const domain = pack.domain.trim();
97
+ if (!domain || seen.has(domain)) {
98
+ continue;
99
+ }
100
+ seen.add(domain);
101
+ domains.push(domain);
102
+ }
103
+ return domains;
104
+ }
105
+ function resolveDurationMs(manifest) {
106
+ const startedAt = Date.parse(manifest.started_at);
107
+ const completedAt = Date.parse(manifest.completed_at ?? manifest.started_at);
108
+ if (!Number.isFinite(startedAt) || !Number.isFinite(completedAt)) {
109
+ return 0;
110
+ }
111
+ return Math.max(0, completedAt - startedAt);
112
+ }
113
+ function normalizeSummary(value) {
114
+ return value.replace(/\s+/gu, ' ').trim();
115
+ }
116
+ function countWords(value) {
117
+ const tokens = value.trim().split(/\s+/u).filter(Boolean);
118
+ return tokens.length;
119
+ }
120
+ function roundCurrency(value) {
121
+ return Math.round(value * 1_000_000) / 1_000_000;
122
+ }
@@ -5,7 +5,7 @@ import { setTimeout as sleep } from 'node:timers/promises';
5
5
  import { isoTimestamp } from '../cli/utils/time.js';
6
6
  const TASK_ID_PATTERN = /\btask_[a-z]_[a-f0-9]+\b/i;
7
7
  const MAX_LOG_CHARS = 32 * 1024;
8
- const STATUS_RETRY_LIMIT = 3;
8
+ const STATUS_RETRY_LIMIT = 12;
9
9
  const STATUS_RETRY_BACKOFF_MS = 1500;
10
10
  const DEFAULT_LIST_LIMIT = 20;
11
11
  export function extractCloudTaskId(text) {
@@ -104,6 +104,12 @@ export class CodexCloudTaskExecutor {
104
104
  if (input.branch && input.branch.trim()) {
105
105
  execArgs.push('--branch', input.branch.trim());
106
106
  }
107
+ for (const feature of normalizeFeatureList(input.enableFeatures)) {
108
+ execArgs.push('--enable', feature);
109
+ }
110
+ for (const feature of normalizeFeatureList(input.disableFeatures)) {
111
+ execArgs.push('--disable', feature);
112
+ }
107
113
  execArgs.push(input.prompt);
108
114
  const execResult = await runCloudCommand(execArgs);
109
115
  if (execResult.exitCode !== 0) {
@@ -123,6 +129,8 @@ export class CodexCloudTaskExecutor {
123
129
  }
124
130
  const timeoutAt = Date.now() + cloudExecution.timeout_seconds * 1000;
125
131
  let statusRetries = 0;
132
+ let lastKnownStatus = cloudExecution.status;
133
+ let loggedNonZeroStatus = false;
126
134
  while (Date.now() < timeoutAt) {
127
135
  const statusResult = await runCloudCommand(['cloud', 'status', taskId]);
128
136
  cloudExecution.last_polled_at = this.now();
@@ -139,9 +147,14 @@ export class CodexCloudTaskExecutor {
139
147
  await this.sleepFn(STATUS_RETRY_BACKOFF_MS * statusRetries);
140
148
  continue;
141
149
  }
150
+ if (statusResult.exitCode !== 0 && mapped !== 'unknown' && !loggedNonZeroStatus) {
151
+ notes.push(`Cloud status returned exit ${statusResult.exitCode} with remote status ${mapped}; continuing to poll.`);
152
+ loggedNonZeroStatus = true;
153
+ }
142
154
  statusRetries = 0;
143
155
  if (mapped !== 'unknown') {
144
156
  cloudExecution.status = mapped;
157
+ lastKnownStatus = mapped;
145
158
  }
146
159
  if (mapped === 'ready') {
147
160
  notes.push(`Cloud task completed: ${taskId}`);
@@ -155,7 +168,7 @@ export class CodexCloudTaskExecutor {
155
168
  }
156
169
  if (cloudExecution.status === 'running' || cloudExecution.status === 'queued') {
157
170
  cloudExecution.status = 'failed';
158
- cloudExecution.error = `Timed out waiting for cloud task completion after ${cloudExecution.timeout_seconds}s.`;
171
+ cloudExecution.error = `Timed out waiting for cloud task completion after ${cloudExecution.timeout_seconds}s (last remote status: ${lastKnownStatus}, polls: ${cloudExecution.poll_count}).`;
159
172
  }
160
173
  if (cloudExecution.status === 'ready') {
161
174
  const diffResult = await runCloudCommand(['cloud', 'diff', taskId]);
@@ -213,6 +226,25 @@ export class CodexCloudTaskExecutor {
213
226
  }
214
227
  }
215
228
  }
229
+ function normalizeFeatureList(features) {
230
+ if (!Array.isArray(features) || features.length === 0) {
231
+ return [];
232
+ }
233
+ const seen = new Set();
234
+ const normalized = [];
235
+ for (const raw of features) {
236
+ if (typeof raw !== 'string') {
237
+ continue;
238
+ }
239
+ const feature = raw.trim();
240
+ if (!feature || seen.has(feature)) {
241
+ continue;
242
+ }
243
+ seen.add(feature);
244
+ normalized.push(feature);
245
+ }
246
+ return normalized;
247
+ }
216
248
  export async function defaultCloudCommandRunner(request) {
217
249
  return await new Promise((resolve, reject) => {
218
250
  const child = spawn(request.command, request.args, {
package/docs/README.md CHANGED
@@ -53,6 +53,9 @@ Group execution (when `FEATURE_TFGRPO_GROUP=on`): repeat the Builder → Tester
53
53
  - Manifests record the tag, commit SHA, tarball digest/path, queue payload path, and validation status (`validated`, `snapshot_failed`, `stalled_snapshot`, `needs_manual_scenario`) under `learning.*` so reviewers can audit outcomes without external storage.
54
54
  - Scenario synthesis replays the most recent successful command from the run (or prompt/diff fallback), writes `learning/scenario.json`, and automatically executes the commands; validation logs live at `learning/scenario-validation.log` and are stored in `learning.validation.log_path`.
55
55
  - Override snapshot storage with `LEARNING_SNAPSHOT_DIR=/custom/dir` when needed; the default lives under `.runs/learning-snapshots/` (or `$CODEX_ORCHESTRATOR_RUNS_DIR/learning-snapshots/` when configured).
56
+ - Successful pipeline runs also persist lightweight experience records in `out/<task-id>/experiences.jsonl` using prompt-pack domains, so future runs can inject higher-signal context without requiring learning snapshots.
57
+ - Prompt-pack injections apply a minimum reward threshold (`TFGRPO_EXPERIENCE_MIN_REWARD`, default `0.1`) to avoid re-injecting low-signal records.
58
+ - In cloud execution mode, the orchestrator now injects a bounded subset of relevant prompt-pack experience snippets directly into the cloud task prompt, so persisted experience data can influence execution outcomes immediately.
56
59
 
57
60
  ### How to run the learning pipeline locally
58
61
  - Seed a normal run and keep manifests grouped by task:
@@ -199,7 +202,7 @@ Note: the commands below assume a source checkout; `scripts/` helpers are not in
199
202
  | `npm run eval:test` | Optional evaluation harness (enable when `evaluation/fixtures/**` is populated). |
200
203
  | `npm run docs:check` | Deterministically validates scripts/pipelines/paths referenced in agent-facing docs. |
201
204
  | `npm run docs:freshness` | Validates docs registry coverage + review recency; writes `out/<task-id>/docs-freshness.json`. |
202
- | `npm run ci:cloud-canary` | Runs the cloud canary harness (`scripts/cloud-canary-ci.mjs`) to verify cloud lifecycle manifest + run-summary evidence; credential-gated by `CODEX_CLOUD_ENV_ID` and optional auth secrets (`CODEX_CLOUD_BRANCH` defaults to `main`). |
205
+ | `npm run ci:cloud-canary` | Runs the cloud canary harness (`scripts/cloud-canary-ci.mjs`) to verify cloud lifecycle manifest + run-summary evidence; credential-gated by `CODEX_CLOUD_ENV_ID` and optional auth secrets (`CODEX_CLOUD_BRANCH` defaults to `main`). Feature flags can be passed through with `CODEX_CLOUD_ENABLE_FEATURES` / `CODEX_CLOUD_DISABLE_FEATURES` (comma- or space-delimited, e.g. `sqlite,memory_tool`). |
203
206
  | `node scripts/delegation-guard.mjs` | Enforces subagent delegation evidence before review (repo-only). |
204
207
  | `node scripts/spec-guard.mjs --dry-run` | Validates spec freshness; required before review (repo-only). |
205
208
  | `node scripts/diff-budget.mjs` | Guards against oversized diffs before review (repo-only; defaults: 25 files / 800 lines; supports explicit overrides). |
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kbediako/codex-orchestrator",
3
- "version": "0.1.16",
3
+ "version": "0.1.18",
4
4
  "license": "MIT",
5
5
  "repository": {
6
6
  "type": "git",
@@ -62,6 +62,7 @@
62
62
  "eval:test": "vitest run evaluation/tests",
63
63
  "test:watch": "vitest",
64
64
  "review": "node --loader ts-node/esm scripts/run-review.ts",
65
+ "pr:watch-merge": "node scripts/pr-monitor-merge.mjs",
65
66
  "status-ui": "node scripts/status-ui-serve.mjs",
66
67
  "design:purge-expired": "node --loader ts-node/esm scripts/design/purgeExpired.ts",
67
68
  "generate:manifest-types": "node scripts/generate-manifest-types.mjs",
@@ -5,17 +5,81 @@ description: Structure multi-agent brainstorming and deliberation (options, trad
5
5
 
6
6
  # Collab Deliberation
7
7
 
8
- Use this skill when the user asks for brainstorming, multiple approaches, pros/cons, or decision support. This skill is for **ideas**, not implementation.
8
+ Use this skill when the user asks for brainstorming, tradeoffs, option comparison, or decision support before implementation. This skill is for ideas and decisions, not coding.
9
9
 
10
- ## Workflow
10
+ ## Deliberation Default v1 (required)
11
+ - Keep MCP as the lead control plane. Use collab/delegated subagents to generate and challenge options.
12
+ - Run full deliberation when any hard-stop trigger is true:
13
+ - Irreversible/destructive change with unclear rollback.
14
+ - Auth/secrets/PII boundary touched.
15
+ - Direct production customer/financial/legal impact.
16
+ - Conflicting intent on high-impact work.
17
+ - Otherwise compute a risk score (`0..2` each): reversibility, external impact, security/privacy boundary, blast radius, requirement clarity, verification strength, time pressure.
18
+ - Run full deliberation when score `>=7` or two or more criteria score `2`.
19
+ - Use these time budgets for auto-deliberation:
11
20
 
12
- 1) Clarify the decision: summarize the goal, constraints, and success criteria.
13
- 2) Generate options: 3–5 distinct approaches with short descriptions.
14
- 3) Compare tradeoffs: cost, risk, speed, maintenance, and alignment with guardrails.
15
- 4) Recommend: choose a recommended approach and explain why.
16
- 5) Open questions: list 1–3 questions that would change the recommendation.
21
+ | Class | Horizon | Soft cap | Hard cap |
22
+ | --- | --- | --- | --- |
23
+ | `T0` | `<=15m` | `5s` | `12s` |
24
+ | `T1` | `15m..2h` | `20s` | `45s` |
25
+ | `T2` | `2h..8h` | `60s` | `120s` |
26
+ | `T3` | `>8h` | `120s` | `300s` |
27
+
28
+ - On soft cap: stop branching and execute the best current plan.
29
+ - On hard cap: disable auto-deliberation for that stage and continue execution.
30
+
31
+ ## Auto-trigger cadence (required)
32
+ - Run deliberation at task bootstrap for non-trivial work.
33
+ - Re-run deliberation after each meaningful chunk (default: behavior change or about 2+ files touched).
34
+ - Re-run deliberation when external feedback lands (PR review, bot findings, CI failures).
35
+ - Re-run deliberation when ambiguity/risk increases mid-flight (new constraints, conflicting evidence, high-signal `P1` or any `P0` finding).
36
+ - Re-run deliberation at least every 45 minutes during active implementation.
37
+ - If orchestration uses symbolic RLM, keep runtime auto-deliberation enabled:
38
+ - `RLM_SYMBOLIC_DELIBERATION=1` (default)
39
+ - `RLM_SYMBOLIC_DELIBERATION_INTERVAL` (default `2`)
40
+ - `RLM_SYMBOLIC_DELIBERATION_MAX_RUNS` (default `12`)
41
+ - `RLM_SYMBOLIC_DELIBERATION_MAX_SUMMARY_BYTES` (default `2048`)
42
+ - `RLM_SYMBOLIC_DELIBERATION_INCLUDE_IN_PLANNER=1` (default)
43
+
44
+ ## Workflow (required)
45
+ 1) Frame the decision.
46
+ - Write a one-sentence decision statement.
47
+ - Capture goals, constraints, non-goals, and success criteria.
48
+ - List assumptions and label each `confirmed` or `unconfirmed`.
49
+
50
+ 2) Close critical context gaps.
51
+ - Ask up to 3 targeted questions only if answers could change the recommendation.
52
+ - If delegation is available, prefer a subagent for context gathering before asking the user.
53
+
54
+ 3) Generate distinct options.
55
+ - Produce 3-5 materially different options.
56
+ - For each option include approach, prerequisites, blast radius, and time/risk profile.
57
+
58
+ 4) Evaluate and stress test.
59
+ - Use a tailored rubric (3-5 dimensions relevant to the decision).
60
+ - For each option include one likely failure mode and one mitigation.
61
+
62
+ 5) Recommend or defer explicitly.
63
+ - Recommend one option when confidence is sufficient.
64
+ - If uncertainty is high, defer with explicit decision gates.
65
+
66
+ 6) Close with decision-driving questions.
67
+ - List 1-3 prioritized open questions that could change the recommendation.
68
+ - End with one concrete next step that improves decision quality without implementation.
69
+
70
+ ## Output contract
71
+ - `Decision`: one sentence.
72
+ - `Context`: goals, constraints, non-goals, assumptions.
73
+ - `Options`: 3-5 concise options.
74
+ - `Tradeoffs`: rubric and comparative rationale.
75
+ - `Recommendation`: chosen option or explicit defer with decision gates.
76
+ - `Open questions`: prioritized items only.
77
+ - `Next step`: single highest-leverage action.
78
+ - `Confidence`: `high` | `medium` | `low`.
17
79
 
18
80
  ## Guardrails
19
- - Separate ideas from decisions.
81
+ - Separate facts from assumptions.
20
82
  - Do not implement or modify code unless explicitly asked.
83
+ - Do not present uncertainty as certainty.
21
84
  - Keep outputs concise and action-oriented.
85
+ - If collab subagents are used, close lifecycle loops per id (`spawn_agent` -> `wait` -> `close_agent`) before finishing.
@@ -1,49 +1,21 @@
1
1
  ---
2
2
  name: delegate-early
3
- description: Spawn subagents early and often to conserve context and parallelize research, review, and planning.
3
+ description: Compatibility alias for delegation-first workflows. Prefer `delegation-usage` as the canonical skill.
4
4
  ---
5
5
 
6
- # Delegate Early
6
+ # Delegate Early (Compatibility Alias)
7
7
 
8
- Use this skill when a task can be split into parallel streams or when the main context risks ballooning. The top-level Codex remains the lead; subagents are assistants.
8
+ `delegate-early` is kept for backward compatibility. The canonical delegation workflow now lives in `delegation-usage`.
9
9
 
10
- ## Goals
11
- - Conserve primary context by offloading research/review/planning.
12
- - Improve throughput with parallel subagent streams.
13
- - Capture delegation evidence for auditability.
10
+ ## Required behavior
11
+ - Immediately follow `delegation-usage` for setup, spawn semantics, question queue handling, confirmation flow, and manifest usage.
12
+ - Keep delegation MCP enabled by default; enable other MCP servers only when relevant to the task.
13
+ - Preserve delegation evidence (task-id stream naming + manifest path capture) exactly as documented in `delegation-usage`.
14
14
 
15
- ## When to spawn
16
- - Before deep reading/analysis to avoid bloating context.
17
- - When new ambiguity appears or scope changes.
18
- - For independent streams (research, review, planning, edge cases).
15
+ ## Quick routing
16
+ 1. Use `delegation-usage` as the source of truth.
17
+ 2. Apply early fan-out only when streams are clearly independent and acceptance criteria are explicit.
18
+ 3. Keep summaries short and artifact-first; avoid long chat dumps.
19
19
 
20
- ## Task slicing heuristic
21
- - Identify 2–4 independent streams with minimal shared context.
22
- - Prefer streams like: `research`, `review`, `spec-check`, `edge-cases`.
23
-
24
- ## Required conventions
25
- - Use `MCP_RUNNER_TASK_ID=<task-id>-<stream>` for subagents.
26
- - Record manifest paths and summarize findings in the main run.
27
- - Before review handoff, run the delegation guard stage via the packaged runner:
28
- `node "$CODEX_ORCHESTRATOR_PACKAGE_ROOT/dist/orchestrator/src/cli/utils/delegationGuardRunner.js"`.
29
- For ad-hoc runs without task IDs, set `CODEX_ORCHESTRATOR_GUARD_PROFILE=warn`.
30
-
31
- ## Minimal delegation workflow
32
- 1) Name streams and write 1–2 sentence goals for each.
33
- 2) Spawn subagents with clear, bounded prompts.
34
- 3) Wait for subagent completion; retrieve manifest evidence and summarize findings into the main plan.
35
- 4) Proceed with implementation.
36
-
37
- ## Prompt patterns
38
- - Research: “Find X, cite Y, return 3 bullets + risks.”
39
- - Review: “Inspect files A/B for regressions; list issues by severity.”
40
- - Planning: “Draft a 3–5-step plan, call out unknowns.”
41
-
42
- ## Escalation rules
43
- - If delegation is impossible, set `DELEGATION_GUARD_OVERRIDE_REASON` and document it in the task checklist.
44
-
45
- ## Subagent summary format
46
- - **Findings**: Key results and conclusions from the subagent run
47
- - **Risks**: Issues, blockers, or concerns
48
- - **Open questions**: Unresolved items requiring follow-up
49
- - **Evidence**: Manifest path (e.g., `.runs/<task-id>-<stream>/cli/<timestamp>/manifest.json`)
20
+ ## Note
21
+ If guidance in this file conflicts with `delegation-usage`, follow `delegation-usage`.
@@ -1,6 +1,7 @@
1
1
  # Delegation Guide (Detailed)
2
2
 
3
3
  Use this guide for deeper context on delegation behavior, tool surfaces, and troubleshooting.
4
+ `delegation-usage` is the canonical delegation workflow; `delegate-early` should be treated as a compatibility alias.
4
5
 
5
6
  ## Mental model
6
7
 
@@ -80,6 +81,16 @@ delegate.spawn({
80
81
  })
81
82
  ```
82
83
 
84
+ ## Collab lifecycle hygiene (required)
85
+
86
+ When using collab tools (`spawn_agent` / `wait` / `close_agent`):
87
+
88
+ - Treat each spawned `agent_id` as a resource that must be closed.
89
+ - For every successful spawn, run `wait` then `close_agent` for the same id.
90
+ - Keep a local list of spawned ids and run a final cleanup pass before returning.
91
+ - On timeout/error paths, still close known ids before reporting failure.
92
+ - If you see `agent thread limit reached`, stop spawning immediately, close known ids, and retry only after cleanup.
93
+
83
94
  ## RLM budget overrides (recommended defaults)
84
95
 
85
96
  If you want deeper recursion or longer wall-clock time for delegated runs, set RLM budgets on the delegation server:
@@ -106,12 +117,12 @@ If you need delegation to respect a repo’s `.codex/orchestrator.toml` (e.g., s
106
117
 
107
118
  ## Version guard (JSONL handshake)
108
119
 
109
- Delegation MCP expects JSONL. Use `codex-orchestrator` 0.1.12 or newer.
120
+ Delegation MCP expects JSONL. Keep `codex-orchestrator` aligned with the current release line.
110
121
 
111
122
  - Check: `codex-orchestrator --version`
112
- - Update global: `npm i -g @kbediako/codex-orchestrator@0.1.12`
113
- - Or pin via npx: `npx -y @kbediako/codex-orchestrator@0.1.12 delegate-server`
114
- - If your installed CLI is behind 0.1.12, prefer upgrading or pinning to the docs’ minimum.
123
+ - Update global: `npm i -g @kbediako/codex-orchestrator@latest`
124
+ - Or pin via npx: `npx -y @kbediako/codex-orchestrator@<version> delegate-server`
125
+ - If using a custom Codex fork, fast-forward from `upstream/main` regularly and rebuild to avoid protocol drift.
115
126
 
116
127
  ## Common failures
117
128
 
@@ -120,3 +131,6 @@ Delegation MCP expects JSONL. Use `codex-orchestrator` 0.1.12 or newer.
120
131
  - **Tool profile ignored**: The repo’s `delegate.allowed_tool_servers` may be empty, or names are invalid.
121
132
  - **Missing control files**: delegate tools rely on `control_endpoint.json` in the run directory.
122
133
  - **Run identifiers**: status/pause/cancel require `manifest_path`; question queue requires `parent_manifest_path`.
134
+ - **Collab payload mismatch**: `spawn_agent` calls fail if they include both `message` and `items`.
135
+ - **Collab depth limits**: recursive collab fan-out can fail near max depth; prefer shallow parent fan-out.
136
+ - **Collab lifecycle leaks**: missing `close_agent` calls can exhaust thread slots and block future spawns (`agent thread limit reached`).
@@ -9,8 +9,21 @@ description: Use when operating the Codex delegation MCP server and tools (deleg
9
9
 
10
10
  Use this skill to operate delegation MCP tools with delegation enabled by default (the only MCP on by default). Disable it only when required by safety constraints, and keep other MCPs off unless they are relevant to the task.
11
11
 
12
+ `delegation-usage` is the canonical delegation workflow skill. If `delegate-early` is present, treat it as a compatibility alias that should redirect to this skill.
13
+
12
14
  Collab multi-agent mode is separate from delegation. For symbolic RLM subcalls that use collab tools, set `RLM_SYMBOLIC_COLLAB=1` and ensure a collab-capable Codex CLI; collab tool calls are recorded in `manifest.collab_tool_calls`. If collab tools are unavailable in your CLI build, skip collab steps; delegation still works independently.
13
15
 
16
+ ## Collab realities in delegated runs (current behavior)
17
+
18
+ - `spawn_agent` accepts one input style per call: either `message` (plain text) or `items` (structured input).
19
+ - Do not send both `message` and `items` in the same `spawn_agent` call.
20
+ - Spawn returns an `agent_id` (thread id). Current TUI collab rendering is id-based; do not depend on custom visible agent names.
21
+ - Subagents spawned through collab run with approval effectively set to `never`; design child tasks to avoid approval/escalation requirements.
22
+ - Collab spawn depth is bounded. Near/at max depth, recursive delegation can fail or collab can be disabled in children; prefer shallow parent fan-out.
23
+ - **Lifecycle is mandatory:** for every successful `spawn_agent`, run `wait` and then `close_agent` for that same id before task completion.
24
+ - Keep a local list of spawned ids and run a final cleanup pass so no agent id is left unclosed on timeout/error paths.
25
+ - If spawn fails with `agent thread limit reached`, stop spawning, close any known ids first, then surface a concise recovery note.
26
+
14
27
  ## Quick-start workflow (canned)
15
28
 
16
29
  Use this when delegation tools are missing in the current run (MCP disabled) and you want a background Codex run to handle delegation:
@@ -64,12 +77,11 @@ For runner + delegation coordination (short `--task` flow), see `docs/delegation
64
77
 
65
78
  ### 0a) Version guard (JSONL handshake)
66
79
 
67
- - Delegation MCP uses JSONL; ensure the server binary meets the docs’ minimum version (0.1.12):
68
- - `codex-orchestrator --version` should be `>= 0.1.12`.
69
- - If not, update global install: `npm i -g @kbediako/codex-orchestrator@0.1.12`
70
- - Alternative: pin the MCP server to `npx -y @kbediako/codex-orchestrator@0.1.12` for deterministic behavior.
71
- - Note: if your installed CLI is older than 0.1.12, prefer upgrading or pinning to the docs’ minimum.
72
- - Keep the version pins in this section in sync with the docs’ minimum (currently 0.1.12).
80
+ - Delegation MCP uses JSONL; keep `codex-orchestrator` aligned with the current release line.
81
+ - Check installed version: `codex-orchestrator --version`
82
+ - Preferred update path: `npm i -g @kbediako/codex-orchestrator@latest`
83
+ - Deterministic pin path (for reproducible environments): `npx -y @kbediako/codex-orchestrator@<version> delegate-server`
84
+ - If using a custom Codex fork, fast-forward it regularly from `upstream/main` and rebuild the managed CLI to avoid delegation/collab protocol drift.
73
85
 
74
86
  ### 0b) Background terminal bootstrap (required when MCP is disabled)
75
87
 
@@ -163,3 +175,6 @@ repeat:
163
175
  - **Confirmation misuse:** never pass `confirm_nonce` from model/tool input; it is runner‑injected only.
164
176
  - **Secrets exposure:** never include secrets/tokens/PII in delegate prompts or files.
165
177
  - **Missing control files:** delegate tools rely on `control_endpoint.json` in the run directory; older runs may not have it.
178
+ - **Collab payload mismatch:** `spawn_agent` rejects calls that include both `message` and `items`.
179
+ - **Collab UI assumptions:** agent rows/records are id-based today; use explicit stream role text in prompts/artifacts for operator clarity.
180
+ - **Collab lifecycle leaks:** missing `close_agent` calls accumulate open threads and can trigger `agent thread limit reached`; always finish `spawn -> wait -> close_agent` per id.
@@ -16,6 +16,7 @@ Use this skill when a task needs a spec-driven workflow. The objective is to cre
16
16
  - TECH_SPEC: capture technical requirements (use `.agent/task/templates/tech-spec-template.md`; stored under `tasks/specs/<id>-<slug>.md`).
17
17
  - ACTION_PLAN: capture sequencing/milestones (use `.agent/task/templates/action-plan-template.md`).
18
18
  - Depth scales with scope, but all three docs are required.
19
+ - For low-risk tiny edits, follow the bounded shortcut in `docs/micro-task-path.md` instead of long-form rewrites (still requires task/spec evidence).
19
20
 
20
21
  2) Register the TECH_SPEC and task
21
22
  - Add the TECH_SPEC to `tasks/index.json` (including `last_review`).
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: standalone-review
3
- description: Use for ad-hoc/standalone reviews outside pipelines (fast checks during implementation or before handoff) using `codex review`.
3
+ description: Use for required periodic cross-check reviews during implementation and before handoff using `codex review`.
4
4
  ---
5
5
 
6
6
  # Standalone Review
@@ -10,6 +10,17 @@ description: Use for ad-hoc/standalone reviews outside pipelines (fast checks du
10
10
  Use this skill when you need a fast, ad-hoc review without running a pipeline or collecting a manifest. It is ideal during implementation or for quick pre-flight checks.
11
11
  Before implementation, use it to review the task/spec against the user’s intent and record the approval in the PRD/TECH_SPEC or task notes.
12
12
 
13
+ ## Auto-trigger policy (required)
14
+
15
+ Run this skill automatically whenever any condition is true:
16
+ - You made code/config/script/test edits since the last standalone review.
17
+ - You finished a meaningful chunk of work (default: behavior change or about 2+ files touched).
18
+ - You are about to report completion, propose merge, or answer "what's next?" with recommendations.
19
+ - You addressed external feedback (PR reviews, bot comments, or CI-fix patches).
20
+ - 45 minutes of active implementation elapsed without a standalone review.
21
+
22
+ If review execution is blocked, record why in task notes, then do manual diff review plus targeted tests before proceeding.
23
+
13
24
  ## Quick start
14
25
 
15
26
  Uncommitted diff:
@@ -39,6 +50,7 @@ codex review "Focus on correctness, regressions, edge cases; list missing tests.
39
50
  - Keep prompts short, specific, and test-oriented.
40
51
 
41
52
  2) Run the review often
53
+ - Follow the auto-trigger policy above (not optional).
42
54
  - Run after each meaningful chunk of work.
43
55
  - Prefer targeted focus prompts for WIP reviews.
44
56
 
@@ -1,9 +1,11 @@
1
- <!-- codex:instruction-stamp 7477a786938b5c7f883b7d21a0954b48a1ddbbc3dcabd052b347b303cb3075a4 -->
1
+ <!-- codex:instruction-stamp 2408396e5cc9b25d5522b7064010a36a43007508072f3e0f051ab042370928a1 -->
2
2
  # Agent Instructions (Template)
3
3
 
4
4
  ## Orchestrator-first workflow
5
5
  - Use `codex-orchestrator` pipelines for planning, implementation, validation, and review.
6
6
  - Default to `docs-review` before implementation and `implementation-gate` after code changes.
7
+ - Prefer cloud mode when runs are long-running/parallel and cloud prerequisites are ready.
8
+ - Before cloud mode, verify branch availability, non-interactive setup commands, and required secrets/variables; if missing, run in local `mcp` mode and record why.
7
9
  - Before implementation, run a standalone review of the task/spec against the user’s intent and record the approval in the spec + checklist notes.
8
10
  - Delegation is mandatory for top-level tasks once a task id exists: spawn at least one subagent run using `MCP_RUNNER_TASK_ID=<task-id>-<stream>`, capture manifest evidence, and summarize in the main run. Use `DELEGATION_GUARD_OVERRIDE_REASON` only when delegation is impossible (technical/blocking limitation or explicit operational block) and record the justification.
9
11
  - Once a task id exists, prefer delegation for research, review, and planning work. Use `codex exec` only for pre-task triage (no task id yet) or when delegation is genuinely unavailable (technical/blocking limitation or explicit operational block), and set `DELEGATION_GUARD_OVERRIDE_REASON` with a clear justification.
@@ -18,11 +20,38 @@
18
20
  - Use `codex review` for quick checks during implementation.
19
21
  - Capture standalone review approval in the spec/task notes before implementation begins.
20
22
  - When you need manifest-backed review evidence, run `npm run review` with the manifest path.
23
+ - Before merge for non-trivial changes, run one explicit elegance/minimality review pass and simplify avoidable complexity.
21
24
 
22
25
  ## Delegation (recommended)
23
26
  - For non-trivial work, spawn at least one subagent run using `MCP_RUNNER_TASK_ID=<task-id>-<stream>`.
24
27
  - If delegation is not possible, record the reason in the task checklist.
25
28
 
29
+ ## Deliberation Default (agent-first)
30
+ - Keep MCP as the lead control plane. Use collab/delegated subagents for deliberation when ambiguity or impact is high.
31
+ - Run full deliberation on any hard-stop trigger:
32
+ - Irreversible/destructive changes with unclear rollback.
33
+ - Auth/secrets/PII boundary changes.
34
+ - Direct production customer/financial/legal impact.
35
+ - Conflicting intent on high-impact changes.
36
+ - Otherwise, use a simple risk score (`0..2` each): reversibility, external impact, security/privacy boundary, blast radius, requirement clarity, verification strength, time pressure.
37
+ - Require full deliberation when score `>=7` or two or more criteria score `2`.
38
+ - Time budgets for auto-deliberation:
39
+ - `T0` quick: `5s / 12s` (soft/hard)
40
+ - `T1` standard: `20s / 45s`
41
+ - `T2` complex: `60s / 120s`
42
+ - `T3` long-horizon: `120s / 300s`
43
+ - On soft cap: stop branching and execute the best current plan.
44
+ - On hard cap: disable auto-deliberation for that stage and continue execution.
45
+ - Review-signal policy:
46
+ - `P0` critical findings are hard-stop.
47
+ - `P1` high findings are hard-stop only when high-signal (clear evidence or corroboration).
48
+ - `P2/P3` findings are tracked follow-ups.
49
+
50
+ ## Completion discipline (patience-first)
51
+ - Wait/poll for terminal state on long-running operations (CI checks, reviews, cloud jobs, orchestrator runs) before reporting completion.
52
+ - Reset waiting windows when checks restart or new feedback appears.
53
+ - Do not hand off mid-flight work unless the user explicitly asks to stop.
54
+
26
55
  ## Instruction stamp
27
56
  - If you edit this file, refresh the instruction stamp.
28
57
  - One-liner: