@kbediako/codex-orchestrator 0.1.16 → 0.1.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/bin/codex-orchestrator.js +119 -2
- package/dist/orchestrator/src/cli/exec/experience.js +20 -2
- package/dist/orchestrator/src/cli/exec/tfgrpo.js +13 -1
- package/dist/orchestrator/src/cli/orchestrator.js +120 -2
- package/dist/orchestrator/src/cli/rlm/symbolic.js +494 -38
- package/dist/orchestrator/src/cli/rlmRunner.js +248 -15
- package/dist/orchestrator/src/cli/run/manifest.js +200 -4
- package/dist/orchestrator/src/cli/services/pipelineExperience.js +122 -0
- package/dist/orchestrator/src/cloud/CodexCloudTaskExecutor.js +34 -2
- package/docs/README.md +4 -1
- package/package.json +2 -1
- package/skills/collab-deliberation/SKILL.md +72 -8
- package/skills/delegate-early/SKILL.md +13 -41
- package/skills/delegation-usage/DELEGATION_GUIDE.md +18 -4
- package/skills/delegation-usage/SKILL.md +21 -6
- package/skills/docs-first/SKILL.md +1 -0
- package/skills/standalone-review/SKILL.md +13 -1
- package/templates/codex/AGENTS.md +30 -1
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import { relativeToRepo } from '../run/runPaths.js';
|
|
2
|
+
import { ExperienceStore } from '../../persistence/ExperienceStore.js';
|
|
3
|
+
import { loadInstructionSet } from '../../../../packages/orchestrator/src/instructions/loader.js';
|
|
4
|
+
import { logger } from '../../logger.js';
|
|
5
|
+
const SUCCESS_REWARD = 1;
|
|
6
|
+
const COST_PER_TOKEN_USD = 0.000002;
|
|
7
|
+
export async function persistPipelineExperience(params) {
|
|
8
|
+
const { env, pipeline, manifest, paths } = params;
|
|
9
|
+
if (manifest.status !== 'succeeded') {
|
|
10
|
+
return;
|
|
11
|
+
}
|
|
12
|
+
try {
|
|
13
|
+
const instructions = await loadInstructionSet(env.repoRoot);
|
|
14
|
+
const promptPacks = instructions.promptPacks.filter((pack) => pack.experienceSlots > 0);
|
|
15
|
+
if (promptPacks.length === 0) {
|
|
16
|
+
return;
|
|
17
|
+
}
|
|
18
|
+
const domain = resolveExperienceDomain(pipeline, promptPacks);
|
|
19
|
+
if (!domain) {
|
|
20
|
+
return;
|
|
21
|
+
}
|
|
22
|
+
const selectedPack = promptPacks.find((pack) => pack.domain === domain);
|
|
23
|
+
if (!selectedPack) {
|
|
24
|
+
return;
|
|
25
|
+
}
|
|
26
|
+
const summary = summarizePipelineOutcome(manifest);
|
|
27
|
+
if (!summary) {
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
const tokenCount = Math.max(1, countWords(summary));
|
|
31
|
+
const durationMs = resolveDurationMs(manifest);
|
|
32
|
+
const record = {
|
|
33
|
+
runId: manifest.run_id,
|
|
34
|
+
taskId: manifest.task_id,
|
|
35
|
+
epoch: null,
|
|
36
|
+
groupId: null,
|
|
37
|
+
summary,
|
|
38
|
+
reward: { gtScore: SUCCESS_REWARD, relativeRank: 0 },
|
|
39
|
+
toolStats: [
|
|
40
|
+
{
|
|
41
|
+
tool: `pipeline:${pipeline.id}`,
|
|
42
|
+
tokens: tokenCount,
|
|
43
|
+
latencyMs: durationMs,
|
|
44
|
+
costUsd: roundCurrency(tokenCount * COST_PER_TOKEN_USD)
|
|
45
|
+
}
|
|
46
|
+
],
|
|
47
|
+
stampSignature: selectedPack.stamp,
|
|
48
|
+
domain
|
|
49
|
+
};
|
|
50
|
+
const store = new ExperienceStore({
|
|
51
|
+
outDir: env.outRoot,
|
|
52
|
+
runsDir: env.runsRoot,
|
|
53
|
+
maxSummaryWords: instructions.experienceMaxWords
|
|
54
|
+
});
|
|
55
|
+
await store.recordBatch([record], relativeToRepo(env, paths.manifestPath));
|
|
56
|
+
}
|
|
57
|
+
catch (error) {
|
|
58
|
+
logger.warn(`Failed to persist pipeline experience for run ${manifest.run_id}: ${error?.message ?? String(error)}`);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
export function resolveExperienceDomain(pipeline, promptPacks) {
|
|
62
|
+
const domains = uniqueDomains(promptPacks);
|
|
63
|
+
if (domains.length === 0) {
|
|
64
|
+
return null;
|
|
65
|
+
}
|
|
66
|
+
const haystack = normalizeSummary(`${pipeline.id} ${pipeline.title} ${(pipeline.tags ?? []).join(' ')}`).toLowerCase();
|
|
67
|
+
const directMatch = domains.find((domain) => haystack.includes(domain.toLowerCase()));
|
|
68
|
+
if (directMatch) {
|
|
69
|
+
return directMatch;
|
|
70
|
+
}
|
|
71
|
+
if (domains.includes('implementation')) {
|
|
72
|
+
return 'implementation';
|
|
73
|
+
}
|
|
74
|
+
return domains[0] ?? null;
|
|
75
|
+
}
|
|
76
|
+
export function summarizePipelineOutcome(manifest) {
|
|
77
|
+
const chunks = [];
|
|
78
|
+
if (typeof manifest.summary === 'string' && manifest.summary.trim().length > 0) {
|
|
79
|
+
chunks.push(normalizeSummary(manifest.summary));
|
|
80
|
+
}
|
|
81
|
+
const stageHighlights = manifest.commands
|
|
82
|
+
.filter((command) => command.status === 'succeeded')
|
|
83
|
+
.map((command) => normalizeSummary(command.summary ?? command.title))
|
|
84
|
+
.filter((value) => value.length > 0)
|
|
85
|
+
.slice(0, 2);
|
|
86
|
+
chunks.push(...stageHighlights);
|
|
87
|
+
if (chunks.length === 0) {
|
|
88
|
+
return null;
|
|
89
|
+
}
|
|
90
|
+
return chunks.join(' | ');
|
|
91
|
+
}
|
|
92
|
+
function uniqueDomains(promptPacks) {
|
|
93
|
+
const seen = new Set();
|
|
94
|
+
const domains = [];
|
|
95
|
+
for (const pack of promptPacks) {
|
|
96
|
+
const domain = pack.domain.trim();
|
|
97
|
+
if (!domain || seen.has(domain)) {
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
seen.add(domain);
|
|
101
|
+
domains.push(domain);
|
|
102
|
+
}
|
|
103
|
+
return domains;
|
|
104
|
+
}
|
|
105
|
+
function resolveDurationMs(manifest) {
|
|
106
|
+
const startedAt = Date.parse(manifest.started_at);
|
|
107
|
+
const completedAt = Date.parse(manifest.completed_at ?? manifest.started_at);
|
|
108
|
+
if (!Number.isFinite(startedAt) || !Number.isFinite(completedAt)) {
|
|
109
|
+
return 0;
|
|
110
|
+
}
|
|
111
|
+
return Math.max(0, completedAt - startedAt);
|
|
112
|
+
}
|
|
113
|
+
function normalizeSummary(value) {
|
|
114
|
+
return value.replace(/\s+/gu, ' ').trim();
|
|
115
|
+
}
|
|
116
|
+
function countWords(value) {
|
|
117
|
+
const tokens = value.trim().split(/\s+/u).filter(Boolean);
|
|
118
|
+
return tokens.length;
|
|
119
|
+
}
|
|
120
|
+
function roundCurrency(value) {
|
|
121
|
+
return Math.round(value * 1_000_000) / 1_000_000;
|
|
122
|
+
}
|
|
@@ -5,7 +5,7 @@ import { setTimeout as sleep } from 'node:timers/promises';
|
|
|
5
5
|
import { isoTimestamp } from '../cli/utils/time.js';
|
|
6
6
|
const TASK_ID_PATTERN = /\btask_[a-z]_[a-f0-9]+\b/i;
|
|
7
7
|
const MAX_LOG_CHARS = 32 * 1024;
|
|
8
|
-
const STATUS_RETRY_LIMIT =
|
|
8
|
+
const STATUS_RETRY_LIMIT = 12;
|
|
9
9
|
const STATUS_RETRY_BACKOFF_MS = 1500;
|
|
10
10
|
const DEFAULT_LIST_LIMIT = 20;
|
|
11
11
|
export function extractCloudTaskId(text) {
|
|
@@ -104,6 +104,12 @@ export class CodexCloudTaskExecutor {
|
|
|
104
104
|
if (input.branch && input.branch.trim()) {
|
|
105
105
|
execArgs.push('--branch', input.branch.trim());
|
|
106
106
|
}
|
|
107
|
+
for (const feature of normalizeFeatureList(input.enableFeatures)) {
|
|
108
|
+
execArgs.push('--enable', feature);
|
|
109
|
+
}
|
|
110
|
+
for (const feature of normalizeFeatureList(input.disableFeatures)) {
|
|
111
|
+
execArgs.push('--disable', feature);
|
|
112
|
+
}
|
|
107
113
|
execArgs.push(input.prompt);
|
|
108
114
|
const execResult = await runCloudCommand(execArgs);
|
|
109
115
|
if (execResult.exitCode !== 0) {
|
|
@@ -123,6 +129,8 @@ export class CodexCloudTaskExecutor {
|
|
|
123
129
|
}
|
|
124
130
|
const timeoutAt = Date.now() + cloudExecution.timeout_seconds * 1000;
|
|
125
131
|
let statusRetries = 0;
|
|
132
|
+
let lastKnownStatus = cloudExecution.status;
|
|
133
|
+
let loggedNonZeroStatus = false;
|
|
126
134
|
while (Date.now() < timeoutAt) {
|
|
127
135
|
const statusResult = await runCloudCommand(['cloud', 'status', taskId]);
|
|
128
136
|
cloudExecution.last_polled_at = this.now();
|
|
@@ -139,9 +147,14 @@ export class CodexCloudTaskExecutor {
|
|
|
139
147
|
await this.sleepFn(STATUS_RETRY_BACKOFF_MS * statusRetries);
|
|
140
148
|
continue;
|
|
141
149
|
}
|
|
150
|
+
if (statusResult.exitCode !== 0 && mapped !== 'unknown' && !loggedNonZeroStatus) {
|
|
151
|
+
notes.push(`Cloud status returned exit ${statusResult.exitCode} with remote status ${mapped}; continuing to poll.`);
|
|
152
|
+
loggedNonZeroStatus = true;
|
|
153
|
+
}
|
|
142
154
|
statusRetries = 0;
|
|
143
155
|
if (mapped !== 'unknown') {
|
|
144
156
|
cloudExecution.status = mapped;
|
|
157
|
+
lastKnownStatus = mapped;
|
|
145
158
|
}
|
|
146
159
|
if (mapped === 'ready') {
|
|
147
160
|
notes.push(`Cloud task completed: ${taskId}`);
|
|
@@ -155,7 +168,7 @@ export class CodexCloudTaskExecutor {
|
|
|
155
168
|
}
|
|
156
169
|
if (cloudExecution.status === 'running' || cloudExecution.status === 'queued') {
|
|
157
170
|
cloudExecution.status = 'failed';
|
|
158
|
-
cloudExecution.error = `Timed out waiting for cloud task completion after ${cloudExecution.timeout_seconds}s.`;
|
|
171
|
+
cloudExecution.error = `Timed out waiting for cloud task completion after ${cloudExecution.timeout_seconds}s (last remote status: ${lastKnownStatus}, polls: ${cloudExecution.poll_count}).`;
|
|
159
172
|
}
|
|
160
173
|
if (cloudExecution.status === 'ready') {
|
|
161
174
|
const diffResult = await runCloudCommand(['cloud', 'diff', taskId]);
|
|
@@ -213,6 +226,25 @@ export class CodexCloudTaskExecutor {
|
|
|
213
226
|
}
|
|
214
227
|
}
|
|
215
228
|
}
|
|
229
|
+
function normalizeFeatureList(features) {
|
|
230
|
+
if (!Array.isArray(features) || features.length === 0) {
|
|
231
|
+
return [];
|
|
232
|
+
}
|
|
233
|
+
const seen = new Set();
|
|
234
|
+
const normalized = [];
|
|
235
|
+
for (const raw of features) {
|
|
236
|
+
if (typeof raw !== 'string') {
|
|
237
|
+
continue;
|
|
238
|
+
}
|
|
239
|
+
const feature = raw.trim();
|
|
240
|
+
if (!feature || seen.has(feature)) {
|
|
241
|
+
continue;
|
|
242
|
+
}
|
|
243
|
+
seen.add(feature);
|
|
244
|
+
normalized.push(feature);
|
|
245
|
+
}
|
|
246
|
+
return normalized;
|
|
247
|
+
}
|
|
216
248
|
export async function defaultCloudCommandRunner(request) {
|
|
217
249
|
return await new Promise((resolve, reject) => {
|
|
218
250
|
const child = spawn(request.command, request.args, {
|
package/docs/README.md
CHANGED
|
@@ -53,6 +53,9 @@ Group execution (when `FEATURE_TFGRPO_GROUP=on`): repeat the Builder → Tester
|
|
|
53
53
|
- Manifests record the tag, commit SHA, tarball digest/path, queue payload path, and validation status (`validated`, `snapshot_failed`, `stalled_snapshot`, `needs_manual_scenario`) under `learning.*` so reviewers can audit outcomes without external storage.
|
|
54
54
|
- Scenario synthesis replays the most recent successful command from the run (or prompt/diff fallback), writes `learning/scenario.json`, and automatically executes the commands; validation logs live at `learning/scenario-validation.log` and are stored in `learning.validation.log_path`.
|
|
55
55
|
- Override snapshot storage with `LEARNING_SNAPSHOT_DIR=/custom/dir` when needed; the default lives under `.runs/learning-snapshots/` (or `$CODEX_ORCHESTRATOR_RUNS_DIR/learning-snapshots/` when configured).
|
|
56
|
+
- Successful pipeline runs also persist lightweight experience records in `out/<task-id>/experiences.jsonl` using prompt-pack domains, so future runs can inject higher-signal context without requiring learning snapshots.
|
|
57
|
+
- Prompt-pack injections apply a minimum reward threshold (`TFGRPO_EXPERIENCE_MIN_REWARD`, default `0.1`) to avoid re-injecting low-signal records.
|
|
58
|
+
- In cloud execution mode, the orchestrator now injects a bounded subset of relevant prompt-pack experience snippets directly into the cloud task prompt, so persisted experience data can influence execution outcomes immediately.
|
|
56
59
|
|
|
57
60
|
### How to run the learning pipeline locally
|
|
58
61
|
- Seed a normal run and keep manifests grouped by task:
|
|
@@ -199,7 +202,7 @@ Note: the commands below assume a source checkout; `scripts/` helpers are not in
|
|
|
199
202
|
| `npm run eval:test` | Optional evaluation harness (enable when `evaluation/fixtures/**` is populated). |
|
|
200
203
|
| `npm run docs:check` | Deterministically validates scripts/pipelines/paths referenced in agent-facing docs. |
|
|
201
204
|
| `npm run docs:freshness` | Validates docs registry coverage + review recency; writes `out/<task-id>/docs-freshness.json`. |
|
|
202
|
-
| `npm run ci:cloud-canary` | Runs the cloud canary harness (`scripts/cloud-canary-ci.mjs`) to verify cloud lifecycle manifest + run-summary evidence; credential-gated by `CODEX_CLOUD_ENV_ID` and optional auth secrets (`CODEX_CLOUD_BRANCH` defaults to `main`). |
|
|
205
|
+
| `npm run ci:cloud-canary` | Runs the cloud canary harness (`scripts/cloud-canary-ci.mjs`) to verify cloud lifecycle manifest + run-summary evidence; credential-gated by `CODEX_CLOUD_ENV_ID` and optional auth secrets (`CODEX_CLOUD_BRANCH` defaults to `main`). Feature flags can be passed through with `CODEX_CLOUD_ENABLE_FEATURES` / `CODEX_CLOUD_DISABLE_FEATURES` (comma- or space-delimited, e.g. `sqlite,memory_tool`). |
|
|
203
206
|
| `node scripts/delegation-guard.mjs` | Enforces subagent delegation evidence before review (repo-only). |
|
|
204
207
|
| `node scripts/spec-guard.mjs --dry-run` | Validates spec freshness; required before review (repo-only). |
|
|
205
208
|
| `node scripts/diff-budget.mjs` | Guards against oversized diffs before review (repo-only; defaults: 25 files / 800 lines; supports explicit overrides). |
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@kbediako/codex-orchestrator",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.18",
|
|
4
4
|
"license": "MIT",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -62,6 +62,7 @@
|
|
|
62
62
|
"eval:test": "vitest run evaluation/tests",
|
|
63
63
|
"test:watch": "vitest",
|
|
64
64
|
"review": "node --loader ts-node/esm scripts/run-review.ts",
|
|
65
|
+
"pr:watch-merge": "node scripts/pr-monitor-merge.mjs",
|
|
65
66
|
"status-ui": "node scripts/status-ui-serve.mjs",
|
|
66
67
|
"design:purge-expired": "node --loader ts-node/esm scripts/design/purgeExpired.ts",
|
|
67
68
|
"generate:manifest-types": "node scripts/generate-manifest-types.mjs",
|
|
@@ -5,17 +5,81 @@ description: Structure multi-agent brainstorming and deliberation (options, trad
|
|
|
5
5
|
|
|
6
6
|
# Collab Deliberation
|
|
7
7
|
|
|
8
|
-
Use this skill when the user asks for brainstorming,
|
|
8
|
+
Use this skill when the user asks for brainstorming, tradeoffs, option comparison, or decision support before implementation. This skill is for ideas and decisions, not coding.
|
|
9
9
|
|
|
10
|
-
##
|
|
10
|
+
## Deliberation Default v1 (required)
|
|
11
|
+
- Keep MCP as the lead control plane. Use collab/delegated subagents to generate and challenge options.
|
|
12
|
+
- Run full deliberation when any hard-stop trigger is true:
|
|
13
|
+
- Irreversible/destructive change with unclear rollback.
|
|
14
|
+
- Auth/secrets/PII boundary touched.
|
|
15
|
+
- Direct production customer/financial/legal impact.
|
|
16
|
+
- Conflicting intent on high-impact work.
|
|
17
|
+
- Otherwise compute a risk score (`0..2` each): reversibility, external impact, security/privacy boundary, blast radius, requirement clarity, verification strength, time pressure.
|
|
18
|
+
- Run full deliberation when score `>=7` or two or more criteria score `2`.
|
|
19
|
+
- Use these time budgets for auto-deliberation:
|
|
11
20
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
21
|
+
| Class | Horizon | Soft cap | Hard cap |
|
|
22
|
+
| --- | --- | --- | --- |
|
|
23
|
+
| `T0` | `<=15m` | `5s` | `12s` |
|
|
24
|
+
| `T1` | `15m..2h` | `20s` | `45s` |
|
|
25
|
+
| `T2` | `2h..8h` | `60s` | `120s` |
|
|
26
|
+
| `T3` | `>8h` | `120s` | `300s` |
|
|
27
|
+
|
|
28
|
+
- On soft cap: stop branching and execute the best current plan.
|
|
29
|
+
- On hard cap: disable auto-deliberation for that stage and continue execution.
|
|
30
|
+
|
|
31
|
+
## Auto-trigger cadence (required)
|
|
32
|
+
- Run deliberation at task bootstrap for non-trivial work.
|
|
33
|
+
- Re-run deliberation after each meaningful chunk (default: behavior change or about 2+ files touched).
|
|
34
|
+
- Re-run deliberation when external feedback lands (PR review, bot findings, CI failures).
|
|
35
|
+
- Re-run deliberation when ambiguity/risk increases mid-flight (new constraints, conflicting evidence, high-signal `P1` or any `P0` finding).
|
|
36
|
+
- Re-run deliberation at least every 45 minutes during active implementation.
|
|
37
|
+
- If orchestration uses symbolic RLM, keep runtime auto-deliberation enabled:
|
|
38
|
+
- `RLM_SYMBOLIC_DELIBERATION=1` (default)
|
|
39
|
+
- `RLM_SYMBOLIC_DELIBERATION_INTERVAL` (default `2`)
|
|
40
|
+
- `RLM_SYMBOLIC_DELIBERATION_MAX_RUNS` (default `12`)
|
|
41
|
+
- `RLM_SYMBOLIC_DELIBERATION_MAX_SUMMARY_BYTES` (default `2048`)
|
|
42
|
+
- `RLM_SYMBOLIC_DELIBERATION_INCLUDE_IN_PLANNER=1` (default)
|
|
43
|
+
|
|
44
|
+
## Workflow (required)
|
|
45
|
+
1) Frame the decision.
|
|
46
|
+
- Write a one-sentence decision statement.
|
|
47
|
+
- Capture goals, constraints, non-goals, and success criteria.
|
|
48
|
+
- List assumptions and label each `confirmed` or `unconfirmed`.
|
|
49
|
+
|
|
50
|
+
2) Close critical context gaps.
|
|
51
|
+
- Ask up to 3 targeted questions only if answers could change the recommendation.
|
|
52
|
+
- If delegation is available, prefer a subagent for context gathering before asking the user.
|
|
53
|
+
|
|
54
|
+
3) Generate distinct options.
|
|
55
|
+
- Produce 3-5 materially different options.
|
|
56
|
+
- For each option include approach, prerequisites, blast radius, and time/risk profile.
|
|
57
|
+
|
|
58
|
+
4) Evaluate and stress test.
|
|
59
|
+
- Use a tailored rubric (3-5 dimensions relevant to the decision).
|
|
60
|
+
- For each option include one likely failure mode and one mitigation.
|
|
61
|
+
|
|
62
|
+
5) Recommend or defer explicitly.
|
|
63
|
+
- Recommend one option when confidence is sufficient.
|
|
64
|
+
- If uncertainty is high, defer with explicit decision gates.
|
|
65
|
+
|
|
66
|
+
6) Close with decision-driving questions.
|
|
67
|
+
- List 1-3 prioritized open questions that could change the recommendation.
|
|
68
|
+
- End with one concrete next step that improves decision quality without implementation.
|
|
69
|
+
|
|
70
|
+
## Output contract
|
|
71
|
+
- `Decision`: one sentence.
|
|
72
|
+
- `Context`: goals, constraints, non-goals, assumptions.
|
|
73
|
+
- `Options`: 3-5 concise options.
|
|
74
|
+
- `Tradeoffs`: rubric and comparative rationale.
|
|
75
|
+
- `Recommendation`: chosen option or explicit defer with decision gates.
|
|
76
|
+
- `Open questions`: prioritized items only.
|
|
77
|
+
- `Next step`: single highest-leverage action.
|
|
78
|
+
- `Confidence`: `high` | `medium` | `low`.
|
|
17
79
|
|
|
18
80
|
## Guardrails
|
|
19
|
-
- Separate
|
|
81
|
+
- Separate facts from assumptions.
|
|
20
82
|
- Do not implement or modify code unless explicitly asked.
|
|
83
|
+
- Do not present uncertainty as certainty.
|
|
21
84
|
- Keep outputs concise and action-oriented.
|
|
85
|
+
- If collab subagents are used, close lifecycle loops per id (`spawn_agent` -> `wait` -> `close_agent`) before finishing.
|
|
@@ -1,49 +1,21 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: delegate-early
|
|
3
|
-
description:
|
|
3
|
+
description: Compatibility alias for delegation-first workflows. Prefer `delegation-usage` as the canonical skill.
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
-
# Delegate Early
|
|
6
|
+
# Delegate Early (Compatibility Alias)
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
`delegate-early` is kept for backward compatibility. The canonical delegation workflow now lives in `delegation-usage`.
|
|
9
9
|
|
|
10
|
-
##
|
|
11
|
-
-
|
|
12
|
-
-
|
|
13
|
-
-
|
|
10
|
+
## Required behavior
|
|
11
|
+
- Immediately follow `delegation-usage` for setup, spawn semantics, question queue handling, confirmation flow, and manifest usage.
|
|
12
|
+
- Keep delegation MCP enabled by default; enable other MCP servers only when relevant to the task.
|
|
13
|
+
- Preserve delegation evidence (task-id stream naming + manifest path capture) exactly as documented in `delegation-usage`.
|
|
14
14
|
|
|
15
|
-
##
|
|
16
|
-
|
|
17
|
-
-
|
|
18
|
-
|
|
15
|
+
## Quick routing
|
|
16
|
+
1. Use `delegation-usage` as the source of truth.
|
|
17
|
+
2. Apply early fan-out only when streams are clearly independent and acceptance criteria are explicit.
|
|
18
|
+
3. Keep summaries short and artifact-first; avoid long chat dumps.
|
|
19
19
|
|
|
20
|
-
##
|
|
21
|
-
|
|
22
|
-
- Prefer streams like: `research`, `review`, `spec-check`, `edge-cases`.
|
|
23
|
-
|
|
24
|
-
## Required conventions
|
|
25
|
-
- Use `MCP_RUNNER_TASK_ID=<task-id>-<stream>` for subagents.
|
|
26
|
-
- Record manifest paths and summarize findings in the main run.
|
|
27
|
-
- Before review handoff, run the delegation guard stage via the packaged runner:
|
|
28
|
-
`node "$CODEX_ORCHESTRATOR_PACKAGE_ROOT/dist/orchestrator/src/cli/utils/delegationGuardRunner.js"`.
|
|
29
|
-
For ad-hoc runs without task IDs, set `CODEX_ORCHESTRATOR_GUARD_PROFILE=warn`.
|
|
30
|
-
|
|
31
|
-
## Minimal delegation workflow
|
|
32
|
-
1) Name streams and write 1–2 sentence goals for each.
|
|
33
|
-
2) Spawn subagents with clear, bounded prompts.
|
|
34
|
-
3) Wait for subagent completion; retrieve manifest evidence and summarize findings into the main plan.
|
|
35
|
-
4) Proceed with implementation.
|
|
36
|
-
|
|
37
|
-
## Prompt patterns
|
|
38
|
-
- Research: “Find X, cite Y, return 3 bullets + risks.”
|
|
39
|
-
- Review: “Inspect files A/B for regressions; list issues by severity.”
|
|
40
|
-
- Planning: “Draft a 3–5-step plan, call out unknowns.”
|
|
41
|
-
|
|
42
|
-
## Escalation rules
|
|
43
|
-
- If delegation is impossible, set `DELEGATION_GUARD_OVERRIDE_REASON` and document it in the task checklist.
|
|
44
|
-
|
|
45
|
-
## Subagent summary format
|
|
46
|
-
- **Findings**: Key results and conclusions from the subagent run
|
|
47
|
-
- **Risks**: Issues, blockers, or concerns
|
|
48
|
-
- **Open questions**: Unresolved items requiring follow-up
|
|
49
|
-
- **Evidence**: Manifest path (e.g., `.runs/<task-id>-<stream>/cli/<timestamp>/manifest.json`)
|
|
20
|
+
## Note
|
|
21
|
+
If guidance in this file conflicts with `delegation-usage`, follow `delegation-usage`.
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Delegation Guide (Detailed)
|
|
2
2
|
|
|
3
3
|
Use this guide for deeper context on delegation behavior, tool surfaces, and troubleshooting.
|
|
4
|
+
`delegation-usage` is the canonical delegation workflow; `delegate-early` should be treated as a compatibility alias.
|
|
4
5
|
|
|
5
6
|
## Mental model
|
|
6
7
|
|
|
@@ -80,6 +81,16 @@ delegate.spawn({
|
|
|
80
81
|
})
|
|
81
82
|
```
|
|
82
83
|
|
|
84
|
+
## Collab lifecycle hygiene (required)
|
|
85
|
+
|
|
86
|
+
When using collab tools (`spawn_agent` / `wait` / `close_agent`):
|
|
87
|
+
|
|
88
|
+
- Treat each spawned `agent_id` as a resource that must be closed.
|
|
89
|
+
- For every successful spawn, run `wait` then `close_agent` for the same id.
|
|
90
|
+
- Keep a local list of spawned ids and run a final cleanup pass before returning.
|
|
91
|
+
- On timeout/error paths, still close known ids before reporting failure.
|
|
92
|
+
- If you see `agent thread limit reached`, stop spawning immediately, close known ids, and retry only after cleanup.
|
|
93
|
+
|
|
83
94
|
## RLM budget overrides (recommended defaults)
|
|
84
95
|
|
|
85
96
|
If you want deeper recursion or longer wall-clock time for delegated runs, set RLM budgets on the delegation server:
|
|
@@ -106,12 +117,12 @@ If you need delegation to respect a repo’s `.codex/orchestrator.toml` (e.g., s
|
|
|
106
117
|
|
|
107
118
|
## Version guard (JSONL handshake)
|
|
108
119
|
|
|
109
|
-
Delegation MCP expects JSONL.
|
|
120
|
+
Delegation MCP expects JSONL. Keep `codex-orchestrator` aligned with the current release line.
|
|
110
121
|
|
|
111
122
|
- Check: `codex-orchestrator --version`
|
|
112
|
-
- Update global: `npm i -g @kbediako/codex-orchestrator@
|
|
113
|
-
- Or pin via npx: `npx -y @kbediako/codex-orchestrator
|
|
114
|
-
- If
|
|
123
|
+
- Update global: `npm i -g @kbediako/codex-orchestrator@latest`
|
|
124
|
+
- Or pin via npx: `npx -y @kbediako/codex-orchestrator@<version> delegate-server`
|
|
125
|
+
- If using a custom Codex fork, fast-forward from `upstream/main` regularly and rebuild to avoid protocol drift.
|
|
115
126
|
|
|
116
127
|
## Common failures
|
|
117
128
|
|
|
@@ -120,3 +131,6 @@ Delegation MCP expects JSONL. Use `codex-orchestrator` 0.1.12 or newer.
|
|
|
120
131
|
- **Tool profile ignored**: The repo’s `delegate.allowed_tool_servers` may be empty, or names are invalid.
|
|
121
132
|
- **Missing control files**: delegate tools rely on `control_endpoint.json` in the run directory.
|
|
122
133
|
- **Run identifiers**: status/pause/cancel require `manifest_path`; question queue requires `parent_manifest_path`.
|
|
134
|
+
- **Collab payload mismatch**: `spawn_agent` calls fail if they include both `message` and `items`.
|
|
135
|
+
- **Collab depth limits**: recursive collab fan-out can fail near max depth; prefer shallow parent fan-out.
|
|
136
|
+
- **Collab lifecycle leaks**: missing `close_agent` calls can exhaust thread slots and block future spawns (`agent thread limit reached`).
|
|
@@ -9,8 +9,21 @@ description: Use when operating the Codex delegation MCP server and tools (deleg
|
|
|
9
9
|
|
|
10
10
|
Use this skill to operate delegation MCP tools with delegation enabled by default (the only MCP on by default). Disable it only when required by safety constraints, and keep other MCPs off unless they are relevant to the task.
|
|
11
11
|
|
|
12
|
+
`delegation-usage` is the canonical delegation workflow skill. If `delegate-early` is present, treat it as a compatibility alias that should redirect to this skill.
|
|
13
|
+
|
|
12
14
|
Collab multi-agent mode is separate from delegation. For symbolic RLM subcalls that use collab tools, set `RLM_SYMBOLIC_COLLAB=1` and ensure a collab-capable Codex CLI; collab tool calls are recorded in `manifest.collab_tool_calls`. If collab tools are unavailable in your CLI build, skip collab steps; delegation still works independently.
|
|
13
15
|
|
|
16
|
+
## Collab realities in delegated runs (current behavior)
|
|
17
|
+
|
|
18
|
+
- `spawn_agent` accepts one input style per call: either `message` (plain text) or `items` (structured input).
|
|
19
|
+
- Do not send both `message` and `items` in the same `spawn_agent` call.
|
|
20
|
+
- Spawn returns an `agent_id` (thread id). Current TUI collab rendering is id-based; do not depend on custom visible agent names.
|
|
21
|
+
- Subagents spawned through collab run with approval effectively set to `never`; design child tasks to avoid approval/escalation requirements.
|
|
22
|
+
- Collab spawn depth is bounded. Near/at max depth, recursive delegation can fail or collab can be disabled in children; prefer shallow parent fan-out.
|
|
23
|
+
- **Lifecycle is mandatory:** for every successful `spawn_agent`, run `wait` and then `close_agent` for that same id before task completion.
|
|
24
|
+
- Keep a local list of spawned ids and run a final cleanup pass so no agent id is left unclosed on timeout/error paths.
|
|
25
|
+
- If spawn fails with `agent thread limit reached`, stop spawning, close any known ids first, then surface a concise recovery note.
|
|
26
|
+
|
|
14
27
|
## Quick-start workflow (canned)
|
|
15
28
|
|
|
16
29
|
Use this when delegation tools are missing in the current run (MCP disabled) and you want a background Codex run to handle delegation:
|
|
@@ -64,12 +77,11 @@ For runner + delegation coordination (short `--task` flow), see `docs/delegation
|
|
|
64
77
|
|
|
65
78
|
### 0a) Version guard (JSONL handshake)
|
|
66
79
|
|
|
67
|
-
- Delegation MCP uses JSONL;
|
|
68
|
-
- `codex-orchestrator --version`
|
|
69
|
-
-
|
|
70
|
-
-
|
|
71
|
-
-
|
|
72
|
-
- Keep the version pins in this section in sync with the docs’ minimum (currently 0.1.12).
|
|
80
|
+
- Delegation MCP uses JSONL; keep `codex-orchestrator` aligned with the current release line.
|
|
81
|
+
- Check installed version: `codex-orchestrator --version`
|
|
82
|
+
- Preferred update path: `npm i -g @kbediako/codex-orchestrator@latest`
|
|
83
|
+
- Deterministic pin path (for reproducible environments): `npx -y @kbediako/codex-orchestrator@<version> delegate-server`
|
|
84
|
+
- If using a custom Codex fork, fast-forward it regularly from `upstream/main` and rebuild the managed CLI to avoid delegation/collab protocol drift.
|
|
73
85
|
|
|
74
86
|
### 0b) Background terminal bootstrap (required when MCP is disabled)
|
|
75
87
|
|
|
@@ -163,3 +175,6 @@ repeat:
|
|
|
163
175
|
- **Confirmation misuse:** never pass `confirm_nonce` from model/tool input; it is runner‑injected only.
|
|
164
176
|
- **Secrets exposure:** never include secrets/tokens/PII in delegate prompts or files.
|
|
165
177
|
- **Missing control files:** delegate tools rely on `control_endpoint.json` in the run directory; older runs may not have it.
|
|
178
|
+
- **Collab payload mismatch:** `spawn_agent` rejects calls that include both `message` and `items`.
|
|
179
|
+
- **Collab UI assumptions:** agent rows/records are id-based today; use explicit stream role text in prompts/artifacts for operator clarity.
|
|
180
|
+
- **Collab lifecycle leaks:** missing `close_agent` calls accumulate open threads and can trigger `agent thread limit reached`; always finish `spawn -> wait -> close_agent` per id.
|
|
@@ -16,6 +16,7 @@ Use this skill when a task needs a spec-driven workflow. The objective is to cre
|
|
|
16
16
|
- TECH_SPEC: capture technical requirements (use `.agent/task/templates/tech-spec-template.md`; stored under `tasks/specs/<id>-<slug>.md`).
|
|
17
17
|
- ACTION_PLAN: capture sequencing/milestones (use `.agent/task/templates/action-plan-template.md`).
|
|
18
18
|
- Depth scales with scope, but all three docs are required.
|
|
19
|
+
- For low-risk tiny edits, follow the bounded shortcut in `docs/micro-task-path.md` instead of long-form rewrites (still requires task/spec evidence).
|
|
19
20
|
|
|
20
21
|
2) Register the TECH_SPEC and task
|
|
21
22
|
- Add the TECH_SPEC to `tasks/index.json` (including `last_review`).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: standalone-review
|
|
3
|
-
description: Use for
|
|
3
|
+
description: Use for required periodic cross-check reviews during implementation and before handoff using `codex review`.
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# Standalone Review
|
|
@@ -10,6 +10,17 @@ description: Use for ad-hoc/standalone reviews outside pipelines (fast checks du
|
|
|
10
10
|
Use this skill when you need a fast, ad-hoc review without running a pipeline or collecting a manifest. It is ideal during implementation or for quick pre-flight checks.
|
|
11
11
|
Before implementation, use it to review the task/spec against the user’s intent and record the approval in the PRD/TECH_SPEC or task notes.
|
|
12
12
|
|
|
13
|
+
## Auto-trigger policy (required)
|
|
14
|
+
|
|
15
|
+
Run this skill automatically whenever any condition is true:
|
|
16
|
+
- You made code/config/script/test edits since the last standalone review.
|
|
17
|
+
- You finished a meaningful chunk of work (default: behavior change or about 2+ files touched).
|
|
18
|
+
- You are about to report completion, propose merge, or answer "what's next?" with recommendations.
|
|
19
|
+
- You addressed external feedback (PR reviews, bot comments, or CI-fix patches).
|
|
20
|
+
- 45 minutes of active implementation elapsed without a standalone review.
|
|
21
|
+
|
|
22
|
+
If review execution is blocked, record why in task notes, then do manual diff review plus targeted tests before proceeding.
|
|
23
|
+
|
|
13
24
|
## Quick start
|
|
14
25
|
|
|
15
26
|
Uncommitted diff:
|
|
@@ -39,6 +50,7 @@ codex review "Focus on correctness, regressions, edge cases; list missing tests.
|
|
|
39
50
|
- Keep prompts short, specific, and test-oriented.
|
|
40
51
|
|
|
41
52
|
2) Run the review often
|
|
53
|
+
- Follow the auto-trigger policy above (not optional).
|
|
42
54
|
- Run after each meaningful chunk of work.
|
|
43
55
|
- Prefer targeted focus prompts for WIP reviews.
|
|
44
56
|
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
<!-- codex:instruction-stamp
|
|
1
|
+
<!-- codex:instruction-stamp 2408396e5cc9b25d5522b7064010a36a43007508072f3e0f051ab042370928a1 -->
|
|
2
2
|
# Agent Instructions (Template)
|
|
3
3
|
|
|
4
4
|
## Orchestrator-first workflow
|
|
5
5
|
- Use `codex-orchestrator` pipelines for planning, implementation, validation, and review.
|
|
6
6
|
- Default to `docs-review` before implementation and `implementation-gate` after code changes.
|
|
7
|
+
- Prefer cloud mode when runs are long-running/parallel and cloud prerequisites are ready.
|
|
8
|
+
- Before cloud mode, verify branch availability, non-interactive setup commands, and required secrets/variables; if missing, run in local `mcp` mode and record why.
|
|
7
9
|
- Before implementation, run a standalone review of the task/spec against the user’s intent and record the approval in the spec + checklist notes.
|
|
8
10
|
- Delegation is mandatory for top-level tasks once a task id exists: spawn at least one subagent run using `MCP_RUNNER_TASK_ID=<task-id>-<stream>`, capture manifest evidence, and summarize in the main run. Use `DELEGATION_GUARD_OVERRIDE_REASON` only when delegation is impossible (technical/blocking limitation or explicit operational block) and record the justification.
|
|
9
11
|
- Once a task id exists, prefer delegation for research, review, and planning work. Use `codex exec` only for pre-task triage (no task id yet) or when delegation is genuinely unavailable (technical/blocking limitation or explicit operational block), and set `DELEGATION_GUARD_OVERRIDE_REASON` with a clear justification.
|
|
@@ -18,11 +20,38 @@
|
|
|
18
20
|
- Use `codex review` for quick checks during implementation.
|
|
19
21
|
- Capture standalone review approval in the spec/task notes before implementation begins.
|
|
20
22
|
- When you need manifest-backed review evidence, run `npm run review` with the manifest path.
|
|
23
|
+
- Before merge for non-trivial changes, run one explicit elegance/minimality review pass and simplify avoidable complexity.
|
|
21
24
|
|
|
22
25
|
## Delegation (recommended)
|
|
23
26
|
- For non-trivial work, spawn at least one subagent run using `MCP_RUNNER_TASK_ID=<task-id>-<stream>`.
|
|
24
27
|
- If delegation is not possible, record the reason in the task checklist.
|
|
25
28
|
|
|
29
|
+
## Deliberation Default (agent-first)
|
|
30
|
+
- Keep MCP as the lead control plane. Use collab/delegated subagents for deliberation when ambiguity or impact is high.
|
|
31
|
+
- Run full deliberation on any hard-stop trigger:
|
|
32
|
+
- Irreversible/destructive changes with unclear rollback.
|
|
33
|
+
- Auth/secrets/PII boundary changes.
|
|
34
|
+
- Direct production customer/financial/legal impact.
|
|
35
|
+
- Conflicting intent on high-impact changes.
|
|
36
|
+
- Otherwise, use a simple risk score (`0..2` each): reversibility, external impact, security/privacy boundary, blast radius, requirement clarity, verification strength, time pressure.
|
|
37
|
+
- Require full deliberation when score `>=7` or two or more criteria score `2`.
|
|
38
|
+
- Time budgets for auto-deliberation:
|
|
39
|
+
- `T0` quick: `5s / 12s` (soft/hard)
|
|
40
|
+
- `T1` standard: `20s / 45s`
|
|
41
|
+
- `T2` complex: `60s / 120s`
|
|
42
|
+
- `T3` long-horizon: `120s / 300s`
|
|
43
|
+
- On soft cap: stop branching and execute the best current plan.
|
|
44
|
+
- On hard cap: disable auto-deliberation for that stage and continue execution.
|
|
45
|
+
- Review-signal policy:
|
|
46
|
+
- `P0` critical findings are hard-stop.
|
|
47
|
+
- `P1` high findings are hard-stop only when high-signal (clear evidence or corroboration).
|
|
48
|
+
- `P2/P3` findings are tracked follow-ups.
|
|
49
|
+
|
|
50
|
+
## Completion discipline (patience-first)
|
|
51
|
+
- Wait/poll for terminal state on long-running operations (CI checks, reviews, cloud jobs, orchestrator runs) before reporting completion.
|
|
52
|
+
- Reset waiting windows when checks restart or new feedback appears.
|
|
53
|
+
- Do not hand off mid-flight work unless the user explicitly asks to stop.
|
|
54
|
+
|
|
26
55
|
## Instruction stamp
|
|
27
56
|
- If you edit this file, refresh the instruction stamp.
|
|
28
57
|
- One-liner:
|