@kbediako/codex-orchestrator 0.1.12 → 0.1.14-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/LICENSE +19 -5
  2. package/README.md +47 -2
  3. package/dist/bin/codex-orchestrator.js +93 -0
  4. package/dist/orchestrator/src/cli/adapters/CommandBuilder.js +27 -3
  5. package/dist/orchestrator/src/cli/adapters/CommandPlanner.js +17 -1
  6. package/dist/orchestrator/src/cli/adapters/CommandReviewer.js +36 -1
  7. package/dist/orchestrator/src/cli/adapters/CommandTester.js +28 -0
  8. package/dist/orchestrator/src/cli/adapters/cloudFailureDiagnostics.js +45 -0
  9. package/dist/orchestrator/src/cli/codexCliSetup.js +294 -0
  10. package/dist/orchestrator/src/cli/init.js +3 -0
  11. package/dist/orchestrator/src/cli/mcp.js +4 -2
  12. package/dist/orchestrator/src/cli/orchestrator.js +298 -28
  13. package/dist/orchestrator/src/cli/rlm/context.js +31 -3
  14. package/dist/orchestrator/src/cli/rlm/symbolic.js +152 -15
  15. package/dist/orchestrator/src/cli/rlmRunner.js +59 -5
  16. package/dist/orchestrator/src/cli/run/manifest.js +3 -0
  17. package/dist/orchestrator/src/cli/services/commandRunner.js +87 -0
  18. package/dist/orchestrator/src/cli/services/runSummaryWriter.js +24 -0
  19. package/dist/orchestrator/src/cli/skills.js +1 -1
  20. package/dist/orchestrator/src/cli/utils/codexCli.js +94 -0
  21. package/dist/orchestrator/src/cli/utils/codexPaths.js +13 -0
  22. package/dist/orchestrator/src/cli/utils/devtools.js +9 -12
  23. package/dist/orchestrator/src/cloud/CodexCloudTaskExecutor.js +255 -0
  24. package/dist/orchestrator/src/learning/crystalizer.js +2 -1
  25. package/dist/orchestrator/src/manager.js +1 -0
  26. package/dist/orchestrator/src/sync/CloudSyncWorker.js +37 -7
  27. package/dist/scripts/design/pipeline/context.js +3 -2
  28. package/dist/scripts/lib/run-manifests.js +14 -0
  29. package/docs/README.md +22 -2
  30. package/package.json +6 -2
  31. package/schemas/manifest.json +83 -0
  32. package/skills/collab-deliberation/SKILL.md +21 -0
  33. package/skills/collab-evals/SKILL.md +32 -0
  34. package/skills/delegate-early/SKILL.md +47 -0
  35. package/skills/delegation-usage/DELEGATION_GUIDE.md +5 -4
  36. package/skills/delegation-usage/SKILL.md +11 -5
  37. package/skills/docs-first/SKILL.md +2 -1
  38. package/templates/README.md +4 -0
@@ -0,0 +1,255 @@
1
+ import { spawn } from 'node:child_process';
2
+ import { appendFile, mkdir, writeFile } from 'node:fs/promises';
3
+ import { join, relative } from 'node:path';
4
+ import { setTimeout as sleep } from 'node:timers/promises';
5
+ import { isoTimestamp } from '../cli/utils/time.js';
6
+ const TASK_ID_PATTERN = /\btask_[a-z]_[a-f0-9]+\b/i;
7
+ const MAX_LOG_CHARS = 32 * 1024;
8
+ const STATUS_RETRY_LIMIT = 3;
9
+ const STATUS_RETRY_BACKOFF_MS = 1500;
10
+ const DEFAULT_LIST_LIMIT = 20;
11
+ export function extractCloudTaskId(text) {
12
+ const match = TASK_ID_PATTERN.exec(text);
13
+ if (!match?.[0]) {
14
+ return null;
15
+ }
16
+ return match[0];
17
+ }
18
+ export function parseCloudStatusToken(text) {
19
+ const match = /^\s*\[([A-Z_]+)\]/m.exec(text);
20
+ if (!match?.[1]) {
21
+ return null;
22
+ }
23
+ return match[1].toUpperCase();
24
+ }
25
+ export function mapCloudStatusToken(token) {
26
+ if (!token) {
27
+ return 'unknown';
28
+ }
29
+ switch (token) {
30
+ case 'READY':
31
+ case 'COMPLETED':
32
+ case 'SUCCEEDED':
33
+ return 'ready';
34
+ case 'RUNNING':
35
+ case 'IN_PROGRESS':
36
+ return 'running';
37
+ case 'QUEUED':
38
+ case 'PENDING':
39
+ return 'queued';
40
+ case 'ERROR':
41
+ return 'error';
42
+ case 'FAILED':
43
+ return 'failed';
44
+ case 'CANCELLED':
45
+ case 'CANCELED':
46
+ return 'cancelled';
47
+ default:
48
+ return 'unknown';
49
+ }
50
+ }
51
+ export class CodexCloudTaskExecutor {
52
+ commandRunner;
53
+ now;
54
+ sleepFn;
55
+ constructor(options = {}) {
56
+ this.commandRunner = options.commandRunner ?? defaultCloudCommandRunner;
57
+ this.now = options.now ?? isoTimestamp;
58
+ this.sleepFn = options.sleepFn ?? sleep;
59
+ }
60
+ async execute(input) {
61
+ const cloudDir = join(input.runDir, 'cloud');
62
+ await mkdir(cloudDir, { recursive: true });
63
+ const commandLogPath = join(cloudDir, 'commands.ndjson');
64
+ const env = { ...process.env, ...(input.env ?? {}) };
65
+ const notes = [];
66
+ const cloudExecution = {
67
+ task_id: null,
68
+ environment_id: input.environmentId,
69
+ status: 'queued',
70
+ status_url: null,
71
+ submitted_at: null,
72
+ completed_at: null,
73
+ last_polled_at: null,
74
+ poll_count: 0,
75
+ poll_interval_seconds: Math.max(1, input.pollIntervalSeconds),
76
+ timeout_seconds: Math.max(1, input.timeoutSeconds),
77
+ attempts: Math.max(1, input.attempts),
78
+ diff_path: null,
79
+ diff_url: null,
80
+ diff_status: 'pending',
81
+ apply_status: 'not_requested',
82
+ log_path: relative(input.repoRoot, commandLogPath),
83
+ error: null
84
+ };
85
+ const runCloudCommand = async (args) => {
86
+ const result = await this.commandRunner({
87
+ command: input.codexBin,
88
+ args,
89
+ cwd: input.repoRoot,
90
+ env
91
+ });
92
+ await appendFile(commandLogPath, `${JSON.stringify({
93
+ timestamp: this.now(),
94
+ command: input.codexBin,
95
+ args,
96
+ exit_code: result.exitCode,
97
+ stdout: truncate(result.stdout),
98
+ stderr: truncate(result.stderr)
99
+ })}\n`, 'utf8');
100
+ return result;
101
+ };
102
+ try {
103
+ const execArgs = ['cloud', 'exec', '--env', input.environmentId, '--attempts', String(cloudExecution.attempts)];
104
+ if (input.branch && input.branch.trim()) {
105
+ execArgs.push('--branch', input.branch.trim());
106
+ }
107
+ execArgs.push(input.prompt);
108
+ const execResult = await runCloudCommand(execArgs);
109
+ if (execResult.exitCode !== 0) {
110
+ throw new Error(`codex cloud exec failed with exit ${execResult.exitCode}: ${compactError(execResult.stderr, execResult.stdout)}`);
111
+ }
112
+ const taskId = extractCloudTaskId(`${execResult.stdout}\n${execResult.stderr}`);
113
+ if (!taskId) {
114
+ throw new Error('Unable to parse cloud task id from codex cloud exec output.');
115
+ }
116
+ cloudExecution.task_id = taskId;
117
+ cloudExecution.status = 'running';
118
+ cloudExecution.submitted_at = this.now();
119
+ notes.push(`Cloud task submitted: ${taskId}`);
120
+ const metadata = await this.lookupTaskMetadata(taskId, runCloudCommand);
121
+ if (metadata?.url) {
122
+ cloudExecution.status_url = metadata.url;
123
+ }
124
+ const timeoutAt = Date.now() + cloudExecution.timeout_seconds * 1000;
125
+ let statusRetries = 0;
126
+ while (Date.now() < timeoutAt) {
127
+ const statusResult = await runCloudCommand(['cloud', 'status', taskId]);
128
+ cloudExecution.last_polled_at = this.now();
129
+ cloudExecution.poll_count += 1;
130
+ const token = parseCloudStatusToken(`${statusResult.stdout}\n${statusResult.stderr}`);
131
+ const mapped = mapCloudStatusToken(token);
132
+ // `codex cloud status` may return a non-zero exit while the task is still pending.
133
+ // Treat non-zero as a retry only when no recognizable status token is present.
134
+ if (statusResult.exitCode !== 0 && mapped === 'unknown') {
135
+ statusRetries += 1;
136
+ if (statusRetries > STATUS_RETRY_LIMIT) {
137
+ throw new Error(`codex cloud status failed ${statusRetries} times: ${compactError(statusResult.stderr, statusResult.stdout)}`);
138
+ }
139
+ await this.sleepFn(STATUS_RETRY_BACKOFF_MS * statusRetries);
140
+ continue;
141
+ }
142
+ statusRetries = 0;
143
+ if (mapped !== 'unknown') {
144
+ cloudExecution.status = mapped;
145
+ }
146
+ if (mapped === 'ready') {
147
+ notes.push(`Cloud task completed: ${taskId}`);
148
+ break;
149
+ }
150
+ if (mapped === 'error' || mapped === 'failed' || mapped === 'cancelled') {
151
+ cloudExecution.error = `Cloud task ended with status ${mapped}.`;
152
+ break;
153
+ }
154
+ await this.sleepFn(cloudExecution.poll_interval_seconds * 1000);
155
+ }
156
+ if (cloudExecution.status === 'running' || cloudExecution.status === 'queued') {
157
+ cloudExecution.status = 'failed';
158
+ cloudExecution.error = `Timed out waiting for cloud task completion after ${cloudExecution.timeout_seconds}s.`;
159
+ }
160
+ if (cloudExecution.status === 'ready') {
161
+ const diffResult = await runCloudCommand(['cloud', 'diff', taskId]);
162
+ if (diffResult.exitCode === 0 && diffResult.stdout.trim().length > 0) {
163
+ const diffPath = join(cloudDir, `${taskId}.diff.patch`);
164
+ await writeFile(diffPath, diffResult.stdout, 'utf8');
165
+ cloudExecution.diff_path = relative(input.repoRoot, diffPath);
166
+ cloudExecution.diff_status = 'available';
167
+ cloudExecution.diff_url = cloudExecution.status_url;
168
+ notes.push(`Cloud diff captured: ${cloudExecution.diff_path}`);
169
+ }
170
+ else {
171
+ cloudExecution.diff_status = 'unavailable';
172
+ if (diffResult.exitCode !== 0) {
173
+ notes.push(`Cloud diff unavailable (exit ${diffResult.exitCode}).`);
174
+ }
175
+ else {
176
+ notes.push('Cloud diff unavailable (empty payload).');
177
+ }
178
+ }
179
+ }
180
+ else {
181
+ cloudExecution.diff_status = 'unavailable';
182
+ }
183
+ cloudExecution.completed_at = this.now();
184
+ const success = cloudExecution.status === 'ready';
185
+ const summary = success
186
+ ? `Cloud task ${cloudExecution.task_id} completed successfully.`
187
+ : `Cloud task ${cloudExecution.task_id ?? '<unknown>'} failed (${cloudExecution.status}).`;
188
+ return { success, summary, notes, cloudExecution };
189
+ }
190
+ catch (error) {
191
+ // Preserve non-queued status to reflect last known remote state at failure time.
192
+ cloudExecution.status = cloudExecution.status === 'queued' ? 'failed' : cloudExecution.status;
193
+ cloudExecution.diff_status = 'unavailable';
194
+ cloudExecution.error = error?.message ?? String(error);
195
+ cloudExecution.completed_at = this.now();
196
+ const summary = `Cloud execution failed: ${cloudExecution.error}`;
197
+ notes.push(summary);
198
+ return { success: false, summary, notes, cloudExecution };
199
+ }
200
+ }
201
+ async lookupTaskMetadata(taskId, runCloudCommand) {
202
+ const listResult = await runCloudCommand(['cloud', 'list', '--json', '--limit', String(DEFAULT_LIST_LIMIT)]);
203
+ if (listResult.exitCode !== 0) {
204
+ return null;
205
+ }
206
+ try {
207
+ const payload = JSON.parse(listResult.stdout);
208
+ const match = payload.tasks?.find((task) => task.id === taskId) ?? null;
209
+ return { url: match?.url ?? null };
210
+ }
211
+ catch {
212
+ return null;
213
+ }
214
+ }
215
+ }
216
+ export async function defaultCloudCommandRunner(request) {
217
+ return await new Promise((resolve, reject) => {
218
+ const child = spawn(request.command, request.args, {
219
+ cwd: request.cwd,
220
+ env: request.env,
221
+ stdio: ['ignore', 'pipe', 'pipe']
222
+ });
223
+ let stdout = '';
224
+ let stderr = '';
225
+ child.stdout?.on('data', (chunk) => {
226
+ stdout += chunk.toString();
227
+ });
228
+ child.stderr?.on('data', (chunk) => {
229
+ stderr += chunk.toString();
230
+ });
231
+ child.once('error', (error) => {
232
+ reject(error instanceof Error ? error : new Error(String(error)));
233
+ });
234
+ child.once('close', (code) => {
235
+ resolve({
236
+ exitCode: typeof code === 'number' ? code : 1,
237
+ stdout,
238
+ stderr
239
+ });
240
+ });
241
+ });
242
+ }
243
+ function truncate(value) {
244
+ if (value.length <= MAX_LOG_CHARS) {
245
+ return value;
246
+ }
247
+ return `${value.slice(0, MAX_LOG_CHARS)}…`;
248
+ }
249
+ function compactError(...values) {
250
+ const merged = values
251
+ .map((value) => value.trim())
252
+ .filter((value) => value.length > 0)
253
+ .join(' | ');
254
+ return merged.length > 0 ? truncate(merged) : 'no stderr/stdout captured';
255
+ }
@@ -4,6 +4,7 @@ import { mkdtemp, readFile, writeFile, mkdir, rm } from 'node:fs/promises';
4
4
  import { tmpdir } from 'node:os';
5
5
  import { join, relative } from 'node:path';
6
6
  import { isoTimestamp } from '../cli/utils/time.js';
7
+ import { resolveCodexCliBin } from '../cli/utils/codexCli.js';
7
8
  import { slugify } from '../cli/utils/strings.js';
8
9
  import { appendLearningAlert, ensureLearningSection } from './manifest.js';
9
10
  import { computePromptPackStamp, loadPromptPacks } from '../../../packages/orchestrator/src/instructions/promptPacks.js';
@@ -86,7 +87,7 @@ function composePrompt(promptBody, packStamp, problem, patch, scenarioSummary) {
86
87
  ];
87
88
  return segments.filter(Boolean).join('\n\n');
88
89
  }
89
- export async function createCodexCliCrystalizerClient(binary = process.env.CODEX_CLI_BIN ?? 'codex') {
90
+ export async function createCodexCliCrystalizerClient(binary = resolveCodexCliBin(process.env)) {
90
91
  const execFileAsync = promisify(execFile);
91
92
  return {
92
93
  async generate(prompt, options) {
@@ -151,6 +151,7 @@ export class TaskManager {
151
151
  build,
152
152
  test,
153
153
  review,
154
+ cloudExecution: build.cloudExecution ?? null,
154
155
  timestamp
155
156
  };
156
157
  }
@@ -4,6 +4,7 @@ import { createHash } from 'node:crypto';
4
4
  import { CloudRunsHttpError } from './CloudRunsHttpClient.js';
5
5
  import { sanitizeTaskId } from '../persistence/sanitizeTaskId.js';
6
6
  import { sanitizeRunId } from '../persistence/sanitizeRunId.js';
7
+ import { resolveRunDir } from '../../../scripts/lib/run-manifests.js';
7
8
  export class CloudSyncWorker {
8
9
  bus;
9
10
  client;
@@ -99,11 +100,25 @@ export class CloudSyncWorker {
99
100
  }
100
101
  }
101
102
  }
102
- buildManifestPath(summary) {
103
+ buildManifestPaths(summary) {
103
104
  const safeTaskId = sanitizeTaskId(summary.taskId);
104
105
  const safeRunId = sanitizeRunId(summary.runId);
105
- const runDir = join(this.runsDir, safeTaskId, safeRunId);
106
- return join(runDir, 'manifest.json');
106
+ const primaryRunDir = resolveRunDir({
107
+ runsRoot: this.runsDir,
108
+ taskId: safeTaskId,
109
+ runId: safeRunId,
110
+ layout: 'cli'
111
+ });
112
+ const fallbackRunDir = resolveRunDir({
113
+ runsRoot: this.runsDir,
114
+ taskId: safeTaskId,
115
+ runId: safeRunId,
116
+ layout: 'legacy'
117
+ });
118
+ return {
119
+ primary: join(primaryRunDir, 'manifest.json'),
120
+ fallback: join(fallbackRunDir, 'manifest.json')
121
+ };
107
122
  }
108
123
  async appendAuditLog(entry) {
109
124
  const safeTaskId = sanitizeTaskId(entry.summary.taskId);
@@ -146,7 +161,7 @@ export class CloudSyncWorker {
146
161
  return true;
147
162
  }
148
163
  async readManifestWithRetry(summary) {
149
- const manifestPath = this.buildManifestPath(summary);
164
+ const { primary, fallback } = this.buildManifestPaths(summary);
150
165
  let attempt = 0;
151
166
  let delay = this.manifestInitialDelayMs;
152
167
  let lastError;
@@ -154,13 +169,24 @@ export class CloudSyncWorker {
154
169
  while (attempt < this.manifestReadRetries) {
155
170
  attempt += 1;
156
171
  try {
157
- const contents = await readFile(manifestPath, 'utf-8');
172
+ const contents = await readFile(primary, 'utf-8');
158
173
  lastContents = contents;
159
174
  return JSON.parse(contents);
160
175
  }
161
176
  catch (error) {
162
- lastError = error;
163
- if (shouldRetryManifestRead(error) && attempt < this.manifestReadRetries) {
177
+ let candidateError = error;
178
+ if (isMissingPathError(error)) {
179
+ try {
180
+ const contents = await readFile(fallback, 'utf-8');
181
+ lastContents = contents;
182
+ return JSON.parse(contents);
183
+ }
184
+ catch (fallbackError) {
185
+ candidateError = fallbackError;
186
+ }
187
+ }
188
+ lastError = candidateError;
189
+ if (shouldRetryManifestRead(candidateError) && attempt < this.manifestReadRetries) {
164
190
  await new Promise((resolve) => setTimeout(resolve, delay));
165
191
  delay *= 2;
166
192
  continue;
@@ -196,6 +222,10 @@ function shouldRetryManifestRead(error) {
196
222
  const code = error?.code;
197
223
  return code === 'ENOENT' || code === 'EBUSY' || code === 'EMFILE';
198
224
  }
225
+ function isMissingPathError(error) {
226
+ const code = error?.code;
227
+ return code === 'ENOENT' || code === 'ENOTDIR';
228
+ }
199
229
  function attemptJsonRecovery(contents) {
200
230
  const lastBrace = contents.lastIndexOf('}');
201
231
  if (lastBrace === -1) {
@@ -3,13 +3,14 @@ import { mkdir } from 'node:fs/promises';
3
3
  import { loadDesignConfig, designPipelineId } from '../../../packages/shared/config/index.js';
4
4
  import { sanitizeTaskId } from '../../../orchestrator/src/persistence/sanitizeTaskId.js';
5
5
  import { sanitizeRunId } from '../../../orchestrator/src/persistence/sanitizeRunId.js';
6
- import { resolveEnvironmentPaths } from '../../lib/run-manifests.js';
6
+ import { resolveEnvironmentPaths, resolveRunDir } from '../../lib/run-manifests.js';
7
7
  export async function loadDesignContext() {
8
8
  const { repoRoot, runsRoot, outRoot } = resolveEnvironmentPaths();
9
9
  const taskId = sanitizeTaskId(process.env.CODEX_ORCHESTRATOR_TASK_ID ?? process.env.MCP_RUNNER_TASK_ID ?? 'unknown-task');
10
10
  const rawRunId = process.env.CODEX_ORCHESTRATOR_RUN_ID ?? 'run-local';
11
11
  const runId = sanitizeRunId(rawRunId);
12
- const runDir = process.env.CODEX_ORCHESTRATOR_RUN_DIR ?? join(runsRoot, taskId, runId);
12
+ const runDir = process.env.CODEX_ORCHESTRATOR_RUN_DIR ??
13
+ resolveRunDir({ runsRoot, taskId, runId, layout: 'cli' });
13
14
  const manifestPath = process.env.CODEX_ORCHESTRATOR_MANIFEST_PATH ?? join(runDir, 'manifest.json');
14
15
  const designConfigPath = process.env.DESIGN_CONFIG_PATH ?? join(repoRoot, 'design.config.yaml');
15
16
  const config = await loadDesignConfig({ rootDir: repoRoot, filePath: designConfigPath });
@@ -2,6 +2,7 @@ import { access, readdir } from 'node:fs/promises';
2
2
  import { isAbsolute, join, resolve } from 'node:path';
3
3
  import process from 'node:process';
4
4
  const DEFAULT_TASK_ID = '0101';
5
+ const DEFAULT_RUN_LAYOUT = 'cli';
5
6
  function resolveRepoRoot() {
6
7
  const configured = process.env.CODEX_ORCHESTRATOR_ROOT;
7
8
  if (!configured) {
@@ -33,6 +34,19 @@ export function resolveEnvironmentPaths() {
33
34
  const taskId = process.env.MCP_RUNNER_TASK_ID ?? DEFAULT_TASK_ID;
34
35
  return { repoRoot, runsRoot, outRoot, taskId };
35
36
  }
37
+ export function resolveRunDir(options) {
38
+ const { runsRoot, taskId, runId, layout = DEFAULT_RUN_LAYOUT } = options ?? {};
39
+ if (!runsRoot || !taskId || !runId) {
40
+ throw new Error('resolveRunDir requires runsRoot, taskId, and runId');
41
+ }
42
+ if (layout !== 'cli' && layout !== 'legacy') {
43
+ throw new Error(`resolveRunDir received unsupported layout: ${layout}`);
44
+ }
45
+ if (layout === 'legacy') {
46
+ return join(runsRoot, taskId, runId);
47
+ }
48
+ return join(runsRoot, taskId, 'cli', runId);
49
+ }
36
50
  export async function listDirectories(dirPath) {
37
51
  try {
38
52
  const entries = await readdir(dirPath, { withFileTypes: true });
package/docs/README.md CHANGED
@@ -1,11 +1,28 @@
1
1
  # Codex Orchestrator (Repository Guide)
2
2
 
3
- This document covers repository internals, contributor workflows, and deeper architecture. For end‑user install and usage instructions, see the main `README.md`.
3
+ > **Internal/Contributor guide:** This document covers repository internals and workflow details. End‑user installation and usage live in `README.md`.
4
4
 
5
5
  Codex Orchestrator is the coordination layer that glues together Codex-driven agents, run pipelines, approval policies, and evidence capture for multi-stage automation projects. It wraps a reusable orchestration core with a CLI that produces auditable manifests, integrates with control-plane validators, and syncs run results to downstream systems.
6
6
 
7
7
  > **At a glance:** Every run starts from a task description, writes the active CLI manifest to `.runs/<task-id>/cli/<run-id>/manifest.json`, emits a persisted run summary at `.runs/<task-id>/<run-id>/manifest.json`, mirrors human-readable data to `out/<task-id>/`, and can optionally sync to a remote control plane. Pipelines define the concrete commands (build, lint, test, etc.) that execute for a given task.
8
8
 
9
+ ## Evaluation & Metrics
10
+ - Evaluation playbook: `docs/guides/evaluation-playbook.md`.
11
+ - Metrics reference: `docs/reference/metrics-collab-context-rot.md`.
12
+
13
+ ## Collab vs MCP
14
+ - Decision guide: `docs/guides/collab-vs-mcp.md`.
15
+
16
+ ## Downstream init
17
+ - See `README.md` for the recommended quick-start flow.
18
+
19
+ ## Upstream Sync
20
+ - Codex CLI sync strategy: `docs/guides/upstream-codex-cli-sync.md`.
21
+
22
+ ## Release Notes
23
+ - Shipped skills note: `docs/release-notes-template-addendum.md`.
24
+ - Optional overview override: add and commit a release overview file at .github/release-overview.md before tagging; the release workflow uses it when present.
25
+
9
26
  ## How It Works
10
27
  - **Planner → Builder → Tester → Reviewer:** The core `TaskManager` (see `orchestrator/src/manager.ts`) wires together agent interfaces that decide *what* to run (planner), execute the selected pipeline stage (builder), verify results (tester), and give a final decision (reviewer).
11
28
  - **Execution modes:** Each plan item can flag `requires_cloud` and task metadata can set `execution.parallel`; the mode policy picks `mcp` (local MCP runtime) or `cloud` execution accordingly.
@@ -130,6 +147,7 @@ Notes:
130
147
  - `/prompts:diagnostics` takes `TASK=<task-id> MANIFEST=<path> [NOTES=<free text>]`, exports `MCP_RUNNER_TASK_ID=$TASK`, runs `npx @kbediako/codex-orchestrator start diagnostics --format json`, tails `.runs/$TASK/cli/<run-id>/manifest.json` (or `npx @kbediako/codex-orchestrator status --run <run-id> --watch --interval 10`), and records evidence to `/tasks`, `docs/TASKS.md`, `.agent/task/...`, `.runs/$TASK/metrics.json`, and `out/$TASK/state.json` using `$MANIFEST`.
131
148
  - `/prompts:review-handoff` takes `TASK=<task-id> MANIFEST=<path> NOTES=<goal + summary + risks + optional questions>`, re-exports `MCP_RUNNER_TASK_ID`, and (repo-only) runs `node scripts/delegation-guard.mjs`, `node scripts/spec-guard.mjs --dry-run`, `npm run lint`, `npm run test`, optional `npm run eval:test`, plus `npm run review` (wraps `codex review` against the current diff and includes the latest run manifest path as evidence). It also reminds you to log approvals in `$MANIFEST` and mirror the evidence to the same docs/metrics/state targets.
132
149
  - In CI / `--no-interactive` pipelines (or when stdin is not a TTY, or `CODEX_REVIEW_NON_INTERACTIVE=1` / `CODEX_NON_INTERACTIVE=1` / `CODEX_NO_INTERACTIVE=1`), `npm run review` prints the review handoff prompt (including evidence paths) and exits successfully instead of invoking `codex review`. Set `FORCE_CODEX_REVIEW=1` to run `codex review` in those environments.
150
+ - When forcing non-interactive review execution, `npm run review` enforces a timeout (`CODEX_REVIEW_TIMEOUT_SECONDS`, default `900`). Set `CODEX_REVIEW_TIMEOUT_SECONDS=0` to disable the timeout.
133
151
  - Always trigger diagnostics and review workflows through these prompts whenever you run the orchestrator so contributors consistently execute the required command sequences and capture auditable manifests.
134
152
 
135
153
  ### Identifier Guardrails
@@ -159,6 +177,7 @@ Notes:
159
177
  ## Persistence & Observability
160
178
  - `TaskStateStore` writes per-task snapshots with bounded lock retries; failures degrade gracefully while still writing the main manifest.
161
179
  - `RunManifestWriter` generates the canonical manifest JSON for each run (mirrored under `.runs/`), while metrics appenders and summary writers keep `out/` up to date.
180
+ - `collab_tool_calls` in the manifest captures collab tool call JSONL lines extracted from command stdout (bounded by `CODEX_ORCHESTRATOR_COLLAB_MAX_EVENTS`, default 200; set 0 to disable capture).
162
181
  - Heartbeat files and timestamps guard against stalled runs. `orchestrator/src/cli/metrics/metricsRecorder.ts` aggregates command durations, exit codes, and guardrail stats for later review.
163
182
  - Optional caps: `CODEX_ORCHESTRATOR_EXEC_EVENT_MAX_CHUNKS` limits captured exec chunk events per command (defaults to 500; set 0 for no cap), `CODEX_ORCHESTRATOR_TELEMETRY_MAX_EVENTS` caps in-memory telemetry events queued before flush (defaults to 1000; set 0 for no cap), and `CODEX_METRICS_PRIVACY_EVENTS_MAX` limits privacy decision events stored in `metrics.json` (-1 = no cap; `privacy_event_count` still reflects total).
164
183
 
@@ -178,6 +197,7 @@ Note: the commands below assume a source checkout; `scripts/` helpers are not in
178
197
  | `npm run eval:test` | Optional evaluation harness (enable when `evaluation/fixtures/**` is populated). |
179
198
  | `npm run docs:check` | Deterministically validates scripts/pipelines/paths referenced in agent-facing docs. |
180
199
  | `npm run docs:freshness` | Validates docs registry coverage + review recency; writes `out/<task-id>/docs-freshness.json`. |
200
+ | `npm run ci:cloud-canary` | Runs the cloud canary harness (`scripts/cloud-canary-ci.mjs`) to verify cloud lifecycle manifest + run-summary evidence; credential-gated by `CODEX_CLOUD_ENV_ID` and optional auth secrets (`CODEX_CLOUD_BRANCH` defaults to `main`). |
181
201
  | `node scripts/delegation-guard.mjs` | Enforces subagent delegation evidence before review (repo-only). |
182
202
  | `node scripts/spec-guard.mjs --dry-run` | Validates spec freshness; required before review (repo-only). |
183
203
  | `node scripts/diff-budget.mjs` | Guards against oversized diffs before review (repo-only; defaults: 25 files / 800 lines; supports explicit overrides). |
@@ -241,7 +261,7 @@ Check readiness with `codex-orchestrator doctor --format json` (reports DevTools
241
261
  Use the hi-fi pipeline to snapshot complex marketing sites (motion, interactions, tokens) while keeping the repo cloneable:
242
262
 
243
263
  1. **Configure the source:** Update `design.config.yaml` → `pipelines.hi_fi_design_toolkit.sources` with the target URL, slug, title, and breakpoints (the repo defaults to an empty `sources` list until you add one).
244
- 2. **Permit the domain:** Add (or update) the matching record in `compliance/permit.json` so Playwright, video capture, and live assets are explicitly approved for that origin.
264
+ 2. **Permit the domain:** Copy `compliance/permit.example.json` to `compliance/permit.json`, then add (or update) the matching record so Playwright, video capture, and live assets are explicitly approved for that origin.
245
265
  3. **Prep tooling:**
246
266
  - `npm install && npm run build`
247
267
  - `npm run setup:design-tools` (installs design-system deps) and ensure FFmpeg is available (`brew install ffmpeg` on macOS).
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@kbediako/codex-orchestrator",
3
- "version": "0.1.12",
4
- "license": "SEE LICENSE IN LICENSE",
3
+ "version": "0.1.14-alpha.1",
4
+ "license": "MIT",
5
5
  "type": "module",
6
6
  "bin": {
7
7
  "codex-orchestrator": "dist/bin/codex-orchestrator.js",
@@ -40,6 +40,7 @@
40
40
  "docs:archive-tasks": "node scripts/tasks-archive.mjs",
41
41
  "docs:freshness": "node scripts/docs-freshness.mjs --check",
42
42
  "docs:sync": "node --loader ts-node/esm scripts/docs-hygiene.ts --sync",
43
+ "ci:cloud-canary": "node scripts/cloud-canary-ci.mjs",
43
44
  "prelint": "node scripts/build-patterns-if-needed.mjs",
44
45
  "lint": "eslint orchestrator/src orchestrator/tests packages/orchestrator/src packages/orchestrator/tests packages/shared adapters evaluation/harness evaluation/tests --ext .ts,.tsx",
45
46
  "pack:audit": "node scripts/pack-audit.mjs",
@@ -75,6 +76,9 @@
75
76
  "eslint-plugin-patterns": "file:eslint-plugin-patterns",
76
77
  "jscodeshift": "^0.15.2",
77
78
  "json-schema-to-typescript": "^14.0.0",
79
+ "pixelmatch": "^7.1.0",
80
+ "playwright": "^1.57.0",
81
+ "pngjs": "^7.0.0",
78
82
  "ts-node": "^10.9.2",
79
83
  "typescript": "^5.4.0",
80
84
  "vitest": "^1.3.1"
@@ -154,6 +154,10 @@
154
154
  }
155
155
  }
156
156
  },
157
+ "collab_tool_calls": {
158
+ "type": ["array", "null"],
159
+ "items": { "$ref": "#/definitions/collabToolCall" }
160
+ },
157
161
  "child_runs": {
158
162
  "type": "array",
159
163
  "items": {
@@ -313,6 +317,51 @@
313
317
  }
314
318
  }
315
319
  },
320
+ "cloud_execution": {
321
+ "type": ["object", "null"],
322
+ "additionalProperties": false,
323
+ "required": [
324
+ "task_id",
325
+ "environment_id",
326
+ "status",
327
+ "status_url",
328
+ "submitted_at",
329
+ "completed_at",
330
+ "last_polled_at",
331
+ "poll_count",
332
+ "poll_interval_seconds",
333
+ "timeout_seconds",
334
+ "attempts",
335
+ "diff_path",
336
+ "diff_url",
337
+ "diff_status",
338
+ "apply_status",
339
+ "log_path",
340
+ "error"
341
+ ],
342
+ "properties": {
343
+ "task_id": { "type": ["string", "null"] },
344
+ "environment_id": { "type": ["string", "null"] },
345
+ "status": {
346
+ "type": "string",
347
+ "enum": ["queued", "running", "ready", "error", "failed", "cancelled", "unknown"]
348
+ },
349
+ "status_url": { "type": ["string", "null"] },
350
+ "submitted_at": { "type": ["string", "null"] },
351
+ "completed_at": { "type": ["string", "null"] },
352
+ "last_polled_at": { "type": ["string", "null"] },
353
+ "poll_count": { "type": "integer", "minimum": 0 },
354
+ "poll_interval_seconds": { "type": "integer", "minimum": 1 },
355
+ "timeout_seconds": { "type": "integer", "minimum": 1 },
356
+ "attempts": { "type": "integer", "minimum": 1 },
357
+ "diff_path": { "type": ["string", "null"] },
358
+ "diff_url": { "type": ["string", "null"] },
359
+ "diff_status": { "type": "string", "enum": ["pending", "available", "unavailable"] },
360
+ "apply_status": { "type": "string", "enum": ["not_requested", "succeeded", "failed"] },
361
+ "log_path": { "type": ["string", "null"] },
362
+ "error": { "type": ["string", "null"] }
363
+ }
364
+ },
316
365
  "privacy": {
317
366
  "type": ["object", "null"],
318
367
  "additionalProperties": false,
@@ -756,6 +805,40 @@
756
805
  },
757
806
  "additionalProperties": true
758
807
  },
808
+ "collabToolCall": {
809
+ "type": "object",
810
+ "required": [
811
+ "observed_at",
812
+ "stage_id",
813
+ "command_index",
814
+ "event_type",
815
+ "item_id",
816
+ "tool",
817
+ "status",
818
+ "sender_thread_id",
819
+ "receiver_thread_ids"
820
+ ],
821
+ "additionalProperties": false,
822
+ "properties": {
823
+ "observed_at": { "type": "string", "minLength": 1 },
824
+ "stage_id": { "type": "string", "minLength": 1 },
825
+ "command_index": { "type": "integer", "minimum": 1 },
826
+ "event_type": { "type": "string", "enum": ["item.started", "item.completed", "item.updated"] },
827
+ "item_id": { "type": "string", "minLength": 1 },
828
+ "tool": { "type": "string", "minLength": 1 },
829
+ "status": { "type": "string", "enum": ["in_progress", "completed", "failed"] },
830
+ "sender_thread_id": { "type": "string", "minLength": 1 },
831
+ "receiver_thread_ids": {
832
+ "type": "array",
833
+ "items": { "type": "string", "minLength": 1 }
834
+ },
835
+ "prompt": { "type": ["string", "null"] },
836
+ "agents_states": {
837
+ "type": ["object", "null"],
838
+ "additionalProperties": true
839
+ }
840
+ }
841
+ },
759
842
  "designArtifact": {
760
843
  "type": "object",
761
844
  "required": ["stage", "status", "relative_path"],
@@ -0,0 +1,21 @@
1
+ ---
2
+ name: collab-deliberation
3
+ description: Structure multi-agent brainstorming and deliberation (options, tradeoffs, decision framing) without drifting into implementation.
4
+ ---
5
+
6
+ # Collab Deliberation
7
+
8
+ Use this skill when the user asks for brainstorming, multiple approaches, pros/cons, or decision support. This skill is for **ideas**, not implementation.
9
+
10
+ ## Workflow
11
+
12
+ 1) Clarify the decision: summarize the goal, constraints, and success criteria.
13
+ 2) Generate options: 3–5 distinct approaches with short descriptions.
14
+ 3) Compare tradeoffs: cost, risk, speed, maintenance, and alignment with guardrails.
15
+ 4) Recommend: choose a recommended approach and explain why.
16
+ 5) Open questions: list 1–3 questions that would change the recommendation.
17
+
18
+ ## Guardrails
19
+ - Separate ideas from decisions.
20
+ - Do not implement or modify code unless explicitly asked.
21
+ - Keep outputs concise and action-oriented.
@@ -0,0 +1,32 @@
1
+ ---
2
+ name: collab-evals
3
+ description: Run collab/multi-agent eval scenarios (symbolic RLM, large-context, pause/resume, multi-hour checkpoints) and capture manifest-backed evidence.
4
+ ---
5
+
6
+ # Collab Evals
7
+
8
+ Use this skill to run repeatable collab evaluation scenarios and record evidence. Keep scope to evals; do not implement unrelated fixes.
9
+
10
+ ## Quick start
11
+
12
+ 1) Pick the scenario(s):
13
+ - Large-context symbolic RLM with collab subcalls.
14
+ - Multi-hour refactor with checkpoints.
15
+ - 24h pause/resume context-rot regression.
16
+ - Multi-day initiative (48–72h) with multiple resumes.
17
+
18
+ 2) Ensure task context:
19
+ - `export MCP_RUNNER_TASK_ID=<task-id>`
20
+
21
+ 3) Run the scenario using `codex-orchestrator start <pipeline> --format json` and record the manifest path.
22
+
23
+ ## Evidence checklist
24
+ - Manifest path under `.runs/<task-id>/cli/<run-id>/manifest.json`.
25
+ - Log path under `.runs/<task-id>/cli/<run-id>/runner.ndjson`.
26
+ - Findings recorded in `docs/findings/<date>-<topic>.md`.
27
+ - Task mirror update in `docs/TASKS.md` and task spec.
28
+
29
+ ## Guardrails
30
+ - Collab is additive; keep MCP as the control plane for approvals and audit trails.
31
+ - Cap collab event capture with `CODEX_ORCHESTRATOR_COLLAB_MAX_EVENTS` when needed.
32
+ - If pause/resume is required, use control endpoints or `codex-orchestrator resume` with manifest evidence.