@jhlee0619/codexloop 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +34 -0
- package/.claude-plugin/plugin.json +8 -0
- package/.codex-plugin/plugin.json +38 -0
- package/LICENSE +21 -0
- package/README.md +425 -0
- package/assets/banner.png +0 -0
- package/bin/cloop +45 -0
- package/commands/iterate.md +25 -0
- package/commands/model.md +33 -0
- package/commands/result.md +17 -0
- package/commands/start.md +188 -0
- package/commands/status.md +10 -0
- package/commands/stop.md +12 -0
- package/package.json +60 -0
- package/prompts/evaluate.md +91 -0
- package/prompts/rank.md +97 -0
- package/prompts/suggest.md +69 -0
- package/schemas/evaluation.schema.json +65 -0
- package/schemas/loop-state.schema.json +103 -0
- package/schemas/proposal.schema.json +74 -0
- package/schemas/ranking.schema.json +77 -0
- package/scripts/lib/apply.mjs +254 -0
- package/scripts/lib/args.mjs +202 -0
- package/scripts/lib/codex-exec.mjs +318 -0
- package/scripts/lib/convergence.mjs +153 -0
- package/scripts/lib/iteration.mjs +484 -0
- package/scripts/lib/process.mjs +164 -0
- package/scripts/lib/prompts.mjs +53 -0
- package/scripts/lib/rank.mjs +149 -0
- package/scripts/lib/render.mjs +240 -0
- package/scripts/lib/state.mjs +378 -0
- package/scripts/lib/validate.mjs +71 -0
- package/scripts/lib/workspace.mjs +49 -0
- package/scripts/loop-companion.mjs +849 -0
- package/skills/cloop/SKILL.md +177 -0
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
// Lightweight CLI argument parser used by scripts/loop-companion.mjs.
|
|
2
|
+
//
|
|
3
|
+
// Design:
|
|
4
|
+
// - Supports `--key=value`, `--key value`, `--flag`, short `-k` aliases, and `--` passthrough.
|
|
5
|
+
// - Unknown long flags fall through to positionals so callers can forward them.
|
|
6
|
+
// - `splitRawArgumentString` tokenizes a raw $ARGUMENTS string with quote + backslash handling.
|
|
7
|
+
|
|
8
|
+
export function parseArgs(argv, config = {}) {
|
|
9
|
+
const valueOptions = new Set(config.valueOptions ?? []);
|
|
10
|
+
const booleanOptions = new Set(config.booleanOptions ?? []);
|
|
11
|
+
const repeatableOptions = new Set(config.repeatableOptions ?? []);
|
|
12
|
+
const aliasMap = config.aliasMap ?? {};
|
|
13
|
+
|
|
14
|
+
const options = {};
|
|
15
|
+
const positionals = [];
|
|
16
|
+
let passthrough = false;
|
|
17
|
+
|
|
18
|
+
const assign = (key, value) => {
|
|
19
|
+
if (repeatableOptions.has(key)) {
|
|
20
|
+
if (!Array.isArray(options[key])) {
|
|
21
|
+
options[key] = [];
|
|
22
|
+
}
|
|
23
|
+
options[key].push(value);
|
|
24
|
+
} else {
|
|
25
|
+
options[key] = value;
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
for (let index = 0; index < argv.length; index += 1) {
|
|
30
|
+
const token = argv[index];
|
|
31
|
+
|
|
32
|
+
if (passthrough) {
|
|
33
|
+
positionals.push(token);
|
|
34
|
+
continue;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (token === "--") {
|
|
38
|
+
passthrough = true;
|
|
39
|
+
continue;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if (!token.startsWith("-") || token === "-") {
|
|
43
|
+
positionals.push(token);
|
|
44
|
+
continue;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if (token.startsWith("--")) {
|
|
48
|
+
const [rawKey, inlineValue] = splitKeyValue(token.slice(2));
|
|
49
|
+
const key = aliasMap[rawKey] ?? rawKey;
|
|
50
|
+
|
|
51
|
+
if (booleanOptions.has(key)) {
|
|
52
|
+
const value = inlineValue === undefined ? true : inlineValue !== "false";
|
|
53
|
+
assign(key, value);
|
|
54
|
+
continue;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if (valueOptions.has(key) || repeatableOptions.has(key)) {
|
|
58
|
+
const nextValue = inlineValue ?? argv[index + 1];
|
|
59
|
+
if (nextValue === undefined) {
|
|
60
|
+
throw new Error(`Missing value for --${rawKey}`);
|
|
61
|
+
}
|
|
62
|
+
assign(key, nextValue);
|
|
63
|
+
if (inlineValue === undefined) {
|
|
64
|
+
index += 1;
|
|
65
|
+
}
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
positionals.push(token);
|
|
70
|
+
continue;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const shortKey = token.slice(1);
|
|
74
|
+
const key = aliasMap[shortKey] ?? shortKey;
|
|
75
|
+
|
|
76
|
+
if (booleanOptions.has(key)) {
|
|
77
|
+
assign(key, true);
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (valueOptions.has(key) || repeatableOptions.has(key)) {
|
|
82
|
+
const nextValue = argv[index + 1];
|
|
83
|
+
if (nextValue === undefined) {
|
|
84
|
+
throw new Error(`Missing value for -${shortKey}`);
|
|
85
|
+
}
|
|
86
|
+
assign(key, nextValue);
|
|
87
|
+
index += 1;
|
|
88
|
+
continue;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
positionals.push(token);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return { options, positionals };
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function splitKeyValue(raw) {
|
|
98
|
+
const equalsIndex = raw.indexOf("=");
|
|
99
|
+
if (equalsIndex === -1) {
|
|
100
|
+
return [raw, undefined];
|
|
101
|
+
}
|
|
102
|
+
return [raw.slice(0, equalsIndex), raw.slice(equalsIndex + 1)];
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
export function splitRawArgumentString(raw) {
|
|
106
|
+
if (raw == null) {
|
|
107
|
+
return [];
|
|
108
|
+
}
|
|
109
|
+
const input = String(raw);
|
|
110
|
+
const tokens = [];
|
|
111
|
+
let current = "";
|
|
112
|
+
let quote = null;
|
|
113
|
+
let escaping = false;
|
|
114
|
+
let started = false;
|
|
115
|
+
|
|
116
|
+
const push = () => {
|
|
117
|
+
if (started) {
|
|
118
|
+
tokens.push(current);
|
|
119
|
+
current = "";
|
|
120
|
+
started = false;
|
|
121
|
+
}
|
|
122
|
+
};
|
|
123
|
+
|
|
124
|
+
for (const character of input) {
|
|
125
|
+
if (escaping) {
|
|
126
|
+
current += character;
|
|
127
|
+
started = true;
|
|
128
|
+
escaping = false;
|
|
129
|
+
continue;
|
|
130
|
+
}
|
|
131
|
+
if (character === "\\") {
|
|
132
|
+
escaping = true;
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
if (quote) {
|
|
136
|
+
if (character === quote) {
|
|
137
|
+
quote = null;
|
|
138
|
+
} else {
|
|
139
|
+
current += character;
|
|
140
|
+
started = true;
|
|
141
|
+
}
|
|
142
|
+
continue;
|
|
143
|
+
}
|
|
144
|
+
if (character === "'" || character === "\"") {
|
|
145
|
+
quote = character;
|
|
146
|
+
started = true;
|
|
147
|
+
continue;
|
|
148
|
+
}
|
|
149
|
+
if (/\s/.test(character)) {
|
|
150
|
+
push();
|
|
151
|
+
continue;
|
|
152
|
+
}
|
|
153
|
+
current += character;
|
|
154
|
+
started = true;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
if (escaping) {
|
|
158
|
+
current += "\\";
|
|
159
|
+
started = true;
|
|
160
|
+
}
|
|
161
|
+
push();
|
|
162
|
+
return tokens;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
export function normalizeArgv(argv) {
|
|
166
|
+
if (argv.length === 1) {
|
|
167
|
+
const [raw] = argv;
|
|
168
|
+
if (!raw || !raw.trim()) {
|
|
169
|
+
return [];
|
|
170
|
+
}
|
|
171
|
+
return splitRawArgumentString(raw);
|
|
172
|
+
}
|
|
173
|
+
return argv;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export function parseDuration(value) {
|
|
177
|
+
if (value == null) return null;
|
|
178
|
+
if (typeof value === "number") return value;
|
|
179
|
+
const match = String(value).trim().match(/^(\d+(?:\.\d+)?)(ms|s|m|h|d)?$/i);
|
|
180
|
+
if (!match) {
|
|
181
|
+
throw new Error(`Invalid duration: ${value}`);
|
|
182
|
+
}
|
|
183
|
+
const amount = Number(match[1]);
|
|
184
|
+
const unit = (match[2] ?? "ms").toLowerCase();
|
|
185
|
+
const scale = { ms: 1, s: 1000, m: 60_000, h: 3_600_000, d: 86_400_000 }[unit];
|
|
186
|
+
return Math.floor(amount * scale);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
export function parseInteger(value, { name = "value", min, max } = {}) {
|
|
190
|
+
if (value == null) return null;
|
|
191
|
+
const parsed = Number.parseInt(String(value), 10);
|
|
192
|
+
if (!Number.isFinite(parsed)) {
|
|
193
|
+
throw new Error(`Invalid integer for ${name}: ${value}`);
|
|
194
|
+
}
|
|
195
|
+
if (min != null && parsed < min) {
|
|
196
|
+
throw new Error(`${name} must be >= ${min} (got ${parsed})`);
|
|
197
|
+
}
|
|
198
|
+
if (max != null && parsed > max) {
|
|
199
|
+
throw new Error(`${name} must be <= ${max} (got ${parsed})`);
|
|
200
|
+
}
|
|
201
|
+
return parsed;
|
|
202
|
+
}
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
// Thin wrapper around the `codex` CLI used for every step of the iteration
|
|
2
|
+
// loop (evaluate / suggest / rank). Always runs in `exec --json --ephemeral`
|
|
3
|
+
// mode with a JSON Schema attached via `--output-schema`, and extracts the
|
|
4
|
+
// structured response from `--output-last-message` for reliability.
|
|
5
|
+
//
|
|
6
|
+
// The binary is resolved via `resolveCodexBin()` which honors
|
|
7
|
+
// `CODEXLOOP_CODEX_BIN` — this is how tests swap in the mock fixture without
|
|
8
|
+
// changing runtime code.
|
|
9
|
+
|
|
10
|
+
import fs from "node:fs";
|
|
11
|
+
import os from "node:os";
|
|
12
|
+
import path from "node:path";
|
|
13
|
+
import process from "node:process";
|
|
14
|
+
|
|
15
|
+
import { runCommand, sleep } from "./process.mjs";
|
|
16
|
+
import { getSchemaPath, renderPrompt } from "./prompts.mjs";
|
|
17
|
+
|
|
18
|
+
const DEFAULT_CODEX_BIN = "codex";
|
|
19
|
+
|
|
20
|
+
const STEP_SCHEMAS = {
|
|
21
|
+
evaluate: "evaluation",
|
|
22
|
+
suggest: "proposal",
|
|
23
|
+
rank: "ranking"
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
export function resolveCodexBin() {
|
|
27
|
+
return process.env.CODEXLOOP_CODEX_BIN || DEFAULT_CODEX_BIN;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export class CodexError extends Error {
|
|
31
|
+
constructor(message, { kind = "unknown", stderr = "", status = null, attempts = 1 } = {}) {
|
|
32
|
+
super(message);
|
|
33
|
+
this.name = "CodexError";
|
|
34
|
+
this.kind = kind;
|
|
35
|
+
this.stderr = stderr;
|
|
36
|
+
this.status = status;
|
|
37
|
+
this.attempts = attempts;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export function classifyCodexFailure(stderr, status) {
|
|
42
|
+
const text = String(stderr || "").toLowerCase();
|
|
43
|
+
if (text.includes("rate limit") || text.includes("too many requests") || text.includes("429")) {
|
|
44
|
+
return "rate-limit";
|
|
45
|
+
}
|
|
46
|
+
if (
|
|
47
|
+
text.includes("unauthorized") ||
|
|
48
|
+
text.includes("not logged in") ||
|
|
49
|
+
text.includes("not authenticated") ||
|
|
50
|
+
text.includes("auth") && text.includes("fail")
|
|
51
|
+
) {
|
|
52
|
+
return "auth";
|
|
53
|
+
}
|
|
54
|
+
if (
|
|
55
|
+
text.includes("network") ||
|
|
56
|
+
text.includes("timed out") ||
|
|
57
|
+
text.includes("timeout") ||
|
|
58
|
+
text.includes("econn") ||
|
|
59
|
+
text.includes("etimedout") ||
|
|
60
|
+
text.includes("dns")
|
|
61
|
+
) {
|
|
62
|
+
return "network";
|
|
63
|
+
}
|
|
64
|
+
if (text.includes("schema") || text.includes("jsonschema")) {
|
|
65
|
+
return "schema";
|
|
66
|
+
}
|
|
67
|
+
return status === 0 ? "parse" : "unknown";
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function isTransient(kind) {
|
|
71
|
+
return kind === "rate-limit" || kind === "network";
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function backoffMs(attempt) {
|
|
75
|
+
return Math.min(5000, 500 * 2 ** Math.max(0, attempt - 1));
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export function checkCodexAvailable({ cwd = process.cwd() } = {}) {
|
|
79
|
+
const bin = resolveCodexBin();
|
|
80
|
+
const result = runCommand(bin, ["--version"], { cwd, timeoutMs: 5000 });
|
|
81
|
+
if (result.error && result.error.code === "ENOENT") {
|
|
82
|
+
return {
|
|
83
|
+
available: false,
|
|
84
|
+
kind: "missing",
|
|
85
|
+
detail: `${bin} not found on PATH. Install it with 'npm install -g @openai/codex' or run /codex:setup from the codex plugin.`
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
if (result.error) {
|
|
89
|
+
return { available: false, kind: "error", detail: result.error.message };
|
|
90
|
+
}
|
|
91
|
+
if (result.status !== 0) {
|
|
92
|
+
const detail = (result.stderr || result.stdout || `exit ${result.status}`).trim();
|
|
93
|
+
return { available: false, kind: "error", detail };
|
|
94
|
+
}
|
|
95
|
+
return { available: true, version: (result.stdout || result.stderr || "").trim() };
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Raw wrapper. Spawns `codex exec --json --output-schema … < prompt`,
|
|
99
|
+
// returns the parsed structured output plus token usage and duration.
|
|
100
|
+
//
|
|
101
|
+
// `model` is forwarded via `--model` and `reasoningEffort` is forwarded via
|
|
102
|
+
// `-c model_reasoning_effort=<value>`. Both are optional; if unset, codex
|
|
103
|
+
// falls back to the defaults in ~/.codex/config.toml.
|
|
104
|
+
export async function runCodex({
|
|
105
|
+
prompt,
|
|
106
|
+
schemaName,
|
|
107
|
+
cwd,
|
|
108
|
+
sandbox = "read-only",
|
|
109
|
+
model = null,
|
|
110
|
+
reasoningEffort = null,
|
|
111
|
+
timeoutMs = 180_000,
|
|
112
|
+
retries = 2,
|
|
113
|
+
maxBuffer = 64 * 1024 * 1024
|
|
114
|
+
} = {}) {
|
|
115
|
+
if (typeof prompt !== "string" || !prompt.trim()) {
|
|
116
|
+
throw new CodexError("runCodex requires a non-empty prompt string", { kind: "usage" });
|
|
117
|
+
}
|
|
118
|
+
if (!cwd) {
|
|
119
|
+
throw new CodexError("runCodex requires a cwd", { kind: "usage" });
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const bin = resolveCodexBin();
|
|
123
|
+
const schemaPath = schemaName ? getSchemaPath(schemaName) : null;
|
|
124
|
+
const lastMessageFile = path.join(
|
|
125
|
+
os.tmpdir(),
|
|
126
|
+
`codexloop-last-${process.pid}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 8)}.txt`
|
|
127
|
+
);
|
|
128
|
+
|
|
129
|
+
const args = [
|
|
130
|
+
"exec",
|
|
131
|
+
"--json",
|
|
132
|
+
"--sandbox", sandbox,
|
|
133
|
+
"--ephemeral",
|
|
134
|
+
"--skip-git-repo-check",
|
|
135
|
+
"--cd", cwd,
|
|
136
|
+
"--color", "never",
|
|
137
|
+
"--output-last-message", lastMessageFile
|
|
138
|
+
];
|
|
139
|
+
if (schemaPath) {
|
|
140
|
+
args.push("--output-schema", schemaPath);
|
|
141
|
+
}
|
|
142
|
+
if (model) {
|
|
143
|
+
args.push("--model", model);
|
|
144
|
+
}
|
|
145
|
+
if (reasoningEffort) {
|
|
146
|
+
args.push("-c", `model_reasoning_effort=${reasoningEffort}`);
|
|
147
|
+
}
|
|
148
|
+
args.push("-"); // prompt on stdin
|
|
149
|
+
|
|
150
|
+
const maxAttempts = Math.max(1, retries + 1);
|
|
151
|
+
let lastError = null;
|
|
152
|
+
|
|
153
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
|
|
154
|
+
const startedAt = Date.now();
|
|
155
|
+
const result = runCommand(bin, args, {
|
|
156
|
+
cwd,
|
|
157
|
+
input: prompt,
|
|
158
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
159
|
+
timeoutMs,
|
|
160
|
+
maxBuffer
|
|
161
|
+
});
|
|
162
|
+
const durationMs = Date.now() - startedAt;
|
|
163
|
+
|
|
164
|
+
if (result.error) {
|
|
165
|
+
const kind = result.error.code === "ENOENT" ? "missing" : "spawn";
|
|
166
|
+
lastError = new CodexError(
|
|
167
|
+
`${bin} failed to spawn: ${result.error.message}`,
|
|
168
|
+
{ kind, stderr: result.stderr, attempts: attempt }
|
|
169
|
+
);
|
|
170
|
+
break; // spawn errors are not retry-worthy
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
const events = parseJsonlEvents(result.stdout);
|
|
174
|
+
const usage = extractUsage(events);
|
|
175
|
+
|
|
176
|
+
if (result.status !== 0) {
|
|
177
|
+
const kind = classifyCodexFailure(result.stderr, result.status);
|
|
178
|
+
lastError = new CodexError(
|
|
179
|
+
`${bin} exec exited ${result.status}: ${(result.stderr || "").trim().slice(0, 500) || "(no stderr)"}`,
|
|
180
|
+
{ kind, stderr: result.stderr, status: result.status, attempts: attempt }
|
|
181
|
+
);
|
|
182
|
+
if (!isTransient(kind) || attempt >= maxAttempts) {
|
|
183
|
+
break;
|
|
184
|
+
}
|
|
185
|
+
await sleep(backoffMs(attempt));
|
|
186
|
+
continue;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
let rawOutput = "";
|
|
190
|
+
try {
|
|
191
|
+
if (fs.existsSync(lastMessageFile)) {
|
|
192
|
+
rawOutput = fs.readFileSync(lastMessageFile, "utf8");
|
|
193
|
+
}
|
|
194
|
+
} catch {}
|
|
195
|
+
if (!rawOutput.trim()) {
|
|
196
|
+
rawOutput = extractLastAgentMessage(events) ?? "";
|
|
197
|
+
}
|
|
198
|
+
try { fs.unlinkSync(lastMessageFile); } catch {}
|
|
199
|
+
|
|
200
|
+
if (!rawOutput.trim()) {
|
|
201
|
+
lastError = new CodexError("Codex returned no agent message", {
|
|
202
|
+
kind: "empty-output",
|
|
203
|
+
stderr: result.stderr,
|
|
204
|
+
status: 0,
|
|
205
|
+
attempts: attempt
|
|
206
|
+
});
|
|
207
|
+
if (attempt >= maxAttempts) break;
|
|
208
|
+
await sleep(backoffMs(attempt));
|
|
209
|
+
continue;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
let data;
|
|
213
|
+
try {
|
|
214
|
+
data = parseJsonOutput(rawOutput);
|
|
215
|
+
} catch (err) {
|
|
216
|
+
lastError = new CodexError(`Failed to parse Codex JSON output: ${err.message}`, {
|
|
217
|
+
kind: "parse",
|
|
218
|
+
stderr: result.stderr,
|
|
219
|
+
status: 0,
|
|
220
|
+
attempts: attempt
|
|
221
|
+
});
|
|
222
|
+
if (attempt >= maxAttempts) break;
|
|
223
|
+
await sleep(backoffMs(attempt));
|
|
224
|
+
continue;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
return {
|
|
228
|
+
data,
|
|
229
|
+
usage,
|
|
230
|
+
events,
|
|
231
|
+
durationMs,
|
|
232
|
+
attempts: attempt,
|
|
233
|
+
rawOutput
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
try { fs.unlinkSync(lastMessageFile); } catch {}
|
|
238
|
+
throw lastError ?? new CodexError("Unknown codex exec failure", { kind: "unknown" });
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Higher-level helper keyed by iteration step. Loads the matching prompt
|
|
242
|
+
// template, renders it with `vars`, and dispatches to runCodex with the
|
|
243
|
+
// right schema name.
|
|
244
|
+
export async function codexCall(step, vars, opts = {}) {
|
|
245
|
+
const schemaName = STEP_SCHEMAS[step];
|
|
246
|
+
if (!schemaName) {
|
|
247
|
+
throw new CodexError(`Unknown codex step: ${step}`, { kind: "usage" });
|
|
248
|
+
}
|
|
249
|
+
const prompt = renderPrompt(step, vars);
|
|
250
|
+
return runCodex({
|
|
251
|
+
prompt,
|
|
252
|
+
schemaName,
|
|
253
|
+
...opts
|
|
254
|
+
});
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
function parseJsonlEvents(stdout) {
|
|
258
|
+
if (!stdout) return [];
|
|
259
|
+
const events = [];
|
|
260
|
+
for (const line of stdout.split(/\r?\n/)) {
|
|
261
|
+
const trimmed = line.trim();
|
|
262
|
+
if (!trimmed) continue;
|
|
263
|
+
try {
|
|
264
|
+
events.push(JSON.parse(trimmed));
|
|
265
|
+
} catch {
|
|
266
|
+
// Malformed progress line from Codex — ignore.
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
return events;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
function extractLastAgentMessage(events) {
|
|
273
|
+
for (let i = events.length - 1; i >= 0; i -= 1) {
|
|
274
|
+
const evt = events[i];
|
|
275
|
+
if (evt?.type === "item.completed" && evt.item?.type === "agent_message" && typeof evt.item.text === "string") {
|
|
276
|
+
return evt.item.text;
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
return null;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
function extractUsage(events) {
|
|
283
|
+
for (let i = events.length - 1; i >= 0; i -= 1) {
|
|
284
|
+
const evt = events[i];
|
|
285
|
+
if (evt?.type === "turn.completed" && evt.usage) {
|
|
286
|
+
return {
|
|
287
|
+
inputTokens: evt.usage.input_tokens ?? 0,
|
|
288
|
+
cachedInputTokens: evt.usage.cached_input_tokens ?? 0,
|
|
289
|
+
outputTokens: evt.usage.output_tokens ?? 0
|
|
290
|
+
};
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
return { inputTokens: 0, cachedInputTokens: 0, outputTokens: 0 };
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
function parseJsonOutput(raw) {
|
|
297
|
+
const trimmed = String(raw).trim();
|
|
298
|
+
if (!trimmed) throw new Error("empty output");
|
|
299
|
+
try {
|
|
300
|
+
return JSON.parse(trimmed);
|
|
301
|
+
} catch {}
|
|
302
|
+
|
|
303
|
+
// Some Codex versions wrap JSON in a ```json fence; strip that.
|
|
304
|
+
const fence = /^```(?:json)?\s*([\s\S]*?)\s*```$/m.exec(trimmed);
|
|
305
|
+
if (fence) {
|
|
306
|
+
try {
|
|
307
|
+
return JSON.parse(fence[1]);
|
|
308
|
+
} catch {}
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// Last resort: find the outermost {...} block.
|
|
312
|
+
const start = trimmed.indexOf("{");
|
|
313
|
+
const end = trimmed.lastIndexOf("}");
|
|
314
|
+
if (start !== -1 && end !== -1 && end > start) {
|
|
315
|
+
return JSON.parse(trimmed.slice(start, end + 1));
|
|
316
|
+
}
|
|
317
|
+
throw new Error("output is not valid JSON");
|
|
318
|
+
}
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
// Per-iteration quality score + loop stopping criteria.
|
|
2
|
+
//
|
|
3
|
+
// Quality score q_i in [0, 1] weighted sum of six terms:
|
|
4
|
+
// testPassRate, issueReduction, winnerConfidence,
|
|
5
|
+
// typeClean, lintClean, distanceFromGoal
|
|
6
|
+
//
|
|
7
|
+
// Stop triggers (first match wins):
|
|
8
|
+
// 1. goal-met
|
|
9
|
+
// 2. regression (single drop >0.10 with validate.regression true)
|
|
10
|
+
// → divergence if previous iteration also regressed
|
|
11
|
+
// 3. negligible-improvement (|Δq| < ε for stableWindow iters AND q >= 0.75)
|
|
12
|
+
// 4. plateau (same Δq rule, q < 0.75)
|
|
13
|
+
// 5. budget-iterations
|
|
14
|
+
// 6. budget-time
|
|
15
|
+
// 7. budget-calls
|
|
16
|
+
|
|
17
|
+
export const QUALITY_WEIGHTS = Object.freeze({
|
|
18
|
+
testPassRate: 0.30,
|
|
19
|
+
issueReduction: 0.20,
|
|
20
|
+
winnerConfidence: 0.15,
|
|
21
|
+
typeClean: 0.15,
|
|
22
|
+
lintClean: 0.10,
|
|
23
|
+
distanceFromGoal: 0.10
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
export const REGRESSION_DROP = 0.10;
|
|
27
|
+
export const PLATEAU_QUALITY_THRESHOLD = 0.75;
|
|
28
|
+
|
|
29
|
+
function clamp01(x) {
|
|
30
|
+
if (!Number.isFinite(x)) return 0;
|
|
31
|
+
if (x < 0) return 0;
|
|
32
|
+
if (x > 1) return 1;
|
|
33
|
+
return x;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export function computeQualityScore(iteration, state) {
|
|
37
|
+
const v = iteration.validate;
|
|
38
|
+
const e = iteration.evaluate;
|
|
39
|
+
|
|
40
|
+
let testPassRate = 0.5;
|
|
41
|
+
if (v && Number.isFinite(v.passingTests) && Number.isFinite(v.failingTests)) {
|
|
42
|
+
const total = v.passingTests + v.failingTests;
|
|
43
|
+
testPassRate = total > 0 ? v.passingTests / total : 1;
|
|
44
|
+
} else if (v?.passed === true) {
|
|
45
|
+
testPassRate = 1;
|
|
46
|
+
} else if (v?.passed === false) {
|
|
47
|
+
testPassRate = 0;
|
|
48
|
+
} else if (e && Number.isFinite(e.passingTests) && Number.isFinite(e.failingTests)) {
|
|
49
|
+
const total = e.passingTests + e.failingTests;
|
|
50
|
+
testPassRate = total > 0 ? e.passingTests / total : 1;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const openIssuesNow = Array.isArray(e?.openIssues) ? e.openIssues.length : 0;
|
|
54
|
+
const initial = Math.max(1, state?.openIssuesInitial ?? openIssuesNow);
|
|
55
|
+
const issueReduction = 1 - Math.min(1, openIssuesNow / initial);
|
|
56
|
+
|
|
57
|
+
const winnerConfidence = iteration.ranking?.winner?.confidence;
|
|
58
|
+
const winnerConfidenceTerm = Number.isFinite(winnerConfidence) ? winnerConfidence : 0.5;
|
|
59
|
+
|
|
60
|
+
const typeErrors = v?.typeErrors ?? e?.typeErrors ?? 0;
|
|
61
|
+
const typeClean = typeErrors === 0 ? 1 : Math.max(0, 1 - Math.min(1, typeErrors / 10));
|
|
62
|
+
|
|
63
|
+
const lintErrors = v?.lintErrors ?? e?.lintErrors ?? 0;
|
|
64
|
+
const lintClean = lintErrors === 0 ? 1 : Math.max(0, 1 - Math.min(1, lintErrors / 20));
|
|
65
|
+
|
|
66
|
+
const distance = clamp01(e?.distanceFromGoal ?? 0.5);
|
|
67
|
+
const distanceFromGoal = 1 - distance;
|
|
68
|
+
|
|
69
|
+
const terms = {
|
|
70
|
+
testPassRate: clamp01(testPassRate),
|
|
71
|
+
issueReduction: clamp01(issueReduction),
|
|
72
|
+
winnerConfidence: clamp01(winnerConfidenceTerm),
|
|
73
|
+
typeClean: clamp01(typeClean),
|
|
74
|
+
lintClean: clamp01(lintClean),
|
|
75
|
+
distanceFromGoal: clamp01(distanceFromGoal)
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
let total = 0;
|
|
79
|
+
for (const key of Object.keys(QUALITY_WEIGHTS)) {
|
|
80
|
+
total += QUALITY_WEIGHTS[key] * terms[key];
|
|
81
|
+
}
|
|
82
|
+
return { score: Number(total.toFixed(6)), terms };
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export function checkStopping(state) {
|
|
86
|
+
const iterations = state.iterations ?? [];
|
|
87
|
+
const last = iterations[iterations.length - 1];
|
|
88
|
+
|
|
89
|
+
// Budget-iterations: check even if no iteration ran yet
|
|
90
|
+
if ((state.budget?.consumed?.iterations ?? 0) >= (state.budget?.maxIterations ?? Infinity)) {
|
|
91
|
+
return { shouldStop: true, reason: "budget-iterations" };
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Budget-time
|
|
95
|
+
const startedAtMs = state.budget?.consumed?.startedAtMs;
|
|
96
|
+
if (
|
|
97
|
+
startedAtMs != null &&
|
|
98
|
+
Date.now() - startedAtMs >= (state.budget?.maxElapsedMs ?? Infinity)
|
|
99
|
+
) {
|
|
100
|
+
return { shouldStop: true, reason: "budget-time" };
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Budget-calls
|
|
104
|
+
if (
|
|
105
|
+
(state.budget?.consumed?.codexCalls ?? 0) >= (state.budget?.maxCodexCalls ?? Infinity)
|
|
106
|
+
) {
|
|
107
|
+
return { shouldStop: true, reason: "budget-calls" };
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (!last) {
|
|
111
|
+
return { shouldStop: false };
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// goal-met: trust only if validate didn't fail
|
|
115
|
+
const goalMetByVerdict =
|
|
116
|
+
last.evaluate?.verdict === "goal-met" || last.stopReason === "goal-met";
|
|
117
|
+
if (goalMetByVerdict && last.validate?.passed !== false) {
|
|
118
|
+
return { shouldStop: true, reason: "goal-met" };
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// regression / divergence
|
|
122
|
+
if (
|
|
123
|
+
last.qualityDelta != null &&
|
|
124
|
+
last.qualityDelta < -REGRESSION_DROP &&
|
|
125
|
+
last.validate?.regression === true
|
|
126
|
+
) {
|
|
127
|
+
const prev = iterations[iterations.length - 2];
|
|
128
|
+
if (prev?.validate?.regression === true) {
|
|
129
|
+
return { shouldStop: true, reason: "divergence" };
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// negligible-improvement / plateau
|
|
134
|
+
const epsilon = state.convergence?.epsilon ?? 0.02;
|
|
135
|
+
const window = state.convergence?.stableWindow ?? 2;
|
|
136
|
+
if (iterations.length >= window + 1) {
|
|
137
|
+
const recent = iterations.slice(-window - 1);
|
|
138
|
+
const deltas = [];
|
|
139
|
+
for (let i = 1; i < recent.length; i += 1) {
|
|
140
|
+
const d = (recent[i].qualityScore ?? 0) - (recent[i - 1].qualityScore ?? 0);
|
|
141
|
+
deltas.push(Math.abs(d));
|
|
142
|
+
}
|
|
143
|
+
if (deltas.every((d) => d < epsilon)) {
|
|
144
|
+
const q = last.qualityScore ?? 0;
|
|
145
|
+
if (q >= PLATEAU_QUALITY_THRESHOLD) {
|
|
146
|
+
return { shouldStop: true, reason: "negligible-improvement" };
|
|
147
|
+
}
|
|
148
|
+
return { shouldStop: true, reason: "plateau" };
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
return { shouldStop: false };
|
|
153
|
+
}
|