@jhlee0619/codexloop 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,149 @@
1
+ // Deterministic ranking math. The "judge" Codex call returns dimensional
2
+ // scores + a declared winner; this module re-computes the weighted sum from
3
+ // the dimensional scores and picks the winner itself, so the runtime's
4
+ // decision is reproducible and does not depend on the judge's own arithmetic.
5
+ //
6
+ // Tiebreaker rule (when top-2 weighted scores are within 0.03):
7
+ // 1. higher correctness
8
+ // 2. higher riskInverse (lower risk)
9
+ // 3. smaller patch size (fewer diff lines)
10
+
11
+ export const WEIGHTS = Object.freeze({
12
+ correctness: 0.30,
13
+ requirementSatisfaction: 0.25,
14
+ maintainability: 0.15,
15
+ testability: 0.10,
16
+ simplicity: 0.10,
17
+ riskInverse: 0.10
18
+ });
19
+
20
+ export const DIMENSIONS = Object.freeze(Object.keys(WEIGHTS));
21
+
22
+ export const TIE_EPSILON = 0.03;
23
+
24
+ export function clamp01(x) {
25
+ if (!Number.isFinite(x)) return 0;
26
+ if (x < 0) return 0;
27
+ if (x > 1) return 1;
28
+ return x;
29
+ }
30
+
31
+ export function computeWeightedScore(scoreEntry) {
32
+ let sum = 0;
33
+ for (const dim of DIMENSIONS) {
34
+ sum += WEIGHTS[dim] * clamp01(scoreEntry?.[dim] ?? 0);
35
+ }
36
+ return Number(sum.toFixed(6));
37
+ }
38
+
39
+ export function measurePatchSize(patch) {
40
+ if (!patch || typeof patch !== "string") return 0;
41
+ return patch.split("\n").length;
42
+ }
43
+
44
+ // Apply a reward-hacking floor: if a proposal modifies tests without a
45
+ // substantive testChangeJustification, force its correctness to 0. This
46
+ // mirrors the same check in apply.mjs but runs at ranking time so the judge
47
+ // never picks a blocked proposal as the winner.
48
+ export function applyRewardHackingFloor(scoreEntry, proposal) {
49
+ if (!proposal) return scoreEntry;
50
+ const modifiesTests = proposal.modifiesTests === true;
51
+ const justification = String(proposal.testChangeJustification ?? "").trim();
52
+ const short = justification.length < 20;
53
+ if (modifiesTests && short) {
54
+ return { ...scoreEntry, correctness: 0, _rewardHackingFloor: true };
55
+ }
56
+ return scoreEntry;
57
+ }
58
+
59
+ // The core runtime authority: given the judge's ranking output and the
60
+ // original proposals, return a canonical ranking the loop will trust.
61
+ export function recomputeWinner(ranking, proposals) {
62
+ if (!ranking || !Array.isArray(ranking.scores) || ranking.scores.length === 0) {
63
+ throw new Error("recomputeWinner: ranking.scores is empty");
64
+ }
65
+
66
+ const proposalById = new Map();
67
+ for (const proposal of proposals ?? []) {
68
+ if (proposal?.id) proposalById.set(proposal.id, proposal);
69
+ }
70
+
71
+ const canonical = ranking.scores.map((entry) => {
72
+ const proposal = proposalById.get(entry.proposalId);
73
+ const floored = applyRewardHackingFloor(entry, proposal);
74
+ // Reward-hacking proposals are disqualified from winner selection:
75
+ // their weighted score is forced to 0, which ranks them below every
76
+ // non-floored proposal deterministically.
77
+ const weighted = floored._rewardHackingFloor ? 0 : computeWeightedScore(floored);
78
+ const patchSize = measurePatchSize(proposal?.patch);
79
+ return {
80
+ ...floored,
81
+ weighted,
82
+ _patchSize: patchSize
83
+ };
84
+ });
85
+
86
+ const sorted = [...canonical].sort((a, b) => {
87
+ if (b.weighted !== a.weighted) return b.weighted - a.weighted;
88
+ if (b.correctness !== a.correctness) return b.correctness - a.correctness;
89
+ if (b.riskInverse !== a.riskInverse) return b.riskInverse - a.riskInverse;
90
+ return a._patchSize - b._patchSize;
91
+ });
92
+
93
+ const winnerEntry = sorted[0];
94
+ let tiebreaker = null;
95
+ if (sorted.length >= 2) {
96
+ const gap = Math.abs(sorted[0].weighted - sorted[1].weighted);
97
+ if (gap < TIE_EPSILON) {
98
+ tiebreaker = buildTiebreakerReason(sorted[0], sorted[1]);
99
+ }
100
+ }
101
+
102
+ const runtimeWinnerId = winnerEntry.proposalId;
103
+ const judgeWinnerId = ranking.winner?.id ?? null;
104
+ const disagreement = judgeWinnerId !== null && judgeWinnerId !== runtimeWinnerId;
105
+
106
+ const rejections = { ...(ranking.rejections ?? {}) };
107
+ for (const entry of canonical) {
108
+ if (entry.proposalId === runtimeWinnerId) continue;
109
+ if (!rejections[entry.proposalId]) {
110
+ rejections[entry.proposalId] =
111
+ `weighted ${entry.weighted.toFixed(3)} below winner ${winnerEntry.weighted.toFixed(3)}`;
112
+ }
113
+ if (entry._rewardHackingFloor) {
114
+ rejections[entry.proposalId] =
115
+ `reward-hacking floor: correctness forced to 0 (modifiesTests without justification). ` +
116
+ (rejections[entry.proposalId] ?? "");
117
+ }
118
+ }
119
+
120
+ return {
121
+ scores: canonical.map(({ _patchSize, _rewardHackingFloor, ...rest }) => rest),
122
+ winner: {
123
+ id: runtimeWinnerId,
124
+ justification:
125
+ ranking.winner?.justification ??
126
+ `highest weighted score (${winnerEntry.weighted.toFixed(3)}) in deterministic re-computation`,
127
+ confidence:
128
+ typeof ranking.winner?.confidence === "number" ? ranking.winner.confidence : null
129
+ },
130
+ rejections,
131
+ tiebreaker,
132
+ disagreement,
133
+ judgeWinnerId,
134
+ runtimeWinnerId
135
+ };
136
+ }
137
+
138
+ function buildTiebreakerReason(a, b) {
139
+ if (a.correctness !== b.correctness) {
140
+ return `correctness (${a.correctness.toFixed(3)} vs ${b.correctness.toFixed(3)})`;
141
+ }
142
+ if (a.riskInverse !== b.riskInverse) {
143
+ return `riskInverse (${a.riskInverse.toFixed(3)} vs ${b.riskInverse.toFixed(3)})`;
144
+ }
145
+ if (a._patchSize !== b._patchSize) {
146
+ return `patch size (${a._patchSize} vs ${b._patchSize} lines)`;
147
+ }
148
+ return "exact tie — deterministic order";
149
+ }
@@ -0,0 +1,240 @@
1
+ // Terminal-friendly rendering for /cloop:status, /cloop:result, and the
2
+ // per-iteration transcript printed by /cloop:iterate. All helpers take a
3
+ // `state` (optionally an `iteration`) and return a string — no I/O.
4
+
5
+ function truncate(text, max = 80) {
6
+ if (text == null) return "";
7
+ const s = String(text).replace(/\s+/g, " ").trim();
8
+ if (s.length <= max) return s;
9
+ return `${s.slice(0, Math.max(0, max - 3))}...`;
10
+ }
11
+
12
+ function fmtNum(n, digits = 3) {
13
+ if (n == null || !Number.isFinite(n)) return "?";
14
+ return Number(n).toFixed(digits);
15
+ }
16
+
17
+ function fmtDelta(n) {
18
+ if (n == null || !Number.isFinite(n)) return "?";
19
+ const v = Number(n).toFixed(3);
20
+ return n >= 0 ? `+${v}` : v;
21
+ }
22
+
23
+ function fmtMs(ms) {
24
+ if (!Number.isFinite(ms) || ms <= 0) return "?";
25
+ if (ms < 1000) return `${ms}ms`;
26
+ const s = Math.floor(ms / 1000);
27
+ if (s < 60) return `${s}s`;
28
+ const m = Math.floor(s / 60);
29
+ const rs = s % 60;
30
+ if (m < 60) return `${m}m${rs}s`;
31
+ const h = Math.floor(m / 60);
32
+ const rm = m % 60;
33
+ return `${h}h${rm}m`;
34
+ }
35
+
36
+ export function renderStatusReport(state) {
37
+ const lines = [];
38
+ lines.push("# CodexLoop status");
39
+ lines.push("");
40
+ lines.push(`| field | value |`);
41
+ lines.push(`|----------|-------|`);
42
+ lines.push(`| loopId | \`${state.loopId ?? "(none)"}\` |`);
43
+ lines.push(`| status | **${state.status}** |`);
44
+ lines.push(`| mode | ${state.mode ?? "interactive"} |`);
45
+ lines.push(`| goal | ${truncate(state.goal?.text ?? "(unspecified)", 120)} |`);
46
+ if (Array.isArray(state.goal?.acceptanceCriteria) && state.goal.acceptanceCriteria.length > 0) {
47
+ lines.push(`| criteria | ${state.goal.acceptanceCriteria.length} items |`);
48
+ }
49
+ if (state.goal?.seedCommit) {
50
+ lines.push(`| seed | \`${state.goal.seedCommit.slice(0, 12)}\` |`);
51
+ }
52
+
53
+ const consumed = state.budget?.consumed ?? {};
54
+ const maxIt = state.budget?.maxIterations ?? "?";
55
+ const maxCalls = state.budget?.maxCodexCalls ?? "?";
56
+ const elapsed = consumed.elapsedMs ?? 0;
57
+ const maxTime = state.budget?.maxElapsedMs ?? null;
58
+ lines.push(
59
+ `| budget | ${consumed.iterations ?? 0}/${maxIt} iter, ${consumed.codexCalls ?? 0}/${maxCalls} calls, ${fmtMs(elapsed)}${maxTime ? `/${fmtMs(maxTime)}` : ""} |`
60
+ );
61
+
62
+ if (Array.isArray(state.iterations) && state.iterations.length > 0) {
63
+ const last = state.iterations[state.iterations.length - 1];
64
+ lines.push(
65
+ `| quality | last=${fmtNum(last.qualityScore)} (Δ ${fmtDelta(last.qualityDelta)}) |`
66
+ );
67
+ }
68
+ if (state.stopReason) {
69
+ lines.push(`| stop | ${state.stopReason} |`);
70
+ }
71
+ if (state.error) {
72
+ const errText = typeof state.error === "string" ? state.error : state.error.message ?? JSON.stringify(state.error);
73
+ lines.push(`| error | ${truncate(errText, 200)} |`);
74
+ }
75
+
76
+ if (Array.isArray(state.iterations) && state.iterations.length > 0) {
77
+ lines.push("");
78
+ lines.push("## Iteration history");
79
+ lines.push("");
80
+ lines.push("| # | verdict | q | Δq | winner | apply | validate |");
81
+ lines.push("|---|---------|---|----|--------|-------|----------|");
82
+ for (const iter of state.iterations) {
83
+ const applyTag = iter.apply?.applied
84
+ ? "ok"
85
+ : iter.apply?.empty
86
+ ? "empty"
87
+ : iter.apply?.skipped
88
+ ? "skip"
89
+ : iter.apply?.error
90
+ ? "fail"
91
+ : "-";
92
+ const validateTag = iter.validate?.skipped
93
+ ? "skip"
94
+ : iter.validate?.passed === true
95
+ ? "pass"
96
+ : iter.validate?.passed === false
97
+ ? "fail"
98
+ : "-";
99
+ lines.push(
100
+ `| ${iter.index} | ${iter.evaluate?.verdict ?? "?"} | ${fmtNum(iter.qualityScore)} | ${fmtDelta(iter.qualityDelta)} | ${iter.acceptedProposalId ?? "-"} | ${applyTag} | ${validateTag} |`
101
+ );
102
+ }
103
+ }
104
+
105
+ return lines.join("\n");
106
+ }
107
+
108
+ export function renderIterationReport(iteration) {
109
+ if (!iteration) return "(no iteration)";
110
+ const lines = [];
111
+ lines.push(`## Iteration ${iteration.index}${iteration.dryRun ? " [DRY-RUN]" : ""}`);
112
+ lines.push("");
113
+
114
+ if (iteration.evaluate) {
115
+ lines.push(
116
+ `**evaluate**: verdict=${iteration.evaluate.verdict}, distance=${fmtNum(iteration.evaluate.distanceFromGoal, 2)}, openIssues=${iteration.evaluate.openIssues?.length ?? 0}`
117
+ );
118
+ if (iteration.evaluate.rationale) {
119
+ lines.push(`> ${truncate(iteration.evaluate.rationale, 400)}`);
120
+ }
121
+ }
122
+
123
+ if (Array.isArray(iteration.proposals) && iteration.proposals.length > 0) {
124
+ lines.push("");
125
+ lines.push(`**suggest**: ${iteration.proposals.length} proposals`);
126
+ for (const p of iteration.proposals) {
127
+ lines.push(
128
+ `- \`${p.id}\` — ${truncate(p.approach, 120)} (risk=${p.estimatedRisk}, impact=${p.estimatedImpact})`
129
+ );
130
+ }
131
+ }
132
+
133
+ if (iteration.ranking) {
134
+ lines.push("");
135
+ lines.push(`**rank**: winner=\`${iteration.ranking.winner?.id ?? "?"}\``);
136
+ for (const s of iteration.ranking.scores ?? []) {
137
+ lines.push(
138
+ ` - \`${s.proposalId}\`: weighted=${fmtNum(s.weighted)} (corr=${fmtNum(s.correctness, 2)}, req=${fmtNum(s.requirementSatisfaction, 2)}, risk⁻=${fmtNum(s.riskInverse, 2)})`
139
+ );
140
+ }
141
+ if (iteration.ranking.tiebreaker) {
142
+ lines.push(` tiebreaker: ${iteration.ranking.tiebreaker}`);
143
+ }
144
+ if (iteration.ranking.disagreement) {
145
+ lines.push(
146
+ ` ⚠ judge picked \`${iteration.ranking.judgeWinnerId}\` but runtime overrode to \`${iteration.ranking.winner?.id}\``
147
+ );
148
+ }
149
+ for (const [id, reason] of Object.entries(iteration.ranking.rejections ?? {})) {
150
+ lines.push(` ✗ \`${id}\`: ${truncate(reason, 200)}`);
151
+ }
152
+ }
153
+
154
+ if (iteration.apply) {
155
+ lines.push("");
156
+ if (iteration.apply.applied) {
157
+ lines.push(
158
+ `**apply**: applied ${iteration.apply.filesTouched?.length ?? 0} file(s), HEAD=\`${iteration.apply.postSha?.slice(0, 12) ?? "?"}\``
159
+ );
160
+ } else if (iteration.apply.empty) {
161
+ lines.push("**apply**: empty patch (no code change this iteration)");
162
+ } else if (iteration.apply.skipped) {
163
+ lines.push(`**apply**: skipped (${iteration.apply.skipped})`);
164
+ } else if (iteration.apply.error) {
165
+ lines.push(`**apply**: FAILED — ${truncate(iteration.apply.error, 300)}`);
166
+ }
167
+ if (iteration.apply.hackingFindings?.length) {
168
+ for (const f of iteration.apply.hackingFindings) {
169
+ lines.push(
170
+ ` ⚠ reward-hacking: ${f.kind}${f.file ? ` (${f.file})` : ""}${f.summary ? ` — ${truncate(f.summary, 200)}` : ""}`
171
+ );
172
+ }
173
+ }
174
+ }
175
+
176
+ if (iteration.validate) {
177
+ lines.push("");
178
+ if (iteration.validate.skipped) {
179
+ lines.push(`**validate**: skipped (${iteration.validate.skipped})`);
180
+ } else {
181
+ const passed = iteration.validate.passed;
182
+ lines.push(
183
+ `**validate**: ${passed === true ? "pass" : passed === false ? "FAIL" : "?"}${iteration.validate.regression ? " (regression)" : ""}`
184
+ );
185
+ for (const c of iteration.validate.commands ?? []) {
186
+ lines.push(
187
+ ` - ${c.kind}: exit=${c.status ?? "?"} ${fmtMs(c.durationMs)} — \`${truncate(c.cmd, 120)}\``
188
+ );
189
+ }
190
+ }
191
+ }
192
+
193
+ if (iteration.qualityScore != null) {
194
+ lines.push("");
195
+ lines.push(`**quality**: ${fmtNum(iteration.qualityScore)} (Δ ${fmtDelta(iteration.qualityDelta)})`);
196
+ }
197
+
198
+ if (iteration.stopReason) {
199
+ lines.push("");
200
+ lines.push(`**STOP**: ${iteration.stopReason}`);
201
+ }
202
+
203
+ if (iteration.error) {
204
+ lines.push("");
205
+ lines.push(`**ERROR**: ${truncate(iteration.error, 400)}`);
206
+ }
207
+
208
+ return lines.join("\n");
209
+ }
210
+
211
+ export function renderResultReport(state, fullIterations, { iterationIndex = null, withDiff = false } = {}) {
212
+ const lines = [];
213
+ lines.push(renderStatusReport(state));
214
+ lines.push("");
215
+ lines.push("---");
216
+ lines.push("");
217
+
218
+ if (iterationIndex != null) {
219
+ const match = (fullIterations ?? []).find((i) => i.index === iterationIndex);
220
+ if (match) {
221
+ lines.push(renderIterationReport(match));
222
+ } else {
223
+ lines.push(`No iteration ${iterationIndex} found.`);
224
+ }
225
+ } else {
226
+ for (const iter of fullIterations ?? []) {
227
+ lines.push(renderIterationReport(iter));
228
+ lines.push("");
229
+ }
230
+ }
231
+
232
+ if (withDiff && state.goal?.seedCommit) {
233
+ lines.push("");
234
+ lines.push("## Cumulative diff since seed commit");
235
+ lines.push("");
236
+ lines.push("(run `git diff " + state.goal.seedCommit + " HEAD` in the target repo)");
237
+ }
238
+
239
+ return lines.join("\n");
240
+ }