ultimate-pi 0.22.2 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.pi/extensions/harness-run-context.ts +62 -6
- package/.pi/lib/harness-run-context.ts +38 -0
- package/.pi/lib/plan-human-gates.ts +82 -0
- package/.pi/prompts/harness-plan.md +2 -0
- package/.pi/prompts/harness-review.md +2 -0
- package/.pi/prompts/harness-run.md +4 -3
- package/CHANGELOG.md +7 -0
- package/package.json +1 -1
|
@@ -74,6 +74,7 @@ import {
|
|
|
74
74
|
readReviewOutcomeFromRun,
|
|
75
75
|
reconcileReviewRouting,
|
|
76
76
|
reconcileStaleExecuteCompletion,
|
|
77
|
+
refreshRunContextProgress,
|
|
77
78
|
relPathUnderActiveRun,
|
|
78
79
|
resetRunContextForHarnessAuto,
|
|
79
80
|
resolveArgsForCommand,
|
|
@@ -814,6 +815,13 @@ function registerHarnessRunStatusCommand(
|
|
|
814
815
|
if (ctx.hasUI) ctx.ui.notify(msg, "warning");
|
|
815
816
|
return;
|
|
816
817
|
}
|
|
818
|
+
ctxState = await refreshRunContextProgress(
|
|
819
|
+
projectRoot,
|
|
820
|
+
ctxState,
|
|
821
|
+
entries,
|
|
822
|
+
);
|
|
823
|
+
active.set(ctxState);
|
|
824
|
+
persistContext(pi, ctxState);
|
|
817
825
|
let summary: PlanPacketSummary | null = null;
|
|
818
826
|
for (let i = entries.length - 1; i >= 0; i--) {
|
|
819
827
|
const entry = entries[i] as SessionEntryLike;
|
|
@@ -1253,12 +1261,29 @@ function registerPlanApprovalCapture(
|
|
|
1253
1261
|
if (event.toolName === "ask_user") {
|
|
1254
1262
|
const details = event.details as { cancelled?: boolean; input?: unknown };
|
|
1255
1263
|
if (details?.cancelled) {
|
|
1256
|
-
|
|
1264
|
+
// Ignore cancels from later planning forks (e.g. debate profile choice):
|
|
1265
|
+
// only treat cancel as Phase-0 clarification failure when clarification
|
|
1266
|
+
// is not already locked ready.
|
|
1267
|
+
const runRoot = join(
|
|
1257
1268
|
process.cwd(),
|
|
1258
|
-
|
|
1269
|
+
".pi",
|
|
1270
|
+
"harness",
|
|
1271
|
+
"runs",
|
|
1272
|
+
runCtx.run_id ?? "",
|
|
1259
1273
|
);
|
|
1260
|
-
|
|
1261
|
-
|
|
1274
|
+
const clarDoc = runCtx.run_id
|
|
1275
|
+
? await readTaskClarificationDoc(runRoot)
|
|
1276
|
+
: null;
|
|
1277
|
+
const clarReady =
|
|
1278
|
+
String(clarDoc?.status ?? "").toLowerCase() === "ready";
|
|
1279
|
+
if (!clarReady) {
|
|
1280
|
+
const synced = await syncPlanLastOutcomeFromTaskClarification(
|
|
1281
|
+
process.cwd(),
|
|
1282
|
+
runCtx,
|
|
1283
|
+
);
|
|
1284
|
+
Object.assign(runCtx, synced);
|
|
1285
|
+
persistContext(pi, runCtx);
|
|
1286
|
+
}
|
|
1262
1287
|
} else if (
|
|
1263
1288
|
!isPlanApprovalAskUser(
|
|
1264
1289
|
(details?.input ?? {}) as {
|
|
@@ -1295,6 +1320,36 @@ function registerPlanApprovalCapture(
|
|
|
1295
1320
|
});
|
|
1296
1321
|
}
|
|
1297
1322
|
|
|
1323
|
+
function registerExecutorHandoffReconcile(
|
|
1324
|
+
pi: ExtensionAPI,
|
|
1325
|
+
active: ActiveContextAccess,
|
|
1326
|
+
): void {
|
|
1327
|
+
pi.on("tool_result", async (event, ctx) => {
|
|
1328
|
+
if (event.isError || event.toolName !== "submit_executor_handoff") return;
|
|
1329
|
+
const entries = getEntries(ctx);
|
|
1330
|
+
const runCtx = getLatestRunContext(entries) ?? active.get();
|
|
1331
|
+
if (!runCtx?.run_id) return;
|
|
1332
|
+
const projectRoot = process.cwd();
|
|
1333
|
+
const refreshed = await refreshRunContextProgress(
|
|
1334
|
+
projectRoot,
|
|
1335
|
+
runCtx,
|
|
1336
|
+
entries,
|
|
1337
|
+
);
|
|
1338
|
+
Object.assign(runCtx, refreshed);
|
|
1339
|
+
active.set(runCtx);
|
|
1340
|
+
persistContext(pi, runCtx);
|
|
1341
|
+
if (refreshed.last_completed_step === "execute") {
|
|
1342
|
+
const notify = `Execute finished (${refreshed.last_outcome ?? "done"}). Next: ${refreshed.next_recommended_command ?? "/harness-review"}`;
|
|
1343
|
+
pi.appendEntry("harness-step-handoff", {
|
|
1344
|
+
next_command: refreshed.next_recommended_command,
|
|
1345
|
+
execution_status: refreshed.last_outcome,
|
|
1346
|
+
phase: refreshed.phase,
|
|
1347
|
+
});
|
|
1348
|
+
if (ctx.hasUI) ctx.ui.notify(notify, "info");
|
|
1349
|
+
}
|
|
1350
|
+
});
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1298
1353
|
async function guardToolCall(input: {
|
|
1299
1354
|
event: { toolName: string; input: unknown };
|
|
1300
1355
|
ctx: { sessionManager: { getEntries(): unknown[] } };
|
|
@@ -1828,7 +1883,7 @@ async function handleAgentEnd(input: {
|
|
|
1828
1883
|
activeCtx.run_id,
|
|
1829
1884
|
projectRoot,
|
|
1830
1885
|
);
|
|
1831
|
-
if (parsed?.command === "harness-run") {
|
|
1886
|
+
if (parsed?.command === "harness-run" || parsed?.command === "harness-auto") {
|
|
1832
1887
|
let execStatus = statuses.executionStatus;
|
|
1833
1888
|
if (!execStatus) {
|
|
1834
1889
|
const handoff = await readExecutorHandoffFromRun(
|
|
@@ -1895,7 +1950,7 @@ async function handleAgentEnd(input: {
|
|
|
1895
1950
|
activeCtx.next_recommended_command = next;
|
|
1896
1951
|
activeCtx.updated_at = new Date().toISOString();
|
|
1897
1952
|
if (
|
|
1898
|
-
parsed?.command === "harness-run" &&
|
|
1953
|
+
(parsed?.command === "harness-run" || parsed?.command === "harness-auto") &&
|
|
1899
1954
|
activeCtx.last_outcome === "completed"
|
|
1900
1955
|
) {
|
|
1901
1956
|
syncPolicyFromRunContext(input.pi, entries, activeCtx);
|
|
@@ -2579,6 +2634,7 @@ export default function harnessRunContext(pi: ExtensionAPI) {
|
|
|
2579
2634
|
});
|
|
2580
2635
|
|
|
2581
2636
|
registerPlanApprovalCapture(pi, activeAccess);
|
|
2637
|
+
registerExecutorHandoffReconcile(pi, activeAccess);
|
|
2582
2638
|
registerHarnessToolCallGuards(pi, activeAccess);
|
|
2583
2639
|
registerHarnessRunStatusCommand(pi, activeAccess);
|
|
2584
2640
|
|
|
@@ -2407,6 +2407,44 @@ export async function reconcileStaleExecuteCompletion(
|
|
|
2407
2407
|
return synced;
|
|
2408
2408
|
}
|
|
2409
2409
|
|
|
2410
|
+
/** Reconcile disk artifacts and recompute next_recommended_command for status UI. */
|
|
2411
|
+
export async function refreshRunContextProgress(
|
|
2412
|
+
projectRoot: string,
|
|
2413
|
+
ctx: HarnessRunContext,
|
|
2414
|
+
entries: unknown[] = [],
|
|
2415
|
+
): Promise<HarnessRunContext> {
|
|
2416
|
+
let synced = await reconcileStaleExecuteCompletion(projectRoot, ctx, entries);
|
|
2417
|
+
synced = await reconcileReviewRouting(projectRoot, synced);
|
|
2418
|
+
const statuses = await resolveCompletionStatuses(
|
|
2419
|
+
entries,
|
|
2420
|
+
synced.run_id,
|
|
2421
|
+
projectRoot,
|
|
2422
|
+
);
|
|
2423
|
+
const reviewComplete =
|
|
2424
|
+
synced.last_completed_step === "review" ||
|
|
2425
|
+
synced.last_completed_step === "adversary";
|
|
2426
|
+
const remediationClass = await resolveRemediationClassForRun(
|
|
2427
|
+
synced.run_id,
|
|
2428
|
+
projectRoot,
|
|
2429
|
+
);
|
|
2430
|
+
synced.next_recommended_command = nextStepAfterOutcome({
|
|
2431
|
+
phase: synced.phase,
|
|
2432
|
+
planStatus: synced.plan_ready ? "ready" : statuses.planStatus,
|
|
2433
|
+
lastCompletedStep: synced.last_completed_step,
|
|
2434
|
+
lastOutcome: synced.last_outcome,
|
|
2435
|
+
executionStatus: statuses.executionStatus,
|
|
2436
|
+
evalStatus: statuses.evalStatus,
|
|
2437
|
+
adversaryComplete: statuses.adversaryComplete,
|
|
2438
|
+
aborted: synced.status === "aborted",
|
|
2439
|
+
remediationClass,
|
|
2440
|
+
steerAttempt: synced.steer_attempt ?? 0,
|
|
2441
|
+
steerMaxAttempts: synced.steer_max_attempts ?? steerMaxAttemptsFromEnv(),
|
|
2442
|
+
reviewComplete,
|
|
2443
|
+
});
|
|
2444
|
+
synced.updated_at = nowIso();
|
|
2445
|
+
return synced;
|
|
2446
|
+
}
|
|
2447
|
+
|
|
2410
2448
|
export async function blockingHarnessAutoCommandReason(
|
|
2411
2449
|
command: string,
|
|
2412
2450
|
activeCtx: HarnessRunContext | null,
|
|
@@ -28,6 +28,33 @@ import {
|
|
|
28
28
|
const EXPLICIT_ACCEPTANCE_RE =
|
|
29
29
|
/\b(acceptance|success criteria|definition of done|done when|must (pass|satisfy)|out of scope|in scope)\b/i;
|
|
30
30
|
|
|
31
|
+
function logPlanHumanGate(payload: {
|
|
32
|
+
runId: string;
|
|
33
|
+
hypothesisId: string;
|
|
34
|
+
location: string;
|
|
35
|
+
message: string;
|
|
36
|
+
data: Record<string, unknown>;
|
|
37
|
+
}): void {
|
|
38
|
+
// #region agent log
|
|
39
|
+
fetch("http://127.0.0.1:7928/ingest/a5d40896-34cb-4f12-97db-df7ada0b22f0", {
|
|
40
|
+
method: "POST",
|
|
41
|
+
headers: {
|
|
42
|
+
"Content-Type": "application/json",
|
|
43
|
+
"X-Debug-Session-Id": "f7763e",
|
|
44
|
+
},
|
|
45
|
+
body: JSON.stringify({
|
|
46
|
+
sessionId: "f7763e",
|
|
47
|
+
runId: payload.runId,
|
|
48
|
+
hypothesisId: payload.hypothesisId,
|
|
49
|
+
location: payload.location,
|
|
50
|
+
message: payload.message,
|
|
51
|
+
data: payload.data,
|
|
52
|
+
timestamp: Date.now(),
|
|
53
|
+
}),
|
|
54
|
+
}).catch(() => {});
|
|
55
|
+
// #endregion
|
|
56
|
+
}
|
|
57
|
+
|
|
31
58
|
type SessionEntryLike = {
|
|
32
59
|
type?: string;
|
|
33
60
|
customType?: string;
|
|
@@ -190,11 +217,51 @@ export async function resolvePlanHumanGateStatus(
|
|
|
190
217
|
const runDir = join(projectRoot, ".pi", "harness", "runs", runId);
|
|
191
218
|
const clar = await isTaskClarificationReady(runDir);
|
|
192
219
|
const clarDoc = clar.ok ? await readTaskClarificationDoc(runDir) : null;
|
|
220
|
+
logPlanHumanGate({
|
|
221
|
+
runId,
|
|
222
|
+
hypothesisId: "H3",
|
|
223
|
+
location: "plan-human-gates.ts:resolvePlanHumanGateStatus:clar",
|
|
224
|
+
message: "Task clarification readiness evaluated",
|
|
225
|
+
data: {
|
|
226
|
+
runDir,
|
|
227
|
+
clarOk: clar.ok,
|
|
228
|
+
clarErrors: clar.errors,
|
|
229
|
+
docStatus: String(clarDoc?.status ?? ""),
|
|
230
|
+
docEngagementSource:
|
|
231
|
+
typeof clarDoc?.user_engagement === "object" &&
|
|
232
|
+
clarDoc?.user_engagement !== null
|
|
233
|
+
? String(
|
|
234
|
+
(
|
|
235
|
+
clarDoc.user_engagement as {
|
|
236
|
+
source?: string;
|
|
237
|
+
}
|
|
238
|
+
).source ?? "",
|
|
239
|
+
)
|
|
240
|
+
: "",
|
|
241
|
+
},
|
|
242
|
+
});
|
|
193
243
|
const humanGate = validateTaskClarificationHumanGate(entries, clarDoc, {
|
|
194
244
|
quick: opts?.quick,
|
|
195
245
|
taskSummary: opts?.taskSummary,
|
|
196
246
|
allowFollowUpMessage: opts?.lastOutcome === "needs_clarification",
|
|
197
247
|
});
|
|
248
|
+
logPlanHumanGate({
|
|
249
|
+
runId,
|
|
250
|
+
hypothesisId: "H1-H2",
|
|
251
|
+
location: "plan-human-gates.ts:resolvePlanHumanGateStatus:humanGate",
|
|
252
|
+
message: "Human gate evaluated for phase0 ask_user requirement",
|
|
253
|
+
data: {
|
|
254
|
+
humanGateOk: humanGate.ok,
|
|
255
|
+
humanGateErrors: humanGate.errors,
|
|
256
|
+
allowFollowUpMessage: opts?.lastOutcome === "needs_clarification",
|
|
257
|
+
hasTaskClarificationAskUserSincePlanCommand:
|
|
258
|
+
hasTaskClarificationAskUserSincePlanCommand(entries),
|
|
259
|
+
hasClarificationFollowUpUserMessage:
|
|
260
|
+
hasClarificationFollowUpUserMessage(entries),
|
|
261
|
+
indexOfLastPlanCommand: indexOfLastPlanCommand(entries),
|
|
262
|
+
entriesLen: entries.length,
|
|
263
|
+
},
|
|
264
|
+
});
|
|
198
265
|
const phase0Ready = clar.ok && humanGate.ok;
|
|
199
266
|
const phase0NeedsAskUser = clar.ok && !humanGate.ok;
|
|
200
267
|
const approvalRecorded = hasPlanUserApproval(entries, {
|
|
@@ -244,6 +311,21 @@ export async function resolvePlanHumanGateStatus(
|
|
|
244
311
|
} else if (approvalRequired && !approvalRecorded) {
|
|
245
312
|
nextRequiredAction = "approve_plan then create_plan (Phase 6)";
|
|
246
313
|
}
|
|
314
|
+
logPlanHumanGate({
|
|
315
|
+
runId,
|
|
316
|
+
hypothesisId: "H4",
|
|
317
|
+
location: "plan-human-gates.ts:resolvePlanHumanGateStatus:result",
|
|
318
|
+
message: "Resolved plan human gate status",
|
|
319
|
+
data: {
|
|
320
|
+
phase0Ready,
|
|
321
|
+
phase0NeedsAskUser,
|
|
322
|
+
debateComplete,
|
|
323
|
+
debateRequired,
|
|
324
|
+
approvalRequired,
|
|
325
|
+
approvalRecorded,
|
|
326
|
+
nextRequiredAction,
|
|
327
|
+
},
|
|
328
|
+
});
|
|
247
329
|
|
|
248
330
|
return {
|
|
249
331
|
phase0Ready,
|
|
@@ -188,6 +188,8 @@ subagent({ agentScope: "both", agent: "harness/planning/execution-plan-author",
|
|
|
188
188
|
|
|
189
189
|
Merge `execution_plan` into draft `plan-packet.yaml` (`write_harness_yaml`). Save `artifacts/execution-plan-draft.yaml` the same way.
|
|
190
190
|
|
|
191
|
+
The `execution_plan` must make testing expectations explicit: decide whether unit, integration, and e2e/end-to-end tests are applicable for each changed surface based on risk and implementation scope; add work items/done criteria to create or update applicable tests; list relevant verification commands; and record a short rationale when a test level is not applicable. Do not hard-require all three test levels for every change — make the applicability decision visible.
|
|
192
|
+
|
|
191
193
|
## Phase 4c — Deterministic quality gate (hard stop)
|
|
192
194
|
|
|
193
195
|
**Practice:** Harness engineering — never trust the model for graph validity.
|
|
@@ -75,6 +75,8 @@ Ensure `artifacts/ls-lint-signal.yaml` exists (from `/harness-run` or write from
|
|
|
75
75
|
|
|
76
76
|
Run project tests if the approved `PlanPacket` or spawn context lists a test command. Capture stdout paths only — do not paste full logs into the next spawn.
|
|
77
77
|
|
|
78
|
+
Verify the testing obligation itself: the approved `PlanPacket` or spawn context must show planned applicability decisions for unit, integration, and e2e/end-to-end tests, and executor evidence must show applicable tests were implemented or updated and run. If a test level was not applicable, require a clear rationale tied to risk and changed surface; missing planned or executed applicable testing is a benchmark failure.
|
|
79
|
+
|
|
78
80
|
Write `artifacts/benchmark-log.yaml` via `write_harness_yaml` when any shell step ran:
|
|
79
81
|
|
|
80
82
|
```yaml
|
|
@@ -52,14 +52,15 @@ Note `violation_count` in run notes (do not block execute on pre-existing violat
|
|
|
52
52
|
1. Confirm `[HarnessActivePlan]` / extension reports plan ready.
|
|
53
53
|
2. Build `HarnessSpawnContext` with `mode: execute`, `plan_packet_path`, `run_dir`, `acceptance_checks` from plan file.
|
|
54
54
|
3. Include **`critical_path_work_item_ids`** from `execution_plan.schedule_metadata` in spawn task when present — executor should tackle limiting-step items first (Grove).
|
|
55
|
-
4.
|
|
55
|
+
4. Include the plan's testing expectations in the spawn task: the executor must implement or update applicable unit, integration, and e2e/end-to-end tests, run the relevant verification commands, and report command evidence or a rationale for any non-applicable test level in `validation_summary`.
|
|
56
|
+
5. Spawn (max **1** agent per call):
|
|
56
57
|
|
|
57
58
|
```
|
|
58
59
|
subagent({ agentScope: "both", agent: "harness/running/executor", task: "<HarnessSpawnContext + handoff + critical path hint>" })
|
|
59
60
|
```
|
|
60
61
|
|
|
61
|
-
|
|
62
|
-
|
|
62
|
+
6. Parse subprocess output JSON (`execution_status`, validations, rollback refs) from tool result text.
|
|
63
|
+
7. Parent persists trace/handoff artifacts under run dir if needed; do not self-review.
|
|
63
64
|
|
|
64
65
|
## Post-work — Structural observation (parent)
|
|
65
66
|
|
package/CHANGELOG.md
CHANGED
package/package.json
CHANGED