ultimate-pi 0.22.2 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -74,6 +74,7 @@ import {
74
74
  readReviewOutcomeFromRun,
75
75
  reconcileReviewRouting,
76
76
  reconcileStaleExecuteCompletion,
77
+ refreshRunContextProgress,
77
78
  relPathUnderActiveRun,
78
79
  resetRunContextForHarnessAuto,
79
80
  resolveArgsForCommand,
@@ -814,6 +815,13 @@ function registerHarnessRunStatusCommand(
814
815
  if (ctx.hasUI) ctx.ui.notify(msg, "warning");
815
816
  return;
816
817
  }
818
+ ctxState = await refreshRunContextProgress(
819
+ projectRoot,
820
+ ctxState,
821
+ entries,
822
+ );
823
+ active.set(ctxState);
824
+ persistContext(pi, ctxState);
817
825
  let summary: PlanPacketSummary | null = null;
818
826
  for (let i = entries.length - 1; i >= 0; i--) {
819
827
  const entry = entries[i] as SessionEntryLike;
@@ -1253,12 +1261,29 @@ function registerPlanApprovalCapture(
1253
1261
  if (event.toolName === "ask_user") {
1254
1262
  const details = event.details as { cancelled?: boolean; input?: unknown };
1255
1263
  if (details?.cancelled) {
1256
- const synced = await syncPlanLastOutcomeFromTaskClarification(
1264
+ // Ignore cancels from later planning forks (e.g. debate profile choice):
1265
+ // only treat cancel as Phase-0 clarification failure when clarification
1266
+ // is not already locked ready.
1267
+ const runRoot = join(
1257
1268
  process.cwd(),
1258
- runCtx,
1269
+ ".pi",
1270
+ "harness",
1271
+ "runs",
1272
+ runCtx.run_id ?? "",
1259
1273
  );
1260
- Object.assign(runCtx, synced);
1261
- persistContext(pi, runCtx);
1274
+ const clarDoc = runCtx.run_id
1275
+ ? await readTaskClarificationDoc(runRoot)
1276
+ : null;
1277
+ const clarReady =
1278
+ String(clarDoc?.status ?? "").toLowerCase() === "ready";
1279
+ if (!clarReady) {
1280
+ const synced = await syncPlanLastOutcomeFromTaskClarification(
1281
+ process.cwd(),
1282
+ runCtx,
1283
+ );
1284
+ Object.assign(runCtx, synced);
1285
+ persistContext(pi, runCtx);
1286
+ }
1262
1287
  } else if (
1263
1288
  !isPlanApprovalAskUser(
1264
1289
  (details?.input ?? {}) as {
@@ -1295,6 +1320,36 @@ function registerPlanApprovalCapture(
1295
1320
  });
1296
1321
  }
1297
1322
 
1323
+ function registerExecutorHandoffReconcile(
1324
+ pi: ExtensionAPI,
1325
+ active: ActiveContextAccess,
1326
+ ): void {
1327
+ pi.on("tool_result", async (event, ctx) => {
1328
+ if (event.isError || event.toolName !== "submit_executor_handoff") return;
1329
+ const entries = getEntries(ctx);
1330
+ const runCtx = getLatestRunContext(entries) ?? active.get();
1331
+ if (!runCtx?.run_id) return;
1332
+ const projectRoot = process.cwd();
1333
+ const refreshed = await refreshRunContextProgress(
1334
+ projectRoot,
1335
+ runCtx,
1336
+ entries,
1337
+ );
1338
+ Object.assign(runCtx, refreshed);
1339
+ active.set(runCtx);
1340
+ persistContext(pi, runCtx);
1341
+ if (refreshed.last_completed_step === "execute") {
1342
+ const notify = `Execute finished (${refreshed.last_outcome ?? "done"}). Next: ${refreshed.next_recommended_command ?? "/harness-review"}`;
1343
+ pi.appendEntry("harness-step-handoff", {
1344
+ next_command: refreshed.next_recommended_command,
1345
+ execution_status: refreshed.last_outcome,
1346
+ phase: refreshed.phase,
1347
+ });
1348
+ if (ctx.hasUI) ctx.ui.notify(notify, "info");
1349
+ }
1350
+ });
1351
+ }
1352
+
1298
1353
  async function guardToolCall(input: {
1299
1354
  event: { toolName: string; input: unknown };
1300
1355
  ctx: { sessionManager: { getEntries(): unknown[] } };
@@ -1828,7 +1883,7 @@ async function handleAgentEnd(input: {
1828
1883
  activeCtx.run_id,
1829
1884
  projectRoot,
1830
1885
  );
1831
- if (parsed?.command === "harness-run") {
1886
+ if (parsed?.command === "harness-run" || parsed?.command === "harness-auto") {
1832
1887
  let execStatus = statuses.executionStatus;
1833
1888
  if (!execStatus) {
1834
1889
  const handoff = await readExecutorHandoffFromRun(
@@ -1895,7 +1950,7 @@ async function handleAgentEnd(input: {
1895
1950
  activeCtx.next_recommended_command = next;
1896
1951
  activeCtx.updated_at = new Date().toISOString();
1897
1952
  if (
1898
- parsed?.command === "harness-run" &&
1953
+ (parsed?.command === "harness-run" || parsed?.command === "harness-auto") &&
1899
1954
  activeCtx.last_outcome === "completed"
1900
1955
  ) {
1901
1956
  syncPolicyFromRunContext(input.pi, entries, activeCtx);
@@ -2579,6 +2634,7 @@ export default function harnessRunContext(pi: ExtensionAPI) {
2579
2634
  });
2580
2635
 
2581
2636
  registerPlanApprovalCapture(pi, activeAccess);
2637
+ registerExecutorHandoffReconcile(pi, activeAccess);
2582
2638
  registerHarnessToolCallGuards(pi, activeAccess);
2583
2639
  registerHarnessRunStatusCommand(pi, activeAccess);
2584
2640
 
@@ -2407,6 +2407,44 @@ export async function reconcileStaleExecuteCompletion(
2407
2407
  return synced;
2408
2408
  }
2409
2409
 
2410
+ /** Reconcile disk artifacts and recompute next_recommended_command for status UI. */
2411
+ export async function refreshRunContextProgress(
2412
+ projectRoot: string,
2413
+ ctx: HarnessRunContext,
2414
+ entries: unknown[] = [],
2415
+ ): Promise<HarnessRunContext> {
2416
+ let synced = await reconcileStaleExecuteCompletion(projectRoot, ctx, entries);
2417
+ synced = await reconcileReviewRouting(projectRoot, synced);
2418
+ const statuses = await resolveCompletionStatuses(
2419
+ entries,
2420
+ synced.run_id,
2421
+ projectRoot,
2422
+ );
2423
+ const reviewComplete =
2424
+ synced.last_completed_step === "review" ||
2425
+ synced.last_completed_step === "adversary";
2426
+ const remediationClass = await resolveRemediationClassForRun(
2427
+ synced.run_id,
2428
+ projectRoot,
2429
+ );
2430
+ synced.next_recommended_command = nextStepAfterOutcome({
2431
+ phase: synced.phase,
2432
+ planStatus: synced.plan_ready ? "ready" : statuses.planStatus,
2433
+ lastCompletedStep: synced.last_completed_step,
2434
+ lastOutcome: synced.last_outcome,
2435
+ executionStatus: statuses.executionStatus,
2436
+ evalStatus: statuses.evalStatus,
2437
+ adversaryComplete: statuses.adversaryComplete,
2438
+ aborted: synced.status === "aborted",
2439
+ remediationClass,
2440
+ steerAttempt: synced.steer_attempt ?? 0,
2441
+ steerMaxAttempts: synced.steer_max_attempts ?? steerMaxAttemptsFromEnv(),
2442
+ reviewComplete,
2443
+ });
2444
+ synced.updated_at = nowIso();
2445
+ return synced;
2446
+ }
2447
+
2410
2448
  export async function blockingHarnessAutoCommandReason(
2411
2449
  command: string,
2412
2450
  activeCtx: HarnessRunContext | null,
@@ -28,6 +28,33 @@ import {
28
28
  const EXPLICIT_ACCEPTANCE_RE =
29
29
  /\b(acceptance|success criteria|definition of done|done when|must (pass|satisfy)|out of scope|in scope)\b/i;
30
30
 
31
+ function logPlanHumanGate(payload: {
32
+ runId: string;
33
+ hypothesisId: string;
34
+ location: string;
35
+ message: string;
36
+ data: Record<string, unknown>;
37
+ }): void {
38
+ // #region agent log
39
+ fetch("http://127.0.0.1:7928/ingest/a5d40896-34cb-4f12-97db-df7ada0b22f0", {
40
+ method: "POST",
41
+ headers: {
42
+ "Content-Type": "application/json",
43
+ "X-Debug-Session-Id": "f7763e",
44
+ },
45
+ body: JSON.stringify({
46
+ sessionId: "f7763e",
47
+ runId: payload.runId,
48
+ hypothesisId: payload.hypothesisId,
49
+ location: payload.location,
50
+ message: payload.message,
51
+ data: payload.data,
52
+ timestamp: Date.now(),
53
+ }),
54
+ }).catch(() => {});
55
+ // #endregion
56
+ }
57
+
31
58
  type SessionEntryLike = {
32
59
  type?: string;
33
60
  customType?: string;
@@ -190,11 +217,51 @@ export async function resolvePlanHumanGateStatus(
190
217
  const runDir = join(projectRoot, ".pi", "harness", "runs", runId);
191
218
  const clar = await isTaskClarificationReady(runDir);
192
219
  const clarDoc = clar.ok ? await readTaskClarificationDoc(runDir) : null;
220
+ logPlanHumanGate({
221
+ runId,
222
+ hypothesisId: "H3",
223
+ location: "plan-human-gates.ts:resolvePlanHumanGateStatus:clar",
224
+ message: "Task clarification readiness evaluated",
225
+ data: {
226
+ runDir,
227
+ clarOk: clar.ok,
228
+ clarErrors: clar.errors,
229
+ docStatus: String(clarDoc?.status ?? ""),
230
+ docEngagementSource:
231
+ typeof clarDoc?.user_engagement === "object" &&
232
+ clarDoc?.user_engagement !== null
233
+ ? String(
234
+ (
235
+ clarDoc.user_engagement as {
236
+ source?: string;
237
+ }
238
+ ).source ?? "",
239
+ )
240
+ : "",
241
+ },
242
+ });
193
243
  const humanGate = validateTaskClarificationHumanGate(entries, clarDoc, {
194
244
  quick: opts?.quick,
195
245
  taskSummary: opts?.taskSummary,
196
246
  allowFollowUpMessage: opts?.lastOutcome === "needs_clarification",
197
247
  });
248
+ logPlanHumanGate({
249
+ runId,
250
+ hypothesisId: "H1-H2",
251
+ location: "plan-human-gates.ts:resolvePlanHumanGateStatus:humanGate",
252
+ message: "Human gate evaluated for phase0 ask_user requirement",
253
+ data: {
254
+ humanGateOk: humanGate.ok,
255
+ humanGateErrors: humanGate.errors,
256
+ allowFollowUpMessage: opts?.lastOutcome === "needs_clarification",
257
+ hasTaskClarificationAskUserSincePlanCommand:
258
+ hasTaskClarificationAskUserSincePlanCommand(entries),
259
+ hasClarificationFollowUpUserMessage:
260
+ hasClarificationFollowUpUserMessage(entries),
261
+ indexOfLastPlanCommand: indexOfLastPlanCommand(entries),
262
+ entriesLen: entries.length,
263
+ },
264
+ });
198
265
  const phase0Ready = clar.ok && humanGate.ok;
199
266
  const phase0NeedsAskUser = clar.ok && !humanGate.ok;
200
267
  const approvalRecorded = hasPlanUserApproval(entries, {
@@ -244,6 +311,21 @@ export async function resolvePlanHumanGateStatus(
244
311
  } else if (approvalRequired && !approvalRecorded) {
245
312
  nextRequiredAction = "approve_plan then create_plan (Phase 6)";
246
313
  }
314
+ logPlanHumanGate({
315
+ runId,
316
+ hypothesisId: "H4",
317
+ location: "plan-human-gates.ts:resolvePlanHumanGateStatus:result",
318
+ message: "Resolved plan human gate status",
319
+ data: {
320
+ phase0Ready,
321
+ phase0NeedsAskUser,
322
+ debateComplete,
323
+ debateRequired,
324
+ approvalRequired,
325
+ approvalRecorded,
326
+ nextRequiredAction,
327
+ },
328
+ });
247
329
 
248
330
  return {
249
331
  phase0Ready,
@@ -188,6 +188,8 @@ subagent({ agentScope: "both", agent: "harness/planning/execution-plan-author",
188
188
 
189
189
  Merge `execution_plan` into draft `plan-packet.yaml` (`write_harness_yaml`). Save `artifacts/execution-plan-draft.yaml` the same way.
190
190
 
191
+ The `execution_plan` must make testing expectations explicit: decide whether unit, integration, and e2e/end-to-end tests are applicable for each changed surface based on risk and implementation scope; add work items/done criteria to create or update applicable tests; list relevant verification commands; and record a short rationale when a test level is not applicable. Do not hard-require all three test levels for every change — make the applicability decision visible.
192
+
191
193
  ## Phase 4c — Deterministic quality gate (hard stop)
192
194
 
193
195
  **Practice:** Harness engineering — never trust the model for graph validity.
@@ -75,6 +75,8 @@ Ensure `artifacts/ls-lint-signal.yaml` exists (from `/harness-run` or write from
75
75
 
76
76
  Run project tests if the approved `PlanPacket` or spawn context lists a test command. Capture stdout paths only — do not paste full logs into the next spawn.
77
77
 
78
+ Verify the testing obligation itself: the approved `PlanPacket` or spawn context must show planned applicability decisions for unit, integration, and e2e/end-to-end tests, and executor evidence must show applicable tests were implemented or updated and run. If a test level was not applicable, require a clear rationale tied to risk and changed surface; missing planned or executed applicable testing is a benchmark failure.
79
+
78
80
  Write `artifacts/benchmark-log.yaml` via `write_harness_yaml` when any shell step ran:
79
81
 
80
82
  ```yaml
@@ -52,14 +52,15 @@ Note `violation_count` in run notes (do not block execute on pre-existing violat
52
52
  1. Confirm `[HarnessActivePlan]` / extension reports plan ready.
53
53
  2. Build `HarnessSpawnContext` with `mode: execute`, `plan_packet_path`, `run_dir`, `acceptance_checks` from plan file.
54
54
  3. Include **`critical_path_work_item_ids`** from `execution_plan.schedule_metadata` in spawn task when present — executor should tackle limiting-step items first (Grove).
55
- 4. Spawn (max **1** agent per call):
55
+ 4. Include the plan's testing expectations in the spawn task: the executor must implement or update applicable unit, integration, and e2e/end-to-end tests, run the relevant verification commands, and report command evidence or a rationale for any non-applicable test level in `validation_summary`.
56
+ 5. Spawn (max **1** agent per call):
56
57
 
57
58
  ```
58
59
  subagent({ agentScope: "both", agent: "harness/running/executor", task: "<HarnessSpawnContext + handoff + critical path hint>" })
59
60
  ```
60
61
 
61
- 5. Parse subprocess output JSON (`execution_status`, validations, rollback refs) from tool result text.
62
- 6. Parent persists trace/handoff artifacts under run dir if needed; do not self-review.
62
+ 6. Parse subprocess output JSON (`execution_status`, validations, rollback refs) from tool result text.
63
+ 7. Parent persists trace/handoff artifacts under run dir if needed; do not self-review.
63
64
 
64
65
  ## Post-work — Structural observation (parent)
65
66
 
package/CHANGELOG.md CHANGED
@@ -2,6 +2,13 @@
2
2
 
3
3
  All notable changes to this project are documented in this file.
4
4
 
5
+
6
+ ## [v0.23.0] — 2026-05-28
7
+
8
+ ### ✨ Features
9
+
10
+ - strengthen run context human gates
11
+
5
12
  ## [v0.22.2] — 2026-05-28
6
13
 
7
14
  ### 🐛 Fixes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ultimate-pi",
3
- "version": "0.22.2",
3
+ "version": "0.23.0",
4
4
  "description": "Governed AI coding harness for pi.dev — bootstrap, plan, execute, review, and steer with deterministic policy gates",
5
5
  "keywords": [
6
6
  "pi-package",