npm - ultimate-pi - Versions diffs - 0.22.2 → 0.23.0 - Mend

ultimate-pi 0.22.2 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/.pi/extensions/harness-run-context.ts +62 -6
package/.pi/lib/harness-run-context.ts +38 -0
package/.pi/lib/plan-human-gates.ts +82 -0
package/.pi/prompts/harness-plan.md +2 -0
package/.pi/prompts/harness-review.md +2 -0
package/.pi/prompts/harness-run.md +4 -3
package/CHANGELOG.md +7 -0
package/package.json +1 -1

package/.pi/extensions/harness-run-context.ts CHANGED Viewed

@@ -74,6 +74,7 @@ import {
 	readReviewOutcomeFromRun,
 	reconcileReviewRouting,
 	reconcileStaleExecuteCompletion,
+	refreshRunContextProgress,
 	relPathUnderActiveRun,
 	resetRunContextForHarnessAuto,
 	resolveArgsForCommand,
@@ -814,6 +815,13 @@ function registerHarnessRunStatusCommand(
 				if (ctx.hasUI) ctx.ui.notify(msg, "warning");
 				return;
 			}
+			ctxState = await refreshRunContextProgress(
+				projectRoot,
+				ctxState,
+				entries,
+			);
+			active.set(ctxState);
+			persistContext(pi, ctxState);
 			let summary: PlanPacketSummary | null = null;
 			for (let i = entries.length - 1; i >= 0; i--) {
 				const entry = entries[i] as SessionEntryLike;
@@ -1253,12 +1261,29 @@ function registerPlanApprovalCapture(
 		if (event.toolName === "ask_user") {
 			const details = event.details as { cancelled?: boolean; input?: unknown };
 			if (details?.cancelled) {
-				const synced = await syncPlanLastOutcomeFromTaskClarification(
+				// Ignore cancels from later planning forks (e.g. debate profile choice):
+				// only treat cancel as Phase-0 clarification failure when clarification
+				// is not already locked ready.
+				const runRoot = join(
 					process.cwd(),
-					runCtx,
+					".pi",
+					"harness",
+					"runs",
+					runCtx.run_id ?? "",
 				);
-				Object.assign(runCtx, synced);
-				persistContext(pi, runCtx);
+				const clarDoc = runCtx.run_id
+					? await readTaskClarificationDoc(runRoot)
+					: null;
+				const clarReady =
+					String(clarDoc?.status ?? "").toLowerCase() === "ready";
+				if (!clarReady) {
+					const synced = await syncPlanLastOutcomeFromTaskClarification(
+						process.cwd(),
+						runCtx,
+					);
+					Object.assign(runCtx, synced);
+					persistContext(pi, runCtx);
+				}
 			} else if (
 				!isPlanApprovalAskUser(
 					(details?.input ?? {}) as {
@@ -1295,6 +1320,36 @@ function registerPlanApprovalCapture(
 	});
 }
+function registerExecutorHandoffReconcile(
+	pi: ExtensionAPI,
+	active: ActiveContextAccess,
+): void {
+	pi.on("tool_result", async (event, ctx) => {
+		if (event.isError || event.toolName !== "submit_executor_handoff") return;
+		const entries = getEntries(ctx);
+		const runCtx = getLatestRunContext(entries) ?? active.get();
+		if (!runCtx?.run_id) return;
+		const projectRoot = process.cwd();
+		const refreshed = await refreshRunContextProgress(
+			projectRoot,
+			runCtx,
+			entries,
+		);
+		Object.assign(runCtx, refreshed);
+		active.set(runCtx);
+		persistContext(pi, runCtx);
+		if (refreshed.last_completed_step === "execute") {
+			const notify = `Execute finished (${refreshed.last_outcome ?? "done"}). Next: ${refreshed.next_recommended_command ?? "/harness-review"}`;
+			pi.appendEntry("harness-step-handoff", {
+				next_command: refreshed.next_recommended_command,
+				execution_status: refreshed.last_outcome,
+				phase: refreshed.phase,
+			});
+			if (ctx.hasUI) ctx.ui.notify(notify, "info");
+		}
+	});
+}
 async function guardToolCall(input: {
 	event: { toolName: string; input: unknown };
 	ctx: { sessionManager: { getEntries(): unknown[] } };
@@ -1828,7 +1883,7 @@ async function handleAgentEnd(input: {
 		activeCtx.run_id,
 		projectRoot,
 	);
-	if (parsed?.command === "harness-run") {
+	if (parsed?.command === "harness-run" || parsed?.command === "harness-auto") {
 		let execStatus = statuses.executionStatus;
 		if (!execStatus) {
 			const handoff = await readExecutorHandoffFromRun(
@@ -1895,7 +1950,7 @@ async function handleAgentEnd(input: {
 	activeCtx.next_recommended_command = next;
 	activeCtx.updated_at = new Date().toISOString();
 	if (
-		parsed?.command === "harness-run" &&
+		(parsed?.command === "harness-run" || parsed?.command === "harness-auto") &&
 		activeCtx.last_outcome === "completed"
 	) {
 		syncPolicyFromRunContext(input.pi, entries, activeCtx);
@@ -2579,6 +2634,7 @@ export default function harnessRunContext(pi: ExtensionAPI) {
 	});
 	registerPlanApprovalCapture(pi, activeAccess);
+	registerExecutorHandoffReconcile(pi, activeAccess);
 	registerHarnessToolCallGuards(pi, activeAccess);
 	registerHarnessRunStatusCommand(pi, activeAccess);

package/.pi/lib/harness-run-context.ts CHANGED Viewed

@@ -2407,6 +2407,44 @@ export async function reconcileStaleExecuteCompletion(
 	return synced;
 }
+/** Reconcile disk artifacts and recompute next_recommended_command for status UI. */
+export async function refreshRunContextProgress(
+	projectRoot: string,
+	ctx: HarnessRunContext,
+	entries: unknown[] = [],
+): Promise<HarnessRunContext> {
+	let synced = await reconcileStaleExecuteCompletion(projectRoot, ctx, entries);
+	synced = await reconcileReviewRouting(projectRoot, synced);
+	const statuses = await resolveCompletionStatuses(
+		entries,
+		synced.run_id,
+		projectRoot,
+	);
+	const reviewComplete =
+		synced.last_completed_step === "review" ||
+		synced.last_completed_step === "adversary";
+	const remediationClass = await resolveRemediationClassForRun(
+		synced.run_id,
+		projectRoot,
+	);
+	synced.next_recommended_command = nextStepAfterOutcome({
+		phase: synced.phase,
+		planStatus: synced.plan_ready ? "ready" : statuses.planStatus,
+		lastCompletedStep: synced.last_completed_step,
+		lastOutcome: synced.last_outcome,
+		executionStatus: statuses.executionStatus,
+		evalStatus: statuses.evalStatus,
+		adversaryComplete: statuses.adversaryComplete,
+		aborted: synced.status === "aborted",
+		remediationClass,
+		steerAttempt: synced.steer_attempt ?? 0,
+		steerMaxAttempts: synced.steer_max_attempts ?? steerMaxAttemptsFromEnv(),
+		reviewComplete,
+	});
+	synced.updated_at = nowIso();
+	return synced;
+}
 export async function blockingHarnessAutoCommandReason(
 	command: string,
 	activeCtx: HarnessRunContext | null,

package/.pi/lib/plan-human-gates.ts CHANGED Viewed

@@ -28,6 +28,33 @@ import {
 const EXPLICIT_ACCEPTANCE_RE =
 	/\b(acceptance|success criteria|definition of done|done when|must (pass|satisfy)|out of scope|in scope)\b/i;
+function logPlanHumanGate(payload: {
+	runId: string;
+	hypothesisId: string;
+	location: string;
+	message: string;
+	data: Record<string, unknown>;
+}): void {
+	// #region agent log
+	fetch("http://127.0.0.1:7928/ingest/a5d40896-34cb-4f12-97db-df7ada0b22f0", {
+		method: "POST",
+		headers: {
+			"Content-Type": "application/json",
+			"X-Debug-Session-Id": "f7763e",
+		},
+		body: JSON.stringify({
+			sessionId: "f7763e",
+			runId: payload.runId,
+			hypothesisId: payload.hypothesisId,
+			location: payload.location,
+			message: payload.message,
+			data: payload.data,
+			timestamp: Date.now(),
+		}),
+	}).catch(() => {});
+	// #endregion
+}
 type SessionEntryLike = {
 	type?: string;
 	customType?: string;
@@ -190,11 +217,51 @@ export async function resolvePlanHumanGateStatus(
 	const runDir = join(projectRoot, ".pi", "harness", "runs", runId);
 	const clar = await isTaskClarificationReady(runDir);
 	const clarDoc = clar.ok ? await readTaskClarificationDoc(runDir) : null;
+	logPlanHumanGate({
+		runId,
+		hypothesisId: "H3",
+		location: "plan-human-gates.ts:resolvePlanHumanGateStatus:clar",
+		message: "Task clarification readiness evaluated",
+		data: {
+			runDir,
+			clarOk: clar.ok,
+			clarErrors: clar.errors,
+			docStatus: String(clarDoc?.status ?? ""),
+			docEngagementSource:
+				typeof clarDoc?.user_engagement === "object" &&
+				clarDoc?.user_engagement !== null
+					? String(
+							(
+								clarDoc.user_engagement as {
+									source?: string;
+								}
+							).source ?? "",
+						)
+					: "",
+		},
+	});
 	const humanGate = validateTaskClarificationHumanGate(entries, clarDoc, {
 		quick: opts?.quick,
 		taskSummary: opts?.taskSummary,
 		allowFollowUpMessage: opts?.lastOutcome === "needs_clarification",
 	});
+	logPlanHumanGate({
+		runId,
+		hypothesisId: "H1-H2",
+		location: "plan-human-gates.ts:resolvePlanHumanGateStatus:humanGate",
+		message: "Human gate evaluated for phase0 ask_user requirement",
+		data: {
+			humanGateOk: humanGate.ok,
+			humanGateErrors: humanGate.errors,
+			allowFollowUpMessage: opts?.lastOutcome === "needs_clarification",
+			hasTaskClarificationAskUserSincePlanCommand:
+				hasTaskClarificationAskUserSincePlanCommand(entries),
+			hasClarificationFollowUpUserMessage:
+				hasClarificationFollowUpUserMessage(entries),
+			indexOfLastPlanCommand: indexOfLastPlanCommand(entries),
+			entriesLen: entries.length,
+		},
+	});
 	const phase0Ready = clar.ok && humanGate.ok;
 	const phase0NeedsAskUser = clar.ok && !humanGate.ok;
 	const approvalRecorded = hasPlanUserApproval(entries, {
@@ -244,6 +311,21 @@ export async function resolvePlanHumanGateStatus(
 	} else if (approvalRequired && !approvalRecorded) {
 		nextRequiredAction = "approve_plan then create_plan (Phase 6)";
 	}
+	logPlanHumanGate({
+		runId,
+		hypothesisId: "H4",
+		location: "plan-human-gates.ts:resolvePlanHumanGateStatus:result",
+		message: "Resolved plan human gate status",
+		data: {
+			phase0Ready,
+			phase0NeedsAskUser,
+			debateComplete,
+			debateRequired,
+			approvalRequired,
+			approvalRecorded,
+			nextRequiredAction,
+		},
+	});
 	return {
 		phase0Ready,

package/.pi/prompts/harness-plan.md CHANGED Viewed

@@ -188,6 +188,8 @@ subagent({ agentScope: "both", agent: "harness/planning/execution-plan-author",
 Merge `execution_plan` into draft `plan-packet.yaml` (`write_harness_yaml`). Save `artifacts/execution-plan-draft.yaml` the same way.
+The `execution_plan` must make testing expectations explicit: decide whether unit, integration, and e2e/end-to-end tests are applicable for each changed surface based on risk and implementation scope; add work items/done criteria to create or update applicable tests; list relevant verification commands; and record a short rationale when a test level is not applicable. Do not hard-require all three test levels for every change — make the applicability decision visible.
 ## Phase 4c — Deterministic quality gate (hard stop)
 **Practice:** Harness engineering — never trust the model for graph validity.

package/.pi/prompts/harness-review.md CHANGED Viewed

@@ -75,6 +75,8 @@ Ensure `artifacts/ls-lint-signal.yaml` exists (from `/harness-run` or write from
 Run project tests if the approved `PlanPacket` or spawn context lists a test command. Capture stdout paths only — do not paste full logs into the next spawn.
+Verify the testing obligation itself: the approved `PlanPacket` or spawn context must show planned applicability decisions for unit, integration, and e2e/end-to-end tests, and executor evidence must show applicable tests were implemented or updated and run. If a test level was not applicable, require a clear rationale tied to risk and changed surface; missing planned or executed applicable testing is a benchmark failure.
 Write `artifacts/benchmark-log.yaml` via `write_harness_yaml` when any shell step ran:
 ```yaml

package/.pi/prompts/harness-run.md CHANGED Viewed

@@ -52,14 +52,15 @@ Note `violation_count` in run notes (do not block execute on pre-existing violat
 1. Confirm `[HarnessActivePlan]` / extension reports plan ready.
 2. Build `HarnessSpawnContext` with `mode: execute`, `plan_packet_path`, `run_dir`, `acceptance_checks` from plan file.
 3. Include **`critical_path_work_item_ids`** from `execution_plan.schedule_metadata` in spawn task when present — executor should tackle limiting-step items first (Grove).
-4. Spawn (max **1** agent per call):
+4. Include the plan's testing expectations in the spawn task: the executor must implement or update applicable unit, integration, and e2e/end-to-end tests, run the relevant verification commands, and report command evidence or a rationale for any non-applicable test level in `validation_summary`.
+5. Spawn (max **1** agent per call):
 ```
 subagent({ agentScope: "both", agent: "harness/running/executor", task: "<HarnessSpawnContext + handoff + critical path hint>" })
 ```
-5. Parse subprocess output JSON (`execution_status`, validations, rollback refs) from tool result text.
-6. Parent persists trace/handoff artifacts under run dir if needed; do not self-review.
+6. Parse subprocess output JSON (`execution_status`, validations, rollback refs) from tool result text.
+7. Parent persists trace/handoff artifacts under run dir if needed; do not self-review.
 ## Post-work — Structural observation (parent)

package/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,13 @@
 All notable changes to this project are documented in this file.
+## [v0.23.0] — 2026-05-28
+### ✨ Features
+- strengthen run context human gates
 ## [v0.22.2] — 2026-05-28
 ### 🐛 Fixes

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "ultimate-pi",
-	"version": "0.22.2",
+	"version": "0.23.0",
 	"description": "Governed AI coding harness for pi.dev — bootstrap, plan, execute, review, and steer with deterministic policy gates",
 	"keywords": [
 		"pi-package",