npm - @opengsd/gsd-pi - Versions diffs - 1.1.1-dev.a5a2de8 → 1.1.1-dev.b2556262 - Mend

@opengsd/gsd-pi 1.1.1-dev.a5a2de8 → 1.1.1-dev.b2556262

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (325) hide show

package/src/resources/extensions/gsd/auto-model-selection.ts CHANGED Viewed

@@ -4,11 +4,11 @@
  * and fallback chains.
  */
-import type { Api, Model } from "@gsd/pi-ai";
-import { getProviderCapabilities } from "@gsd/pi-ai";
+import type { Api, Model, ModelThinkingLevel } from "@gsd/pi-ai";
+import { getProviderCapabilities, clampThinkingLevel } from "@gsd/pi-ai";
 import type { ExtensionAPI, ExtensionContext } from "@gsd/pi-coding-agent";
 import type { GSDPreferences } from "./preferences.js";
-import { resolveModelWithFallbacksForUnit, resolveDynamicRoutingConfig } from "./preferences.js";
+import { resolveModelWithFallbacksForUnit, resolveThinkingLevelForUnit, resolveDynamicRoutingConfig } from "./preferences.js";
 import type { ComplexityTier } from "./complexity-classifier.js";
 import { classifyUnitComplexity, extractTaskMetadata, tierLabel } from "./complexity-classifier.js";
 import { resolveModelForComplexity, escalateTier, getEligibleModels, loadCapabilityOverrides, adjustToolSet, filterToolsForProvider } from "./model-router.js";
@@ -57,6 +57,12 @@ export interface ModelSelectionResult {
   routing: { tier: string; modelDowngraded: boolean } | null;
   /** Concrete model applied before dispatch so it can be restored after a fresh session. */
   appliedModel: Model<Api> | null;
+  /**
+   * Reasoning effort applied for this dispatch after per-phase resolution,
+   * floor, and capability clamping (ADR-026). Null when no level was applied
+   * (e.g. no start level captured). Surfaced for metrics/telemetry.
+   */
+  appliedThinkingLevel?: ReturnType<ExtensionAPI["getThinkingLevel"]> | null;
 }
 export interface PreferredModelConfig {
@@ -90,6 +96,32 @@ export function clearToolBaseline(pi: ExtensionAPI | object): void {
   TOOL_BASELINE.delete(pi as unknown as object);
 }
+/**
+ * Return the union of the pre-dispatch baseline tool set and the current live
+ * active tools, or just the live tools when no baseline has been recorded yet.
+ *
+ * Use this instead of `pi.getActiveTools()` anywhere you need the full tool
+ * surface for a preflight/routing check that runs BEFORE `selectAndApplyModel`
+ * restores the baseline — e.g. in `runDispatch` and `decideNextUnit`.
+ *
+ * The union is intentional:
+ *   - Baseline covers tools that a prior unit's per-provider narrowing (hook
+ *     overrides, Groq 128-tool cap, etc.) has removed from the live set.
+ *     Those tools will be restored by `selectAndApplyModel` before dispatch, so
+ *     dropping them from the preflight check would be a false negative.
+ *   - Live set covers tools connected after the baseline was first captured
+ *     (e.g. MCP servers attached mid-session or after a paused resume).
+ *     Without the live merge, a stale baseline permanently hides newly
+ *     connected MCP tools and prevents transport-preflight from clearing on
+ *     resume (#477 follow-up).
+ */
+export function getToolBaselineSnapshot(pi: ExtensionAPI): string[] {
+  const live = typeof pi.getActiveTools === "function" ? pi.getActiveTools() : [];
+  const baseline = TOOL_BASELINE.get(pi as unknown as object);
+  if (baseline === undefined) return live;
+  return [...new Set([...baseline, ...live])];
+}
 /**
  * Models eligible for the pre-dispatch policy gate. Prefer registry-available
  * models; when that list is empty (common after worktree resume before registry
@@ -252,12 +284,103 @@ function restoreToolBaseline(pi: ExtensionAPI): void {
   }
 }
-function reapplyThinkingLevel(
+/**
+ * Apply the desired reasoning effort for the just-selected model, clamping to
+ * what the model actually supports (ADR-026). An unsupported level is never
+ * sent to the provider — it is clamped via `clampThinkingLevel` and the
+ * mismatch is surfaced once per (model, requested-level). Returns the level
+ * actually applied so callers can record it.
+ */
+export function applyThinkingLevelForModel(
   pi: ExtensionAPI,
+  desired: ReturnType<ExtensionAPI["getThinkingLevel"]> | null | undefined,
+  model: Model<Api>,
+  ctx: ExtensionContext,
+): ReturnType<ExtensionAPI["getThinkingLevel"]> | null | undefined {
+  if (!desired) return desired;
+  // Capability-clamp only when we have a bare string level AND the model
+  // advertises reasoning capability (`reasoning` is always present on real
+  // registry models). Richer host snapshot shapes (e.g. `{ effort: "high" }`)
+  // and partial model objects are applied verbatim — we never coerce an unknown
+  // shape into a string or guess capability we can't see.
+  if (typeof desired === "string" && model != null && typeof model === "object" && "reasoning" in model) {
+    const clamped = clampThinkingLevel(model, desired as ModelThinkingLevel) as ReturnType<ExtensionAPI["getThinkingLevel"]>;
+    pi.setThinkingLevel(clamped);
+    if (clamped !== desired) {
+      const key = `${model.provider}/${model.id}:${desired}`;
+      if (!_warnedThinkingClamp.has(key)) {
+        _warnedThinkingClamp.add(key);
+        ctx.ui.notify(
+          `Thinking level '${desired}' not supported by ${model.provider}/${model.id}; using '${clamped}'.`,
+          "warning",
+        );
+      }
+    }
+    return clamped;
+  }
+  pi.setThinkingLevel(desired);
+  return desired;
+}
+/** Warn-once guard for capability clamps, keyed by `provider/id:requested`. */
+const _warnedThinkingClamp = new Set<string>();
+/** Warn-once guard for the execute-task floor punch-through advisory. */
+let _warnedExecuteTaskFloorBypass = false;
+type EffectiveThinkingLevel = ReturnType<ExtensionAPI["getThinkingLevel"]>;
+/**
+ * Ascending severity order for reasoning levels (matches @gsd/pi-agent-core
+ * `ThinkingLevel`). Used only for floor comparisons below.
+ */
+const THINKING_LEVEL_ORDER: readonly EffectiveThinkingLevel[] = [
+  "off",
+  "minimal",
+  "low",
+  "medium",
+  "high",
+  "xhigh",
+] as EffectiveThinkingLevel[];
+/**
+ * Minimum reasoning level for code-writing units.
+ *
+ * `execute-task` is the only unit that edits source. With a low/minimal
+ * thinking level a model does not plan its edits and compensates by re-reading
+ * the same files dozens of times per task (measured: index.html read ~49× in a
+ * single task on a minimal-thinking model) and shelling out to `nl`/`sed` to
+ * re-locate code after every edit invalidates its line numbers. Flooring the
+ * level for this unit type removes that read/bash thrash. Planning, research,
+ * and lifecycle units are unaffected.
+ */
+const EXECUTE_TASK_MIN_THINKING_LEVEL: EffectiveThinkingLevel = "medium";
+function thinkingLevelRank(level: EffectiveThinkingLevel): number {
+  const idx = THINKING_LEVEL_ORDER.indexOf(level);
+  return idx === -1 ? 0 : idx;
+}
+/**
+ * Raise (never lower) the thinking level for code-writing units to a sane
+ * floor. Returns the input unchanged for non-`execute-task` units, when no
+ * level was captured, or when the captured level already meets the floor.
+ */
+export function floorThinkingLevelForUnit(
+  unitType: string,
   level: ReturnType<ExtensionAPI["getThinkingLevel"]> | null | undefined,
-): void {
-  if (!level) return;
-  pi.setThinkingLevel(level);
+): ReturnType<ExtensionAPI["getThinkingLevel"]> | null | undefined {
+  if (unitType !== "execute-task") return level;
+  if (!level) return level;
+  // Only act on the recognized string levels. Any other shape (e.g. a richer
+  // host snapshot object) is passed through untouched so we never coerce an
+  // unknown representation into a bare string the host can't apply.
+  if (!THINKING_LEVEL_ORDER.includes(level as EffectiveThinkingLevel)) {
+    return level;
+  }
+  if (thinkingLevelRank(level as EffectiveThinkingLevel) >= thinkingLevelRank(EXECUTE_TASK_MIN_THINKING_LEVEL)) {
+    return level;
+  }
+  return EXECUTE_TASK_MIN_THINKING_LEVEL;
 }
 export function resolvePreferredModelConfig(
@@ -328,6 +451,42 @@ export async function selectAndApplyModel(
   autoModeStartThinkingLevel?: ReturnType<ExtensionAPI["getThinkingLevel"]> | null,
 ): Promise<ModelSelectionResult> {
   const uokFlags = resolveUokFlags(prefs);
+  // Resolve reasoning effort for this dispatch (ADR-026). An explicit per-phase
+  // thinking config (inline `models.<phase>.thinking` or the separate `thinking`
+  // block) expresses hard user intent: it bypasses the execute-task floor and is
+  // honored verbatim, then capability-clamped per model at apply time below.
+  // With no explicit level, fall back to the auto-start session level and raise
+  // the code-writing floor — preserving prior behavior exactly. Recomputed per
+  // dispatch so neither the floor nor a phase override leaks to other units.
+  const explicitThinkingLevel =
+    resolveThinkingLevelForUnit(unitType) as ReturnType<ExtensionAPI["getThinkingLevel"]> | undefined;
+  const desiredThinkingLevel = explicitThinkingLevel
+    ?? floorThinkingLevelForUnit(unitType, autoModeStartThinkingLevel);
+  if (explicitThinkingLevel) {
+    if (
+      unitType === "execute-task" &&
+      thinkingLevelRank(explicitThinkingLevel) < thinkingLevelRank(EXECUTE_TASK_MIN_THINKING_LEVEL) &&
+      !_warnedExecuteTaskFloorBypass
+    ) {
+      _warnedExecuteTaskFloorBypass = true;
+      ctx.ui.notify(
+        `Explicit execution thinking '${explicitThinkingLevel}' is below the measured execute-task floor ` +
+        `(${EXECUTE_TASK_MIN_THINKING_LEVEL}); honoring it as configured. Low reasoning on code edits can ` +
+        `cause repeated file re-reads.`,
+        "warning",
+      );
+    }
+  } else if (
+    verbose &&
+    desiredThinkingLevel &&
+    desiredThinkingLevel !== autoModeStartThinkingLevel
+  ) {
+    ctx.ui.notify(
+      `Thinking level raised to ${desiredThinkingLevel} for ${unitType} (was ${autoModeStartThinkingLevel ?? "unset"})`,
+      "info",
+    );
+  }
+  let appliedThinkingLevel: ReturnType<ExtensionAPI["getThinkingLevel"]> | null | undefined = null;
   const effectiveSessionModelOverride = sessionModelOverride === undefined
     ? getSessionModelOverride(ctx.sessionManager.getSessionId())
     : (sessionModelOverride ?? undefined);
@@ -673,7 +832,7 @@ export async function selectAndApplyModel(
       const ok = await pi.setModel(model, { persist: false });
       if (ok) {
         appliedModel = model;
-        reapplyThinkingLevel(pi, autoModeStartThinkingLevel);
+        appliedThinkingLevel = applyThinkingLevelForModel(pi, desiredThinkingLevel, model, ctx);
         // ADR-005: Adjust active tool set for the selected model's provider capabilities.
         // Hard-filter incompatible tools, then let extensions override via adjust_tool_set hook.
@@ -733,7 +892,7 @@ export async function selectAndApplyModel(
         const ok = await pi.setModel(model, { persist: false });
         if (!ok) continue;
         appliedModel = model;
-        reapplyThinkingLevel(pi, autoModeStartThinkingLevel);
+        appliedThinkingLevel = applyThinkingLevelForModel(pi, desiredThinkingLevel, model, ctx);
         attemptedPolicyEligible = true;
         if (verbose) {
           ctx.ui.notify(
@@ -779,18 +938,37 @@ export async function selectAndApplyModel(
             const fallbackOk = await pi.setModel(byId, { persist: false });
             if (fallbackOk) {
               appliedModel = byId;
-              reapplyThinkingLevel(pi, autoModeStartThinkingLevel);
+              appliedThinkingLevel = applyThinkingLevelForModel(pi, desiredThinkingLevel, byId, ctx);
             }
           }
         } else {
           appliedModel = startModel;
-          reapplyThinkingLevel(pi, autoModeStartThinkingLevel);
+          appliedThinkingLevel = applyThinkingLevelForModel(pi, desiredThinkingLevel, startModel, ctx);
         }
       }
     }
   }
-  return { routing, appliedModel };
+  // If no model branch applied a thinking level (e.g. interactive guided-flow
+  // with a `thinking:` block but no per-phase model and no start model), still
+  // honor an explicitly configured phase thinking level against the current
+  // session model. Only the explicit path runs here — the floored session
+  // default is intentionally left untouched so no-config interactive runs keep
+  // the user's /model thinking level. (ADR-026)
+  if (appliedThinkingLevel == null && explicitThinkingLevel && ctx.model) {
+    // Prefer the full registry model (carries reasoning capability so the level
+    // can be clamped); fall back to ctx.model. Always route through
+    // applyThinkingLevelForModel so the clamp runs whenever capability metadata
+    // exists — never a raw verbatim setThinkingLevel that bypasses it (ADR-026).
+    const current = resolveModelId(
+      `${ctx.model.provider}/${ctx.model.id}`,
+      ctx.modelRegistry?.getAvailable?.() ?? [],
+      ctx.model.provider,
+    ) ?? (ctx.model as Model<Api>);
+    appliedThinkingLevel = applyThinkingLevelForModel(pi, explicitThinkingLevel, current, ctx);
+  }
+  return { routing, appliedModel, appliedThinkingLevel };
 }
 /**

package/src/resources/extensions/gsd/auto-post-unit.ts CHANGED Viewed

@@ -1521,6 +1521,8 @@ export async function postUnitPreVerification(pctx: PostUnitContext, opts?: PreV
       }
     }
+    let blockingContentViolation: string | null = null;
     // ── Safety harness: post-unit validation ──
     try {
       const { loadEffectiveGSDPreferences } = await import("./preferences.js");
@@ -1668,8 +1670,14 @@ export async function postUnitPreVerification(pctx: PostUnitContext, opts?: PreV
             const artifactPath = resolveArtifactForContent(s.currentUnit.type, s.currentUnit.id, s.basePath);
             const contentViolations = validateContent(s.currentUnit.type, artifactPath);
             for (const v of contentViolations) {
-              logWarning("safety", `content: ${v.reason}`);
-              ctx.ui.notify(`Content validation: ${v.reason}`, "warning");
+              if (v.severity === "error") {
+                blockingContentViolation ??= v.reason;
+                logError("safety", `content: ${v.reason}`);
+                ctx.ui.notify(`Content validation: ${v.reason}`, "error");
+              } else {
+                logWarning("safety", `content: ${v.reason}`);
+                ctx.ui.notify(`Content validation: ${v.reason}`, "warning");
+              }
             }
           } catch (e) {
             debugLog("postUnit", { phase: "safety-content-validation", error: String(e) });
@@ -1868,6 +1876,16 @@ export async function postUnitPreVerification(pctx: PostUnitContext, opts?: PreV
         }
       }
+      if (blockingContentViolation && triggerArtifactVerified) {
+        triggerArtifactVerified = false;
+        debugLog("postUnit", {
+          phase: "content-validation-blocked-artifact",
+          unitType: s.currentUnit.type,
+          unitId: s.currentUnit.id,
+          reason: blockingContentViolation,
+        });
+      }
       // When artifact verification fails for a unit type that has a known expected
       // artifact, ask the caller to retry so it re-dispatches with failure context
       // instead of blindly re-dispatching the same unit (#1571).

package/src/resources/extensions/gsd/auto-prompts.ts CHANGED Viewed

@@ -10,8 +10,8 @@
  */
 import { loadFile, parseContinue, parseSummary, loadActiveOverrides, formatOverridesSection, parseTaskPlanFile } from "./files.js";
-import type { Override, UatType } from "./files.js";
-import { hasVerdict, getUatType, extractVerdict } from "./verdict-parser.js";
+import type { Override } from "./files.js";
+import { hasVerdict, extractVerdict } from "./verdict-parser.js";
 import { loadPrompt, inlineTemplate } from "./prompt-loader.js";
 import {
   resolveMilestoneFile, resolveSliceFile, resolveSlicePath,
@@ -42,11 +42,11 @@ import { logWarning } from "./workflow-logger.js";
 import { inlineGraphSubgraph } from "./graph-context.js";
 import { buildExtractionStepsBlock } from "./commands-extract-learnings.js";
 import { classifyProject, type ProjectClassification } from "./detection.js";
-import { hasBrowserRequiredText } from "./browser-evidence.js";
 import { debugLog } from "./debug-logger.js";
 import { buildSkillActivationBlock, buildSkillDiscoveryVars } from "./skill-activation.js";
 import { findMilestoneIds } from "./milestone-ids.js";
-import { buildRunUatResultPresentation, RUN_UAT_TOOL_PRESENTATION_PLAN_ID } from "./tool-presentation-plan.js";
+import { buildRunUatPresentationForType, RUN_UAT_TOOL_PRESENTATION_PLAN_ID } from "./tool-presentation-plan.js";
+import { resolveEffectiveUatType, shouldDispatchUatForContent, type UatType } from "./uat-policy.js";
 export { buildSkillActivationBlock, buildSkillDiscoveryVars };
@@ -286,19 +286,6 @@ function prependContextModeToBlock(
   return `${contextMode}\n\n${block}`;
 }
-function resolveEffectiveUatType(content: string): UatType {
-  const uatType = getUatType(content);
-  if (uatType === "artifact-driven" && hasBrowserRequiredText(content)) {
-    return "browser-executable";
-  }
-  return uatType;
-}
-function shouldDispatchUatForContent(content: string, prefs: GSDPreferences | undefined): boolean {
-  const uatType = resolveEffectiveUatType(content);
-  return !!prefs?.uat_dispatch || uatType !== "artifact-driven" || hasBrowserRequiredText(content);
-}
 // ─── Executor Constraints ─────────────────────────────────────────────────────
 /**
@@ -3385,7 +3372,7 @@ export async function buildRunUatPrompt(
   const uatResultPath = join(base, relSliceFile(base, mid, sliceId, "ASSESSMENT"));
   const uatType = resolveEffectiveUatType(uatContent);
-  const canonicalPresentation = JSON.stringify(buildRunUatResultPresentation(), null, 2);
+  const canonicalPresentation = JSON.stringify(buildRunUatPresentationForType(uatType), null, 2);
   return loadPrompt("run-uat", {
     workingDirectory: base,
@@ -3543,11 +3530,25 @@ export async function buildReassessRoadmapPrompt(
 // ─── Reactive Execute Prompt ──────────────────────────────────────────────
+/**
+ * Build the `with model: "…" and thinking: "…"` suffix injected into a prompt
+ * that instructs the coordinator how to dispatch a `subagent` call. Either or
+ * both may be absent (ADR-026 / #508).
+ */
+function subagentCallSuffix(model?: string, thinking?: string): string {
+  const parts: string[] = [];
+  if (model) parts.push(`model: "${model}"`);
+  if (thinking) parts.push(`thinking: "${thinking}"`);
+  return parts.length > 0 ? ` with ${parts.join(" and ")}` : "";
+}
 export async function buildReactiveExecutePrompt(
   mid: string, midTitle: string, sid: string, sTitle: string,
   readyTaskIds: string[], base: string,
   subagentModel?: string,
-  opts?: { sessionContextWindow?: number; modelRegistry?: MinimalModelRegistry; sessionProvider?: string },
+  // Reasoning effort travels inside opts here (not as a positional param) so
+  // existing positional `opts` callers don't shift (#508).
+  opts?: { sessionContextWindow?: number; modelRegistry?: MinimalModelRegistry; sessionProvider?: string; subagentThinking?: string },
 ): Promise<string> {
   const { loadSliceTaskIO, deriveTaskGraph, graphMetrics } = await import("./reactive-graph.js");
@@ -3640,7 +3641,7 @@ export async function buildReactiveExecutePrompt(
       `When done, say: "Task ${tid} complete."`,
     ].join("\n");
-    const modelSuffix = subagentModel ? ` with model: "${subagentModel}"` : "";
+    const modelSuffix = subagentCallSuffix(subagentModel, opts?.subagentThinking);
     subagentSections.push([
       `### ${tid}: ${tTitle}`,
       "",
@@ -3724,10 +3725,11 @@ export async function buildParallelResearchSlicesPrompt(
   slices: Array<{ id: string; title: string }>,
   basePath: string,
   subagentModel?: string,
+  subagentThinking?: string,
 ): Promise<string> {
   // Build individual research-slice prompts for each slice
   const subagentSections: string[] = [];
-  const modelSuffix = subagentModel ? ` with model: "${subagentModel}"` : "";
+  const modelSuffix = subagentCallSuffix(subagentModel, subagentThinking);
   for (const slice of slices) {
     const slicePrompt = await buildResearchSlicePrompt(mid, midTitle, slice.id, slice.title, basePath, { contextModeRenderMode: "nested" });
     subagentSections.push([
@@ -3755,6 +3757,7 @@ export async function buildGateEvaluatePrompt(
   mid: string, midTitle: string, sid: string, sTitle: string,
   base: string,
   subagentModel?: string,
+  subagentThinking?: string,
 ): Promise<string> {
   // Pull only the gates this turn actually owns (Q3/Q4). Filter via the
   // registry so that scope:"slice" gates owned by other turns (Q8) can't
@@ -3811,7 +3814,7 @@ export async function buildGateEvaluatePrompt(
       "- `findings`: detailed markdown findings (or empty if omitted)",
     ].join("\n");
-    const modelSuffix = subagentModel ? ` with model: "${subagentModel}"` : "";
+    const modelSuffix = subagentCallSuffix(subagentModel, subagentThinking);
     subagentSections.push([
       `### ${def.id}: ${def.question}`,
       "",

package/src/resources/extensions/gsd/auto-recovery.ts CHANGED Viewed

@@ -15,7 +15,25 @@ import { appendEvent } from "./workflow-events.js";
 import { atomicWriteSync } from "./atomic-write.js";
 import { clearParseCache } from "./files.js";
 import { parseRoadmap as parseLegacyRoadmap, parsePlan as parseLegacyPlan } from "./parsers-legacy.js";
-import { isDbAvailable, getTask, getSlice, getSliceTasks, getPendingGates, updateTaskStatus, updateSliceStatus, insertSlice, getMilestone, getMilestoneSlices, getLatestAssessmentByScope, updateMilestoneStatus, refreshOpenDatabaseFromDisk, getCompletedMilestoneTaskFileHints, getMilestoneCommitAttributionShas, recordMilestoneCommitAttribution, transaction } from "./gsd-db.js";
+import {
+  isDbAvailable,
+  getTask,
+  getSlice,
+  getSliceTasks,
+  getPendingGatesForTurn,
+  updateTaskStatus,
+  updateSliceStatus,
+  insertSlice,
+  getMilestone,
+  getMilestoneSlices,
+  getLatestAssessmentByScope,
+  updateMilestoneStatus,
+  refreshOpenDatabaseFromDisk,
+  getCompletedMilestoneTaskFileHints,
+  getMilestoneCommitAttributionShas,
+  recordMilestoneCommitAttribution,
+  transaction,
+} from "./gsd-db.js";
 import { isValidationTerminal } from "./state.js";
 import { getErrorMessage } from "./error-utils.js";
 import { logWarning, logError } from "./workflow-logger.js";
@@ -390,8 +408,9 @@ export function verifyExpectedArtifact(
     if (gateIds.length === 0) return true;
     try {
-      const pending = getPendingGates(mid, sid, "slice");
-      const pendingIds = new Set(pending.map((g: any) => g.gate_id));
+      if (!isDbAvailable()) return false;
+      const pending = getPendingGatesForTurn(mid, sid, "gate-evaluate");
+      const pendingIds = new Set<string>(pending.map((g) => g.gate_id));
       // All dispatched gates must no longer be pending
       for (const gid of gateIds) {
         if (pendingIds.has(gid)) return false;

package/src/resources/extensions/gsd/auto-runtime-state.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 // GSD auto-mode runtime state
 import { AutoSession } from "./auto/session.js";
 import type { CurrentUnit } from "./auto/session.js";
+import type { SourceObservationStore } from "./source-observations.js";
 import {
   isDeterministicPolicyError,
   isQueuedUserMessageSkip,
@@ -65,3 +66,7 @@ export function clearToolInvocationError(): void {
   if (!autoSession.active) return;
   autoSession.lastToolInvocationError = null;
 }
+export function getSourceObservationStore(): SourceObservationStore {
+  return autoSession.sourceObservations;
+}

package/src/resources/extensions/gsd/auto-start.ts CHANGED Viewed

@@ -1557,7 +1557,7 @@ export async function bootstrapAutoSession(
     s.autoStartTime = Date.now();
     s.resourceVersionOnStart = readResourceVersion();
     s.pendingQuickTasks = [];
-    s.currentUnit = null;
+    s.clearCurrentUnit();
     s.currentMilestoneId ??=
       strandedRecoveryAction?.milestoneId ??
       (deepProjectStagePending ? null : state.activeMilestone?.id ?? null);

package/src/resources/extensions/gsd/auto-timers.ts CHANGED Viewed

@@ -147,6 +147,15 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
   const softTimeoutMs = supervisionTimeouts.softTimeoutMs;
   const idleTimeoutMs = supervisionTimeouts.idleTimeoutMs;
   const hardTimeoutMs = supervisionTimeouts.hardTimeoutMs;
+  // A single hung tool gets its own short budget, NOT the general idle window:
+  // a long-but-progressing session is not idle, but a tool stuck for minutes
+  // is. Falls back to the idle window only if misconfigured to zero. The
+  // hung-tool budget is intentionally not scaled by task estimate — a stuck
+  // tool call is stuck regardless of how long the overall task should take.
+  const stalledToolTimeoutMs =
+    (supervisor.stalled_tool_timeout_minutes ?? 0) > 0
+      ? supervisor.stalled_tool_timeout_minutes! * 60 * 1000
+      : idleTimeoutMs;
   // ── 1. Soft timeout warning ──
   s.wrapupWarningHandle = setTimeout(() => {
@@ -189,10 +198,13 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
       };
       const runtime = readUnitRuntimeRecord(s.basePath, unitType, unitId);
       if (!runtime) return;
-      if (Date.now() - runtime.lastProgressAt < idleTimeoutMs) return;
-      // Agent has tool calls currently executing — not idle, just waiting.
-      // But only suppress recovery if the tool started recently.
+      // In-flight tool handling runs on its own dedicated hung-tool budget,
+      // independent of the general idle gate below, so a genuinely stuck tool
+      // is caught in minutes instead of waiting out the (typically much longer)
+      // idle window (#2527, follow-up). A tool actively executing within budget
+      // is real progress, so refreshing lastProgressAt here also keeps the idle
+      // gate from firing during legitimate long-running tool calls.
       let stalledToolDetected = false;
       if (getInFlightToolCount() > 0) {
         // User-interactive tools (ask_user_questions, secure_env_collect) block
@@ -206,25 +218,29 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
         }
         const oldestStart = getOldestInFlightToolStart()!;
         const toolAgeMs = Date.now() - oldestStart;
-        if (toolAgeMs < idleTimeoutMs) {
+        if (toolAgeMs < stalledToolTimeoutMs) {
           writeUnitRuntimeRecord(s.basePath, unitType, unitId, s.currentUnit.startedAt, {
             lastProgressAt: Date.now(),
             lastProgressKind: "tool-in-flight",
           });
           return;
         }
-        // Tool has been in-flight longer than idle timeout — treat as hung.
-        // Clear the stale entries so subsequent ticks don't re-detect them,
-        // and set the flag so the filesystem-activity check below does not
-        // override the stall verdict (#2527).
+        // Tool has been in-flight longer than the hung-tool budget — treat as
+        // hung. Clear the stale entries so subsequent ticks don't re-detect
+        // them, and set the flag so the idle gate and filesystem-activity check
+        // below do not override the stall verdict (#2527).
         stalledToolDetected = true;
         clearInFlightTools();
         ctx.ui.notify(
-          `Stalled tool detected: a tool has been in-flight for ${Math.round(toolAgeMs / 60000)}min. Treating as hung — attempting idle recovery.`,
+          `Stalled tool detected: a tool has been in-flight for ${Math.round(toolAgeMs / 60000)}min (budget ${Math.round(stalledToolTimeoutMs / 60000)}min). Treating as hung — attempting idle recovery.`,
           "warning",
         );
       }
+      // No hung tool — apply the general idle gate. A unit that has made
+      // meaningful progress within the idle window is not idle yet.
+      if (!stalledToolDetected && Date.now() - runtime.lastProgressAt < idleTimeoutMs) return;
       // Check if the agent is producing work on disk.
       // Skip this when a stalled tool was just detected — filesystem changes
       // from earlier in the task should not override the stall verdict (#2527).