pi-taskflow 0.0.24 → 0.0.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,7 +17,7 @@ import { execFileSync } from "node:child_process";
17
17
  import * as crypto from "node:crypto";
18
18
  import * as fs from "node:fs";
19
19
  import * as path from "node:path";
20
- import { cacheDir, withLock, writeFileAtomic } from "./store.ts";
20
+ import { cacheDir, withLock, writeFileAtomic, type PhaseState } from "./store.ts";
21
21
 
22
22
  // ---------------------------------------------------------------------------
23
23
  // Fingerprint resolution
@@ -144,6 +144,11 @@ export interface CacheEntry {
144
144
  output?: string;
145
145
  json?: unknown;
146
146
  model?: string;
147
+ /** Full PhaseState payload preserved so cross-run reuse is semantically
148
+ * equivalent to within-run resume. Storing only output/json would drop
149
+ * `gate`, `approval`, `reads`, `loop`, `tournament`, `warnings`, etc.,
150
+ * breaking recompute soundness and gate-block detection. */
151
+ state?: PhaseState;
147
152
  /** Provenance for audit / cleanup. */
148
153
  flowName?: string;
149
154
  phaseId?: string;
@@ -0,0 +1,97 @@
1
+ /**
2
+ * Content-addressed hashing for flow definitions.
3
+ *
4
+ * The canonical-JSON + SHA-256-truncation algorithm here is **vendored from
5
+ * overstory `packages/core/src/ir/hash.ts`** (pinned commit) so that
6
+ * pi-taskflow and overstory share one byte-identical hashing contract. This is
7
+ * the `M1` slice of the overstory-convergence roadmap: we are *not* compiling
8
+ * to overstory FlowIR yet (the IR compiler expects an explicit inject/emits
9
+ * model pi-taskflow doesn't have), but we share the **hash algorithm** now —
10
+ * the cheapest, lowest-risk piece of the contract — and put it to immediate
11
+ * work folding the flow *definition* into the cross-run cache key (M2).
12
+ *
13
+ * Why this matters: previously the cache key folded only the flow **name**
14
+ * (`flow:${flowName}`), so two structurally-different flows that happened to
15
+ * share a name + phase id + task could collide in the cross-run cache, and a
16
+ * flow that changed structure (but not name) could serve a stale hit. Folding
17
+ * `flowDefHash` (a content fingerprint of the desugared definition) closes
18
+ * that hole and is the foundation of "identical re-run is free ($0.00)".
19
+ *
20
+ * Pure module: no IO. Uses Web Crypto (`globalThis.crypto.subtle`) — therefore
21
+ * async — exactly like overstory's `hashIR`, so the contract is identical.
22
+ *
23
+ * @see docs/internal/overstory-convergence-roadmap.md §3 (M1, "cut B")
24
+ * @see docs/internal/rfc-flowir-compilation.md
25
+ */
26
+
27
+ import type { Taskflow } from "../schema.ts";
28
+
29
+ // ---------------------------------------------------------------------------
30
+ // Canonical JSON (vendored from overstory ir/hash.ts — byte-identical)
31
+ // ---------------------------------------------------------------------------
32
+
33
+ /**
34
+ * Deterministic JSON: recursively key-sorted (UTF-16 code units), no
35
+ * whitespace, `undefined` values dropped. Arrays keep their order (the
36
+ * desugared Taskflow is already in a canonical shape). Byte-identical to
37
+ * overstory's `canonicalJson` — do not diverge without bumping the contract
38
+ * and updating the parity test.
39
+ */
40
+ export function canonicalJson(value: unknown): string {
41
+ if (value === null || typeof value === "number" || typeof value === "boolean") {
42
+ return JSON.stringify(value);
43
+ }
44
+ if (typeof value === "string") {
45
+ return JSON.stringify(value);
46
+ }
47
+ if (Array.isArray(value)) {
48
+ return `[${value.map((item) => canonicalJson(item === undefined ? null : item)).join(",")}]`;
49
+ }
50
+ if (typeof value === "object") {
51
+ const record = value as Record<string, unknown>;
52
+ const keys = Object.keys(record)
53
+ .filter((key) => record[key] !== undefined)
54
+ .sort();
55
+ const body = keys.map((key) => `${JSON.stringify(key)}:${canonicalJson(record[key])}`);
56
+ return `{${body.join(",")}}`;
57
+ }
58
+ // undefined / function / symbol at the top level — not representable.
59
+ return "null";
60
+ }
61
+
62
+ // ---------------------------------------------------------------------------
63
+ // Hashing (vendored from overstory ir/hash.ts — byte-identical)
64
+ // ---------------------------------------------------------------------------
65
+
66
+ /** SHA-256 of the canonical serialization, first 16 bytes, lowercase hex.
67
+ * Same shape as overstory's `hashCanonical` / RFC-001 content hashes. */
68
+ export async function hashCanonical(canonical: string): Promise<string> {
69
+ const bytes = new TextEncoder().encode(canonical);
70
+ const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
71
+ const view = new Uint8Array(digest).slice(0, 16);
72
+ let hex = "";
73
+ for (const byte of view) {
74
+ hex += byte.toString(16).padStart(2, "0");
75
+ }
76
+ return hex;
77
+ }
78
+
79
+ // ---------------------------------------------------------------------------
80
+ // Flow-definition fingerprint
81
+ // ---------------------------------------------------------------------------
82
+
83
+ /**
84
+ * Content fingerprint of a desugared `Taskflow` definition.
85
+ *
86
+ * Hashes the **definition** (structure + task text + declared deps), NOT the
87
+ * runtime `args` values — args vary per invocation and are already folded into
88
+ * each phase's `inputHash` via the interpolated task. `flowDefHash` answers a
89
+ * different question: "did the flow *itself* change?" Two flows are
90
+ * definitionally identical ⟺ this hash matches (key order / whitespace /
91
+ * optional-field presence do not affect it).
92
+ *
93
+ * Deterministic and async (Web Crypto), matching overstory's `hashIR` shape.
94
+ */
95
+ export async function flowDefHash(def: Taskflow): Promise<string> {
96
+ return hashCanonical(canonicalJson(def));
97
+ }
@@ -28,7 +28,8 @@ import { type AgentScope, discoverAgents, readSubagentSettings, shouldSyncBuilti
28
28
  import { renderRunResult, summarizeRun } from "./render.ts";
29
29
  import { RunHistoryComponent, type RunHistoryResult } from "./runs-view.ts";
30
30
  import { ApprovalViewComponent, type ApprovalChoice } from "./approval-view.ts";
31
- import { executeTaskflow, type ApprovalDecision, type ApprovalRequest, type RuntimeResult } from "./runtime.ts";
31
+ import { executeTaskflow, recomputeTaskflow, type ApprovalDecision, type ApprovalRequest, type RecomputeReport, type RuntimeDeps, type RuntimeResult } from "./runtime.ts";
32
+ import { type UsageStats } from "./usage.ts";
32
33
  import { finalPhase, resolveArgs, type Taskflow, validateTaskflow, desugar, isShorthand } from "./schema.ts";
33
34
  import {
34
35
  getFlow,
@@ -44,6 +45,7 @@ import {
44
45
  } from "./store.ts";
45
46
  import { CacheStore } from "./cache.ts";
46
47
  import { safeParse } from "./interpolate.ts";
48
+ import { formatWhyStale, readMapOf } from "./stale.ts";
47
49
  import {
48
50
  isValidKey,
49
51
  queueSpawn,
@@ -60,6 +62,7 @@ interface TaskflowDetails {
60
62
  finalOutput?: string;
61
63
  action: string;
62
64
  message?: string;
65
+ cacheReport?: string;
63
66
  }
64
67
 
65
68
  /** pi reads `isError` at runtime to mark tool failures; it is not in the public type. */
@@ -83,8 +86,8 @@ const ShorthandStep = Type.Object(
83
86
  );
84
87
 
85
88
  const TaskflowParams = Type.Object({
86
- action: StringEnum(["run", "save", "resume", "list", "agents", "init", "verify", "compile", "cache-clear"] as const, {
87
- description: "What to do: run a flow, save a definition, resume a paused run, list saved flows, list available agents, init model role configuration, verify the DAG, compile the DAG to a Mermaid diagram + verification report, or clear the cross-run memoization cache",
89
+ action: StringEnum(["run", "save", "resume", "list", "agents", "init", "verify", "compile", "provenance", "why-stale", "recompute", "cache-clear"] as const, {
90
+ description: "What to do: run a flow, save a definition, resume a paused run, list saved flows, list available agents, init model role configuration, verify the DAG, compile the DAG to a Mermaid diagram + verification report, show observed readSet provenance, explain why a run is stale, minimally recompute a stale run, or clear the cross-run memoization cache",
88
91
  default: "run",
89
92
  }),
90
93
  name: Type.Optional(Type.String({ description: "Name of a saved flow (for run/save without inline define)" })),
@@ -123,6 +126,8 @@ const TaskflowParams = Type.Object({
123
126
  ),
124
127
  args: Type.Optional(Type.Record(Type.String(), Type.Unknown(), { description: "Invocation arguments for the flow" })),
125
128
  runId: Type.Optional(Type.String({ description: "Run id to resume (for action=resume)" })),
129
+ phaseId: Type.Optional(Type.String({ description: "Phase id — the assumed-changed seed for action=why-stale, or the phase to re-run for action=recompute" })),
130
+ dryRun: Type.Optional(Type.Boolean({ description: "For action=recompute: compute the stale frontier without re-executing anything (no tokens spent). Defaults to true (safe); set false to actually re-run the seed + stale frontier and persist the updated run" })),
126
131
  scope: Type.Optional(
127
132
  StringEnum(["user", "project"] as const, { description: "Where to save (action=save)", default: "project" }),
128
133
  ),
@@ -146,6 +151,45 @@ const TaskflowParams = Type.Object({
146
151
  ),
147
152
  });
148
153
 
154
+ function formatProvenance(run: RunState): string {
155
+ const lines: string[] = [];
156
+ lines.push(`Provenance — run ${run.runId} · flow "${run.flowName}" · ${run.status}`);
157
+ lines.push("");
158
+ const finalIds = new Set(run.def.phases.filter((p) => p.final).map((p) => p.id));
159
+ const phases = Object.values(run.phases);
160
+ const any = phases.some((p) => p.reads && p.reads.length > 0);
161
+ if (!any) {
162
+ lines.push(
163
+ "(No observed readSets recorded. Reads are captured for agent/gate/reduce phases that interpolate {steps.*} — the overstory \"observed readSet@version\" moat.)",
164
+ );
165
+ return lines.join("\n");
166
+ }
167
+ for (const p of phases) {
168
+ const reads = p.reads ?? [];
169
+ lines.push(`■ ${p.id} [${p.status}]${finalIds.has(p.id) ? " ★ final" : ""}`);
170
+ if (reads.length) {
171
+ lines.push(" observed reads:");
172
+ for (const r of reads) lines.push(` ← ${r.stepId}@${r.version ?? "?"}`);
173
+ } else {
174
+ lines.push(" (source — no upstream reads)");
175
+ }
176
+ }
177
+ return lines.join("\n");
178
+ }
179
+
180
+ function formatRecompute(r: RecomputeReport): string {
181
+ const lines: string[] = [];
182
+ lines.push(`Recompute — seed: ${r.seeds.join(", ")}${r.dryRun ? " (DRY RUN — worst-case, no execution)" : ""}`);
183
+ lines.push("");
184
+ lines.push(`▲ re-run (${r.rerun.length}): ${r.rerun.join(", ") || "—"}`);
185
+ if (!r.dryRun) {
186
+ lines.push(`✂ early-cutoff (cached — inputHash unchanged): ${r.cutoff.join(", ") || "—"}`);
187
+ if (r.cutoff.length > 0) lines.push(` → saved ${r.cutoff.length} re-execution(s).`);
188
+ }
189
+ lines.push(`✓ reused (outside frontier): ${r.reused.join(", ") || "—"}`);
190
+ return lines.join("\n");
191
+ }
192
+
149
193
  function makeRunState(def: Taskflow, args: Record<string, unknown>, cwd: string): RunState {
150
194
  return {
151
195
  runId: newRunId(def.name),
@@ -292,7 +336,18 @@ async function runFlow(
292
336
  persist: persistThrottled,
293
337
  requestApproval,
294
338
  loadFlow: (name: string) => getFlow(ctx.cwd, name)?.def,
339
+ // Cross-run cache is opt-in per phase (cache:{scope:"cross-run"}).
340
+ // Defaulting every real run to cross-run was reviewed out: it silently
341
+ // persists phase outputs and can serve stale results for phases whose
342
+ // agents read files at runtime (those files are not in the cache key).
343
+ cacheScopeDefault: "run-only",
295
344
  });
345
+ // Auto-report cache savings at the end of a real run so the user sees the
346
+ // M1-M5 effect without running a separate /tf command.
347
+ if (result.ok) {
348
+ const report = formatCacheReport(result.state, result.totalUsage);
349
+ if (report) ctx.ui.notify(report, "info");
350
+ }
296
351
  return result;
297
352
  } finally {
298
353
  if (heartbeat) clearInterval(heartbeat);
@@ -629,6 +684,60 @@ export default function (pi: ExtensionAPI) {
629
684
  return finalResult(action, result);
630
685
  }
631
686
 
687
+ if (action === "provenance") {
688
+ if (!params.runId)
689
+ return errorResult(action, "action=provenance requires 'runId'");
690
+ const run = loadRun(ctx.cwd, params.runId);
691
+ if (!run) return errorResult(action, `Run not found: ${params.runId}`);
692
+ return {
693
+ content: [{ type: "text", text: formatProvenance(run) }],
694
+ details: { action } satisfies TaskflowDetails,
695
+ };
696
+ }
697
+
698
+ if (action === "why-stale") {
699
+ if (!params.runId)
700
+ return errorResult(action, "action=why-stale requires 'runId'");
701
+ const run = loadRun(ctx.cwd, params.runId);
702
+ if (!run) return errorResult(action, `Run not found: ${params.runId}`);
703
+ const reads = readMapOf(run.phases);
704
+ const seeds = params.phaseId ? [String(params.phaseId)] : [];
705
+ return {
706
+ content: [{ type: "text", text: formatWhyStale(run.runId, run.flowName, reads, seeds) }],
707
+ details: { action } satisfies TaskflowDetails,
708
+ };
709
+ }
710
+
711
+ if (action === "recompute") {
712
+ if (!params.runId)
713
+ return errorResult(action, "action=recompute requires 'runId'");
714
+ if (!params.phaseId)
715
+ return errorResult(action, "action=recompute requires 'phaseId' (the seed phase to re-run)");
716
+ const prev = loadRun(ctx.cwd, params.runId);
717
+ if (!prev) return errorResult(action, `Run not found: ${params.runId}`);
718
+ // H1: the LLM-callable tool defaults to a SAFE dry-run (no tokens, no
719
+ // mutation). A real recompute — which spends money and overwrites the
720
+ // run — requires an explicit dryRun:false.
721
+ const dryRun = params.dryRun !== false;
722
+ const settings = readSubagentSettings();
723
+ const { agents } = discoverAgents(ctx.cwd, prev.def.agentScope ?? "user", settings.modelRoles, settings.taskflow);
724
+ const deps: RuntimeDeps = {
725
+ cwd: ctx.cwd,
726
+ agents,
727
+ globalThinking: settings.globalThinking,
728
+ signal,
729
+ loadFlow: (name: string) => getFlow(ctx.cwd, name)?.def,
730
+ };
731
+ const { report, state } = await recomputeTaskflow(prev, deps, [String(params.phaseId)], { dryRun });
732
+ // H2: never persist a partial/aborted recompute over the original run.
733
+ if (!dryRun && !report.aborted) saveRun(state, { maxKeep: settings.taskflow.maxKeptRuns, maxAgeDays: settings.taskflow.maxRunAgeDays });
734
+ const prefix = report.aborted ? "⚠ ABORTED mid-recompute — original run left unchanged.\n\n" : "";
735
+ return {
736
+ content: [{ type: "text", text: prefix + formatRecompute(report) }],
737
+ details: { action } satisfies TaskflowDetails,
738
+ };
739
+ }
740
+
632
741
  // resolve the definition: inline `define` / shorthand (single|parallel|chain), else saved `name`.
633
742
  let def: Taskflow | undefined;
634
743
 
@@ -822,7 +931,7 @@ export default function (pi: ExtensionAPI) {
822
931
  pi.registerCommand("tf", {
823
932
  description: "Taskflow: list | run <name> | show <name> | compile <name> | runs | init",
824
933
  getArgumentCompletions: (prefix) => {
825
- const subs = ["list", "run", "show", "runs", "resume", "init", "save", "verify", "compile"];
934
+ const subs = ["list", "run", "show", "runs", "resume", "init", "save", "verify", "compile", "provenance", "why-stale", "recompute"];
826
935
  const items = subs.map((s) => ({ value: s, label: s }));
827
936
  const filtered = items.filter((i) => i.value.startsWith(prefix));
828
937
  return filtered.length > 0 ? filtered : null;
@@ -878,6 +987,69 @@ export default function (pi: ExtensionAPI) {
878
987
  return;
879
988
  }
880
989
 
990
+ if (sub === "provenance") {
991
+ if (!arg) {
992
+ ctx.ui.notify("Usage: /tf provenance <runId>", "warning");
993
+ return;
994
+ }
995
+ const run = loadRun(ctx.cwd, arg);
996
+ if (!run) {
997
+ ctx.ui.notify(`Run not found: ${arg}`, "error");
998
+ return;
999
+ }
1000
+ ctx.ui.notify(formatProvenance(run), "info");
1001
+ return;
1002
+ }
1003
+
1004
+ if (sub === "why-stale") {
1005
+ if (!arg) {
1006
+ ctx.ui.notify("Usage: /tf why-stale <runId> [phaseId]", "warning");
1007
+ return;
1008
+ }
1009
+ const [rid, ...rest] = arg.trim().split(/\s+/);
1010
+ const run = loadRun(ctx.cwd, rid);
1011
+ if (!run) {
1012
+ ctx.ui.notify(`Run not found: ${rid}`, "error");
1013
+ return;
1014
+ }
1015
+ const reads = readMapOf(run.phases);
1016
+ ctx.ui.notify(formatWhyStale(run.runId, run.flowName, reads, rest), "info");
1017
+ return;
1018
+ }
1019
+
1020
+ if (sub === "recompute") {
1021
+ const tokens = (arg ?? "").trim().split(/\s+/).filter(Boolean);
1022
+ const rid = tokens[0];
1023
+ const seed = tokens.find((t) => t !== rid && !t.startsWith("--"));
1024
+ const apply = tokens.includes("--apply");
1025
+ if (!rid || !seed) {
1026
+ ctx.ui.notify("Usage: /tf recompute <runId> <phaseId> [--apply]\n(default is a safe dry-run; --apply spends tokens)", "warning");
1027
+ return;
1028
+ }
1029
+ const prev = loadRun(ctx.cwd, rid);
1030
+ if (!prev) {
1031
+ ctx.ui.notify(`Run not found: ${rid}`, "error");
1032
+ return;
1033
+ }
1034
+ const settings = readSubagentSettings();
1035
+ const { agents } = discoverAgents(ctx.cwd, prev.def.agentScope ?? "user", settings.modelRoles, settings.taskflow);
1036
+ const deps: RuntimeDeps = {
1037
+ cwd: ctx.cwd,
1038
+ agents,
1039
+ globalThinking: settings.globalThinking,
1040
+ loadFlow: (name: string) => getFlow(ctx.cwd, name)?.def,
1041
+ };
1042
+ if (apply) {
1043
+ const { report, state } = await recomputeTaskflow(prev, deps, [seed], { dryRun: false });
1044
+ if (!report.aborted) saveRun(state, { maxKeep: settings.taskflow.maxKeptRuns, maxAgeDays: settings.taskflow.maxRunAgeDays });
1045
+ ctx.ui.notify(formatRecompute(report), report.aborted ? "warning" : "info");
1046
+ } else {
1047
+ const { report } = await recomputeTaskflow(prev, deps, [seed], { dryRun: true });
1048
+ ctx.ui.notify(formatRecompute(report), "info");
1049
+ }
1050
+ return;
1051
+ }
1052
+
881
1053
  if (sub === "runs") {
882
1054
  const runs = listRuns(ctx.cwd, 50);
883
1055
  if (runs.length === 0) {
@@ -1123,6 +1295,17 @@ function errorResult(action: string, message: string): ToolResult {
1123
1295
  };
1124
1296
  }
1125
1297
 
1298
+ function formatCacheReport(state: RunState, totalUsage: UsageStats): string {
1299
+ const cached = Object.values(state.phases).filter((p) => p.cacheHit === "cross-run");
1300
+ if (cached.length === 0) return "";
1301
+ // Honest reporting: we know these phases spent 0 tokens *this run* because
1302
+ // they were served from cache. We do NOT estimate dollars/tokens "saved" —
1303
+ // that requires guessing what a re-execution would have cost, and the mix of
1304
+ // cheap vs expensive phases (tournament/loop) makes such a guess misleading.
1305
+ const cachedTokens = cached.reduce((sum, p) => sum + ((p.usage?.input ?? 0) + (p.usage?.output ?? 0)), 0);
1306
+ return `💾 ${cached.length} phase(s) reused from cross-run cache (${cachedTokens.toLocaleString()} tokens spent on them this run)`;
1307
+ }
1308
+
1126
1309
  function finalResult(action: string, result: RuntimeResult): ToolResult {
1127
1310
  const fp = finalPhase(result.state.def.phases);
1128
1311
  const header = result.ok
@@ -1130,7 +1313,7 @@ function finalResult(action: string, result: RuntimeResult): ToolResult {
1130
1313
  : `Taskflow '${result.state.flowName}' ${result.state.status} (${summarizeRun(result.state)}). Run id: ${result.state.runId} — resume with action=resume.`;
1131
1314
  return {
1132
1315
  content: [{ type: "text", text: `${header}\n\n--- ${fp.id} ---\n${result.finalOutput}` }],
1133
- details: { action, state: result.state, finalOutput: result.finalOutput },
1316
+ details: { action, state: result.state, finalOutput: result.finalOutput, cacheReport: formatCacheReport(result.state, result.totalUsage) },
1134
1317
  isError: !result.ok,
1135
1318
  };
1136
1319
  }
@@ -21,6 +21,12 @@ export interface InterpolationContext {
21
21
  previousOutput?: string;
22
22
  /** loop variable bindings, e.g. { item: {...} } */
23
23
  locals?: Record<string, unknown>;
24
+ /** Observed-read hook (M3): invoked once per successfully-resolved
25
+ * placeholder path, so the runtime can capture which upstream phases a
26
+ * phase actually consumed (its observed readSet). Unresolved refs do NOT
27
+ * fire it (they become `missing` warnings instead). Default undefined →
28
+ * zero overhead, fully backward-compatible. */
29
+ onRead?: (ref: string) => void;
24
30
  }
25
31
 
26
32
  const PLACEHOLDER = /\{([a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*)\}/g;
@@ -48,7 +54,18 @@ export function interpolate(
48
54
  return { text, missing };
49
55
  }
50
56
 
57
+ /** Resolve + record an observed read (M3 observed-readSet). Fires only on
58
+ * successful resolution so an unresolved ref is NOT logged as a dependency
59
+ * (it stays a `missing` warning). The runtime threads a collector here to
60
+ * capture which upstream phases this phase actually consumed — the overstory
61
+ * "observed readSet@version" moat (nobody else records this). */
51
62
  function resolvePath(path: string, ctx: InterpolationContext): unknown {
63
+ const value = _resolvePath(path, ctx);
64
+ if (value !== undefined) ctx.onRead?.(path);
65
+ return value;
66
+ }
67
+
68
+ function _resolvePath(path: string, ctx: InterpolationContext): unknown {
52
69
  const parts = path.split(".");
53
70
  const head = parts[0];
54
71
 
@@ -20,6 +20,8 @@ import { type Budget, type CacheScope, dependenciesOf, finalPhase, LOOP_DEFAULT_
20
20
  import { verifyTaskflow } from "./verify.ts";
21
21
  import { hashInput, newRunId, type PhaseState, type RunState, runsDir } from "./store.ts";
22
22
  import { CacheStore, resolveFingerprint } from "./cache.ts";
23
+ import { flowDefHash } from "./flowir/hash.ts";
24
+ import { computeStaleFrontier, readMapOf } from "./stale.ts";
23
25
  import { ctxDirFor, drainPendingSpawns, initCtxDir, registerNode, setNodeStatus, type SpawnAssignment } from "./context-store.ts";
24
26
  import { allocateWorkspace, isWorkspaceKeyword, type Workspace } from "./workspace.ts";
25
27
 
@@ -55,6 +57,8 @@ export interface RuntimeDeps {
55
57
  loadFlow?: (name: string) => Taskflow | undefined;
56
58
  /** Cross-run memoization store. Omit to construct a default one for `deps.cwd`. */
57
59
  cacheStore?: CacheStore;
60
+ /** Default cache scope for phases that don't specify one. */
61
+ cacheScopeDefault?: CacheScope;
58
62
  /** Internal: sub-flow call stack, for recursion detection. */
59
63
  _stack?: string[];
60
64
  /** Internal: pre-resolved Shared Context Tree dir for this run (sub-flows inherit the parent's). */
@@ -74,6 +78,7 @@ function buildInterpolationContext(
74
78
  state: RunState,
75
79
  previousOutput: string | undefined,
76
80
  locals?: Record<string, unknown>,
81
+ onRead?: (ref: string) => void,
77
82
  ): InterpolationContext {
78
83
  const steps: Record<string, { output: string; json?: unknown }> = {};
79
84
  for (const [id, ps] of Object.entries(state.phases)) {
@@ -90,7 +95,7 @@ function buildInterpolationContext(
90
95
  }
91
96
  }
92
97
  }
93
- return { args: state.args, steps, previousOutput, locals };
98
+ return { args: state.args, steps, previousOutput, locals, onRead };
94
99
  }
95
100
 
96
101
  function resultToPhaseState(id: string, r: RunResult, inputHash: string, parseJson: boolean): PhaseState { const failed = isFailed(r);
@@ -115,6 +120,27 @@ function resultToPhaseState(id: string, r: RunResult, inputHash: string, parseJs
115
120
  };
116
121
  }
117
122
 
123
+ /** Convert observed read refs (e.g. "steps.scout.output") into a structured
124
+ * readSet keyed by upstream phase id, tagging each with the version
125
+ * (= inputHash) that was current when read. Only `steps.*` refs are upstream
126
+ * phase dependencies; args/item/previous are invocation/loop values. */
127
+ function readRefsToReads(
128
+ refs: string[],
129
+ state: RunState,
130
+ ): Array<{ stepId: string; version?: string }> {
131
+ const out: Array<{ stepId: string; version?: string }> = [];
132
+ const seen = new Set<string>();
133
+ for (const ref of refs) {
134
+ const m = /^steps\.([A-Za-z0-9_-]+)\b/.exec(ref);
135
+ if (!m) continue;
136
+ const stepId = m[1] as string;
137
+ if (seen.has(stepId)) continue;
138
+ seen.add(stepId);
139
+ out.push({ stepId, version: state.phases[stepId]?.inputHash });
140
+ }
141
+ return out;
142
+ }
143
+
118
144
  /**
119
145
  * Surface unresolved interpolation placeholders (the `missing[]` from
120
146
  * `interpolate()`). Without this they are silently left intact in the task —
@@ -551,6 +577,15 @@ async function runSpawnedChildren(
551
577
  * and tears it down afterwards. All allocation is fail-open: a failed allocation
552
578
  * degrades to the base cwd so a phase never fails to run because of isolation.
553
579
  */
580
+ /** Optional per-invocation execution flags (e.g. M5 recompute forces a
581
+ * phase to re-run, bypassing the cross-run cache so the result refreshes). */
582
+ interface PhaseExecOpts {
583
+ /** Bypass the cache entirely (within-run prior AND cross-run store) and
584
+ * re-execute. Used by `/tf recompute` on the seeded phase so its new
585
+ * output — and only the downstream whose inputHash actually moves — refreshes. */
586
+ forceRerun?: boolean;
587
+ }
588
+
554
589
  async function executePhase(
555
590
  phase: Phase,
556
591
  state: RunState,
@@ -558,10 +593,11 @@ async function executePhase(
558
593
  prior: PhaseState | undefined,
559
594
  emitProgress: () => void,
560
595
  _retryDepth = 0,
596
+ opts?: PhaseExecOpts,
561
597
  ): Promise<PhaseState> {
562
598
  // Non-keyword cwd (or none): no workspace lifecycle — run directly.
563
599
  if (!isWorkspaceKeyword(phase.cwd)) {
564
- return executePhaseInner(phase, state, deps, prior, emitProgress, _retryDepth);
600
+ return executePhaseInner(phase, state, deps, prior, emitProgress, _retryDepth, opts);
565
601
  }
566
602
  let ws: Workspace | undefined;
567
603
  try {
@@ -576,7 +612,7 @@ async function executePhase(
576
612
  }
577
613
  const innerDeps: RuntimeDeps = ws ? { ...deps, _cwdOverride: ws.dir } : deps;
578
614
  try {
579
- const ps = await executePhaseInner(phase, state, innerDeps, prior, emitProgress, _retryDepth);
615
+ const ps = await executePhaseInner(phase, state, innerDeps, prior, emitProgress, _retryDepth, opts);
580
616
  if (ws && (ws.kind !== "inherited" || ws.note)) {
581
617
  const tag = ws.kind === "inherited" ? "workspace" : `workspace:${ws.kind}`;
582
618
  const msg = ws.note ? `${tag} — ${ws.note}` : `${tag} at ${ws.dir}`;
@@ -599,6 +635,7 @@ async function executePhaseInner(
599
635
  prior: PhaseState | undefined,
600
636
  emitProgress: () => void,
601
637
  _retryDepth = 0,
638
+ opts?: PhaseExecOpts,
602
639
  ): Promise<PhaseState> {
603
640
  const type = phase.type ?? "agent";
604
641
  const concurrency = phase.concurrency ?? state.def.concurrency ?? 8;
@@ -631,13 +668,49 @@ async function executePhaseInner(
631
668
  // Resolve context pre-read files once, before any type branching.
632
669
  // The content is prepended to every task so the subagent never spends
633
670
  // turns on file exploration for files the flow author already knows.
634
- const ctx = buildInterpolationContext(state, previousOutput);
671
+ // M3 observed-readSet: collect every upstream ref this phase resolves, so we
672
+ // can record what its result ACTUALLY depended on (not just its declared
673
+ // dependsOn). Shared by every interpolation in this phase (task / when / …).
674
+ const readRefs: string[] = [];
675
+ const onRead = (ref: string): void => {
676
+ readRefs.push(ref);
677
+ };
678
+ const ctx = buildInterpolationContext(state, previousOutput, undefined, onRead);
679
+
680
+ // M3 observed-readSet: when conditions are part of the phase's real
681
+ // dependencies. Evaluate them inside executePhaseInner so every upstream
682
+ // interpolation is captured by the shared onRead hook, not silently dropped
683
+ // by a separate out-of-band context.
684
+ if (phase.when !== undefined) {
685
+ if (!evaluateCondition(phase.when, ctx)) {
686
+ return {
687
+ id: phase.id,
688
+ status: "skipped",
689
+ error: `Condition not met: ${phase.when}`,
690
+ endedAt: Date.now(),
691
+ usage: emptyUsage(),
692
+ reads: readRefsToReads(readRefs, state),
693
+ };
694
+ }
695
+ }
696
+
635
697
  const preRead = await resolvePhaseContext(phase, ctx);
636
698
 
637
699
  // Resolve this phase's cache policy once. Default scope is "run-only" (the
638
700
  // historical within-run resume behavior). Only "cross-run" phases resolve a
639
701
  // fingerprint and consult the persistent store.
640
- const cacheScope: CacheScope = (phase.cache?.scope ?? "run-only") as CacheScope;
702
+ let cacheScope: CacheScope = (phase.cache?.scope ?? deps.cacheScopeDefault ?? "run-only") as CacheScope;
703
+ // Defense in depth: gate/approval/loop/tournament must produce a fresh result
704
+ // each run (schema already rejects explicit cross-run, but the default-scope
705
+ // path must also be blocked). If flowDefHash failed, cross-run is unsafe
706
+ // because the key degrades to flowName-only and reopens cross-flow collisions.
707
+ const CROSS_RUN_BLOCKED_TYPES = new Set(["gate", "approval", "loop", "tournament"]);
708
+ if (cacheScope === "cross-run" && CROSS_RUN_BLOCKED_TYPES.has(type)) {
709
+ cacheScope = "run-only";
710
+ }
711
+ if (state.flowDefHash === "failed" && cacheScope === "cross-run") {
712
+ cacheScope = "run-only";
713
+ }
641
714
  const cc: PhaseCacheCtx = {
642
715
  scope: cacheScope,
643
716
  ttlMs: phase.cache?.ttl ? (parseTtlMs(phase.cache.ttl) ?? undefined) : undefined,
@@ -647,6 +720,8 @@ async function executePhaseInner(
647
720
  phaseId: phase.id,
648
721
  flowName: state.flowName,
649
722
  runId: state.runId,
723
+ flowDefHash: state.flowDefHash === "failed" ? undefined : state.flowDefHash,
724
+ forceRerun: opts?.forceRerun,
650
725
  thinking: phase.thinking,
651
726
  tools: phase.tools,
652
727
  preRead,
@@ -823,7 +898,7 @@ async function executePhaseInner(
823
898
  if (type === "agent" || type === "gate" || type === "reduce") {
824
899
  // Eval gate: zero-token machine checks before the LLM gate.
825
900
  if (type === "gate" && Array.isArray(phase.eval) && phase.eval.length > 0) {
826
- const evalCtx = buildInterpolationContext(state, previousOutput);
901
+ const evalCtx = buildInterpolationContext(state, previousOutput, undefined, onRead);
827
902
  let allPassed = true;
828
903
  for (const check of phase.eval) {
829
904
  let expr = check;
@@ -858,6 +933,7 @@ async function executePhaseInner(
858
933
  inputHash,
859
934
  endedAt: Date.now(),
860
935
  };
936
+ if (readRefs.length) ps.reads = readRefsToReads(readRefs, state);
861
937
  recordCache(cc, ps);
862
938
  return ps;
863
939
  }
@@ -873,6 +949,7 @@ async function executePhaseInner(
873
949
 
874
950
  const r = await runOne(agentName, fullTask, liveSink(state, phase.id, emitProgress), nodeIdFor());
875
951
  const ps = resultToPhaseState(phase.id, r, inputHash, parseJson);
952
+ if (readRefs.length) ps.reads = readRefsToReads(readRefs, state);
876
953
  if (refWarning) ps.warnings = [...(ps.warnings ?? []), refWarning];
877
954
  if (type === "gate" && ps.status === "done") ps.gate = parseGateVerdict(r.output);
878
955
 
@@ -919,7 +996,7 @@ async function executePhaseInner(
919
996
  for (const depId of phase.dependsOn ?? []) {
920
997
  const d = state.def.phases.find((p) => p.id === depId);
921
998
  if (!d) continue;
922
- const dPs = await executePhase(d, state, depsForUpstream, prior, emitProgress, _retryDepth + 1);
999
+ const dPs = await executePhase(d, state, depsForUpstream, prior, emitProgress, _retryDepth + 1, undefined);
923
1000
  state.phases[depId] = dPs;
924
1001
  }
925
1002
  }
@@ -954,6 +1031,7 @@ async function executePhaseInner(
954
1031
 
955
1032
  const results = await runFanout(branches);
956
1033
  const ps = mergePhaseState(phase.id, results, inputHash, parseJson);
1034
+ if (readRefs.length) ps.reads = readRefsToReads(readRefs, state);
957
1035
  recordCache(cc, ps);
958
1036
  return ps;
959
1037
  }
@@ -982,7 +1060,7 @@ async function executePhaseInner(
982
1060
  }
983
1061
  const loopVar = phase.as ?? "item";
984
1062
  const tasks = arr.map((item) => {
985
- const localCtx = buildInterpolationContext(state, previousOutput, { [loopVar]: item });
1063
+ const localCtx = buildInterpolationContext(state, previousOutput, { [loopVar]: item }, onRead);
986
1064
  return {
987
1065
  agent: resolveAgent(phase.agent, deps, state),
988
1066
  task: preRead + interpolate(phase.task ?? "", localCtx).text,
@@ -994,6 +1072,7 @@ async function executePhaseInner(
994
1072
 
995
1073
  const results = await runFanout(tasks);
996
1074
  const ps = mergePhaseState(phase.id, results, inputHash, parseJson);
1075
+ if (readRefs.length) ps.reads = readRefsToReads(readRefs, state);
997
1076
  if (mapTruncated) {
998
1077
  ps.warnings = [...(ps.warnings ?? []), `map fan-out truncated to MAX_DYNAMIC_MAP_ITEMS (${MAX_DYNAMIC_MAP_ITEMS}) inside a dynamic sub-flow`];
999
1078
  // NB: do NOT set ps.budgetTruncated — that field drives the run-level
@@ -1005,9 +1084,10 @@ async function executePhaseInner(
1005
1084
  }
1006
1085
 
1007
1086
  if (type === "approval") {
1008
- const ctx = buildInterpolationContext(state, previousOutput);
1087
+ const readRefs: string[] = [];
1088
+ const ctx = buildInterpolationContext(state, previousOutput, undefined, (ref) => readRefs.push(ref));
1009
1089
  const message = interpolate(phase.task ?? "Approve to continue?", ctx).text;
1010
- const inputHash = hashInput(phase.id, phase.model ?? "", "approval", message);
1090
+ const inputHash = cacheKey(cc, [phase.id, phase.model ?? "", "approval", message]);
1011
1091
  const cached = cachedPhase(cc, inputHash);
1012
1092
  if (cached) return cached;
1013
1093
 
@@ -1023,6 +1103,7 @@ async function executePhaseInner(
1023
1103
  gate: { verdict: "block", reason: "(auto-rejected: no interactive approver available)" },
1024
1104
  usage: emptyUsage(),
1025
1105
  inputHash,
1106
+ reads: readRefsToReads(readRefs, state),
1026
1107
  endedAt: Date.now(),
1027
1108
  };
1028
1109
  }
@@ -1035,6 +1116,7 @@ async function executePhaseInner(
1035
1116
  approval: { decision: decision.decision, note },
1036
1117
  usage: emptyUsage(),
1037
1118
  inputHash,
1119
+ reads: readRefsToReads(readRefs, state),
1038
1120
  endedAt: Date.now(),
1039
1121
  };
1040
1122
  // A rejection halts the flow via the same mechanism as a blocking gate.
@@ -1045,7 +1127,8 @@ async function executePhaseInner(
1045
1127
  }
1046
1128
 
1047
1129
  if (type === "flow") {
1048
- const ctx = buildInterpolationContext(state, previousOutput);
1130
+ const readRefs: string[] = [];
1131
+ const ctx = buildInterpolationContext(state, previousOutput, undefined, (ref) => readRefs.push(ref));
1049
1132
  const hasDef = (phase as { def?: unknown }).def !== undefined;
1050
1133
  const stack = deps._stack ?? [];
1051
1134
 
@@ -1066,6 +1149,7 @@ async function executePhaseInner(
1066
1149
  json: parseJson ? safeParse("") : undefined,
1067
1150
  usage: emptyUsage(),
1068
1151
  inputHash: hashInput(phase.id, `flow-def-error:${diag}`),
1152
+ reads: readRefsToReads(readRefs, state),
1069
1153
  endedAt: Date.now(),
1070
1154
  defError: diag,
1071
1155
  });
@@ -1101,6 +1185,7 @@ async function executePhaseInner(
1101
1185
  json: parseJson ? safeParse("") : undefined,
1102
1186
  usage: emptyUsage(),
1103
1187
  inputHash: hashInput(phase.id, "flow-def-empty"),
1188
+ reads: readRefsToReads(readRefs, state),
1104
1189
  endedAt: Date.now(),
1105
1190
  };
1106
1191
  }
@@ -1222,6 +1307,7 @@ async function executePhaseInner(
1222
1307
  },
1223
1308
  error: subResult.ok ? undefined : `sub-flow '${name}' ${subResult.state.status}`,
1224
1309
  inputHash,
1310
+ reads: readRefsToReads(readRefs, state),
1225
1311
  endedAt: Date.now(),
1226
1312
  };
1227
1313
  recordCache(cc, flowPs);
@@ -1231,11 +1317,21 @@ async function executePhaseInner(
1231
1317
  // loop-until-done: run the body repeatedly until `until` is truthy, the output
1232
1318
  // converges to a fixed point, or maxIterations is hit (always terminates).
1233
1319
  if (type === "loop") {
1320
+ const readRefs: string[] = [];
1234
1321
  const agentName = resolveAgent(phase.agent, deps, state);
1235
1322
  const rawMax = phase.maxIterations ?? LOOP_DEFAULT_MAX_ITERATIONS;
1236
1323
  const maxIters = Math.max(1, Math.min(LOOP_HARD_MAX_ITERATIONS, Math.floor(rawMax)));
1237
1324
  const convergence = phase.convergence ?? true;
1238
1325
 
1326
+ // Canonical first-iteration body for the cache key. It must fold in the
1327
+ // interpolated task/upstream refs so that a changed upstream changes the
1328
+ // key and recompute no longer silently reuses a stale loop (critic finding).
1329
+ const firstBodyCtx = buildInterpolationContext(state, previousOutput, {
1330
+ loop: { iteration: 1, lastOutput: "", maxIterations: maxIters },
1331
+ }, (ref) => readRefs.push(ref));
1332
+ const firstBody = preRead + interpolate(phase.task ?? "", firstBodyCtx).text;
1333
+ const inputHash = hashInput(phase.id, "loop", phase.until ?? "", firstBody, String(maxIters));
1334
+
1239
1335
  const usages: UsageStats[] = [];
1240
1336
  const loopWarnings: string[] = [];
1241
1337
  let lastOutput = "";
@@ -1253,7 +1349,7 @@ async function executePhaseInner(
1253
1349
  // The body sees its iteration number and the prior iteration's output.
1254
1350
  const bodyCtx = buildInterpolationContext(state, previousOutput, {
1255
1351
  loop: { iteration: i, lastOutput, maxIterations: maxIters },
1256
- });
1352
+ }, (ref) => readRefs.push(ref));
1257
1353
  const body = preRead + interpolate(phase.task ?? "", bodyCtx).text;
1258
1354
  const r = await runOne(agentName, body, liveSink(state, phase.id, emitProgress));
1259
1355
  usages.push(r.usage);
@@ -1270,7 +1366,7 @@ async function executePhaseInner(
1270
1366
  // Loop locals ({loop.iteration} etc.) are available to the condition too.
1271
1367
  const untilCtx = buildInterpolationContext(state, previousOutput, {
1272
1368
  loop: { iteration: i, lastOutput, maxIterations: maxIters },
1273
- });
1369
+ }, (ref) => readRefs.push(ref));
1274
1370
  untilCtx.steps[phase.id] = { output: lastOutput, json: safeParse(lastOutput) };
1275
1371
  const { value: done, error: condErr } = tryEvaluateCondition(phase.until ?? "", untilCtx);
1276
1372
  // A malformed condition must not spin forever: stop and surface a warning
@@ -1301,7 +1397,8 @@ async function executePhaseInner(
1301
1397
  error: failedResult?.errorMessage || failedResult?.stderr || (stop === "aborted" ? "Aborted" : `loop '${phase.id}' iteration ${iterations} failed`),
1302
1398
  loop: { iterations, stop },
1303
1399
  warnings: loopWarnings.length ? loopWarnings : undefined,
1304
- inputHash: hashInput(phase.id, "loop", phase.until ?? ""),
1400
+ inputHash,
1401
+ reads: readRefsToReads(readRefs, state),
1305
1402
  endedAt: Date.now(),
1306
1403
  };
1307
1404
  }
@@ -1313,7 +1410,8 @@ async function executePhaseInner(
1313
1410
  usage: aggUsage,
1314
1411
  loop: { iterations, stop },
1315
1412
  warnings: loopWarnings.length ? loopWarnings : undefined,
1316
- inputHash: hashInput(phase.id, "loop", phase.until ?? "", String(iterations)),
1413
+ inputHash,
1414
+ reads: readRefsToReads(readRefs, state),
1317
1415
  endedAt: Date.now(),
1318
1416
  };
1319
1417
  }
@@ -1336,6 +1434,20 @@ async function executePhaseInner(
1336
1434
  competitors = Array.from({ length: n }, () => ({ agent: resolveAgent(phase.agent, deps, state), task: body }));
1337
1435
  }
1338
1436
 
1437
+ // The inputHash must fold in the resolved competitors (which embed the
1438
+ // interpolated task/upstream refs) and the judge rubric, otherwise a changed
1439
+ // upstream produces the same key and recompute silently reuses a stale
1440
+ // tournament (critic finding: unsound for cross-run/recompute).
1441
+ const rubric = interpolate(phase.judge ?? "", ctx).text.trim();
1442
+ const inputHash = hashInput(
1443
+ phase.id,
1444
+ "tournament",
1445
+ mode,
1446
+ String(competitors.length),
1447
+ JSON.stringify(competitors.map((c) => ({ agent: c.agent, task: c.task }))),
1448
+ rubric,
1449
+ );
1450
+
1339
1451
  const results = await runFanout(competitors);
1340
1452
  const ran = results.filter((r) => r.stopReason !== "budget-skipped");
1341
1453
  const ok = ran.filter((r) => !isFailed(r));
@@ -1355,7 +1467,8 @@ async function executePhaseInner(
1355
1467
  error: `tournament '${phase.id}': all ${competitors.length} variants failed`,
1356
1468
  budgetTruncated: budgetSkipCount > 0 || undefined,
1357
1469
  tournament: { variants: competitors.length, winner: 0, mode },
1358
- inputHash: hashInput(phase.id, "tournament", String(competitors.length)),
1470
+ inputHash,
1471
+ reads: readRefsToReads(readRefs, state),
1359
1472
  endedAt: Date.now(),
1360
1473
  };
1361
1474
  }
@@ -1370,7 +1483,8 @@ async function executePhaseInner(
1370
1483
  model: ok[0].model,
1371
1484
  budgetTruncated: budgetSkipCount > 0 || undefined,
1372
1485
  tournament: { variants: competitors.length, winner: ranIdx(ok[0]), mode, reason: "only surviving variant" },
1373
- inputHash: hashInput(phase.id, "tournament", String(competitors.length)),
1486
+ inputHash,
1487
+ reads: readRefsToReads(readRefs, state),
1374
1488
  endedAt: Date.now(),
1375
1489
  };
1376
1490
  }
@@ -1387,7 +1501,8 @@ async function executePhaseInner(
1387
1501
  budgetTruncated: budgetSkipCount > 0 || undefined,
1388
1502
  warnings: ["judge skipped: run aborted or budget exceeded"],
1389
1503
  tournament: { variants: competitors.length, winner: ranIdx(ok[0]), mode, reason: "judge skipped" },
1390
- inputHash: hashInput(phase.id, "tournament", String(competitors.length)),
1504
+ inputHash,
1505
+ reads: readRefsToReads(readRefs, state),
1391
1506
  endedAt: Date.now(),
1392
1507
  };
1393
1508
  }
@@ -1396,14 +1511,14 @@ async function executePhaseInner(
1396
1511
  const labelled = ran
1397
1512
  .map((r, i) => `### Variant ${i + 1}${isFailed(r) ? " (failed — ineligible)" : ""}\n\n${r.output}`)
1398
1513
  .join("\n\n---\n\n");
1399
- const rubric =
1400
- interpolate(phase.judge ?? "", ctx).text.trim() ||
1514
+ const finalRubric =
1515
+ rubric ||
1401
1516
  "You are judging competing answers to the same task. Pick the single best variant on correctness, completeness, and clarity.";
1402
1517
  const directive =
1403
1518
  mode === "best"
1404
1519
  ? `End your reply with a line exactly: WINNER: <number> (1–${ran.length}), choosing the strongest eligible variant.`
1405
1520
  : `Synthesize the strongest possible answer by combining the best parts of the eligible variants. Then end with a line: WINNER: <number> indicating which variant contributed most.`;
1406
- const judgeTask = `${rubric}\n\nThe candidate variants:\n\n${labelled}\n\n${directive}`;
1521
+ const judgeTask = `${finalRubric}\n\nThe candidate variants:\n\n${labelled}\n\n${directive}`;
1407
1522
  const judgeAgent = resolveAgent(phase.judgeAgent ?? phase.agent, deps, state);
1408
1523
  const judgeRes = await runOne(judgeAgent, judgeTask, liveSink(state, phase.id, emitProgress));
1409
1524
  const judgeUsage = aggregateUsage([variantUsage, judgeRes.usage]);
@@ -1421,7 +1536,8 @@ async function executePhaseInner(
1421
1536
  budgetTruncated: budgetSkipCount > 0 || undefined,
1422
1537
  warnings: [`judge failed (${judgeRes.errorMessage ?? "error"}); used variant ${ranIdx(ok[0])}`],
1423
1538
  tournament: { variants: competitors.length, winner: ranIdx(ok[0]), mode, reason: "judge failed" },
1424
- inputHash: hashInput(phase.id, "tournament", String(competitors.length)),
1539
+ inputHash,
1540
+ reads: readRefsToReads(readRefs, state),
1425
1541
  endedAt: Date.now(),
1426
1542
  };
1427
1543
  }
@@ -1444,7 +1560,8 @@ async function executePhaseInner(
1444
1560
  budgetTruncated: budgetSkipCount > 0 || undefined,
1445
1561
  warnings: winnerIneligible ? [`judge picked an ineligible variant; used variant ${winnerIdx}`] : undefined,
1446
1562
  tournament: { variants: competitors.length, winner: winnerIdx, mode, reason },
1447
- inputHash: hashInput(phase.id, "tournament", String(competitors.length), mode),
1563
+ inputHash,
1564
+ reads: readRefsToReads(readRefs, state),
1448
1565
  endedAt: Date.now(),
1449
1566
  };
1450
1567
  }
@@ -1509,6 +1626,15 @@ interface PhaseCacheCtx {
1509
1626
  * whether a given branch happens to fold preRead into its task string
1510
1627
  * (previously this was only incidentally true via `fullTask`). */
1511
1628
  preRead?: string;
1629
+ /** Content fingerprint of the desugared flow definition — folded into the
1630
+ * key so two structurally-different flows that share a name can never
1631
+ * collide, and a changed flow never serves a stale cross-run hit. */
1632
+ flowDefHash?: string | "failed";
1633
+ /** Force this phase to re-execute, ignoring the within-run prior AND the
1634
+ * cross-run store (M5 recompute seed). Downstream phases are NOT forced —
1635
+ * they re-evaluate naturally: if the seed's new output changed their
1636
+ * inputHash they miss and re-run, otherwise they hit (early cutoff). */
1637
+ forceRerun?: boolean;
1512
1638
  }
1513
1639
 
1514
1640
  /** Fold the phase fingerprint into the base hash parts to form the final cache key. */
@@ -1519,6 +1645,7 @@ function cacheKey(cc: PhaseCacheCtx, baseParts: string[]): string {
1519
1645
  // resolved context pre-read content, and the world-state fingerprint.
1520
1646
  const parts = [
1521
1647
  `flow:${cc.flowName}`,
1648
+ `flowdef:${cc.flowDefHash ?? ""}`,
1522
1649
  ...baseParts,
1523
1650
  `think:${cc.thinking ?? ""}`,
1524
1651
  `tools:${JSON.stringify(cc.tools ?? [])}`,
@@ -1536,6 +1663,7 @@ function cacheKey(cc: PhaseCacheCtx, baseParts: string[]): string {
1536
1663
  */
1537
1664
  function cachedPhase(cc: PhaseCacheCtx, inputHash: string): PhaseState | null {
1538
1665
  if (cc.scope === "off") return null;
1666
+ if (cc.forceRerun) return null;
1539
1667
 
1540
1668
  // 1. within-run resume (fastest; always allowed unless scope is off)
1541
1669
  if (cc.prior && cc.prior.status === "done" && cc.prior.inputHash === inputHash) {
@@ -1546,6 +1674,13 @@ function cachedPhase(cc: PhaseCacheCtx, inputHash: string): PhaseState | null {
1546
1674
  if (cc.scope === "cross-run") {
1547
1675
  const e = cc.store.get(inputHash, cc.ttlMs);
1548
1676
  if (e) {
1677
+ // If we stored the full PhaseState, restore it (preserving gate,
1678
+ // approval, reads, loop/tournament metadata, warnings) and just mark
1679
+ // the cache hit + zero usage. Fallback to the legacy trimmed surface
1680
+ // for entries written before this change.
1681
+ if (e.state) {
1682
+ return { ...e.state, inputHash, usage: emptyUsage(), cacheHit: "cross-run", endedAt: Date.now() };
1683
+ }
1549
1684
  return {
1550
1685
  id: cc.phaseId,
1551
1686
  status: "done",
@@ -1573,6 +1708,7 @@ function recordCache(cc: PhaseCacheCtx, ps: PhaseState): void {
1573
1708
  output: ps.output,
1574
1709
  json: ps.json,
1575
1710
  model: ps.model,
1711
+ state: ps,
1576
1712
  flowName: cc.flowName,
1577
1713
  phaseId: cc.phaseId,
1578
1714
  runId: cc.runId,
@@ -1701,6 +1837,155 @@ function safeProgress(deps: RuntimeDeps, state: RunState): void {
1701
1837
  /**
1702
1838
  * Execute a full taskflow. Mutates and persists `state` as it progresses.
1703
1839
  */
1840
+ /** Result of a recompute: what was (or would be) re-executed vs reused.
1841
+ * `cutoff` is the prize — phases in the stale frontier whose inputHash did
1842
+ * NOT move, so they hit their cached result instead of re-running (early
1843
+ * cutoff). That is what makes recompute cheaper than a full re-run. */
1844
+ export interface RecomputeReport {
1845
+ readonly dryRun: boolean;
1846
+ readonly aborted: boolean;
1847
+ readonly seeds: readonly string[];
1848
+ /** Phases that were (dry-run: would be) re-executed, or whose result moved. */
1849
+ readonly rerun: readonly string[];
1850
+ /** Phases outside the frontier — untouched, reused verbatim. */
1851
+ readonly reused: readonly string[];
1852
+ /** Phases in the frontier whose inputHash did NOT move → cached result
1853
+ * reused, no re-execution (early cutoff). Empty in dry-run (unknowable). */
1854
+ readonly cutoff: readonly string[];
1855
+ }
1856
+
1857
+ /** Scan a flow for dependencies that cannot be observed through the readSet.
1858
+ * These include Shared Context Tree, sub-flows, context: file pre-reads, and
1859
+ * interpolation placeholders that do not resolve through `steps.*` (previous,
1860
+ * args, item). Recomputing flows with such deps with dryRun:false risks
1861
+ * silently reusing stale upstream state. */
1862
+ function hasUnobservedDependencies(state: RunState): boolean {
1863
+ const scan = (text: string): boolean => /\{(previous\.output|args\.|item\b|item\.)/.test(text);
1864
+ for (const p of state.def.phases) {
1865
+ if (p.shareContext === true) return true;
1866
+ if (state.def.contextSharing === true) return true;
1867
+ if (p.type === "flow") return true;
1868
+ if (p.context && p.context.length > 0) return true;
1869
+ if (scan(p.task ?? "")) return true;
1870
+ if (p.when && scan(p.when)) return true;
1871
+ if (Array.isArray(p.eval) && p.eval.some(scan)) return true;
1872
+ }
1873
+ return false;
1874
+ }
1875
+
1876
+ /** Recompute a completed run minimally: force-rerun the `seeds`, then walk
1877
+ * their stale frontier in topological order. The cache provides early cutoff
1878
+ * for free — a downstream whose inputHash didn't move (because the seed's new
1879
+ * output happened to equal the old) hits its prior and is reused rather than
1880
+ * re-executed. `dryRun` computes the worst-case frontier without spending a
1881
+ * token. Returns a fresh state + a report. Throws only when dryRun:false is
1882
+ * requested for a flow with unobserved dependencies; callers should surface
1883
+ * that as a user-facing error. */
1884
+ export async function recomputeTaskflow(
1885
+ state: RunState,
1886
+ deps: RuntimeDeps,
1887
+ seeds: readonly string[],
1888
+ // Fail-safe default: a real recompute overwrites the run and spends tokens.
1889
+ // The tool/command wrappers can explicitly opt into dryRun:false.
1890
+ opts: { dryRun?: boolean } = { dryRun: true },
1891
+ ): Promise<{ report: RecomputeReport; state: RunState }> {
1892
+ // Never mutate the caller's RunState in-place. Recompute is a speculative
1893
+ // replay; only the caller decides whether to persist the new state.
1894
+ const newState = structuredClone(state) as RunState;
1895
+ const reads = readMapOf(newState.phases);
1896
+ const frontier = computeStaleFrontier(reads, seeds);
1897
+ const allIds = Object.keys(newState.phases);
1898
+
1899
+ if (opts.dryRun) {
1900
+ return {
1901
+ report: {
1902
+ dryRun: true,
1903
+ aborted: false,
1904
+ seeds,
1905
+ rerun: [...frontier],
1906
+ reused: allIds.filter((id) => !frontier.has(id)),
1907
+ cutoff: [],
1908
+ },
1909
+ state: newState,
1910
+ };
1911
+ }
1912
+
1913
+ // Guard: observed readSet only tracks `{steps.X.*}` interpolation refs. It is
1914
+ // blind to Shared Context Tree (ctx_read/ctx_write), sub-flow internals,
1915
+ // context: file pre-reads, {previous.output}, and loop locals ({args.*},
1916
+ // {item.*}). Recomputing such a run with dryRun:false could silently skip
1917
+ // phases whose deps changed outside the observed frontier and then persist a
1918
+ // corrupted run over the original.
1919
+ if (hasUnobservedDependencies(newState)) {
1920
+ throw new Error(
1921
+ "recompute dryRun:false is unsafe for this run: it contains dependencies " +
1922
+ "(shareContext, flow/ctx_spawn, context: files, {previous.output}, {args.*}, or {item.*}) " +
1923
+ "that are not tracked by the observed readSet. Use dryRun:true to inspect " +
1924
+ "the frontier, or change the upstream phase and re-run the whole flow.",
1925
+ );
1926
+ }
1927
+
1928
+ // Real recompute: topological order over the frontier so a downstream always
1929
+ // sees its (already-refreshed) upstreams when it re-evaluates its cache key.
1930
+ // The order must respect both declared dependsOn AND observed reads, because
1931
+ // pi-taskflow allows interpolation refs without an explicit dependsOn edge.
1932
+ const seedSet = new Set(seeds);
1933
+ function observedDeps(phaseId: string): string[] {
1934
+ // A phase reading its own prior output (e.g. a loop `until` checking
1935
+ // `{steps.thisId.output}`) must not create a self-edge in the scheduling
1936
+ // graph — otherwise topoLayers would deadlock on the self-loop.
1937
+ return (newState.phases[phaseId]?.reads ?? [])
1938
+ .map((r) => r.stepId)
1939
+ .filter((id) => id !== phaseId);
1940
+ }
1941
+ const augmentedPhases = newState.def.phases.map((p) => ({
1942
+ ...p,
1943
+ dependsOn: [...new Set([...(p.dependsOn ?? []), ...observedDeps(p.id)])],
1944
+ }));
1945
+ const order = topoLayers(augmentedPhases)
1946
+ .flat()
1947
+ .map((p) => p.id)
1948
+ .filter((id) => frontier.has(id));
1949
+ const rerun: string[] = [];
1950
+ const cutoff: string[] = [];
1951
+ const noop = () => {};
1952
+ let aborted = false;
1953
+ for (const id of order) {
1954
+ // A partial recompute must NOT be persisted over the original run — the
1955
+ // caller discards `state` when `aborted` is set.
1956
+ if (deps.signal?.aborted) {
1957
+ aborted = true;
1958
+ break;
1959
+ }
1960
+ const phase = newState.def.phases.find((p) => p.id === id);
1961
+ if (!phase) continue;
1962
+ const before = newState.phases[id]?.inputHash;
1963
+ const execOpts = seedSet.has(id) ? { forceRerun: true } : undefined;
1964
+ try {
1965
+ const ps = await executePhase(phase, newState, deps, newState.phases[id], noop, 0, execOpts);
1966
+ newState.phases[id] = ps;
1967
+ // A phase counts as "rerun" if it was a forced seed OR its result moved;
1968
+ // otherwise it hit its cache (inputHash unchanged) → early cutoff.
1969
+ if (seedSet.has(id) || ps.inputHash !== before) rerun.push(id);
1970
+ else cutoff.push(id);
1971
+ } catch {
1972
+ // A failing recompute phase is recorded as rerun (it was attempted).
1973
+ rerun.push(id);
1974
+ }
1975
+ }
1976
+ return {
1977
+ report: {
1978
+ dryRun: false,
1979
+ aborted,
1980
+ seeds,
1981
+ rerun,
1982
+ reused: allIds.filter((id) => !frontier.has(id)),
1983
+ cutoff,
1984
+ },
1985
+ state: newState,
1986
+ };
1987
+ }
1988
+
1704
1989
  export async function executeTaskflow(state: RunState, deps: RuntimeDeps): Promise<RuntimeResult> {
1705
1990
  const def: Taskflow = state.def;
1706
1991
  try {
@@ -1726,6 +2011,24 @@ export async function executeTaskflow(state: RunState, deps: RuntimeDeps): Promi
1726
2011
  async function runTaskflowLayers(state: RunState, deps: RuntimeDeps): Promise<RuntimeResult> {
1727
2012
  const def: Taskflow = state.def;
1728
2013
  const layers = topoLayers(def.phases);
2014
+ // Content-fingerprint the desugared definition ONCE per run and fold it into
2015
+ // every phase's cache key (overstory hash algorithm; see ./flowir/hash.ts).
2016
+ // Reused by every phase, persisted on the RunState for audit/resume.
2017
+ // Never throws into the run — a hash failure leaves the field unset and the
2018
+ // cache key degrades to the legacy flowName-only shape.
2019
+ if (state.flowDefHash === undefined) {
2020
+ try {
2021
+ state.flowDefHash = await flowDefHash(def);
2022
+ } catch (e) {
2023
+ // Fail-safe: warn loudly rather than silently degrading to the legacy
2024
+ // flowName-only key, which would reopen the cross-flow collision hole.
2025
+ console.warn(
2026
+ `[taskflow] flowDefHash failed for '${def.name}': ${e instanceof Error ? e.message : String(e)}. ` +
2027
+ "Cross-run cache is disabled for this run to prevent stale cross-flow hits.",
2028
+ );
2029
+ state.flowDefHash = "failed";
2030
+ }
2031
+ }
1729
2032
 
1730
2033
  state.status = "running";
1731
2034
  safeEmit(deps, state);
@@ -1770,10 +2073,6 @@ async function runTaskflowLayers(state: RunState, deps: RuntimeDeps): Promise<Ru
1770
2073
  else if (budgetBlocked) skipReason = `Budget exceeded${budgetReason ? `: ${budgetReason}` : ""}`;
1771
2074
  else if (!depsSatisfied)
1772
2075
  skipReason = join === "any" ? "All dependencies failed or were skipped" : "Upstream dependency not satisfied";
1773
- else if (phase.when !== undefined) {
1774
- const condCtx = buildInterpolationContext(state, lastCompletedOutput(state, phase));
1775
- if (!evaluateCondition(phase.when, condCtx)) skipReason = `Condition not met: ${phase.when}`;
1776
- }
1777
2076
 
1778
2077
  if (skipReason) {
1779
2078
  if (skipReason.startsWith("Budget exceeded")) budgetBlocked = true;
@@ -0,0 +1,137 @@
1
+ /**
2
+ * Stale-marking (M4) — conservative transitive invalidation over the observed
3
+ * readSet captured in M3.
4
+ *
5
+ * This is the "mark stale, don't rerun" half of overstory's cost-asymmetric
6
+ * reactivity (VISION §2.3): the cheap effects (figuring out what WOULD be
7
+ * invalidated) run for free; the expensive effects (actually re-running an LLM
8
+ * phase) are gated for M5. Given a run's observed readSets and a set of phases
9
+ * assumed to have changed, `computeStaleFrontier` returns the transitive
10
+ * closure of phases whose recorded dependencies are no longer trustworthy.
11
+ *
12
+ * Pure module: no IO, no Date, no randomness. Deterministic.
13
+ *
14
+ * Scope (honest): this is TOPOLOGICAL propagation only — a changed seed
15
+ * invalidates everything that (transitively) read it. The overstory
16
+ * "early cutoff" refinement (a re-run whose output HASH is unchanged does NOT
17
+ * invalidate, even if the version advanced) needs before/after content hashes,
18
+ * which only exist when a phase is actually re-run — that is the M5
19
+ * recomputation concern, deliberately out of scope here. Marking is the safe,
20
+ * conservative prerequisite that lets M5 rerun with confidence.
21
+ *
22
+ * @see docs/internal/overstory-convergence-roadmap.md §3 (M4)
23
+ */
24
+
25
+ import type { PhaseState } from "./store.ts";
26
+
27
+ // ---------------------------------------------------------------------------
28
+ // Read graph
29
+ // ---------------------------------------------------------------------------
30
+
31
+ /** phaseId → the upstream stepIds it observed-reading (M3 PhaseState.reads). */
32
+ export type ReadMap = Map<string, readonly string[]>;
33
+
34
+ /** Fold a run's PhaseStates into a read map (drops phases with no reads). */
35
+ export function readMapOf(phases: Record<string, PhaseState>): ReadMap {
36
+ const m: ReadMap = new Map();
37
+ for (const [id, ps] of Object.entries(phases)) {
38
+ const deps = (ps.reads ?? []).map((r) => r.stepId);
39
+ if (deps.length) m.set(id, deps);
40
+ }
41
+ return m;
42
+ }
43
+
44
+ /** Phases that directly read `phaseId` (its immediate dependents). */
45
+ export function dependentsOf(reads: ReadMap, phaseId: string): string[] {
46
+ const out: string[] = [];
47
+ for (const [reader, deps] of reads) {
48
+ if (deps.includes(phaseId)) out.push(reader);
49
+ }
50
+ return out;
51
+ }
52
+
53
+ // ---------------------------------------------------------------------------
54
+ // Stale frontier (transitive closure, union semantics)
55
+ // ---------------------------------------------------------------------------
56
+
57
+ /**
58
+ * The set of phases that are stale if `seeds` change, transitively. A reader
59
+ * is stale if ANY phase it observed-reading is stale (union/I5: when in doubt,
60
+ * assume dependency). Includes the seeds themselves.
61
+ *
62
+ * Deterministic. O(phases + read-edges). Cycles in the read graph (which a
63
+ * correct DAG can't produce, but a pathological one could) terminate because a
64
+ * phase is enqueued at most once.
65
+ */
66
+ export function computeStaleFrontier(reads: ReadMap, seeds: Iterable<string>): Set<string> {
67
+ const stale = new Set<string>();
68
+ const queue: string[] = [...seeds];
69
+ while (queue.length) {
70
+ const s = queue.shift() as string;
71
+ if (stale.has(s)) continue;
72
+ stale.add(s);
73
+ for (const dep of dependentsOf(reads, s)) {
74
+ if (!stale.has(dep)) queue.push(dep);
75
+ }
76
+ }
77
+ return stale;
78
+ }
79
+
80
+ // ---------------------------------------------------------------------------
81
+ // Rendering
82
+ // ---------------------------------------------------------------------------
83
+
84
+ /**
85
+ * Render either the full observed dependency graph (no seeds) or the stale
86
+ * frontier given assumed-changed seeds. Each stale phase lists the stale
87
+ * upstreams that caused it (its "why").
88
+ */
89
+ export function formatWhyStale(
90
+ runId: string,
91
+ flowName: string,
92
+ reads: ReadMap,
93
+ seeds: readonly string[],
94
+ ): string {
95
+ const lines: string[] = [];
96
+ lines.push(`why-stale — run ${runId} · flow "${flowName}"`);
97
+ lines.push("");
98
+
99
+ if (seeds.length === 0) {
100
+ // No seeds → show the full observed dependency graph (who reads what).
101
+ if (reads.size === 0) {
102
+ lines.push("(No observed readSets in this run — provenance is empty.)");
103
+ return lines.join("\n");
104
+ }
105
+ lines.push("Observed dependency graph (who reads what):");
106
+ lines.push("");
107
+ for (const [reader, deps] of reads) {
108
+ lines.push(`■ ${reader} reads: ${deps.join(", ")}`);
109
+ }
110
+ lines.push("");
111
+ lines.push("Pass a phase id to compute its stale frontier: /tf why-stale <runId> <phaseId>");
112
+ return lines.join("\n");
113
+ }
114
+
115
+ const frontier = computeStaleFrontier(reads, seeds);
116
+ const seedSet = new Set(seeds);
117
+ lines.push(`Assuming changed: ${[...seedSet].join(", ")}`);
118
+ lines.push("");
119
+ if (frontier.size <= seedSet.size) {
120
+ lines.push(`Stale frontier: only the seed(s) themselves — nothing else observed-reading them.`);
121
+ return lines.join("\n");
122
+ }
123
+ lines.push(`Stale frontier (transitive, ${frontier.size} phases):`);
124
+ // Order: seeds first, then the rest, for readability.
125
+ const ordered = [...seeds.filter((s) => frontier.has(s)), ...[...frontier].filter((s) => !seedSet.has(s))];
126
+ for (const id of ordered) {
127
+ if (seedSet.has(id)) {
128
+ lines.push(` ■ ${id} (changed — seed)`);
129
+ } else {
130
+ // Why is it stale? The stale upstreams it read.
131
+ const deps = reads.get(id) ?? [];
132
+ const causes = deps.filter((d) => frontier.has(d));
133
+ lines.push(` ■ ${id} ← reads ${causes.length ? causes.join(", ") : "(nothing stale?)"}`);
134
+ }
135
+ }
136
+ return lines.join("\n");
137
+ }
@@ -70,6 +70,14 @@ export interface PhaseState {
70
70
  /** Non-fatal diagnostic warnings accumulated during this phase (e.g.
71
71
  * unresolved interpolation placeholders, suspicious templates). */
72
72
  warnings?: string[];
73
+ /** Observed readSet (M3): the upstream phase outputs this phase actually
74
+ * consumed at interpolation time — not what it *declared* to depend on
75
+ * (dependsOn), but what it truly *read* (`{steps.X...}`). Each entry
76
+ * carries the version (= the read phase's inputHash) it consumed, so a
77
+ * later staleness check (M4/M5) can tell whether the upstream has moved.
78
+ * This is the overstory "observed readSet@version" moat: no other
79
+ * orchestrator records what a result actually depended on. */
80
+ reads?: Array<{ stepId: string; version?: string }>;
73
81
  /** Truncated previews of interpolated strings used to execute this phase,
74
82
  * useful when diagnosing why a model saw a literal placeholder. */
75
83
  interpolation?: Array<{ source: string; text: string; missing?: string[] }>;
@@ -89,6 +97,12 @@ export interface RunState {
89
97
  pid?: number;
90
98
  /** True for runs spawned via `detach: true` (background execution). */
91
99
  detached?: boolean;
100
+ /** Content fingerprint of the desugared flow definition (overstory hash
101
+ * algorithm). Folded into every phase's cache key so a structural change
102
+ * to the flow always invalidates cross-run cache hits — and an identical
103
+ * re-run always reuses them. Filled once at run start; persisted for
104
+ * audit/resume consistency. */
105
+ flowDefHash?: string | "failed";
92
106
  }
93
107
 
94
108
  // ---------------------------------------------------------------------------
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-taskflow",
3
- "version": "0.0.24",
3
+ "version": "0.0.25",
4
4
  "description": "A declarative, verifiable graph of task nodes for the Pi coding agent — not a workflow you script, but a DAG you declare: statically verified before it runs, with dynamic fan-out, gates, isolated subagent context, resumable runs, and saveable commands.",
5
5
  "keywords": [
6
6
  "pi-package",