npm - pi-taskflow - Versions diffs - 0.0.8 → 0.0.10 - Mend

pi-taskflow 0.0.8 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +2 -2
package/examples/guarded-refactor.json +1 -1
package/extensions/index.ts +18 -3
package/extensions/runner.ts +14 -0
package/extensions/runtime.ts +54 -47
package/extensions/schema.ts +19 -6
package/extensions/store.ts +544 -55
package/package.json +1 -1
package/skills/taskflow/SKILL.md +1 -1

package/README.md CHANGED Viewed

@@ -156,10 +156,10 @@ declaratively, no scripting:
     { "id": "triage", "type": "agent", "agent": "analyst", "output": "json",
       "task": "Classify the bug. Output ONLY {\"severity\":\"high\"} or {\"severity\":\"low\"}." },
     { "id": "deep",  "when": "{steps.triage.json.severity} == high", "dependsOn": ["triage"],
-      "agent": "executor_code", "task": "Root-cause and patch it.",
+      "agent": "executor-code", "task": "Root-cause and patch it.",
       "retry": { "max": 2, "backoffMs": 500 } },
     { "id": "quick", "when": "{steps.triage.json.severity} == low",  "dependsOn": ["triage"],
-      "agent": "executor_fast", "task": "Apply the quick fix." },
+      "agent": "executor-fast", "task": "Apply the quick fix." },
     { "id": "approve", "type": "approval", "join": "any", "dependsOn": ["deep", "quick"],
       "task": "Review the fix before it ships." },
     { "id": "ship", "type": "agent", "dependsOn": ["approve"],

package/examples/guarded-refactor.json CHANGED Viewed

@@ -25,7 +25,7 @@
     {
       "id": "implement",
       "type": "agent",
-      "agent": "executor_code",
+      "agent": "executor-code",
       "dependsOn": ["approve", "plan"],
       "task": "Implement the approved plan for {args.target}.\nPlan:\n{steps.plan.output}\nExtra human guidance (if any):\n{steps.approve.output}",
       "retry": { "max": 1, "backoffMs": 1000 }

package/extensions/index.ts CHANGED Viewed

@@ -50,8 +50,8 @@ const ShorthandStep = Type.Object(
 );
 const TaskflowParams = Type.Object({
-	action: StringEnum(["run", "save", "resume", "list"] as const, {
-		description: "What to do: run a flow, save a definition, resume a paused run, or list saved flows",
+	action: StringEnum(["run", "save", "resume", "list", "agents"] as const, {
+		description: "What to do: run a flow, save a definition, resume a paused run, list saved flows, or list available agents you can use in phases",
 		default: "run",
 	}),
 	name: Type.Optional(Type.String({ description: "Name of a saved flow (for run/save without inline define)" })),
@@ -219,7 +219,7 @@ export default function (pi: ExtensionAPI) {
 			"Phases (agent, parallel, map, gate, reduce, approval, flow) form a DAG; intermediate outputs stay out of your context — only the final phase output is returned.",
 			"Use action=run with an inline `define` (you write the DSL) or a saved `name`.",
 			"For simple non-DAG delegations (like the subagent tool) skip the DSL: pass `task` (+optional `agent`) for one task, `tasks:[{task,agent?}]` to run in parallel, or `chain:[{task,agent?}]` to run sequentially (reference the prior step with {previous.output}).",
-			"Use action=save to persist a definition as a reusable /tf:<name> command. action=resume continues a paused run. action=list shows saved flows.",
+			"Use action=save to persist a definition as a reusable /tf:<name> command. action=resume continues a paused run. action=list shows saved flows. Use action=agents to list available agents — do NOT invent agent names; either use an agent from that list or omit the 'agent' field to auto-select the default agent.",
 			"DSL: {name, args?, concurrency?, budget?:{maxUSD,maxTokens}, phases:[{id, type, agent, task, dependsOn?, join?:'all'|'any', when?, retry?:{max,backoffMs,factor}, over?(map), as?(map), branches?(parallel), from?(reduce), use?(flow), with?(flow), output?:'json', final?}]}.",
 			"Phase types: agent (one subagent), parallel (static branches), map (dynamic fan-out over an array), gate (VERDICT: PASS/BLOCK quality gate), reduce (aggregate from N phases), approval (human-in-the-loop pause), flow (run a saved sub-flow). join:'any' is an OR-join; when is a conditional guard; retry adds backoff; budget caps run cost.",
 			"Interpolation: {args.X}, {steps.ID.output}, {steps.ID.json}, {item} (map), {previous.output}.",
@@ -235,6 +235,21 @@ export default function (pi: ExtensionAPI) {
 		async execute(_id, params, signal, onUpdate, ctx) {
 			const action = params.action ?? "run";
+			// agents — list available agents the LLM can use in phase definitions
+			if (action === "agents") {
+				const scope = params.scope ?? "both";
+				const { agents } = discoverAgents(ctx.cwd, scope as AgentScope, undefined);
+				const text = agents.length
+					? agents
+							.map(
+								(a) =>
+									`- ${a.name} (${a.source}): ${a.description}${a.model ? ` [model: ${a.model}]` : ""}${a.tools?.length ? ` [tools: ${a.tools.join(", ")}]` : ""}`,
+							)
+							.join("\n")
+					: "No agents found. Use the default agent by omitting the 'agent' field in phases.";
+				return { content: [{ type: "text", text }], details: { action } satisfies TaskflowDetails };
+			}
 			// list
 			if (action === "list") {
 				const flows = listFlows(ctx.cwd);

package/extensions/runner.ts CHANGED Viewed

@@ -48,6 +48,20 @@ export function isFailed(r: RunResult): boolean {
 	return r.exitCode !== 0 || r.stopReason === "error" || r.stopReason === "aborted";
 }
+/**
+ * Heuristic: did this failure look like a transient/retryable provider error
+ * (rate limit, overload, timeout, 5xx)? Such errors should be retried inside
+ * the taskflow run with backoff rather than bubbled up — otherwise the calling
+ * agent tends to re-invoke the whole tool, producing duplicate progress blocks.
+ */
+const TRANSIENT_ERROR_RE =
+	/rate[_\s-]?limit|too\s+many\s+requests|overloaded|\b429\b|\b503\b|\b502\b|\b504\b|service\s+unavailable|temporarily\s+unavailable|timeout|timed?\s+out|econnreset|etimedout|socket\s+hang\s*up/i;
+export function isTransientError(r: RunResult): boolean {
+	if (r.stopReason === "aborted") return false;
+	const hay = `${r.errorMessage ?? ""} ${r.stderr ?? ""} ${r.output ?? ""}`;
+	return TRANSIENT_ERROR_RE.test(hay);
+}
 /** Placeholder written to a failed phase's `output` so downstream interpolation
  *  can detect "upstream failed" without being polluted by raw HTML/JSON. */
 export const TRANSPORT_ERROR_PLACEHOLDER = "(upstream error: subagent failed; see error)";

package/extensions/runtime.ts CHANGED Viewed

@@ -14,7 +14,7 @@ import * as path from "node:path";
 import * as fs from "node:fs";
 import type { AgentConfig } from "./agents.ts";
 import { coerceArray, evaluateCondition, interpolate, type InterpolationContext, safeParse } from "./interpolate.ts";
-import { isFailed, type LiveUpdate, mapWithConcurrencyLimit, runAgentTask, type RunResult } from "./runner.ts";
+import { isFailed, isTransientError, type LiveUpdate, mapWithConcurrencyLimit, runAgentTask, type RunResult } from "./runner.ts";
 import { aggregateUsage, emptyUsage, type UsageStats } from "./usage.ts";
 import { type Budget, dependenciesOf, finalPhase, type Phase, resolveArgs, type Taskflow, topoLayers } from "./schema.ts";
 import { hashInput, newRunId, type PhaseState, type RunState } from "./store.ts";
@@ -314,9 +314,20 @@ async function executePhase(
 	// Wrap each subagent call in the phase's retry policy. Usage is summed across
 	// attempts; the attempt count rides along on the result for the TUI.
+	//
+	// Even without an explicit `phase.retry`, transient provider errors (rate
+	// limits, overload, 5xx, timeouts) are retried with backoff so a momentary
+	// 429 is absorbed inside this run instead of bubbling up and provoking the
+	// calling agent to re-invoke the whole tool (which stacks duplicate progress
+	// blocks in the transcript).
 	const retry = phase.retry;
+	const DEFAULT_TRANSIENT_RETRIES = 3;
+	const DEFAULT_TRANSIENT_BACKOFF_MS = 2000;
+	const DEFAULT_TRANSIENT_FACTOR = 2;
 	const runOne = async (agentName: string, task: string, onLive?: (l: LiveUpdate) => void): Promise<RunResult> => {
-		const maxAttempts = Math.max(1, 1 + Math.max(0, Math.floor(retry?.max ?? 0)));
+		const explicitMax = Math.max(1, 1 + Math.max(0, Math.floor(retry?.max ?? 0)));
+		// Allow enough attempts to cover whichever policy applies on a given attempt.
+		const maxAttempts = Math.max(explicitMax, 1 + DEFAULT_TRANSIENT_RETRIES);
 		const usages: UsageStats[] = [];
 		let last: RunResult | undefined;
 		for (let attempt = 0; attempt < maxAttempts; attempt++) {
@@ -330,10 +341,21 @@ async function executePhase(
 			if (!isFailed(last)) break;
 			// Stop retrying on abort or once the run is over budget.
 			if (deps.signal?.aborted || overBudget(state).over) break;
-			if (attempt < maxAttempts - 1) {
-				const wait = Math.min(60000, Math.round((retry?.backoffMs ?? 0) * (retry?.factor ?? 1) ** attempt));
-				await delay(wait, deps.signal);
-			}
+			// Decide whether THIS failure warrants another attempt. Explicit retry
+			// policy covers all failures up to its cap; the transient fallback covers
+			// only retryable provider errors. A non-transient failure with no explicit
+			// policy stops immediately (no point burning attempts on a hard error).
+			const withinExplicit = attempt < explicitMax - 1;
+			const transient = isTransientError(last);
+			const withinTransient = transient && attempt < DEFAULT_TRANSIENT_RETRIES;
+			if (!withinExplicit && !withinTransient) break;
+			// Backoff: prefer the explicit policy's curve when the phase defines one
+			// (covers transient retries too, and keeps tests fast with backoffMs:0),
+			// otherwise use the transient defaults.
+			const baseMs = retry ? (retry.backoffMs ?? 0) : DEFAULT_TRANSIENT_BACKOFF_MS;
+			const factor = retry ? (retry.factor ?? 1) : DEFAULT_TRANSIENT_FACTOR;
+			const wait = Math.min(60000, Math.round(baseMs * factor ** attempt));
+			if (wait > 0) await delay(wait, deps.signal);
 		}
 		// Aborted before any attempt ran → return a clean aborted result (no crash).
 		if (!last) {
@@ -414,11 +436,12 @@ async function executePhase(
 	if (type === "agent" || type === "gate" || type === "reduce") {
 		const { text } = interpolate(phase.task ?? "", ctx);
 		const fullTask = preRead + text;
-		const inputHash = hashInput(phase.id, phase.agent ?? "", fullTask);
+		const agentName = resolveAgent(phase.agent, deps, state);
+		const inputHash = hashInput(phase.id, agentName, fullTask);
 		const cached = cachedPhase(prior, inputHash);
 		if (cached) return cached;
-		const r = await runOne(phase.agent ?? defaultAgent(deps), fullTask, liveSink(state, phase.id, emitProgress));
+		const r = await runOne(agentName, fullTask, liveSink(state, phase.id, emitProgress));
 		const ps = resultToPhaseState(phase.id, r, inputHash, parseJson);
 		if (type === "gate" && ps.status === "done") ps.gate = parseGateVerdict(r.output);
 		return ps;
@@ -428,7 +451,7 @@ async function executePhase(
 		const branches = (phase.branches ?? []).map((b) => {
 			const r = interpolate(b.task, ctx);
 			return {
-				agent: b.agent ?? phase.agent ?? defaultAgent(deps),
+				agent: resolveAgent(b.agent ?? phase.agent, deps, state),
 				task: preRead + r.text,
 			};
 		});
@@ -458,7 +481,7 @@ async function executePhase(
 		const tasks = arr.map((item) => {
 			const localCtx = buildInterpolationContext(state, previousOutput, { [loopVar]: item });
 			return {
-				agent: phase.agent ?? defaultAgent(deps),
+				agent: resolveAgent(phase.agent, deps, state),
 				task: preRead + interpolate(phase.task ?? "", localCtx).text,
 			};
 		});
@@ -641,6 +664,27 @@ function cachedPhase(prior: PhaseState | undefined, inputHash: string): PhaseSta
 	return null;
 }
+/**
+ * Resolve an agent name against available agents. Falls back to the default
+ * agent if the requested agent isn't found, logging a warning via safeEmit.
+ */
+function resolveAgent(name: string | undefined, deps: RuntimeDeps, state: RunState): string {
+	const resolved = name ?? defaultAgent(deps);
+	if (name && !deps.agents.some((a) => a.name === name)) {
+		const fallback = defaultAgent(deps);
+		// Log only once per run to avoid noise.
+		if (!(state as any).__unknownAgentWarned) {
+			(state as any).__unknownAgentWarned = new Set<string>();
+		}
+		if (!(state as any).__unknownAgentWarned.has(name)) {
+			(state as any).__unknownAgentWarned.add(name);
+			console.warn(`[taskflow] Unknown agent "${name}", falling back to "${fallback}". Use action=agents to list available agents.`);
+		}
+		return fallback;
+	}
+	return resolved;
+}
 function defaultAgent(deps: RuntimeDeps): string {
 	return deps.agents[0]?.name ?? "default";
 }
@@ -719,45 +763,8 @@ function safeProgress(deps: RuntimeDeps, state: RunState): void {
 /**
  * Execute a full taskflow. Mutates and persists `state` as it progresses.
  */
-function ensureImplicitGate(def: Taskflow): void {
-	// Respect explicit opt-out
-	if ((def as any).implicitGate === false) return;
-	const hasGate = def.phases.some(
-		(p) => p.type === "gate" || p.type === "approval" || p.id === "_implicit-gate",
-	);
-	if (hasGate || def.phases.length === 0) return;
-	// The last existing phase is the effective "final" phase — pin it so the
-	// injected gate doesn't become the finalOutput.
-	const lastPhase = def.phases[def.phases.length - 1];
-	if (!lastPhase.final && !def.phases.some((p) => p.final)) {
-		lastPhase.final = true;
-	}
-	const allIds = def.phases.map((p) => p.id);
-	def.phases.push({
-		id: "_implicit-gate",
-		type: "gate",
-		dependsOn: allIds,
-		agent: "reviewer",
-		task: `Review all phase outputs from this taskflow for accuracy and consistency.
-For each upstream phase, scan its output for:
-1. **Factual accuracy**: Any file paths, line numbers, or code snippets that are wrong?
-2. **Internal contradictions**: Do any phases contradict each other?
-3. **Completeness**: Is any output truncated, empty, or anomalously short?
-4. **Hallucination markers**: Wrong file names, impossible line ranges, circular logic, information not in the given context.
-Output:
-- If ALL outputs look consistent and plausible: output **VERDICT: PASS** with a one-line summary.
-- If ANY issues found: output **VERDICT: BLOCK** listing each issue with the phase ID and specific concern.`,
-	});
-}
 export async function executeTaskflow(state: RunState, deps: RuntimeDeps): Promise<RuntimeResult> {
 	const def: Taskflow = state.def;
-	ensureImplicitGate(def);
 	try {
 		return await runTaskflowLayers(state, deps);
 	} catch (e) {

package/extensions/schema.ts CHANGED Viewed

@@ -147,12 +147,6 @@ export const TaskflowSchema = Type.Object(
 			}),
 		),
 		phases: Type.Array(PhaseSchema, { minItems: 1, description: "Ordered phase definitions (DAG via dependsOn)" }),
-		implicitGate: Type.Optional(
-			Type.Boolean({
-				description: "When true (default), a reviewer gate is auto-injected after all phases if no explicit gate or approval exists",
-				default: true,
-			}),
-		),
 	},
 	{ additionalProperties: false },
 );
@@ -342,6 +336,16 @@ export function validateTaskflow(def: unknown, opts: ValidationOptions = {}): Va
 		if (p.join && !JOIN_MODES.includes(p.join as JoinMode)) {
 			errors.push(`Phase '${p.id}': unknown join mode '${p.join}'`);
 		}
+		// Agent name convention: hyphens only (per AGENTS.md naming convention)
+		if (p.agent && typeof p.agent === "string" && p.agent.includes("_")) {
+			errors.push(`Phase '${p.id}': agent name '${p.agent}' uses underscores — use hyphens (e.g. 'executor-code' not 'executor_code')`);
+		}
+		// Phase id convention: hyphens only (consistent with agent naming)
+		if (p.id && p.id.includes("_")) {
+			errors.push(`Phase '${p.id}': id uses underscores — use hyphens for consistency with agent naming convention`);
+		}
 	}
 	// dependsOn / from references must exist
@@ -355,6 +359,15 @@ export function validateTaskflow(def: unknown, opts: ValidationOptions = {}): Va
 		}
 	}
+	// Agent name format validation (AGENTS.md naming convention: hyphens only, no underscores)
+	const VALID_AGENT_RE = /^[a-z][a-z0-9-]*$/;
+	for (const p of flow.phases) {
+		if (!p?.id) continue;
+		if (p.agent && !VALID_AGENT_RE.test(p.agent)) {
+			errors.push(`Phase '${p.id}': agent '${p.agent}' has invalid name format (expected lowercase alphanumeric with hyphens)`);
+		}
+	}
 	// Cycle detection (Kahn)
 	if (errors.length === 0) {
 		const cycle = detectCycle(flow.phases as Phase[]);

package/extensions/store.ts CHANGED Viewed

@@ -3,7 +3,15 @@
  *
  *   Definitions:  .pi/taskflows/<name>.json          (project)
  *                 ~/.pi/agent/taskflows/<name>.json   (user)
- *   Run state:    .pi/taskflows/runs/<runId>.json     (resume support)
+ *   Run state:    .pi/taskflows/runs/<sanitizedFlowName>/<runId>.json
+ *   Index:        .pi/taskflows/runs/index.json       (lookup accelerator)
+ *
+ *   Legacy layout (v0.0.8 and earlier):
+ *     .pi/taskflows/runs/<runId>.json                 (flat, still readable)
+ *
+ *   v0.0.9 refactor: per-flow subdirectory layout + lightweight index + file
+ *   lock + TTL/cap cleanup. Full backward compatibility with the flat layout
+ *   is maintained: loadRun and listRuns still discover legacy flat files.
  */
 import * as crypto from "node:crypto";
@@ -66,6 +74,403 @@ export interface RunState {
 	cwd: string;
 }
+// ---------------------------------------------------------------------------
+// Index entry — lightweight lookup record persisted in runs/index.json.
+// Enables listRuns to find files without a full directory scan.  Every
+// non-terminal run and every terminal run within the retention window has an
+// index entry; missing/stale entries are tolerated via degradation (rebuild).
+// ---------------------------------------------------------------------------
+export interface RunIndexEntry {
+	runId: string;
+	flowName: string;
+	status: RunState["status"];
+	createdAt: number;
+	updatedAt: number;
+	/** Path relative to runsRoot, e.g. "test-flow/test-roundtrip-001.json". */
+	relPath: string;
+}
+// ---------------------------------------------------------------------------
+// File-lock constants
+// ---------------------------------------------------------------------------
+/** Lock file considered stale after 30 s (orphaned from crash / kill -9). */
+const LOCK_STALE_MS = 30_000;
+/** Lock acquisition busy-wait interval. */
+const LOCK_POLL_MS = 50;
+/** Default acquisition timeout before throwing. */
+const LOCK_TIMEOUT_MS = 10_000;
+// ---------------------------------------------------------------------------
+// Cleanup throttle
+// ---------------------------------------------------------------------------
+/** Minimum ms between opportunistic cleanup runs (called inside saveRun). */
+const CLEANUP_INTERVAL_MS = 60_000;
+/** Retain at most this many terminal runs by default. */
+const DEFAULT_MAX_KEPT_TERMINAL = 100;
+/** Remove terminal runs older than this (days). */
+const DEFAULT_MAX_AGE_DAYS = 30;
+/** Last cleanup timestamp — module-level so it persists across calls. */
+let lastCleanupAt = 0;
+// ---------------------------------------------------------------------------
+// Internal helpers — path construction & sanitisation
+// ---------------------------------------------------------------------------
+/**
+ * Sanitise a flow name into a safe directory name. Same regex used by
+ * saveFlow/newRunId — but that regex keeps `.` in its allow-list, so a
+ * flowName of "." or ".." would pass through unchanged and let `flowRunDir`
+ * resolve OUTSIDE the runs root (write-side path traversal). `def.name` is
+ * internally derived and TypeBox only enforces Type.String() with no charset,
+ * so a Taskflow literally named ".." is schema-valid. We therefore reject
+ * bare-dot / leading-dot components after the character substitution so the
+ * write path can never escape runs/ (risk-reviewer v0.0.9 audit, H1).
+ */
+function safeFlowDirName(flowName: string): string {
+	let safe = flowName.replace(/[^\w.-]+/g, "_");
+	// Collapse leading dots: blocks ".", "..", and hidden-dir names like ".git".
+	safe = safe.replace(/^\.+/, "_");
+	return safe || "_";
+}
+/** Return the per-flow run directory: runs/<sanitisedFlowName>. */
+function flowRunDir(runsRoot: string, flowName: string): string {
+	return path.join(runsRoot, safeFlowDirName(flowName));
+}
+/** Return the full path for a run file in the new subdirectory layout. */
+function runFilePath(runsRoot: string, flowName: string, runId: string): string {
+	return path.join(flowRunDir(runsRoot, flowName), `${runId}.json`);
+}
+/** Return the path to the run index file. */
+function indexPath(runsRoot: string): string {
+	return path.join(runsRoot, "index.json");
+}
+/** Return the lock-file path guarding all index.json read-modify-write cycles. */
+function indexLockPath(runsRoot: string): string {
+	return path.join(runsRoot, "index.json.lock");
+}
+/** Return the lock-file path for a given runId (placed next to the run file). */
+function lockPathForRun(runsRoot: string, flowName: string, runId: string): string {
+	return path.join(flowRunDir(runsRoot, flowName), `${runId}.json.lock`);
+}
+/**
+ * Validate that a runId looks safe before performing any filesystem access.
+ * Legitimate runIds are produced by newRunId() and contain only [A-Za-z0-9._-].
+ */
+function validateRunId(runId: string): boolean {
+	return (
+		typeof runId === "string" &&
+		runId.length > 0 &&
+		!runId.includes("/") &&
+		!runId.includes("\\") &&
+		!runId.includes("\0")
+	);
+}
+// ---------------------------------------------------------------------------
+// File-lock primitives — zero-dependency, using O_CREAT|O_EXCL (atomic)
+// ---------------------------------------------------------------------------
+/**
+ * Acquire a file lock by atomically creating a lock file.
+ *
+ * Uses O_CREAT|O_EXCL (`wx` flag) which is atomic on POSIX and NTFS.
+ * Stale locks (> LOCK_STALE_MS) are stolen via an atomic rename rather than a
+ * naive unlink-then-create: a plain `unlinkSync` + `openSync('wx')` has a
+ * TOCTOU window where two processes both unlink the same stale lock and both
+ * then create a fresh one, yielding two simultaneous holders (risk-reviewer
+ * v0.0.9 audit, L1). `rename` is atomic and removes the *specific* inode the
+ * caller observed: only one racing process can win the rename of that exact
+ * stale file, so at most one process proceeds to re-create the lock.
+ * Throws on timeout.
+ */
+function acquireLock(lockPath: string, timeoutMs: number = LOCK_TIMEOUT_MS): void {
+	const start = Date.now();
+	// Ensure parent directory exists (lock file lives inside the flow subdir).
+	const dir = path.dirname(lockPath);
+	fs.mkdirSync(dir, { recursive: true });
+	while (true) {
+		try {
+			const fd = fs.openSync(lockPath, "wx");
+			fs.writeFileSync(fd, JSON.stringify({ pid: process.pid, ts: Date.now() }));
+			fs.closeSync(fd);
+			return; // lock acquired
+		} catch (e: unknown) {
+			if ((e as NodeJS.ErrnoException).code !== "EEXIST") throw e;
+			// Lock file exists — check if stale.
+			try {
+				const stat = fs.statSync(lockPath);
+				if (Date.now() - stat.mtimeMs > LOCK_STALE_MS) {
+					// Stale lock — steal it via atomic rename so only one racing
+					// stealer can win (L1). The "graveyard" name is unique per
+					// process+attempt; the winner unlinks it, losers see ENOENT
+					// on their own rename and simply retry the acquire loop.
+					const grave = `${lockPath}.stale.${process.pid}.${crypto.randomBytes(4).toString("hex")}`;
+					try {
+						fs.renameSync(lockPath, grave);
+						// We won the steal — discard the graveyard copy and retry
+						// the loop, where openSync('wx') will create a fresh lock.
+						try { fs.unlinkSync(grave); } catch { /* ignore */ }
+					} catch { /* lost the steal race (ENOENT) — just retry */ }
+					continue;
+				}
+			} catch {
+				// ENOENT: another process released it between openSync and statSync — retry.
+				continue;
+			}
+			// Lock is held and not stale — wait and retry.
+			if (Date.now() - start > timeoutMs) {
+				throw new Error(`Lock timeout after ${timeoutMs}ms waiting for ${path.basename(lockPath)}`);
+			}
+			// Busy-wait with Atomics.wait (CPU-efficient sleep).
+			Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, LOCK_POLL_MS);
+		}
+	}
+}
+/**
+ * Release a file lock by deleting the lock file.  Ignores ENOENT (already
+ * released by another process or stolen due to staleness).
+ */
+function releaseLock(lockPath: string): void {
+	try { fs.unlinkSync(lockPath); } catch { /* ENOENT or other — ignore */ }
+}
+/**
+ * Execute `fn` while holding a file lock.  Guarantees release even on throw.
+ */
+function withLock<T>(lockPath: string, fn: () => T): T {
+	acquireLock(lockPath);
+	try {
+		return fn();
+	} finally {
+		releaseLock(lockPath);
+	}
+}
+// ---------------------------------------------------------------------------
+// Index CRUD
+// ---------------------------------------------------------------------------
+/**
+ * Extract a RunIndexEntry from a RunState + computed relative path.
+ */
+function extractIndexEntry(state: RunState, relPath: string): RunIndexEntry {
+	return {
+		runId: state.runId,
+		flowName: state.flowName,
+		status: state.status,
+		createdAt: state.createdAt,
+		updatedAt: state.updatedAt,
+		relPath,
+	};
+}
+/** Read the index file; return [] on any error (missing, corrupt, etc.). */
+function readIndex(runsRoot: string): RunIndexEntry[] {
+	try {
+		const raw = fs.readFileSync(indexPath(runsRoot), "utf-8");
+		const parsed = JSON.parse(raw);
+		if (!Array.isArray(parsed)) return [];
+		// Validate each entry minimally.
+		return (parsed as RunIndexEntry[]).filter(
+			(e) => e && typeof e.runId === "string" && typeof e.relPath === "string",
+		);
+	} catch {
+		return [];
+	}
+}
+/** Write the full index atomically. */
+function writeIndex(runsRoot: string, entries: RunIndexEntry[]): void {
+	writeFileAtomic(indexPath(runsRoot), JSON.stringify(entries, null, 2));
+}
+/** Upsert a single entry by runId (read → mutate → write). */
+/**
+ * Upsert a single entry by runId (read → mutate → write).
+ *
+ * Guarded by a dedicated index lock so concurrent saveRun calls for *different*
+ * runIds (each holding only its own per-run lock) cannot interleave their
+ * read-modify-write of the shared index and lose each other's entries
+ * (risk-reviewer v0.0.9 audit, M1). The per-run lock protects the run file;
+ * this index lock protects the shared index.
+ */
+function updateIndexEntry(runsRoot: string, entry: RunIndexEntry): void {
+	withLock(indexLockPath(runsRoot), () => {
+		const entries = readIndex(runsRoot);
+		const idx = entries.findIndex((e) => e.runId === entry.runId);
+		if (idx >= 0) {
+			entries[idx] = entry;
+		} else {
+			entries.push(entry);
+		}
+		writeIndex(runsRoot, entries);
+	});
+}
+// Note: removeIndexEntry is available but not currently called; cleanupTerminalRuns
+// rewrites the full index instead. Kept as a comment for future use.
+/**
+ * Scan all subdirectories + legacy flat files and rebuild the full index.
+ * Called when the index is missing or corrupt (self-healing).
+ *
+ * Deduplicates by runId: subdirectory entry wins over flat.
+ */
+function rebuildIndex(runsRoot: string): RunIndexEntry[] {
+	const entries = new Map<string, RunIndexEntry>();
+	let dirs: string[];
+	try {
+		dirs = fs.readdirSync(runsRoot, { withFileTypes: true })
+			.filter((d) => d.isDirectory())
+			.map((d) => d.name);
+	} catch {
+		dirs = [];
+	}
+	// Scan per-flow subdirectories.
+	for (const dirName of dirs) {
+		const dirPath = path.join(runsRoot, dirName);
+		let files: string[];
+		try {
+			files = fs.readdirSync(dirPath).filter((f) => f.endsWith(".json") && !f.includes(".lock"));
+		} catch { continue; }
+		for (const file of files) {
+			try {
+				const raw = fs.readFileSync(path.join(dirPath, file), "utf-8");
+				const state = JSON.parse(raw) as RunState;
+				if (state && typeof state.runId === "string") {
+					entries.set(state.runId, extractIndexEntry(state, `${dirName}/${file}`));
+				}
+			} catch { /* skip corrupt */ }
+		}
+	}
+	// Scan legacy flat files (runs/*.json, skip index.json).
+	let flatFiles: string[];
+	try {
+		flatFiles = fs.readdirSync(runsRoot).filter(
+			(f) => f.endsWith(".json") && f !== "index.json" && !f.includes(".lock"),
+		);
+	} catch {
+		flatFiles = [];
+	}
+	for (const file of flatFiles) {
+		if (entries.has(file.replace(/\.json$/, ""))) continue; // prefer subdir entry
+		try {
+			const raw = fs.readFileSync(path.join(runsRoot, file), "utf-8");
+			const state = JSON.parse(raw) as RunState;
+			if (state && typeof state.runId === "string" && !entries.has(state.runId)) {
+				entries.set(state.runId, extractIndexEntry(state, file));
+			}
+		} catch { /* skip corrupt */ }
+	}
+	const result = Array.from(entries.values());
+	// Persist the rebuilt index under the index lock so it does not race a
+	// concurrent updateIndexEntry / cleanup write (M1).
+	withLock(indexLockPath(runsRoot), () => writeIndex(runsRoot, result));
+	return result;
+}
+// ---------------------------------------------------------------------------
+// TTL / cap cleanup
+// ---------------------------------------------------------------------------
+/**
+ * Remove excess and expired terminal (completed/failed) runs.
+ *
+ * Called opportunistically at the end of saveRun.  Throttled to at most once
+ * per CLEANUP_INTERVAL_MS.  Active runs (running/paused/blocked) are never
+ * touched.
+ *
+ * The index read-modify-write is performed under the index lock so it cannot
+ * race a concurrent updateIndexEntry and clobber a freshly-added entry (M1).
+ * We re-read the index *inside* the lock (rather than trusting a snapshot read
+ * before locking) so the rewrite reflects the latest committed state. File and
+ * directory unlinks happen after the lock is released to keep the critical
+ * section short; deleting a file that is no longer in the index is harmless.
+ */
+function cleanupTerminalRuns(
+	runsRoot: string,
+	maxKeep: number = DEFAULT_MAX_KEPT_TERMINAL,
+	maxAgeDays: number = DEFAULT_MAX_AGE_DAYS,
+): void {
+	const now = Date.now();
+	if (now - lastCleanupAt < CLEANUP_INTERVAL_MS) return;
+	lastCleanupAt = now;
+	const maxAgeMs = maxAgeDays * 86_400_000;
+	let toRemove: RunIndexEntry[] = [];
+	withLock(indexLockPath(runsRoot), () => {
+		const entries = readIndex(runsRoot);
+		const terminal: RunIndexEntry[] = [];
+		const active: RunIndexEntry[] = [];
+		for (const e of entries) {
+			if (e.status === "completed" || e.status === "failed") {
+				terminal.push(e);
+			} else {
+				active.push(e);
+			}
+		}
+		// Sort terminal by updatedAt desc (newest first).
+		terminal.sort((a, b) => b.updatedAt - a.updatedAt);
+		for (let i = 0; i < terminal.length; i++) {
+			const e = terminal[i]!;
+			const expiredByAge = now - e.updatedAt > maxAgeMs;
+			const excessByCount = i >= maxKeep;
+			if (expiredByAge || excessByCount) {
+				toRemove.push(e);
+			}
+		}
+		if (toRemove.length === 0) return;
+		// Commit the pruned index while holding the lock so a concurrent
+		// updateIndexEntry cannot interleave and lose entries.
+		const remaining = terminal.filter((e) => !toRemove.includes(e));
+		writeIndex(runsRoot, [...active, ...remaining]);
+	});
+	if (toRemove.length === 0) return;
+	// Delete run files + lock files (outside the index lock).
+	for (const e of toRemove) {
+		const filePath = path.join(runsRoot, e.relPath);
+		try { fs.unlinkSync(filePath); } catch { /* already gone */ }
+		// Also remove any orphaned lock file.
+		try { fs.unlinkSync(filePath + ".lock"); } catch { /* ignore */ }
+	}
+	// Remove empty flow subdirectories.
+	for (const e of toRemove) {
+		const dirPath = path.dirname(path.join(runsRoot, e.relPath));
+		try { fs.rmdirSync(dirPath); } catch { /* ENOTEMPTY or ENOENT — ignore */ }
+	}
+}
+// ---------------------------------------------------------------------------
+// Original helpers (unchanged)
+// ---------------------------------------------------------------------------
 function userFlowsDir(): string {
 	return path.join(getAgentDir(), "taskflows");
 }
@@ -160,90 +565,172 @@ export function newRunId(flowName: string): string {
 	return `${safe}-${Date.now().toString(36)}-${crypto.randomBytes(3).toString("hex")}`;
 }
+/**
+ * Persist a run state to disk.
+ *
+ * v0.0.9: writes to `runs/<sanitisedFlowName>/<runId>.json` (per-flow
+ * subdirectory) and updates the lightweight index.  Uses a per-run file lock
+ * to prevent concurrent writes to the same runId.  After the write, runs
+ * opportunistic cleanup of expired terminal runs.
+ *
+ * F-009: shallow-clones state before stamping updatedAt to avoid mutating the
+ * caller's reference.
+ */
 export function saveRun(state: RunState): void {
-	const dir = runsDir(state.cwd);
-	fs.mkdirSync(dir, { recursive: true });
+	const root = runsDir(state.cwd);
+	const flowDir = flowRunDir(root, state.flowName);
+	fs.mkdirSync(flowDir, { recursive: true });
 	// Clone before stamping updatedAt so the caller's RunState reference is not
 	// mutated as a hidden side effect (v0.0.6 audit, F-009). Shallow clone is
 	// sufficient: saveRun only serializes; it does not mutate nested objects.
 	const toSave = { ...state, updatedAt: Date.now() };
-	writeFileAtomic(path.join(dir, `${state.runId}.json`), JSON.stringify(toSave, null, 2));
+	const filePath = runFilePath(root, state.flowName, state.runId);
+	const lockPath = lockPathForRun(root, state.flowName, state.runId);
+	withLock(lockPath, () => {
+		writeFileAtomic(filePath, JSON.stringify(toSave, null, 2));
+		updateIndexEntry(root, extractIndexEntry(toSave, path.basename(flowDir) + "/" + path.basename(filePath)));
+	});
+	// Opportunistic cleanup — throttled to once per CLEANUP_INTERVAL_MS.
+	cleanupTerminalRuns(root);
 }
+/**
+ * Load a single run by runId.
+ *
+ * Lookup chain (fast → slow):
+ *   1. INDEX — read index.json, find entry with matching runId, read via relPath.
+ *   2. SUBDIR SCAN — for each subdirectory in runsDir, check <subdir>/<runId>.json.
+ *   3. FLAT FALLBACK — check runsDir/<runId>.json directly (legacy layout).
+ *
+ * All existing path-traversal, symlink, and realpath guards are preserved for
+ * every path touched.
+ */
 export function loadRun(cwd: string, runId: string): RunState | null {
-	const dir = runsDir(cwd);
-	// Reject runIds that could be used for path traversal or filesystem abuse.
-	// Legitimate runIds are produced by newRunId() and contain only
-	// [A-Za-z0-9._-]; anything else (empty string, path separators, NUL bytes,
-	// backslashes on POSIX, forward slashes on Windows) is suspicious.
-	if (
-		typeof runId !== "string" ||
-		runId.length === 0 ||
-		runId.includes("/") ||
-		runId.includes("\\") ||
-		runId.includes("\0")
-	) {
-		return null;
+	if (!validateRunId(runId)) return null;
+	const root = runsDir(cwd);
+	// ---- Try index first ----
+	const indexEntries = readIndex(root);
+	const entry = indexEntries.find((e) => e.runId === runId);
+	if (entry) {
+		const filePath = path.join(root, entry.relPath);
+		const state = tryReadRunFile(root, filePath);
+		if (state) return state;
+		// Index entry exists but file is gone or corrupt — fall through.
 	}
-	const filePath = path.resolve(dir, `${runId}.json`);
-	// Reject runIds that would escape the runs directory (e.g. "../etc/passwd").
-	// Compare with a path-separator suffix so legitimate filenames like "..foo"
-	// (a name that just happens to start with two dots) are not false-positives.
-	const rel = path.relative(dir, filePath);
+	// ---- Try subdirectory scan ----
+	let dirs: string[];
+	try {
+		dirs = fs.readdirSync(root, { withFileTypes: true })
+			.filter((d) => d.isDirectory())
+			.map((d) => d.name);
+	} catch { dirs = []; }
+	for (const dirName of dirs) {
+		const filePath = path.join(root, dirName, `${runId}.json`);
+		const state = tryReadRunFile(root, filePath);
+		if (state) return state;
+	}
+	// ---- Try legacy flat fallback ----
+	const flatPath = path.join(root, `${runId}.json`);
+	const state = tryReadRunFile(root, flatPath);
+	if (state) return state;
+	return null;
+}
+/**
+ * Safely read a run file, performing all path-traversal / symlink guards.
+ * Returns null on any violation or read error.
+ */
+function tryReadRunFile(runsRoot: string, filePath: string): RunState | null {
+	// Lexical traversal guard.
+	const rel = path.relative(runsRoot, filePath);
 	if (rel === ".." || rel.startsWith(`..${path.sep}`) || path.isAbsolute(rel)) return null;
-	// Resolve symlinks on both the runs dir and the file, so the containment
-	// check below is on a consistent physical path. Without normalizing `dir`,
-	// a legitimate run on macOS (where /var → /private/var) would compare a
-	// symlinked dir prefix to a real path and falsely flag traversal. A
-	// malicious file already placed inside the runs dir could otherwise also
-	// point at an arbitrary path on disk and bypass the lexical check above.
+	// Resolve symlinks on both runsRoot and the file so the containment check
+	// uses consistent physical paths (macOS /var → /private/var etc.).
 	let realDir: string;
 	let realFilePath: string;
 	try {
-		realDir = fs.realpathSync(dir);
+		realDir = fs.realpathSync(runsRoot);
 		realFilePath = fs.realpathSync(filePath);
-	} catch {
-		return null;
-	}
+	} catch { return null; }
 	const realRel = path.relative(realDir, realFilePath);
 	if (realRel === ".." || realRel.startsWith(`..${path.sep}`) || path.isAbsolute(realRel)) return null;
 	try {
 		const raw = fs.readFileSync(realFilePath, "utf-8");
 		return JSON.parse(raw) as RunState;
-	} catch {
-		return null;
-	}
+	} catch { return null; }
 }
+/**
+ * List recent runs, sorted by updatedAt descending.
+ *
+ * v0.0.9: reads from index first, then merges any legacy flat files not yet in
+ * the index.  If the index is missing/corrupt, calls rebuildIndex for
+ * self-healing.
+ *
+ * F-010: drops records with non-numeric/NaN updatedAt before sorting.
+ */
 export function listRuns(cwd: string, limit = 20): RunState[] {
-	const dir = runsDir(cwd);
-	if (!fs.existsSync(dir)) return [];
-	let files: string[];
+	const root = runsDir(cwd);
+	if (!fs.existsSync(root)) return [];
+	// Index-first path.
+	let entries = readIndex(root);
+	if (entries.length === 0) {
+		// Index missing or corrupt — rebuild from filesystem.
+		entries = rebuildIndex(root);
+	}
+	// Collect runIds from index for deduplication.
+	const indexRunIds = new Set(entries.map((e) => e.runId));
+	// Merge legacy flat files not yet in the index.
+	let flatFiles: string[];
 	try {
-		files = fs.readdirSync(dir).filter((f) => f.endsWith(".json"));
-	} catch {
-		return [];
+		flatFiles = fs.readdirSync(root).filter(
+			(f) => f.endsWith(".json") && f !== "index.json" && !f.includes(".lock"),
+		);
+	} catch { flatFiles = []; }
+	for (const file of flatFiles) {
+		const runIdFromName = file.replace(/\.json$/, "");
+		if (indexRunIds.has(runIdFromName)) continue;
+		try {
+			const raw = fs.readFileSync(path.join(root, file), "utf-8");
+			const state = JSON.parse(raw) as RunState;
+			if (state && typeof state.runId === "string" && !indexRunIds.has(state.runId)) {
+				entries.push(extractIndexEntry(state, file));
+				indexRunIds.add(state.runId);
+			}
+		} catch { /* skip corrupt */ }
 	}
+	// Sort by updatedAt desc, slice to limit.
+	entries.sort((a, b) => b.updatedAt - a.updatedAt);
+	const sliced = entries.slice(0, limit);
+	// Read full RunState for each entry.
 	const runs: RunState[] = [];
-	for (const f of files) {
+	for (const e of sliced) {
 		try {
-			runs.push(JSON.parse(fs.readFileSync(path.join(dir, f), "utf-8")));
-		} catch {
-			/* ignore */
-		}
+			const raw = fs.readFileSync(path.join(root, e.relPath), "utf-8");
+			runs.push(JSON.parse(raw) as RunState);
+		} catch { /* file may have been deleted since index was built — skip */ }
 	}
-	// Guard against records missing/with non-numeric `updatedAt` — a bare
-	// `JSON.parse` may yield an object without it, and `undefined - undefined`
-	// is NaN, which makes `Array.prototype.sort` produce implementation-defined
-	// order. Drop those before sorting. (v0.0.8 audit, F-010.)
-	return runs
-		.filter((r) => typeof r.updatedAt === "number" && !Number.isNaN(r.updatedAt))
-		.sort((a, b) => b.updatedAt - a.updatedAt)
-		.slice(0, limit);
+	// F-010: filter out records with non-numeric/NaN updatedAt.
+	return runs.filter((r) => typeof r.updatedAt === "number" && !Number.isNaN(r.updatedAt));
 }
 /** Stable hash of a phase's resolved task + inputs, for resume caching. */
@@ -257,6 +744,8 @@ export function hashInput(...parts: string[]): string {
  * a crash or concurrent write from leaving a half-written, corrupt JSON file.
  */
 function writeFileAtomic(filePath: string, data: string): void {
+	// Ensure parent directory exists.
+	fs.mkdirSync(path.dirname(filePath), { recursive: true });
 	const tmp = `${filePath}.${process.pid}.${crypto.randomBytes(4).toString("hex")}.tmp`;
 	try {
 		fs.writeFileSync(tmp, data, "utf-8");

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "pi-taskflow",
-  "version": "0.0.8",
+  "version": "0.0.10",
   "description": "Lightweight workflow orchestration for the Pi coding agent — declarative multi-phase taskflows with dynamic fan-out, isolated subagent context, resumable runs, and saveable commands.",
   "keywords": [
     "pi-package",

package/skills/taskflow/SKILL.md CHANGED Viewed

@@ -106,7 +106,7 @@ routing. Use `join: "any"` on the merge phase so it runs whichever branch fired:
 { "id": "triage", "type": "agent", "agent": "analyst", "output": "json",
   "task": "Classify the task. Output ONLY {\"route\":\"deep\"} or {\"route\":\"quick\"}." },
 { "id": "deep",  "when": "{steps.triage.json.route} == deep",  "dependsOn": ["triage"], "agent": "analyst", "task": "..." },
-{ "id": "quick", "when": "{steps.triage.json.route} == quick", "dependsOn": ["triage"], "agent": "executor_fast", "task": "..." },
+{ "id": "quick", "when": "{steps.triage.json.route} == quick", "dependsOn": ["triage"], "agent": "executor-fast", "task": "..." },
 { "id": "report", "type": "reduce", "from": ["deep","quick"], "join": "any",
   "dependsOn": ["deep","quick"], "agent": "writer", "task": "...", "final": true }
 ```