pi-taskflow 0.0.8 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/examples/guarded-refactor.json +1 -1
- package/extensions/index.ts +18 -3
- package/extensions/runner.ts +14 -0
- package/extensions/runtime.ts +54 -47
- package/extensions/schema.ts +19 -6
- package/extensions/store.ts +544 -55
- package/package.json +1 -1
- package/skills/taskflow/SKILL.md +1 -1
package/README.md
CHANGED
|
@@ -156,10 +156,10 @@ declaratively, no scripting:
|
|
|
156
156
|
{ "id": "triage", "type": "agent", "agent": "analyst", "output": "json",
|
|
157
157
|
"task": "Classify the bug. Output ONLY {\"severity\":\"high\"} or {\"severity\":\"low\"}." },
|
|
158
158
|
{ "id": "deep", "when": "{steps.triage.json.severity} == high", "dependsOn": ["triage"],
|
|
159
|
-
"agent": "
|
|
159
|
+
"agent": "executor-code", "task": "Root-cause and patch it.",
|
|
160
160
|
"retry": { "max": 2, "backoffMs": 500 } },
|
|
161
161
|
{ "id": "quick", "when": "{steps.triage.json.severity} == low", "dependsOn": ["triage"],
|
|
162
|
-
"agent": "
|
|
162
|
+
"agent": "executor-fast", "task": "Apply the quick fix." },
|
|
163
163
|
{ "id": "approve", "type": "approval", "join": "any", "dependsOn": ["deep", "quick"],
|
|
164
164
|
"task": "Review the fix before it ships." },
|
|
165
165
|
{ "id": "ship", "type": "agent", "dependsOn": ["approve"],
|
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
{
|
|
26
26
|
"id": "implement",
|
|
27
27
|
"type": "agent",
|
|
28
|
-
"agent": "
|
|
28
|
+
"agent": "executor-code",
|
|
29
29
|
"dependsOn": ["approve", "plan"],
|
|
30
30
|
"task": "Implement the approved plan for {args.target}.\nPlan:\n{steps.plan.output}\nExtra human guidance (if any):\n{steps.approve.output}",
|
|
31
31
|
"retry": { "max": 1, "backoffMs": 1000 }
|
package/extensions/index.ts
CHANGED
|
@@ -50,8 +50,8 @@ const ShorthandStep = Type.Object(
|
|
|
50
50
|
);
|
|
51
51
|
|
|
52
52
|
const TaskflowParams = Type.Object({
|
|
53
|
-
action: StringEnum(["run", "save", "resume", "list"] as const, {
|
|
54
|
-
description: "What to do: run a flow, save a definition, resume a paused run, or list
|
|
53
|
+
action: StringEnum(["run", "save", "resume", "list", "agents"] as const, {
|
|
54
|
+
description: "What to do: run a flow, save a definition, resume a paused run, list saved flows, or list available agents you can use in phases",
|
|
55
55
|
default: "run",
|
|
56
56
|
}),
|
|
57
57
|
name: Type.Optional(Type.String({ description: "Name of a saved flow (for run/save without inline define)" })),
|
|
@@ -219,7 +219,7 @@ export default function (pi: ExtensionAPI) {
|
|
|
219
219
|
"Phases (agent, parallel, map, gate, reduce, approval, flow) form a DAG; intermediate outputs stay out of your context — only the final phase output is returned.",
|
|
220
220
|
"Use action=run with an inline `define` (you write the DSL) or a saved `name`.",
|
|
221
221
|
"For simple non-DAG delegations (like the subagent tool) skip the DSL: pass `task` (+optional `agent`) for one task, `tasks:[{task,agent?}]` to run in parallel, or `chain:[{task,agent?}]` to run sequentially (reference the prior step with {previous.output}).",
|
|
222
|
-
"Use action=save to persist a definition as a reusable /tf:<name> command. action=resume continues a paused run. action=list shows saved flows.",
|
|
222
|
+
"Use action=save to persist a definition as a reusable /tf:<name> command. action=resume continues a paused run. action=list shows saved flows. Use action=agents to list available agents — do NOT invent agent names; either use an agent from that list or omit the 'agent' field to auto-select the default agent.",
|
|
223
223
|
"DSL: {name, args?, concurrency?, budget?:{maxUSD,maxTokens}, phases:[{id, type, agent, task, dependsOn?, join?:'all'|'any', when?, retry?:{max,backoffMs,factor}, over?(map), as?(map), branches?(parallel), from?(reduce), use?(flow), with?(flow), output?:'json', final?}]}.",
|
|
224
224
|
"Phase types: agent (one subagent), parallel (static branches), map (dynamic fan-out over an array), gate (VERDICT: PASS/BLOCK quality gate), reduce (aggregate from N phases), approval (human-in-the-loop pause), flow (run a saved sub-flow). join:'any' is an OR-join; when is a conditional guard; retry adds backoff; budget caps run cost.",
|
|
225
225
|
"Interpolation: {args.X}, {steps.ID.output}, {steps.ID.json}, {item} (map), {previous.output}.",
|
|
@@ -235,6 +235,21 @@ export default function (pi: ExtensionAPI) {
|
|
|
235
235
|
async execute(_id, params, signal, onUpdate, ctx) {
|
|
236
236
|
const action = params.action ?? "run";
|
|
237
237
|
|
|
238
|
+
// agents — list available agents the LLM can use in phase definitions
|
|
239
|
+
if (action === "agents") {
|
|
240
|
+
const scope = params.scope ?? "both";
|
|
241
|
+
const { agents } = discoverAgents(ctx.cwd, scope as AgentScope, undefined);
|
|
242
|
+
const text = agents.length
|
|
243
|
+
? agents
|
|
244
|
+
.map(
|
|
245
|
+
(a) =>
|
|
246
|
+
`- ${a.name} (${a.source}): ${a.description}${a.model ? ` [model: ${a.model}]` : ""}${a.tools?.length ? ` [tools: ${a.tools.join(", ")}]` : ""}`,
|
|
247
|
+
)
|
|
248
|
+
.join("\n")
|
|
249
|
+
: "No agents found. Use the default agent by omitting the 'agent' field in phases.";
|
|
250
|
+
return { content: [{ type: "text", text }], details: { action } satisfies TaskflowDetails };
|
|
251
|
+
}
|
|
252
|
+
|
|
238
253
|
// list
|
|
239
254
|
if (action === "list") {
|
|
240
255
|
const flows = listFlows(ctx.cwd);
|
package/extensions/runner.ts
CHANGED
|
@@ -48,6 +48,20 @@ export function isFailed(r: RunResult): boolean {
|
|
|
48
48
|
return r.exitCode !== 0 || r.stopReason === "error" || r.stopReason === "aborted";
|
|
49
49
|
}
|
|
50
50
|
|
|
51
|
+
/**
|
|
52
|
+
* Heuristic: did this failure look like a transient/retryable provider error
|
|
53
|
+
* (rate limit, overload, timeout, 5xx)? Such errors should be retried inside
|
|
54
|
+
* the taskflow run with backoff rather than bubbled up — otherwise the calling
|
|
55
|
+
* agent tends to re-invoke the whole tool, producing duplicate progress blocks.
|
|
56
|
+
*/
|
|
57
|
+
const TRANSIENT_ERROR_RE =
|
|
58
|
+
/rate[_\s-]?limit|too\s+many\s+requests|overloaded|\b429\b|\b503\b|\b502\b|\b504\b|service\s+unavailable|temporarily\s+unavailable|timeout|timed?\s+out|econnreset|etimedout|socket\s+hang\s*up/i;
|
|
59
|
+
export function isTransientError(r: RunResult): boolean {
|
|
60
|
+
if (r.stopReason === "aborted") return false;
|
|
61
|
+
const hay = `${r.errorMessage ?? ""} ${r.stderr ?? ""} ${r.output ?? ""}`;
|
|
62
|
+
return TRANSIENT_ERROR_RE.test(hay);
|
|
63
|
+
}
|
|
64
|
+
|
|
51
65
|
/** Placeholder written to a failed phase's `output` so downstream interpolation
|
|
52
66
|
* can detect "upstream failed" without being polluted by raw HTML/JSON. */
|
|
53
67
|
export const TRANSPORT_ERROR_PLACEHOLDER = "(upstream error: subagent failed; see error)";
|
package/extensions/runtime.ts
CHANGED
|
@@ -14,7 +14,7 @@ import * as path from "node:path";
|
|
|
14
14
|
import * as fs from "node:fs";
|
|
15
15
|
import type { AgentConfig } from "./agents.ts";
|
|
16
16
|
import { coerceArray, evaluateCondition, interpolate, type InterpolationContext, safeParse } from "./interpolate.ts";
|
|
17
|
-
import { isFailed, type LiveUpdate, mapWithConcurrencyLimit, runAgentTask, type RunResult } from "./runner.ts";
|
|
17
|
+
import { isFailed, isTransientError, type LiveUpdate, mapWithConcurrencyLimit, runAgentTask, type RunResult } from "./runner.ts";
|
|
18
18
|
import { aggregateUsage, emptyUsage, type UsageStats } from "./usage.ts";
|
|
19
19
|
import { type Budget, dependenciesOf, finalPhase, type Phase, resolveArgs, type Taskflow, topoLayers } from "./schema.ts";
|
|
20
20
|
import { hashInput, newRunId, type PhaseState, type RunState } from "./store.ts";
|
|
@@ -314,9 +314,20 @@ async function executePhase(
|
|
|
314
314
|
|
|
315
315
|
// Wrap each subagent call in the phase's retry policy. Usage is summed across
|
|
316
316
|
// attempts; the attempt count rides along on the result for the TUI.
|
|
317
|
+
//
|
|
318
|
+
// Even without an explicit `phase.retry`, transient provider errors (rate
|
|
319
|
+
// limits, overload, 5xx, timeouts) are retried with backoff so a momentary
|
|
320
|
+
// 429 is absorbed inside this run instead of bubbling up and provoking the
|
|
321
|
+
// calling agent to re-invoke the whole tool (which stacks duplicate progress
|
|
322
|
+
// blocks in the transcript).
|
|
317
323
|
const retry = phase.retry;
|
|
324
|
+
const DEFAULT_TRANSIENT_RETRIES = 3;
|
|
325
|
+
const DEFAULT_TRANSIENT_BACKOFF_MS = 2000;
|
|
326
|
+
const DEFAULT_TRANSIENT_FACTOR = 2;
|
|
318
327
|
const runOne = async (agentName: string, task: string, onLive?: (l: LiveUpdate) => void): Promise<RunResult> => {
|
|
319
|
-
const
|
|
328
|
+
const explicitMax = Math.max(1, 1 + Math.max(0, Math.floor(retry?.max ?? 0)));
|
|
329
|
+
// Allow enough attempts to cover whichever policy applies on a given attempt.
|
|
330
|
+
const maxAttempts = Math.max(explicitMax, 1 + DEFAULT_TRANSIENT_RETRIES);
|
|
320
331
|
const usages: UsageStats[] = [];
|
|
321
332
|
let last: RunResult | undefined;
|
|
322
333
|
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
@@ -330,10 +341,21 @@ async function executePhase(
|
|
|
330
341
|
if (!isFailed(last)) break;
|
|
331
342
|
// Stop retrying on abort or once the run is over budget.
|
|
332
343
|
if (deps.signal?.aborted || overBudget(state).over) break;
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
344
|
+
// Decide whether THIS failure warrants another attempt. Explicit retry
|
|
345
|
+
// policy covers all failures up to its cap; the transient fallback covers
|
|
346
|
+
// only retryable provider errors. A non-transient failure with no explicit
|
|
347
|
+
// policy stops immediately (no point burning attempts on a hard error).
|
|
348
|
+
const withinExplicit = attempt < explicitMax - 1;
|
|
349
|
+
const transient = isTransientError(last);
|
|
350
|
+
const withinTransient = transient && attempt < DEFAULT_TRANSIENT_RETRIES;
|
|
351
|
+
if (!withinExplicit && !withinTransient) break;
|
|
352
|
+
// Backoff: prefer the explicit policy's curve when the phase defines one
|
|
353
|
+
// (covers transient retries too, and keeps tests fast with backoffMs:0),
|
|
354
|
+
// otherwise use the transient defaults.
|
|
355
|
+
const baseMs = retry ? (retry.backoffMs ?? 0) : DEFAULT_TRANSIENT_BACKOFF_MS;
|
|
356
|
+
const factor = retry ? (retry.factor ?? 1) : DEFAULT_TRANSIENT_FACTOR;
|
|
357
|
+
const wait = Math.min(60000, Math.round(baseMs * factor ** attempt));
|
|
358
|
+
if (wait > 0) await delay(wait, deps.signal);
|
|
337
359
|
}
|
|
338
360
|
// Aborted before any attempt ran → return a clean aborted result (no crash).
|
|
339
361
|
if (!last) {
|
|
@@ -414,11 +436,12 @@ async function executePhase(
|
|
|
414
436
|
if (type === "agent" || type === "gate" || type === "reduce") {
|
|
415
437
|
const { text } = interpolate(phase.task ?? "", ctx);
|
|
416
438
|
const fullTask = preRead + text;
|
|
417
|
-
const
|
|
439
|
+
const agentName = resolveAgent(phase.agent, deps, state);
|
|
440
|
+
const inputHash = hashInput(phase.id, agentName, fullTask);
|
|
418
441
|
const cached = cachedPhase(prior, inputHash);
|
|
419
442
|
if (cached) return cached;
|
|
420
443
|
|
|
421
|
-
const r = await runOne(
|
|
444
|
+
const r = await runOne(agentName, fullTask, liveSink(state, phase.id, emitProgress));
|
|
422
445
|
const ps = resultToPhaseState(phase.id, r, inputHash, parseJson);
|
|
423
446
|
if (type === "gate" && ps.status === "done") ps.gate = parseGateVerdict(r.output);
|
|
424
447
|
return ps;
|
|
@@ -428,7 +451,7 @@ async function executePhase(
|
|
|
428
451
|
const branches = (phase.branches ?? []).map((b) => {
|
|
429
452
|
const r = interpolate(b.task, ctx);
|
|
430
453
|
return {
|
|
431
|
-
agent: b.agent ?? phase.agent
|
|
454
|
+
agent: resolveAgent(b.agent ?? phase.agent, deps, state),
|
|
432
455
|
task: preRead + r.text,
|
|
433
456
|
};
|
|
434
457
|
});
|
|
@@ -458,7 +481,7 @@ async function executePhase(
|
|
|
458
481
|
const tasks = arr.map((item) => {
|
|
459
482
|
const localCtx = buildInterpolationContext(state, previousOutput, { [loopVar]: item });
|
|
460
483
|
return {
|
|
461
|
-
agent: phase.agent
|
|
484
|
+
agent: resolveAgent(phase.agent, deps, state),
|
|
462
485
|
task: preRead + interpolate(phase.task ?? "", localCtx).text,
|
|
463
486
|
};
|
|
464
487
|
});
|
|
@@ -641,6 +664,27 @@ function cachedPhase(prior: PhaseState | undefined, inputHash: string): PhaseSta
|
|
|
641
664
|
return null;
|
|
642
665
|
}
|
|
643
666
|
|
|
667
|
+
/**
|
|
668
|
+
* Resolve an agent name against available agents. Falls back to the default
|
|
669
|
+
* agent if the requested agent isn't found, logging a warning via safeEmit.
|
|
670
|
+
*/
|
|
671
|
+
function resolveAgent(name: string | undefined, deps: RuntimeDeps, state: RunState): string {
|
|
672
|
+
const resolved = name ?? defaultAgent(deps);
|
|
673
|
+
if (name && !deps.agents.some((a) => a.name === name)) {
|
|
674
|
+
const fallback = defaultAgent(deps);
|
|
675
|
+
// Log only once per run to avoid noise.
|
|
676
|
+
if (!(state as any).__unknownAgentWarned) {
|
|
677
|
+
(state as any).__unknownAgentWarned = new Set<string>();
|
|
678
|
+
}
|
|
679
|
+
if (!(state as any).__unknownAgentWarned.has(name)) {
|
|
680
|
+
(state as any).__unknownAgentWarned.add(name);
|
|
681
|
+
console.warn(`[taskflow] Unknown agent "${name}", falling back to "${fallback}". Use action=agents to list available agents.`);
|
|
682
|
+
}
|
|
683
|
+
return fallback;
|
|
684
|
+
}
|
|
685
|
+
return resolved;
|
|
686
|
+
}
|
|
687
|
+
|
|
644
688
|
function defaultAgent(deps: RuntimeDeps): string {
|
|
645
689
|
return deps.agents[0]?.name ?? "default";
|
|
646
690
|
}
|
|
@@ -719,45 +763,8 @@ function safeProgress(deps: RuntimeDeps, state: RunState): void {
|
|
|
719
763
|
/**
|
|
720
764
|
* Execute a full taskflow. Mutates and persists `state` as it progresses.
|
|
721
765
|
*/
|
|
722
|
-
function ensureImplicitGate(def: Taskflow): void {
|
|
723
|
-
// Respect explicit opt-out
|
|
724
|
-
if ((def as any).implicitGate === false) return;
|
|
725
|
-
|
|
726
|
-
const hasGate = def.phases.some(
|
|
727
|
-
(p) => p.type === "gate" || p.type === "approval" || p.id === "_implicit-gate",
|
|
728
|
-
);
|
|
729
|
-
if (hasGate || def.phases.length === 0) return;
|
|
730
|
-
|
|
731
|
-
// The last existing phase is the effective "final" phase — pin it so the
|
|
732
|
-
// injected gate doesn't become the finalOutput.
|
|
733
|
-
const lastPhase = def.phases[def.phases.length - 1];
|
|
734
|
-
if (!lastPhase.final && !def.phases.some((p) => p.final)) {
|
|
735
|
-
lastPhase.final = true;
|
|
736
|
-
}
|
|
737
|
-
|
|
738
|
-
const allIds = def.phases.map((p) => p.id);
|
|
739
|
-
def.phases.push({
|
|
740
|
-
id: "_implicit-gate",
|
|
741
|
-
type: "gate",
|
|
742
|
-
dependsOn: allIds,
|
|
743
|
-
agent: "reviewer",
|
|
744
|
-
task: `Review all phase outputs from this taskflow for accuracy and consistency.
|
|
745
|
-
|
|
746
|
-
For each upstream phase, scan its output for:
|
|
747
|
-
1. **Factual accuracy**: Any file paths, line numbers, or code snippets that are wrong?
|
|
748
|
-
2. **Internal contradictions**: Do any phases contradict each other?
|
|
749
|
-
3. **Completeness**: Is any output truncated, empty, or anomalously short?
|
|
750
|
-
4. **Hallucination markers**: Wrong file names, impossible line ranges, circular logic, information not in the given context.
|
|
751
|
-
|
|
752
|
-
Output:
|
|
753
|
-
- If ALL outputs look consistent and plausible: output **VERDICT: PASS** with a one-line summary.
|
|
754
|
-
- If ANY issues found: output **VERDICT: BLOCK** listing each issue with the phase ID and specific concern.`,
|
|
755
|
-
});
|
|
756
|
-
}
|
|
757
|
-
|
|
758
766
|
export async function executeTaskflow(state: RunState, deps: RuntimeDeps): Promise<RuntimeResult> {
|
|
759
767
|
const def: Taskflow = state.def;
|
|
760
|
-
ensureImplicitGate(def);
|
|
761
768
|
try {
|
|
762
769
|
return await runTaskflowLayers(state, deps);
|
|
763
770
|
} catch (e) {
|
package/extensions/schema.ts
CHANGED
|
@@ -147,12 +147,6 @@ export const TaskflowSchema = Type.Object(
|
|
|
147
147
|
}),
|
|
148
148
|
),
|
|
149
149
|
phases: Type.Array(PhaseSchema, { minItems: 1, description: "Ordered phase definitions (DAG via dependsOn)" }),
|
|
150
|
-
implicitGate: Type.Optional(
|
|
151
|
-
Type.Boolean({
|
|
152
|
-
description: "When true (default), a reviewer gate is auto-injected after all phases if no explicit gate or approval exists",
|
|
153
|
-
default: true,
|
|
154
|
-
}),
|
|
155
|
-
),
|
|
156
150
|
},
|
|
157
151
|
{ additionalProperties: false },
|
|
158
152
|
);
|
|
@@ -342,6 +336,16 @@ export function validateTaskflow(def: unknown, opts: ValidationOptions = {}): Va
|
|
|
342
336
|
if (p.join && !JOIN_MODES.includes(p.join as JoinMode)) {
|
|
343
337
|
errors.push(`Phase '${p.id}': unknown join mode '${p.join}'`);
|
|
344
338
|
}
|
|
339
|
+
|
|
340
|
+
// Agent name convention: hyphens only (per AGENTS.md naming convention)
|
|
341
|
+
if (p.agent && typeof p.agent === "string" && p.agent.includes("_")) {
|
|
342
|
+
errors.push(`Phase '${p.id}': agent name '${p.agent}' uses underscores — use hyphens (e.g. 'executor-code' not 'executor_code')`);
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// Phase id convention: hyphens only (consistent with agent naming)
|
|
346
|
+
if (p.id && p.id.includes("_")) {
|
|
347
|
+
errors.push(`Phase '${p.id}': id uses underscores — use hyphens for consistency with agent naming convention`);
|
|
348
|
+
}
|
|
345
349
|
}
|
|
346
350
|
|
|
347
351
|
// dependsOn / from references must exist
|
|
@@ -355,6 +359,15 @@ export function validateTaskflow(def: unknown, opts: ValidationOptions = {}): Va
|
|
|
355
359
|
}
|
|
356
360
|
}
|
|
357
361
|
|
|
362
|
+
// Agent name format validation (AGENTS.md naming convention: hyphens only, no underscores)
|
|
363
|
+
const VALID_AGENT_RE = /^[a-z][a-z0-9-]*$/;
|
|
364
|
+
for (const p of flow.phases) {
|
|
365
|
+
if (!p?.id) continue;
|
|
366
|
+
if (p.agent && !VALID_AGENT_RE.test(p.agent)) {
|
|
367
|
+
errors.push(`Phase '${p.id}': agent '${p.agent}' has invalid name format (expected lowercase alphanumeric with hyphens)`);
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
|
|
358
371
|
// Cycle detection (Kahn)
|
|
359
372
|
if (errors.length === 0) {
|
|
360
373
|
const cycle = detectCycle(flow.phases as Phase[]);
|
package/extensions/store.ts
CHANGED
|
@@ -3,7 +3,15 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Definitions: .pi/taskflows/<name>.json (project)
|
|
5
5
|
* ~/.pi/agent/taskflows/<name>.json (user)
|
|
6
|
-
* Run state: .pi/taskflows/runs/<runId>.json
|
|
6
|
+
* Run state: .pi/taskflows/runs/<sanitizedFlowName>/<runId>.json
|
|
7
|
+
* Index: .pi/taskflows/runs/index.json (lookup accelerator)
|
|
8
|
+
*
|
|
9
|
+
* Legacy layout (v0.0.8 and earlier):
|
|
10
|
+
* .pi/taskflows/runs/<runId>.json (flat, still readable)
|
|
11
|
+
*
|
|
12
|
+
* v0.0.9 refactor: per-flow subdirectory layout + lightweight index + file
|
|
13
|
+
* lock + TTL/cap cleanup. Full backward compatibility with the flat layout
|
|
14
|
+
* is maintained: loadRun and listRuns still discover legacy flat files.
|
|
7
15
|
*/
|
|
8
16
|
|
|
9
17
|
import * as crypto from "node:crypto";
|
|
@@ -66,6 +74,403 @@ export interface RunState {
|
|
|
66
74
|
cwd: string;
|
|
67
75
|
}
|
|
68
76
|
|
|
77
|
+
// ---------------------------------------------------------------------------
|
|
78
|
+
// Index entry — lightweight lookup record persisted in runs/index.json.
|
|
79
|
+
// Enables listRuns to find files without a full directory scan. Every
|
|
80
|
+
// non-terminal run and every terminal run within the retention window has an
|
|
81
|
+
// index entry; missing/stale entries are tolerated via degradation (rebuild).
|
|
82
|
+
// ---------------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
export interface RunIndexEntry {
|
|
85
|
+
runId: string;
|
|
86
|
+
flowName: string;
|
|
87
|
+
status: RunState["status"];
|
|
88
|
+
createdAt: number;
|
|
89
|
+
updatedAt: number;
|
|
90
|
+
/** Path relative to runsRoot, e.g. "test-flow/test-roundtrip-001.json". */
|
|
91
|
+
relPath: string;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// ---------------------------------------------------------------------------
|
|
95
|
+
// File-lock constants
|
|
96
|
+
// ---------------------------------------------------------------------------
|
|
97
|
+
|
|
98
|
+
/** Lock file considered stale after 30 s (orphaned from crash / kill -9). */
|
|
99
|
+
const LOCK_STALE_MS = 30_000;
|
|
100
|
+
/** Lock acquisition busy-wait interval. */
|
|
101
|
+
const LOCK_POLL_MS = 50;
|
|
102
|
+
/** Default acquisition timeout before throwing. */
|
|
103
|
+
const LOCK_TIMEOUT_MS = 10_000;
|
|
104
|
+
|
|
105
|
+
// ---------------------------------------------------------------------------
|
|
106
|
+
// Cleanup throttle
|
|
107
|
+
// ---------------------------------------------------------------------------
|
|
108
|
+
|
|
109
|
+
/** Minimum ms between opportunistic cleanup runs (called inside saveRun). */
|
|
110
|
+
const CLEANUP_INTERVAL_MS = 60_000;
|
|
111
|
+
/** Retain at most this many terminal runs by default. */
|
|
112
|
+
const DEFAULT_MAX_KEPT_TERMINAL = 100;
|
|
113
|
+
/** Remove terminal runs older than this (days). */
|
|
114
|
+
const DEFAULT_MAX_AGE_DAYS = 30;
|
|
115
|
+
|
|
116
|
+
/** Last cleanup timestamp — module-level so it persists across calls. */
|
|
117
|
+
let lastCleanupAt = 0;
|
|
118
|
+
|
|
119
|
+
// ---------------------------------------------------------------------------
|
|
120
|
+
// Internal helpers — path construction & sanitisation
|
|
121
|
+
// ---------------------------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Sanitise a flow name into a safe directory name. Same regex used by
|
|
125
|
+
* saveFlow/newRunId — but that regex keeps `.` in its allow-list, so a
|
|
126
|
+
* flowName of "." or ".." would pass through unchanged and let `flowRunDir`
|
|
127
|
+
* resolve OUTSIDE the runs root (write-side path traversal). `def.name` is
|
|
128
|
+
* internally derived and TypeBox only enforces Type.String() with no charset,
|
|
129
|
+
* so a Taskflow literally named ".." is schema-valid. We therefore reject
|
|
130
|
+
* bare-dot / leading-dot components after the character substitution so the
|
|
131
|
+
* write path can never escape runs/ (risk-reviewer v0.0.9 audit, H1).
|
|
132
|
+
*/
|
|
133
|
+
function safeFlowDirName(flowName: string): string {
|
|
134
|
+
let safe = flowName.replace(/[^\w.-]+/g, "_");
|
|
135
|
+
// Collapse leading dots: blocks ".", "..", and hidden-dir names like ".git".
|
|
136
|
+
safe = safe.replace(/^\.+/, "_");
|
|
137
|
+
return safe || "_";
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/** Return the per-flow run directory: runs/<sanitisedFlowName>. */
|
|
141
|
+
function flowRunDir(runsRoot: string, flowName: string): string {
|
|
142
|
+
return path.join(runsRoot, safeFlowDirName(flowName));
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/** Return the full path for a run file in the new subdirectory layout. */
|
|
146
|
+
function runFilePath(runsRoot: string, flowName: string, runId: string): string {
|
|
147
|
+
return path.join(flowRunDir(runsRoot, flowName), `${runId}.json`);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/** Return the path to the run index file. */
|
|
151
|
+
function indexPath(runsRoot: string): string {
|
|
152
|
+
return path.join(runsRoot, "index.json");
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/** Return the lock-file path guarding all index.json read-modify-write cycles. */
|
|
156
|
+
function indexLockPath(runsRoot: string): string {
|
|
157
|
+
return path.join(runsRoot, "index.json.lock");
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/** Return the lock-file path for a given runId (placed next to the run file). */
|
|
161
|
+
function lockPathForRun(runsRoot: string, flowName: string, runId: string): string {
|
|
162
|
+
return path.join(flowRunDir(runsRoot, flowName), `${runId}.json.lock`);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Validate that a runId looks safe before performing any filesystem access.
|
|
167
|
+
* Legitimate runIds are produced by newRunId() and contain only [A-Za-z0-9._-].
|
|
168
|
+
*/
|
|
169
|
+
function validateRunId(runId: string): boolean {
|
|
170
|
+
return (
|
|
171
|
+
typeof runId === "string" &&
|
|
172
|
+
runId.length > 0 &&
|
|
173
|
+
!runId.includes("/") &&
|
|
174
|
+
!runId.includes("\\") &&
|
|
175
|
+
!runId.includes("\0")
|
|
176
|
+
);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
180
|
+
// File-lock primitives — zero-dependency, using O_CREAT|O_EXCL (atomic)
|
|
181
|
+
// ---------------------------------------------------------------------------
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Acquire a file lock by atomically creating a lock file.
|
|
185
|
+
*
|
|
186
|
+
* Uses O_CREAT|O_EXCL (`wx` flag) which is atomic on POSIX and NTFS.
|
|
187
|
+
* Stale locks (> LOCK_STALE_MS) are stolen via an atomic rename rather than a
|
|
188
|
+
* naive unlink-then-create: a plain `unlinkSync` + `openSync('wx')` has a
|
|
189
|
+
* TOCTOU window where two processes both unlink the same stale lock and both
|
|
190
|
+
* then create a fresh one, yielding two simultaneous holders (risk-reviewer
|
|
191
|
+
* v0.0.9 audit, L1). `rename` is atomic and removes the *specific* inode the
|
|
192
|
+
* caller observed: only one racing process can win the rename of that exact
|
|
193
|
+
* stale file, so at most one process proceeds to re-create the lock.
|
|
194
|
+
* Throws on timeout.
|
|
195
|
+
*/
|
|
196
|
+
function acquireLock(lockPath: string, timeoutMs: number = LOCK_TIMEOUT_MS): void {
|
|
197
|
+
const start = Date.now();
|
|
198
|
+
// Ensure parent directory exists (lock file lives inside the flow subdir).
|
|
199
|
+
const dir = path.dirname(lockPath);
|
|
200
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
201
|
+
|
|
202
|
+
while (true) {
|
|
203
|
+
try {
|
|
204
|
+
const fd = fs.openSync(lockPath, "wx");
|
|
205
|
+
fs.writeFileSync(fd, JSON.stringify({ pid: process.pid, ts: Date.now() }));
|
|
206
|
+
fs.closeSync(fd);
|
|
207
|
+
return; // lock acquired
|
|
208
|
+
} catch (e: unknown) {
|
|
209
|
+
if ((e as NodeJS.ErrnoException).code !== "EEXIST") throw e;
|
|
210
|
+
// Lock file exists — check if stale.
|
|
211
|
+
try {
|
|
212
|
+
const stat = fs.statSync(lockPath);
|
|
213
|
+
if (Date.now() - stat.mtimeMs > LOCK_STALE_MS) {
|
|
214
|
+
// Stale lock — steal it via atomic rename so only one racing
|
|
215
|
+
// stealer can win (L1). The "graveyard" name is unique per
|
|
216
|
+
// process+attempt; the winner unlinks it, losers see ENOENT
|
|
217
|
+
// on their own rename and simply retry the acquire loop.
|
|
218
|
+
const grave = `${lockPath}.stale.${process.pid}.${crypto.randomBytes(4).toString("hex")}`;
|
|
219
|
+
try {
|
|
220
|
+
fs.renameSync(lockPath, grave);
|
|
221
|
+
// We won the steal — discard the graveyard copy and retry
|
|
222
|
+
// the loop, where openSync('wx') will create a fresh lock.
|
|
223
|
+
try { fs.unlinkSync(grave); } catch { /* ignore */ }
|
|
224
|
+
} catch { /* lost the steal race (ENOENT) — just retry */ }
|
|
225
|
+
continue;
|
|
226
|
+
}
|
|
227
|
+
} catch {
|
|
228
|
+
// ENOENT: another process released it between openSync and statSync — retry.
|
|
229
|
+
continue;
|
|
230
|
+
}
|
|
231
|
+
// Lock is held and not stale — wait and retry.
|
|
232
|
+
if (Date.now() - start > timeoutMs) {
|
|
233
|
+
throw new Error(`Lock timeout after ${timeoutMs}ms waiting for ${path.basename(lockPath)}`);
|
|
234
|
+
}
|
|
235
|
+
// Busy-wait with Atomics.wait (CPU-efficient sleep).
|
|
236
|
+
Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, LOCK_POLL_MS);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/**
|
|
242
|
+
* Release a file lock by deleting the lock file. Ignores ENOENT (already
|
|
243
|
+
* released by another process or stolen due to staleness).
|
|
244
|
+
*/
|
|
245
|
+
function releaseLock(lockPath: string): void {
|
|
246
|
+
try { fs.unlinkSync(lockPath); } catch { /* ENOENT or other — ignore */ }
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Execute `fn` while holding a file lock. Guarantees release even on throw.
|
|
251
|
+
*/
|
|
252
|
+
function withLock<T>(lockPath: string, fn: () => T): T {
|
|
253
|
+
acquireLock(lockPath);
|
|
254
|
+
try {
|
|
255
|
+
return fn();
|
|
256
|
+
} finally {
|
|
257
|
+
releaseLock(lockPath);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// ---------------------------------------------------------------------------
|
|
262
|
+
// Index CRUD
|
|
263
|
+
// ---------------------------------------------------------------------------
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* Extract a RunIndexEntry from a RunState + computed relative path.
|
|
267
|
+
*/
|
|
268
|
+
function extractIndexEntry(state: RunState, relPath: string): RunIndexEntry {
|
|
269
|
+
return {
|
|
270
|
+
runId: state.runId,
|
|
271
|
+
flowName: state.flowName,
|
|
272
|
+
status: state.status,
|
|
273
|
+
createdAt: state.createdAt,
|
|
274
|
+
updatedAt: state.updatedAt,
|
|
275
|
+
relPath,
|
|
276
|
+
};
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
/** Read the index file; return [] on any error (missing, corrupt, etc.). */
|
|
280
|
+
function readIndex(runsRoot: string): RunIndexEntry[] {
|
|
281
|
+
try {
|
|
282
|
+
const raw = fs.readFileSync(indexPath(runsRoot), "utf-8");
|
|
283
|
+
const parsed = JSON.parse(raw);
|
|
284
|
+
if (!Array.isArray(parsed)) return [];
|
|
285
|
+
// Validate each entry minimally.
|
|
286
|
+
return (parsed as RunIndexEntry[]).filter(
|
|
287
|
+
(e) => e && typeof e.runId === "string" && typeof e.relPath === "string",
|
|
288
|
+
);
|
|
289
|
+
} catch {
|
|
290
|
+
return [];
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/** Write the full index atomically. */
|
|
295
|
+
function writeIndex(runsRoot: string, entries: RunIndexEntry[]): void {
|
|
296
|
+
writeFileAtomic(indexPath(runsRoot), JSON.stringify(entries, null, 2));
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
/** Upsert a single entry by runId (read → mutate → write). */
|
|
300
|
+
/**
|
|
301
|
+
* Upsert a single entry by runId (read → mutate → write).
|
|
302
|
+
*
|
|
303
|
+
* Guarded by a dedicated index lock so concurrent saveRun calls for *different*
|
|
304
|
+
* runIds (each holding only its own per-run lock) cannot interleave their
|
|
305
|
+
* read-modify-write of the shared index and lose each other's entries
|
|
306
|
+
* (risk-reviewer v0.0.9 audit, M1). The per-run lock protects the run file;
|
|
307
|
+
* this index lock protects the shared index.
|
|
308
|
+
*/
|
|
309
|
+
function updateIndexEntry(runsRoot: string, entry: RunIndexEntry): void {
|
|
310
|
+
withLock(indexLockPath(runsRoot), () => {
|
|
311
|
+
const entries = readIndex(runsRoot);
|
|
312
|
+
const idx = entries.findIndex((e) => e.runId === entry.runId);
|
|
313
|
+
if (idx >= 0) {
|
|
314
|
+
entries[idx] = entry;
|
|
315
|
+
} else {
|
|
316
|
+
entries.push(entry);
|
|
317
|
+
}
|
|
318
|
+
writeIndex(runsRoot, entries);
|
|
319
|
+
});
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// Note: removeIndexEntry is available but not currently called; cleanupTerminalRuns
|
|
323
|
+
// rewrites the full index instead. Kept as a comment for future use.
|
|
324
|
+
|
|
325
|
+
/**
|
|
326
|
+
* Scan all subdirectories + legacy flat files and rebuild the full index.
|
|
327
|
+
* Called when the index is missing or corrupt (self-healing).
|
|
328
|
+
*
|
|
329
|
+
* Deduplicates by runId: subdirectory entry wins over flat.
|
|
330
|
+
*/
|
|
331
|
+
function rebuildIndex(runsRoot: string): RunIndexEntry[] {
|
|
332
|
+
const entries = new Map<string, RunIndexEntry>();
|
|
333
|
+
|
|
334
|
+
let dirs: string[];
|
|
335
|
+
try {
|
|
336
|
+
dirs = fs.readdirSync(runsRoot, { withFileTypes: true })
|
|
337
|
+
.filter((d) => d.isDirectory())
|
|
338
|
+
.map((d) => d.name);
|
|
339
|
+
} catch {
|
|
340
|
+
dirs = [];
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// Scan per-flow subdirectories.
|
|
344
|
+
for (const dirName of dirs) {
|
|
345
|
+
const dirPath = path.join(runsRoot, dirName);
|
|
346
|
+
let files: string[];
|
|
347
|
+
try {
|
|
348
|
+
files = fs.readdirSync(dirPath).filter((f) => f.endsWith(".json") && !f.includes(".lock"));
|
|
349
|
+
} catch { continue; }
|
|
350
|
+
|
|
351
|
+
for (const file of files) {
|
|
352
|
+
try {
|
|
353
|
+
const raw = fs.readFileSync(path.join(dirPath, file), "utf-8");
|
|
354
|
+
const state = JSON.parse(raw) as RunState;
|
|
355
|
+
if (state && typeof state.runId === "string") {
|
|
356
|
+
entries.set(state.runId, extractIndexEntry(state, `${dirName}/${file}`));
|
|
357
|
+
}
|
|
358
|
+
} catch { /* skip corrupt */ }
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// Scan legacy flat files (runs/*.json, skip index.json).
|
|
363
|
+
let flatFiles: string[];
|
|
364
|
+
try {
|
|
365
|
+
flatFiles = fs.readdirSync(runsRoot).filter(
|
|
366
|
+
(f) => f.endsWith(".json") && f !== "index.json" && !f.includes(".lock"),
|
|
367
|
+
);
|
|
368
|
+
} catch {
|
|
369
|
+
flatFiles = [];
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
for (const file of flatFiles) {
|
|
373
|
+
if (entries.has(file.replace(/\.json$/, ""))) continue; // prefer subdir entry
|
|
374
|
+
try {
|
|
375
|
+
const raw = fs.readFileSync(path.join(runsRoot, file), "utf-8");
|
|
376
|
+
const state = JSON.parse(raw) as RunState;
|
|
377
|
+
if (state && typeof state.runId === "string" && !entries.has(state.runId)) {
|
|
378
|
+
entries.set(state.runId, extractIndexEntry(state, file));
|
|
379
|
+
}
|
|
380
|
+
} catch { /* skip corrupt */ }
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
const result = Array.from(entries.values());
|
|
384
|
+
// Persist the rebuilt index under the index lock so it does not race a
|
|
385
|
+
// concurrent updateIndexEntry / cleanup write (M1).
|
|
386
|
+
withLock(indexLockPath(runsRoot), () => writeIndex(runsRoot, result));
|
|
387
|
+
return result;
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
// ---------------------------------------------------------------------------
|
|
391
|
+
// TTL / cap cleanup
|
|
392
|
+
// ---------------------------------------------------------------------------
|
|
393
|
+
|
|
394
|
+
/**
|
|
395
|
+
* Remove excess and expired terminal (completed/failed) runs.
|
|
396
|
+
*
|
|
397
|
+
* Called opportunistically at the end of saveRun. Throttled to at most once
|
|
398
|
+
* per CLEANUP_INTERVAL_MS. Active runs (running/paused/blocked) are never
|
|
399
|
+
* touched.
|
|
400
|
+
*
|
|
401
|
+
* The index read-modify-write is performed under the index lock so it cannot
|
|
402
|
+
* race a concurrent updateIndexEntry and clobber a freshly-added entry (M1).
|
|
403
|
+
* We re-read the index *inside* the lock (rather than trusting a snapshot read
|
|
404
|
+
* before locking) so the rewrite reflects the latest committed state. File and
|
|
405
|
+
* directory unlinks happen after the lock is released to keep the critical
|
|
406
|
+
* section short; deleting a file that is no longer in the index is harmless.
|
|
407
|
+
*/
|
|
408
|
+
function cleanupTerminalRuns(
|
|
409
|
+
runsRoot: string,
|
|
410
|
+
maxKeep: number = DEFAULT_MAX_KEPT_TERMINAL,
|
|
411
|
+
maxAgeDays: number = DEFAULT_MAX_AGE_DAYS,
|
|
412
|
+
): void {
|
|
413
|
+
const now = Date.now();
|
|
414
|
+
if (now - lastCleanupAt < CLEANUP_INTERVAL_MS) return;
|
|
415
|
+
lastCleanupAt = now;
|
|
416
|
+
|
|
417
|
+
const maxAgeMs = maxAgeDays * 86_400_000;
|
|
418
|
+
let toRemove: RunIndexEntry[] = [];
|
|
419
|
+
|
|
420
|
+
withLock(indexLockPath(runsRoot), () => {
|
|
421
|
+
const entries = readIndex(runsRoot);
|
|
422
|
+
const terminal: RunIndexEntry[] = [];
|
|
423
|
+
const active: RunIndexEntry[] = [];
|
|
424
|
+
|
|
425
|
+
for (const e of entries) {
|
|
426
|
+
if (e.status === "completed" || e.status === "failed") {
|
|
427
|
+
terminal.push(e);
|
|
428
|
+
} else {
|
|
429
|
+
active.push(e);
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// Sort terminal by updatedAt desc (newest first).
|
|
434
|
+
terminal.sort((a, b) => b.updatedAt - a.updatedAt);
|
|
435
|
+
|
|
436
|
+
for (let i = 0; i < terminal.length; i++) {
|
|
437
|
+
const e = terminal[i]!;
|
|
438
|
+
const expiredByAge = now - e.updatedAt > maxAgeMs;
|
|
439
|
+
const excessByCount = i >= maxKeep;
|
|
440
|
+
if (expiredByAge || excessByCount) {
|
|
441
|
+
toRemove.push(e);
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
if (toRemove.length === 0) return;
|
|
446
|
+
|
|
447
|
+
// Commit the pruned index while holding the lock so a concurrent
|
|
448
|
+
// updateIndexEntry cannot interleave and lose entries.
|
|
449
|
+
const remaining = terminal.filter((e) => !toRemove.includes(e));
|
|
450
|
+
writeIndex(runsRoot, [...active, ...remaining]);
|
|
451
|
+
});
|
|
452
|
+
|
|
453
|
+
if (toRemove.length === 0) return;
|
|
454
|
+
|
|
455
|
+
// Delete run files + lock files (outside the index lock).
|
|
456
|
+
for (const e of toRemove) {
|
|
457
|
+
const filePath = path.join(runsRoot, e.relPath);
|
|
458
|
+
try { fs.unlinkSync(filePath); } catch { /* already gone */ }
|
|
459
|
+
// Also remove any orphaned lock file.
|
|
460
|
+
try { fs.unlinkSync(filePath + ".lock"); } catch { /* ignore */ }
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
// Remove empty flow subdirectories.
|
|
464
|
+
for (const e of toRemove) {
|
|
465
|
+
const dirPath = path.dirname(path.join(runsRoot, e.relPath));
|
|
466
|
+
try { fs.rmdirSync(dirPath); } catch { /* ENOTEMPTY or ENOENT — ignore */ }
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
// ---------------------------------------------------------------------------
|
|
471
|
+
// Original helpers (unchanged)
|
|
472
|
+
// ---------------------------------------------------------------------------
|
|
473
|
+
|
|
69
474
|
function userFlowsDir(): string {
|
|
70
475
|
return path.join(getAgentDir(), "taskflows");
|
|
71
476
|
}
|
|
@@ -160,90 +565,172 @@ export function newRunId(flowName: string): string {
|
|
|
160
565
|
return `${safe}-${Date.now().toString(36)}-${crypto.randomBytes(3).toString("hex")}`;
|
|
161
566
|
}
|
|
162
567
|
|
|
568
|
+
/**
|
|
569
|
+
* Persist a run state to disk.
|
|
570
|
+
*
|
|
571
|
+
* v0.0.9: writes to `runs/<sanitisedFlowName>/<runId>.json` (per-flow
|
|
572
|
+
* subdirectory) and updates the lightweight index. Uses a per-run file lock
|
|
573
|
+
* to prevent concurrent writes to the same runId. After the write, runs
|
|
574
|
+
* opportunistic cleanup of expired terminal runs.
|
|
575
|
+
*
|
|
576
|
+
* F-009: shallow-clones state before stamping updatedAt to avoid mutating the
|
|
577
|
+
* caller's reference.
|
|
578
|
+
*/
|
|
163
579
|
export function saveRun(state: RunState): void {
|
|
164
|
-
const
|
|
165
|
-
|
|
580
|
+
const root = runsDir(state.cwd);
|
|
581
|
+
const flowDir = flowRunDir(root, state.flowName);
|
|
582
|
+
fs.mkdirSync(flowDir, { recursive: true });
|
|
583
|
+
|
|
166
584
|
// Clone before stamping updatedAt so the caller's RunState reference is not
|
|
167
585
|
// mutated as a hidden side effect (v0.0.6 audit, F-009). Shallow clone is
|
|
168
586
|
// sufficient: saveRun only serializes; it does not mutate nested objects.
|
|
169
587
|
const toSave = { ...state, updatedAt: Date.now() };
|
|
170
|
-
|
|
588
|
+
const filePath = runFilePath(root, state.flowName, state.runId);
|
|
589
|
+
const lockPath = lockPathForRun(root, state.flowName, state.runId);
|
|
590
|
+
|
|
591
|
+
withLock(lockPath, () => {
|
|
592
|
+
writeFileAtomic(filePath, JSON.stringify(toSave, null, 2));
|
|
593
|
+
updateIndexEntry(root, extractIndexEntry(toSave, path.basename(flowDir) + "/" + path.basename(filePath)));
|
|
594
|
+
});
|
|
595
|
+
|
|
596
|
+
// Opportunistic cleanup — throttled to once per CLEANUP_INTERVAL_MS.
|
|
597
|
+
cleanupTerminalRuns(root);
|
|
171
598
|
}
|
|
172
599
|
|
|
600
|
+
/**
|
|
601
|
+
* Load a single run by runId.
|
|
602
|
+
*
|
|
603
|
+
* Lookup chain (fast → slow):
|
|
604
|
+
* 1. INDEX — read index.json, find entry with matching runId, read via relPath.
|
|
605
|
+
* 2. SUBDIR SCAN — for each subdirectory in runsDir, check <subdir>/<runId>.json.
|
|
606
|
+
* 3. FLAT FALLBACK — check runsDir/<runId>.json directly (legacy layout).
|
|
607
|
+
*
|
|
608
|
+
* All existing path-traversal, symlink, and realpath guards are preserved for
|
|
609
|
+
* every path touched.
|
|
610
|
+
*/
|
|
173
611
|
export function loadRun(cwd: string, runId: string): RunState | null {
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
//
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
) {
|
|
187
|
-
return null;
|
|
612
|
+
if (!validateRunId(runId)) return null;
|
|
613
|
+
|
|
614
|
+
const root = runsDir(cwd);
|
|
615
|
+
|
|
616
|
+
// ---- Try index first ----
|
|
617
|
+
const indexEntries = readIndex(root);
|
|
618
|
+
const entry = indexEntries.find((e) => e.runId === runId);
|
|
619
|
+
if (entry) {
|
|
620
|
+
const filePath = path.join(root, entry.relPath);
|
|
621
|
+
const state = tryReadRunFile(root, filePath);
|
|
622
|
+
if (state) return state;
|
|
623
|
+
// Index entry exists but file is gone or corrupt — fall through.
|
|
188
624
|
}
|
|
189
625
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
626
|
+
// ---- Try subdirectory scan ----
|
|
627
|
+
let dirs: string[];
|
|
628
|
+
try {
|
|
629
|
+
dirs = fs.readdirSync(root, { withFileTypes: true })
|
|
630
|
+
.filter((d) => d.isDirectory())
|
|
631
|
+
.map((d) => d.name);
|
|
632
|
+
} catch { dirs = []; }
|
|
633
|
+
|
|
634
|
+
for (const dirName of dirs) {
|
|
635
|
+
const filePath = path.join(root, dirName, `${runId}.json`);
|
|
636
|
+
const state = tryReadRunFile(root, filePath);
|
|
637
|
+
if (state) return state;
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
// ---- Try legacy flat fallback ----
|
|
641
|
+
const flatPath = path.join(root, `${runId}.json`);
|
|
642
|
+
const state = tryReadRunFile(root, flatPath);
|
|
643
|
+
if (state) return state;
|
|
644
|
+
|
|
645
|
+
return null;
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
/**
|
|
649
|
+
* Safely read a run file, performing all path-traversal / symlink guards.
|
|
650
|
+
* Returns null on any violation or read error.
|
|
651
|
+
*/
|
|
652
|
+
function tryReadRunFile(runsRoot: string, filePath: string): RunState | null {
|
|
653
|
+
// Lexical traversal guard.
|
|
654
|
+
const rel = path.relative(runsRoot, filePath);
|
|
195
655
|
if (rel === ".." || rel.startsWith(`..${path.sep}`) || path.isAbsolute(rel)) return null;
|
|
196
656
|
|
|
197
|
-
// Resolve symlinks on both
|
|
198
|
-
//
|
|
199
|
-
// a legitimate run on macOS (where /var → /private/var) would compare a
|
|
200
|
-
// symlinked dir prefix to a real path and falsely flag traversal. A
|
|
201
|
-
// malicious file already placed inside the runs dir could otherwise also
|
|
202
|
-
// point at an arbitrary path on disk and bypass the lexical check above.
|
|
657
|
+
// Resolve symlinks on both runsRoot and the file so the containment check
|
|
658
|
+
// uses consistent physical paths (macOS /var → /private/var etc.).
|
|
203
659
|
let realDir: string;
|
|
204
660
|
let realFilePath: string;
|
|
205
661
|
try {
|
|
206
|
-
realDir = fs.realpathSync(
|
|
662
|
+
realDir = fs.realpathSync(runsRoot);
|
|
207
663
|
realFilePath = fs.realpathSync(filePath);
|
|
208
|
-
} catch {
|
|
209
|
-
|
|
210
|
-
}
|
|
664
|
+
} catch { return null; }
|
|
665
|
+
|
|
211
666
|
const realRel = path.relative(realDir, realFilePath);
|
|
212
667
|
if (realRel === ".." || realRel.startsWith(`..${path.sep}`) || path.isAbsolute(realRel)) return null;
|
|
213
668
|
|
|
214
669
|
try {
|
|
215
670
|
const raw = fs.readFileSync(realFilePath, "utf-8");
|
|
216
671
|
return JSON.parse(raw) as RunState;
|
|
217
|
-
} catch {
|
|
218
|
-
return null;
|
|
219
|
-
}
|
|
672
|
+
} catch { return null; }
|
|
220
673
|
}
|
|
221
674
|
|
|
675
|
+
/**
|
|
676
|
+
* List recent runs, sorted by updatedAt descending.
|
|
677
|
+
*
|
|
678
|
+
* v0.0.9: reads from index first, then merges any legacy flat files not yet in
|
|
679
|
+
* the index. If the index is missing/corrupt, calls rebuildIndex for
|
|
680
|
+
* self-healing.
|
|
681
|
+
*
|
|
682
|
+
* F-010: drops records with non-numeric/NaN updatedAt before sorting.
|
|
683
|
+
*/
|
|
222
684
|
export function listRuns(cwd: string, limit = 20): RunState[] {
|
|
223
|
-
const
|
|
224
|
-
if (!fs.existsSync(
|
|
225
|
-
|
|
685
|
+
const root = runsDir(cwd);
|
|
686
|
+
if (!fs.existsSync(root)) return [];
|
|
687
|
+
|
|
688
|
+
// Index-first path.
|
|
689
|
+
let entries = readIndex(root);
|
|
690
|
+
if (entries.length === 0) {
|
|
691
|
+
// Index missing or corrupt — rebuild from filesystem.
|
|
692
|
+
entries = rebuildIndex(root);
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
// Collect runIds from index for deduplication.
|
|
696
|
+
const indexRunIds = new Set(entries.map((e) => e.runId));
|
|
697
|
+
|
|
698
|
+
// Merge legacy flat files not yet in the index.
|
|
699
|
+
let flatFiles: string[];
|
|
226
700
|
try {
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
701
|
+
flatFiles = fs.readdirSync(root).filter(
|
|
702
|
+
(f) => f.endsWith(".json") && f !== "index.json" && !f.includes(".lock"),
|
|
703
|
+
);
|
|
704
|
+
} catch { flatFiles = []; }
|
|
705
|
+
|
|
706
|
+
for (const file of flatFiles) {
|
|
707
|
+
const runIdFromName = file.replace(/\.json$/, "");
|
|
708
|
+
if (indexRunIds.has(runIdFromName)) continue;
|
|
709
|
+
try {
|
|
710
|
+
const raw = fs.readFileSync(path.join(root, file), "utf-8");
|
|
711
|
+
const state = JSON.parse(raw) as RunState;
|
|
712
|
+
if (state && typeof state.runId === "string" && !indexRunIds.has(state.runId)) {
|
|
713
|
+
entries.push(extractIndexEntry(state, file));
|
|
714
|
+
indexRunIds.add(state.runId);
|
|
715
|
+
}
|
|
716
|
+
} catch { /* skip corrupt */ }
|
|
230
717
|
}
|
|
718
|
+
|
|
719
|
+
// Sort by updatedAt desc, slice to limit.
|
|
720
|
+
entries.sort((a, b) => b.updatedAt - a.updatedAt);
|
|
721
|
+
const sliced = entries.slice(0, limit);
|
|
722
|
+
|
|
723
|
+
// Read full RunState for each entry.
|
|
231
724
|
const runs: RunState[] = [];
|
|
232
|
-
for (const
|
|
725
|
+
for (const e of sliced) {
|
|
233
726
|
try {
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
}
|
|
727
|
+
const raw = fs.readFileSync(path.join(root, e.relPath), "utf-8");
|
|
728
|
+
runs.push(JSON.parse(raw) as RunState);
|
|
729
|
+
} catch { /* file may have been deleted since index was built — skip */ }
|
|
238
730
|
}
|
|
239
|
-
|
|
240
|
-
//
|
|
241
|
-
|
|
242
|
-
// order. Drop those before sorting. (v0.0.8 audit, F-010.)
|
|
243
|
-
return runs
|
|
244
|
-
.filter((r) => typeof r.updatedAt === "number" && !Number.isNaN(r.updatedAt))
|
|
245
|
-
.sort((a, b) => b.updatedAt - a.updatedAt)
|
|
246
|
-
.slice(0, limit);
|
|
731
|
+
|
|
732
|
+
// F-010: filter out records with non-numeric/NaN updatedAt.
|
|
733
|
+
return runs.filter((r) => typeof r.updatedAt === "number" && !Number.isNaN(r.updatedAt));
|
|
247
734
|
}
|
|
248
735
|
|
|
249
736
|
/** Stable hash of a phase's resolved task + inputs, for resume caching. */
|
|
@@ -257,6 +744,8 @@ export function hashInput(...parts: string[]): string {
|
|
|
257
744
|
* a crash or concurrent write from leaving a half-written, corrupt JSON file.
|
|
258
745
|
*/
|
|
259
746
|
function writeFileAtomic(filePath: string, data: string): void {
|
|
747
|
+
// Ensure parent directory exists.
|
|
748
|
+
fs.mkdirSync(path.dirname(filePath), { recursive: true });
|
|
260
749
|
const tmp = `${filePath}.${process.pid}.${crypto.randomBytes(4).toString("hex")}.tmp`;
|
|
261
750
|
try {
|
|
262
751
|
fs.writeFileSync(tmp, data, "utf-8");
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-taskflow",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.10",
|
|
4
4
|
"description": "Lightweight workflow orchestration for the Pi coding agent — declarative multi-phase taskflows with dynamic fan-out, isolated subagent context, resumable runs, and saveable commands.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"pi-package",
|
package/skills/taskflow/SKILL.md
CHANGED
|
@@ -106,7 +106,7 @@ routing. Use `join: "any"` on the merge phase so it runs whichever branch fired:
|
|
|
106
106
|
{ "id": "triage", "type": "agent", "agent": "analyst", "output": "json",
|
|
107
107
|
"task": "Classify the task. Output ONLY {\"route\":\"deep\"} or {\"route\":\"quick\"}." },
|
|
108
108
|
{ "id": "deep", "when": "{steps.triage.json.route} == deep", "dependsOn": ["triage"], "agent": "analyst", "task": "..." },
|
|
109
|
-
{ "id": "quick", "when": "{steps.triage.json.route} == quick", "dependsOn": ["triage"], "agent": "
|
|
109
|
+
{ "id": "quick", "when": "{steps.triage.json.route} == quick", "dependsOn": ["triage"], "agent": "executor-fast", "task": "..." },
|
|
110
110
|
{ "id": "report", "type": "reduce", "from": ["deep","quick"], "join": "any",
|
|
111
111
|
"dependsOn": ["deep","quick"], "agent": "writer", "task": "...", "final": true }
|
|
112
112
|
```
|