pi-crew 0.8.13 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/CHANGELOG.md +296 -0
  2. package/README.md +118 -2
  3. package/docs/FEATURE_INTAKE.md +1 -1
  4. package/docs/HARNESS.md +20 -19
  5. package/docs/PROJECT_REVIEW.md +132 -133
  6. package/docs/PROJECT_REVIEW_FIXES.md +130 -131
  7. package/docs/actions-reference.md +127 -121
  8. package/docs/architecture.md +1 -1
  9. package/docs/code-review-2026-05-11.md +134 -134
  10. package/docs/commands-reference.md +108 -106
  11. package/docs/comparison-pi-subagents-vs-pi-crew.md +105 -105
  12. package/docs/deep-review-report.md +1 -1
  13. package/docs/dynamic-workflows.md +90 -0
  14. package/docs/fixes/BATCH_A_H1_H2.md +17 -17
  15. package/docs/fixes/bug-007-async-notifier-stale-ctx.md +23 -23
  16. package/docs/followup-plan-2026-05-12.md +135 -135
  17. package/docs/followup-review-2026-05-12.md +86 -86
  18. package/docs/followup-review-round3-2026-05-12.md +123 -123
  19. package/docs/goals.md +59 -0
  20. package/docs/implementation-plan-top3.md +4 -4
  21. package/docs/issue-29-analysis.md +2 -2
  22. package/docs/oh-my-pi-research.md +154 -154
  23. package/docs/optimization-plan.md +2 -0
  24. package/docs/perf/baseline-2026-05.md +9 -9
  25. package/docs/perf/final-report-2026-05.md +2 -2
  26. package/docs/perf/sprint-1-report.md +2 -2
  27. package/docs/perf/sprint-2-report.md +1 -1
  28. package/docs/perf/upgrade-plan-2026-05.md +72 -72
  29. package/docs/pi-crew-bugs.md +230 -230
  30. package/docs/pi-crew-investigation-report.md +102 -102
  31. package/docs/pi-crew-test-round5.md +4 -4
  32. package/docs/runtime-analysis-child-vs-live.md +57 -57
  33. package/docs/runtime-migration-in-process-analysis.md +97 -97
  34. package/install.mjs +3 -2
  35. package/package.json +2 -4
  36. package/skills/orchestration/SKILL.md +11 -11
  37. package/src/agents/agent-config.ts +4 -0
  38. package/src/config/config.ts +39 -0
  39. package/src/config/types.ts +11 -0
  40. package/src/extension/action-suggestions.ts +2 -1
  41. package/src/extension/async-notifier.ts +10 -0
  42. package/src/extension/help.ts +14 -0
  43. package/src/extension/project-init.ts +7 -20
  44. package/src/extension/registration/commands.ts +27 -0
  45. package/src/extension/team-tool/destructive-gate.ts +1 -1
  46. package/src/extension/team-tool/goal-wrap.ts +288 -0
  47. package/src/extension/team-tool/goal.ts +405 -0
  48. package/src/extension/team-tool/run.ts +103 -4
  49. package/src/extension/team-tool/workflow-manage.ts +194 -0
  50. package/src/extension/team-tool.ts +20 -0
  51. package/src/hooks/types.ts +3 -1
  52. package/src/runtime/async-runner.ts +24 -2
  53. package/src/runtime/background-runner.ts +68 -19
  54. package/src/runtime/child-pi.ts +6 -1
  55. package/src/runtime/completion-guard.ts +1 -1
  56. package/src/runtime/dynamic-workflow-context.ts +450 -0
  57. package/src/runtime/dynamic-workflow-runner.ts +180 -0
  58. package/src/runtime/global-worker-cap.ts +96 -0
  59. package/src/runtime/goal-evaluator.ts +294 -0
  60. package/src/runtime/goal-loop-runner.ts +612 -0
  61. package/src/runtime/goal-state-store.ts +209 -0
  62. package/src/runtime/pi-args.ts +10 -2
  63. package/src/runtime/result-extractor.ts +32 -0
  64. package/src/runtime/team-runner.ts +11 -1
  65. package/src/runtime/verification-gates.ts +85 -5
  66. package/src/runtime/verification-integrity.ts +110 -0
  67. package/src/runtime/verification-worktree.ts +136 -0
  68. package/src/runtime/workspace-lock.ts +448 -0
  69. package/src/schema/config-schema.ts +26 -0
  70. package/src/schema/team-tool-schema.ts +39 -4
  71. package/src/state/atomic-write.ts +9 -0
  72. package/src/state/contracts.ts +14 -0
  73. package/src/state/crew-init.ts +18 -5
  74. package/src/state/event-log.ts +7 -1
  75. package/src/state/state-store.ts +2 -0
  76. package/src/state/types.ts +82 -0
  77. package/src/state/worker-atomic-writer.ts +176 -0
  78. package/src/utils/redaction.ts +104 -24
  79. package/src/workflows/discover-workflows.ts +25 -1
  80. package/src/workflows/workflow-config.ts +13 -0
  81. package/teams/parallel-research.team.md +1 -1
  82. package/workflows/examples/hello.dwf.ts +24 -0
@@ -0,0 +1,450 @@
1
+ /**
2
+ * dynamic-workflow-context.ts — WorkflowCtx facade for dynamic-workflow scripts (P2).
3
+ *
4
+ * Spec: research-findings/goal-workflow/00-SPEC.md §3.2
5
+ * Plan: 07-PLAN.md v3 P2 + §0b G4 + §0c C4/C5/C7.
6
+ *
7
+ * The `ctx` object passed to a `.dwf.ts` script's `export default async function(ctx)`.
8
+ * Capability-locked: exposes ONLY the documented methods (no raw manifest/process/require).
9
+ * The script host (dynamic-workflow-runner.ts) loads the script via jiti in plain module
10
+ * scope with a FROZEN WorkflowCtx. v1 has NO vm sandbox (review H-2): the script CAN
11
+ * reach `process`/`require`/`import` directly — the frozen ctx is a contract surface,
12
+ * not a security boundary. `.dwf.ts` = postinstall-equivalent trust. isolated-vm v1.5.
13
+ *
14
+ * `agent()` resolution (§0b G4): 4-tier precedence
15
+ * 1. opts.agent (explicit name) — bypasses team lookup
16
+ * 2. team.roles.find(r => r.name === role)?.agent → allAgents lookup
17
+ * 3. allAgents(discoverAgents(cwd)).find(a => a.name === role) (role name == agent name)
18
+ * 4. synthesize minimal AgentConfig (source:"dynamic", systemPrompt:"You are {role}.")
19
+ *
20
+ * Isolation (§0b G3 / report 05 §C.4): worker output → artifact file; `agent()` returns
21
+ * structured data + writes a side artifact. The script holds results in JS vars; only
22
+ * `setResult()` reaches the main context.
23
+ */
24
+
25
+ import { runChildPi } from "./child-pi.ts";
26
+ import { parsePiJsonOutput } from "./pi-json-output.ts";
27
+ import { extractStructuredResult } from "./result-extractor.ts";
28
+ import { mapConcurrent } from "./parallel-utils.ts";
29
+ import { Semaphore } from "./semaphore.ts";
30
+ import { executeWithRetry } from "./retry-executor.ts";
31
+ import { allAgents, discoverAgents } from "../agents/discover-agents.ts";
32
+ import { writeArtifact } from "../state/artifact-store.ts";
33
+ import { appendMailboxMessage, readMailbox } from "../state/mailbox.ts";
34
+ import { renderPlanTemplate } from "./plan-templates.ts";
35
+ import { logInternalError } from "../utils/internal-error.ts";
36
+ import { randomBytes } from "node:crypto";
37
+ import type { AgentConfig } from "../agents/agent-config.ts";
38
+ import type { TeamConfig } from "../teams/team-config.ts";
39
+ import type { TeamRunManifest } from "../state/types.ts";
40
+
41
+ export interface AgentCallOpts {
42
+ prompt: string;
43
+ /** Role name (resolved via G4 4-tier chain) OR explicit agent name. */
44
+ role?: string;
45
+ /** Explicit agent name — bypasses team-role lookup (tier 1). */
46
+ agent?: string;
47
+ description?: string;
48
+ model?: string;
49
+ skill?: string[] | false;
50
+ maxTurns?: number;
51
+ graceTurns?: number;
52
+ /** Dependency artifact paths injected into the agent prompt. */
53
+ inputs?: string[];
54
+ /** Disable ALL tools for this call (Pi `--no-tools`, §0c C6). Use for pure-judgment /
55
+ * verdict steps where the agent must answer directly without exploring, e.g.
56
+ * `ctx.review()`'s JSON-verdict call. Without this, role-based tools (read/grep/bash)
57
+ * apply and the model may loop exploring instead of answering. */
58
+ disableTools?: boolean;
59
+ /** Override the resolved agent's system prompt. Use when the call needs a different
60
+ * persona/output-format than the role's defined agent — e.g. `ctx.review()` needs a
61
+ * JSON-verdict judge, but the user's reviewer.md agent is a markdown code-reviewer.
62
+ * When set, the resolved agent's systemPrompt is replaced entirely. */
63
+ systemPrompt?: string;
64
+ }
65
+
66
+ export interface AgentResult {
67
+ ok: boolean;
68
+ text: string;
69
+ structured?: unknown;
70
+ usage?: { input?: number; output?: number; cost?: number; turns?: number };
71
+ runId?: string;
72
+ taskId?: string;
73
+ artifactPath?: string;
74
+ error?: string;
75
+ durationMs?: number;
76
+ }
77
+
78
+ export interface WorkflowCtx {
79
+ cwd: string;
80
+ runId: string;
81
+ goal?: string;
82
+ /** Spawn one agent, await result. Concurrency enforced by ctx.semaphore. */
83
+ agent(opts: AgentCallOpts): Promise<AgentResult>;
84
+ /** Bounded fan-out preserving order (wraps mapConcurrent). */
85
+ fanOut<T>(items: T[], limit: number, fn: (item: T, i: number) => Promise<AgentResult>): Promise<AgentResult[]>;
86
+ /** Run a reviewer agent over an artifact; parse {outcome, feedback}. §3.2. */
87
+ review(taskId: string, reviewerRole?: string, opts?: { content?: string; artifactPath?: string; disableTools?: boolean }): Promise<{ outcome: "accept" | "reject" | "changes_requested"; feedback: string }>;
88
+ /** Re-run a task with feedback (wraps executeWithRetry). */
89
+ retry(taskId: string, opts?: { feedback?: string }): Promise<AgentResult>;
90
+ /** Send a mailbox message to another agent/leader. */
91
+ mail(to: string, body: string, opts?: { kind?: string; taskId?: string; replyTo?: string; replyDeadline?: number }): string;
92
+ /** Block until N mailbox replies arrive or deadline. ~10 LOC net-new (report 05 §G.4). */
93
+ gatherReplies(messageIds: string[], deadlineMs: number): Promise<unknown[]>;
94
+ /** Render a built-in plan template (full-implementation / standard-review). */
95
+ renderTemplate(name: string, vars: Record<string, string>): unknown;
96
+ /** Persistent variables (revived intermediate-store). */
97
+ vars: Record<string, unknown>;
98
+ /** Mark the final result. ONLY this artifact reaches the main context. */
99
+ setResult(artifactPath: string, meta?: Record<string, unknown>): void;
100
+ semaphore: Semaphore;
101
+ /** Abort signal (cancel/stop). */
102
+ signal: AbortSignal;
103
+ }
104
+
105
+ export interface MakeWorkflowCtxOptions {
106
+ concurrency?: number;
107
+ signal: AbortSignal;
108
+ team?: TeamConfig;
109
+ modelOverride?: string;
110
+ }
111
+
112
+ /**
113
+ * Resolve a role/agent name to a full AgentConfig (§0b G4 4-tier precedence).
114
+ * Module-local — NOT promoted to a shared module (keeps P2 isolated from the
115
+ * load-bearing team-runner path).
116
+ */
117
+ export function resolveAgentForRole(
118
+ roleName: string | undefined,
119
+ opts: { explicitAgent?: string; team?: TeamConfig; cwd: string },
120
+ ): AgentConfig {
121
+ const cwd = opts.cwd;
122
+ // Tier 1: explicit agent name.
123
+ if (opts.explicitAgent) {
124
+ const found = allAgents(discoverAgents(cwd)).find((a) => a.name === opts.explicitAgent);
125
+ if (found) return found;
126
+ // Fall through to synthesize if the named agent doesn't exist (P2-friendly).
127
+ }
128
+ // Tier 2: team.roles[].agent lookup.
129
+ if (opts.team) {
130
+ const role = opts.team.roles.find((r) => r.name === roleName);
131
+ if (role) {
132
+ const byAgentName = allAgents(discoverAgents(cwd)).find((a) => a.name === role.agent);
133
+ if (byAgentName) return byAgentName;
134
+ }
135
+ }
136
+ // Tier 3: discoverAgents by role name (role name == agent name).
137
+ if (roleName) {
138
+ const byRoleName = allAgents(discoverAgents(cwd)).find((a) => a.name === roleName);
139
+ if (byRoleName) return byRoleName;
140
+ }
141
+ // Tier 4: synthesize a minimal AgentConfig.
142
+ const name = opts.explicitAgent ?? roleName ?? "executor";
143
+ return synthesizeAgentConfig(name);
144
+ }
145
+
146
+ /** Synthesize a minimal AgentConfig (§0c C7: source:"dynamic", not "synthetic"). */
147
+ export function synthesizeAgentConfig(name: string, model?: string): AgentConfig {
148
+ return {
149
+ name,
150
+ description: `Synthesized agent for dynamic workflow (${name}).`,
151
+ source: "dynamic",
152
+ filePath: `<dynamic-workflow>`,
153
+ systemPrompt: `You are ${name}.`,
154
+ model,
155
+ tools: [],
156
+ inheritProjectContext: false,
157
+ inheritSkills: false,
158
+ };
159
+ }
160
+
161
+ /** Build the WorkflowCtx facade. Capability-locked: only documented methods exposed. */
162
+ export function makeWorkflowCtx(manifest: TeamRunManifest, opts: MakeWorkflowCtxOptions): WorkflowCtx {
163
+ const concurrency = Math.max(1, opts.concurrency ?? 4);
164
+ const semaphore = new Semaphore(concurrency);
165
+ let finalResult: { artifactPath: string; meta?: Record<string, unknown> } | undefined;
166
+
167
+ const ctx: WorkflowCtx = {
168
+ cwd: manifest.cwd,
169
+ runId: manifest.runId,
170
+ goal: manifest.goal,
171
+ signal: opts.signal,
172
+ semaphore,
173
+ async agent(call: AgentCallOpts): Promise<AgentResult> {
174
+ await semaphore.acquire();
175
+ const started = Date.now();
176
+ try {
177
+ const agentConfig = resolveAgentForRole(call.role, {
178
+ explicitAgent: call.agent,
179
+ team: opts.team,
180
+ cwd: manifest.cwd,
181
+ });
182
+ // §0c C6: per-call disableTools override. When set, force Pi `--no-tools` so the
183
+ // agent answers directly without exploring. Applied AFTER role resolution so it
184
+ // wins over any role-defined tools.
185
+ let effectiveAgent = call.disableTools === true ? { ...agentConfig, disableTools: true, tools: [] } : agentConfig;
186
+ // Per-call systemPrompt override (replaces the resolved agent's persona/output-format).
187
+ // Used by ctx.review() to force a JSON-verdict judge instead of the role's markdown reviewer.
188
+ if (call.systemPrompt !== undefined) {
189
+ effectiveAgent = { ...effectiveAgent, systemPrompt: call.systemPrompt };
190
+ }
191
+ const task = composeAgentTask(call);
192
+ const childResult = await runChildPi({
193
+ cwd: manifest.cwd,
194
+ task,
195
+ agent: effectiveAgent,
196
+ model: call.model ?? opts.modelOverride ?? agentConfig.model,
197
+ skillPaths: undefined, // skills resolved via agent config + team-role plumbing
198
+ maxTurns: call.maxTurns,
199
+ graceTurns: call.graceTurns,
200
+ signal: opts.signal,
201
+ artifactsRoot: manifest.artifactsRoot,
202
+ runId: manifest.runId,
203
+ role: call.role ?? call.agent,
204
+ });
205
+ if (childResult.exitCode !== 0 || childResult.error) {
206
+ return { ok: false, text: "", error: childResult.error ?? `exit ${childResult.exitCode}`, durationMs: Date.now() - started };
207
+ }
208
+ const parsed = parsePiJsonOutput(childResult.stdout);
209
+ let text = parsed.finalText ?? "";
210
+ // Round-11 test fix: parsePiJsonOutput only extracts text from pi event stream
211
+ // ({type:"message_end", message:{role:"assistant", content:[...]}}). When the
212
+ // agent emits plain JSON, plain text, or a different format, finalText is empty.
213
+ // Fallback to a more permissive extraction that handles multiple output shapes.
214
+ if (!text.trim()) {
215
+ text = extractTextFallback(childResult.stdout);
216
+ }
217
+ const extracted = extractStructuredResult(text);
218
+ // Write a side artifact for audit/isolation (§0b G3).
219
+ const rel = `wf/${Date.now()}-${randomBytes(4).toString("hex")}.md`;
220
+ const artifact = writeArtifact(manifest.artifactsRoot, {
221
+ kind: "result",
222
+ relativePath: rel,
223
+ content: text,
224
+ producer: "dynamic-workflow",
225
+ });
226
+ return {
227
+ ok: true,
228
+ text,
229
+ structured: extracted.structured ? extracted.data : undefined,
230
+ usage: parsed.usage,
231
+ artifactPath: artifact.path,
232
+ durationMs: Date.now() - started,
233
+ };
234
+ } catch (error) {
235
+ logInternalError("dynamic-workflow-context.agent", error, `runId=${manifest.runId}`);
236
+ return { ok: false, text: "", error: error instanceof Error ? error.message : String(error), durationMs: Date.now() - started };
237
+ } finally {
238
+ semaphore.release();
239
+ }
240
+ },
241
+ async fanOut<T>(items: T[], limit: number, fn: (item: T, i: number) => Promise<AgentResult>): Promise<AgentResult[]> {
242
+ return mapConcurrent(items, Math.max(1, limit), fn);
243
+ },
244
+ async review(taskId: string, reviewerRole = "reviewer", reviewOpts?: { content?: string; artifactPath?: string; disableTools?: boolean }): Promise<{ outcome: "accept" | "reject" | "changes_requested"; feedback: string }> {
245
+ // review() is a VERDICT step: it must produce a parseable JSON {outcome, feedback}, not a
246
+ // free-form markdown review. The resolved reviewer agent (e.g. ~/.pi/agent/agents/reviewer.md)
247
+ // has tools (read/grep/bash) + a markdown-output system prompt. Without disableTools, the
248
+ // reviewer explores the repo looking for the task's work, loops, and gets killed (exit 143)
249
+ // before producing JSON — leaving text="" and the fallback verdict. Default: disableTools so
250
+ // the reviewer judges the provided content (or taskId context) directly.
251
+ const disableTools = reviewOpts?.disableTools !== false; // default true
252
+ const workContext = reviewOpts?.content
253
+ ? `\n\nWork to review:\n"""\n${reviewOpts.content}\n"""`
254
+ : reviewOpts?.artifactPath
255
+ ? `\n\nRead the work from artifact: ${reviewOpts.artifactPath}`
256
+ : "";
257
+ const res = await ctx.agent({
258
+ role: reviewerRole,
259
+ prompt: `You are reviewing the work for task '${taskId}'.${workContext}\n\nEvaluate the work and respond with ONLY a single JSON object, no prose, no markdown:\n{"outcome":"accept|reject|changes_requested","feedback":"<one-paragraph explanation>"}\n\n- "accept": work is complete and correct.\n- "reject": work is fundamentally wrong.\n- "changes_requested": work needs revision (explain what in feedback).`,
260
+ maxTurns: 3,
261
+ disableTools,
262
+ systemPrompt: "You are a JSON verdict judge. You output ONLY a single JSON object with keys \"outcome\" (one of accept/reject/changes_requested) and \"feedback\" (a concise explanation). Never output prose, markdown, or code fences. Begin your response with { and end with }.",
263
+ });
264
+ const extracted = res.structured as { outcome?: string; feedback?: string } | undefined;
265
+ if (extracted && typeof extracted.outcome === "string" && typeof extracted.feedback === "string") {
266
+ const outcome = (extracted.outcome === "accept" || extracted.outcome === "reject" || extracted.outcome === "changes_requested")
267
+ ? extracted.outcome
268
+ : "changes_requested";
269
+ return { outcome, feedback: extracted.feedback };
270
+ }
271
+ // Fallback (round-11 runtime): many models (e.g. MiniMax-M3) ignore JSON-output
272
+ // instructions and produce a prose review instead. Rather than report an
273
+ // unparseable verdict, run a tiny judge call that converts the prose review into a
274
+ // JSON verdict. This guarantees ctx.review() always returns a structured verdict
275
+ // regardless of the reviewer's output format. Skipped when the reviewer produced
276
+ // no text at all (genuine failure).
277
+ if (res.text.trim()) {
278
+ const judge = await ctx.agent({
279
+ role: reviewerRole,
280
+ prompt: `Convert the following code review into a verdict JSON. Read the review and decide the outcome.\n\nREVIEW:\n"""\n${res.text.slice(0, 4000)}\n"""\n\nRespond with ONLY a JSON object:\n{"outcome":"accept|reject|changes_requested","feedback":"<concise summary>"}\n- accept: review found no real issues.\n- reject: review found critical/fundamental problems.\n- changes_requested: review found issues that need fixing.`,
281
+ maxTurns: 1,
282
+ disableTools: true,
283
+ systemPrompt: "You output ONLY a single JSON object with keys outcome and feedback. Begin with { and end with }. Never output prose.",
284
+ });
285
+ const judged = judge.structured as { outcome?: string; feedback?: string } | undefined;
286
+ if (judged && typeof judged.outcome === "string" && typeof judged.feedback === "string") {
287
+ const outcome = (judged.outcome === "accept" || judged.outcome === "reject" || judged.outcome === "changes_requested")
288
+ ? judged.outcome
289
+ : "changes_requested";
290
+ return { outcome, feedback: judged.feedback };
291
+ }
292
+ }
293
+ // Tier-3 sentiment fallback (round-11): when neither the reviewer nor the judge
294
+ // produced JSON (common with MiniMax-M3, GLM, which ignore JSON-output
295
+ // instructions), classify the outcome from the REVIEWER's prose sentiment. We use
296
+ // the reviewer's text (not the judge's terse output) because the original review is
297
+ // the richest sentiment signal. This keeps outcome ACCURATE (accept vs reject vs
298
+ // changes_requested) even when no JSON is ever produced — without it, outcome was
299
+ // always the hardcoded 'changes_requested' default (e.g. correct code was
300
+ // misclassified as needing changes).
301
+ if (res.text.trim()) {
302
+ return { outcome: classifyReviewOutcome(res.text), feedback: res.text };
303
+ }
304
+ return { outcome: "changes_requested", feedback: res.text || "(reviewer produced no parseable verdict)" };
305
+ },
306
+ async retry(taskId: string, retryOpts?: { feedback?: string }): Promise<AgentResult> {
307
+ return executeWithRetry(
308
+ async () => ctx.agent({
309
+ role: "executor",
310
+ prompt: `Re-do task '${taskId}'.${retryOpts?.feedback ? ` Feedback: ${retryOpts.feedback}` : ""}`,
311
+ }),
312
+ { maxAttempts: 3, backoffMs: 0, jitterRatio: 0, exponentialFactor: 1 },
313
+ );
314
+ },
315
+ mail(to: string, body: string, mailOpts?: { kind?: string; taskId?: string; replyTo?: string; replyDeadline?: number }): string {
316
+ const msg = appendMailboxMessage(manifest, {
317
+ direction: "outbox",
318
+ from: "dynamic-workflow",
319
+ to,
320
+ body,
321
+ kind: (mailOpts?.kind as never) ?? "message",
322
+ taskId: mailOpts?.taskId,
323
+ replyTo: mailOpts?.replyTo,
324
+ replyDeadline: mailOpts?.replyDeadline,
325
+ });
326
+ return msg.id;
327
+ },
328
+ async gatherReplies(messageIds: string[], deadlineMs: number): Promise<unknown[]> {
329
+ const deadline = Date.now() + deadlineMs;
330
+ while (Date.now() < deadline) {
331
+ const inbox = readMailbox(manifest, "inbox");
332
+ const got = inbox.filter((m) => m.replyTo && messageIds.includes(m.replyTo));
333
+ if (got.length >= messageIds.length) return got;
334
+ await new Promise((r) => setTimeout(r, 500));
335
+ if (opts.signal.aborted) return inbox.filter((m) => m.replyTo && messageIds.includes(m.replyTo));
336
+ }
337
+ return readMailbox(manifest, "inbox").filter((m) => m.replyTo && messageIds.includes(m.replyTo));
338
+ },
339
+ renderTemplate(name: string, vars: Record<string, string>): unknown {
340
+ return renderPlanTemplate(name, vars);
341
+ },
342
+ vars: {} as Record<string, unknown>,
343
+ setResult(artifactPath: string, meta?: Record<string, unknown>): void {
344
+ finalResult = { artifactPath, meta };
345
+ },
346
+ };
347
+
348
+ // Attach the final-result slot via a non-enumerable getter so the runner can read it
349
+ // without exposing a mutation surface on the ctx the script sees.
350
+ Object.defineProperty(ctx, "__finalResult", {
351
+ get: () => finalResult,
352
+ enumerable: false,
353
+ });
354
+ return ctx;
355
+ }
356
+
357
+ /** Read the final result set by the script (runner-only; not part of the public ctx surface). */
358
+ export function getWorkflowFinalResult(ctx: WorkflowCtx): { artifactPath: string; meta?: Record<string, unknown> } | undefined {
359
+ return (ctx as unknown as { __finalResult?: { artifactPath: string; meta?: Record<string, unknown> } }).__finalResult;
360
+ }
361
+
362
+ /** Compose the agent task: prompt + optional dependency-input context block. */
363
+ function composeAgentTask(call: AgentCallOpts): string {
364
+ if (!call.inputs?.length) return call.prompt;
365
+ const block = call.inputs.map((p) => `- ${p}`).join("\n");
366
+ return `${call.prompt}\n\n## Inputs (artifact paths)\n${block}`;
367
+ }
368
+
369
+ /**
370
+ * Classify a review outcome from prose when no JSON was produced (round-11 tier-3/4 fallback).
371
+ * Scans the reviewer's prose for sentiment signals to decide accept / reject / changes_requested.
372
+ * This keeps the outcome ACCURATE for models that ignore JSON-output instructions.
373
+ *
374
+ * Decision order: reject (critical issues) → accept (explicit approval) → changes_requested (default).
375
+ * reject is checked first because a review can mention both "correctly" (describing existing code)
376
+ * AND "critical bug" (the verdict) — the verdict signal must win.
377
+ */
378
+ export function classifyReviewOutcome(prose: string): "accept" | "reject" | "changes_requested" {
379
+ const text = prose.toLowerCase();
380
+ // Strong negative signals → reject. These indicate fundamental/critical problems.
381
+ const rejectSignals = [
382
+ "\breject\b", "fundamentally", "completely broken", "totally broken",
383
+ "critical bug", "critical issue", "critical flaw", "security vulnerability",
384
+ "does not work", "doesn't work", "will not work", "fails to",
385
+ "unacceptable", "must not be merged", "do not merge", "wrong approach",
386
+ "logically incorrect", "incorrectly implements", "returns the opposite",
387
+ "subtraction instead of addition", "opposite of its intended",
388
+ ];
389
+ // Acceptance signals → accept. These indicate explicit approval with no real issues.
390
+ const acceptSignals = [
391
+ "\baccept\b", "looks good", "well done", "no issues", "no real issues",
392
+ "no problems", "no concerns", "nothing to change", "ready to merge",
393
+ "lgtm", "ship it", "correctly implements", "correctly returns",
394
+ "works as expected", "works correctly", "no bugs", "no defects",
395
+ "meets all requirements", "all requirements met", "passes all",
396
+ "is correct", "are correct", "no changes needed", "no changes required",
397
+ "no further changes", "nothing more to", "complete and correct", "sound implementation",
398
+ ];
399
+ const hasReject = rejectSignals.some((sig) => new RegExp(sig).test(text));
400
+ const hasAccept = acceptSignals.some((sig) => new RegExp(sig).test(text));
401
+ if (hasReject) return "reject";
402
+ if (hasAccept) return "accept";
403
+ return "changes_requested";
404
+ }
405
+
406
+ /**
407
+ * Round-11 test fix: permissive text extraction for ctx.agent().
408
+ * parsePiJsonOutput only handles the canonical pi event stream. When the child emits
409
+ * a different shape, finalText is empty. This fallback walks the JSON tree looking
410
+ * for any text-shaped string at any depth, then returns the longest one (typically
411
+ * the final assistant response).
412
+ */
413
+ export function extractTextFallback(stdout: string): string {
414
+ const trimmed = stdout.trim();
415
+ if (!trimmed) return "";
416
+ const candidates: string[] = [];
417
+ const collect = (value: unknown): void => {
418
+ if (typeof value === "string") {
419
+ const t = value.trim();
420
+ // Skip very short strings and JSON-ish strings
421
+ if (t.length >= 2 && !t.startsWith("{") && !t.startsWith("[") && !/^[\d.]+$/.test(t)) {
422
+ candidates.push(t);
423
+ }
424
+ } else if (Array.isArray(value)) {
425
+ for (const item of value) collect(item);
426
+ } else if (value && typeof value === "object") {
427
+ for (const v of Object.values(value as Record<string, unknown>)) collect(v);
428
+ }
429
+ };
430
+ // 1. Try parsing each line as JSON, walk tree
431
+ for (const line of trimmed.split("\n")) {
432
+ const lineTrim = line.trim();
433
+ if (!lineTrim.startsWith("{")) continue;
434
+ try {
435
+ const obj = JSON.parse(lineTrim);
436
+ collect(obj);
437
+ } catch { /* skip */ }
438
+ }
439
+ // 2. If nothing from JSON, try plain text (longest non-empty line that's not JSON)
440
+ if (candidates.length === 0) {
441
+ for (const line of trimmed.split("\n")) {
442
+ const l = line.trim();
443
+ if (l.length >= 3 && !l.startsWith("{") && !l.startsWith("[") && !l.startsWith("=")) candidates.push(l);
444
+ }
445
+ }
446
+ // 3. Return the longest candidate (typically the final answer)
447
+ if (candidates.length === 0) return "";
448
+ candidates.sort((a, b) => b.length - a.length);
449
+ return candidates[0];
450
+ }
@@ -0,0 +1,180 @@
1
+ /**
2
+ * dynamic-workflow-runner.ts — Script-driven workflow runtime (P2).
3
+ *
4
+ * Spec: research-findings/goal-workflow/00-SPEC.md §3.3
5
+ * Plan: 07-PLAN.md v3 P2 + §0c C5 (resolveRealContainedPath).
6
+ *
7
+ * Loads a `.dwf.ts` script's default export, transpiles it via jiti (the existing
8
+ * TS loader used by async-runner.ts), and executes it with a FROZEN WorkflowCtx.
9
+ *
10
+ * HONEST v1 TRUST MODEL (review H-2): vm.runInNewContext is NOT used in v1 — the
11
+ * script runs in plain module scope with full access to require/import/process.
12
+ * The 'capability-locked WorkflowCtx' is the documented contract surface, NOT a
13
+ * sandbox. A script can reach process/require via constructor walking or direct
14
+ * import. `.dwf.ts` files MUST be commit-reviewed (postinstall-equivalent trust).
15
+ * The path-allowlist (resolveRealContainedPath) limits WHERE scripts load from,
16
+ * not WHAT they can do. isolated-vm (real V8 isolate) is planned for v1.5.
17
+ * See docs/dynamic-workflows.md for the full threat model.
18
+ */
19
+
20
+ import { readFileSync } from "node:fs";
21
+ import { join } from "node:path";
22
+ import { resolveRealContainedPath } from "../utils/safe-paths.ts";
23
+ import { appendEvent } from "../state/event-log.ts";
24
+ import { writeArtifact } from "../state/artifact-store.ts";
25
+ import { logInternalError } from "../utils/internal-error.ts";
26
+ import { makeWorkflowCtx, getWorkflowFinalResult } from "./dynamic-workflow-context.ts";
27
+ import { projectCrewRoot, userPiRoot, packageRoot } from "../utils/paths.ts";
28
+ import type { DynamicWorkflowConfig } from "../workflows/workflow-config.ts";
29
+ import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
30
+
31
+ export interface RunDynamicWorkflowInput {
32
+ manifest: TeamRunManifest;
33
+ workflow: DynamicWorkflowConfig;
34
+ /** Optional team for role resolution (G4 tier 2). */
35
+ team?: import("../teams/team-config.ts").TeamConfig;
36
+ signal: AbortSignal;
37
+ concurrency?: number;
38
+ modelOverride?: string;
39
+ }
40
+
41
+ export interface RunDynamicWorkflowResult {
42
+ manifest: TeamRunManifest;
43
+ tasks: TeamTaskState[];
44
+ }
45
+
46
+ /** The signature a .dwf.ts default export must satisfy. */
47
+ export type DynamicWorkflowScript = (ctx: import("./dynamic-workflow-context.ts").WorkflowCtx) => Promise<void> | void;
48
+
49
+ /**
50
+ * Resolve + validate the script path against the allowlist of workflow dirs (§0c C5).
51
+ * Returns the real contained path or throws.
52
+ */
53
+ function resolveScriptPath(workflow: DynamicWorkflowConfig, cwd: string): string {
54
+ const crewRoot = projectCrewRoot(cwd);
55
+ // Allowlist: the script must resolve inside one of the workflow discovery dirs.
56
+ // (discover-workflows.ts only reads from packageRoot/workflows, userPiRoot/workflows,
57
+ // and projectCrewRoot/workflows — so the script already came from an allowed dir,
58
+ // but we still validate containment to defeat symlink traversal.)
59
+ // Fix round-5 P1: the round-4 P2-5 fix over-corrected to a SINGLE base (crewRoot/workflows).
60
+ // But discoverWorkflows() reads from THREE dirs (builtin, user, project). Use the same bases
61
+ // so user/builtin dynamic workflows aren't rejected.
62
+ const allowedBases = [
63
+ join(projectCrewRoot(cwd), "workflows"),
64
+ join(userPiRoot(), "workflows"),
65
+ join(packageRoot(), "workflows"),
66
+ ];
67
+ for (const base of allowedBases) {
68
+ try {
69
+ const real = resolveRealContainedPath(base, workflow.filePath);
70
+ if (real) return real;
71
+ } catch {
72
+ // not contained in this base — try next
73
+ }
74
+ }
75
+ // Not contained in any allowed base — refuse (do NOT fall back to the raw path).
76
+ throw new Error(`Dynamic workflow '${workflow.filePath}' is outside the allowed workflows directories (${allowedBases.join(", ")}). Refusing to load.`);
77
+ }
78
+
79
+ /**
80
+ * Transpile + load the .dwf.ts default export. Uses jiti (already a dep) for TS→JS.
81
+ * Returns the default export function or throws.
82
+ */
83
+ async function loadWorkflowModule(scriptPath: string): Promise<DynamicWorkflowScript> {
84
+ // jiti is the same loader async-runner.ts uses (resolveTypeScriptLoader). We require it
85
+ // lazily so this module stays importable in environments without jiti (type-only consumers).
86
+ // Fix round-4: use createRequire(import.meta.url) so `require` works under the strip-types
87
+ // loader fallback (Node ≥ 22.6) where bare `require` is not defined in ESM scope.
88
+ const { createRequire } = await import("node:module");
89
+ const require = createRequire(import.meta.url);
90
+ // eslint-disable-next-line @typescript-eslint/no-require-imports
91
+ const createJiti = require("jiti").default ?? require("jiti");
92
+ const jiti = createJiti(import.meta.url, { interopDefault: true });
93
+ const mod = await jiti(scriptPath);
94
+ const fn = (mod as { default?: unknown }).default ?? mod;
95
+ if (typeof fn !== "function") {
96
+ throw new Error(`Dynamic workflow '${scriptPath}' must export a default async function(ctx).`);
97
+ }
98
+ return fn as DynamicWorkflowScript;
99
+ }
100
+
101
+ /**
102
+ * Run the dynamic workflow script. Loads it, builds the ctx, executes, and returns
103
+ * {manifest, tasks} with the manifest updated to a terminal status + result artifact.
104
+ */
105
+ export async function runDynamicWorkflow(input: RunDynamicWorkflowInput): Promise<RunDynamicWorkflowResult> {
106
+ const { manifest, workflow, signal } = input;
107
+ const eventsPath = manifest.eventsPath;
108
+ const scriptPath = resolveScriptPath(workflow, manifest.cwd);
109
+
110
+ appendEvent(eventsPath, { type: "dwf.started", runId: manifest.runId, data: { workflow: workflow.name, script: scriptPath } });
111
+
112
+ const ctx = makeWorkflowCtx(manifest, {
113
+ concurrency: input.concurrency ?? workflow.maxConcurrency ?? 4,
114
+ signal,
115
+ team: input.team,
116
+ modelOverride: input.modelOverride,
117
+ });
118
+
119
+ // Freeze the ctx so the script cannot add/override capability methods (§0c C4).
120
+ const frozenCtx = Object.freeze(ctx);
121
+
122
+ try {
123
+ const script = await loadWorkflowModule(scriptPath);
124
+ // Round-11 test fix (runtime): hard timeout on script execution.
125
+ // Without this, scripts that spawn long-running child processes (e.g., `spawn("pi", ...)`)
126
+ // hang forever. The ctx.signal.timeout is cooperative only — it fires AbortSignal,
127
+ // it does NOT kill the script. Promise.race with a hard timeout at least returns an
128
+ // error so the runner doesn't hang. The spawned child process is leaked, but the
129
+ // dynamic-workflow returns failure promptly. (v1.5: use Worker threads to actually kill.)
130
+ const SCRIPT_TIMEOUT_MS = Number.parseInt(process.env.PI_CREW_DWF_SCRIPT_TIMEOUT_MS ?? "", 10) || 600_000; // 10 min default
131
+ let timeoutHandle: NodeJS.Timeout | undefined;
132
+ const timeoutPromise = new Promise<never>((_, reject) => {
133
+ timeoutHandle = setTimeout(() => {
134
+ reject(new Error(`Dynamic workflow script timed out after ${SCRIPT_TIMEOUT_MS}ms. The script may have spawned a child process that did not exit. Check for spawn/exec calls without proper stdio handling.`));
135
+ }, SCRIPT_TIMEOUT_MS);
136
+ timeoutHandle.unref?.();
137
+ });
138
+ try {
139
+ await Promise.race([script(frozenCtx), timeoutPromise]);
140
+ } finally {
141
+ if (timeoutHandle) clearTimeout(timeoutHandle);
142
+ }
143
+ } catch (error) {
144
+ logInternalError("dynamic-workflow-runner.run", error, `runId=${manifest.runId}, workflow=${workflow.name}`);
145
+ appendEvent(eventsPath, { type: "dwf.failed", runId: manifest.runId, data: { error: error instanceof Error ? error.message : String(error) } });
146
+ // Re-throw so background-runner's error handling marks the run failed.
147
+ throw error;
148
+ }
149
+
150
+ const final = getWorkflowFinalResult(ctx);
151
+ const finalText = final ? readFinalArtifact(final.artifactPath) : `(dynamic workflow '${workflow.name}' completed without calling ctx.setResult())`;
152
+
153
+ // Write a summary artifact mirroring the static-workflow summary.md contract (run.ts reads this).
154
+ const summary = writeArtifact(manifest.artifactsRoot, {
155
+ kind: "result",
156
+ relativePath: "summary.md",
157
+ content: finalText,
158
+ producer: "dynamic-workflow",
159
+ });
160
+
161
+ appendEvent(eventsPath, { type: "dwf.completed", runId: manifest.runId, data: { workflow: workflow.name, summaryArtifact: summary.path } });
162
+
163
+ const updatedManifest: TeamRunManifest = {
164
+ ...manifest,
165
+ status: "completed",
166
+ summary: finalText.slice(0, 2000),
167
+ updatedAt: new Date().toISOString(),
168
+ artifacts: [...manifest.artifacts, summary],
169
+ };
170
+ return { manifest: updatedManifest, tasks: [] };
171
+ }
172
+
173
+ function readFinalArtifact(artifactPath: string): string {
174
+ try {
175
+ return readFileSync(artifactPath, "utf-8");
176
+ } catch (error) {
177
+ logInternalError("dynamic-workflow-runner.readFinal", error, `artifactPath=${artifactPath}`);
178
+ return `(failed to read final artifact ${artifactPath})`;
179
+ }
180
+ }