pi-crew 0.8.3 → 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,99 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.8.5] — Per-write validator (T5) + validateWorkflowForTeam race note (2026-06-16)
4
+
5
+ Third APPLIED technique from the pi-ecosystem distillation (pi-lens /
6
+ apmantza — the "inline channel"). Adds real-time feedback on file
7
+ writes/edits: a CHEAP synchronous validator runs on every `write`/`edit`
8
+ tool result and appends a `🔴` blocker to the tool result on failure, so
9
+ malformed files are caught the moment they're written — not at the next
10
+ load.
11
+
12
+ ### Latency-safe v1 design (deliberate scope)
13
+
14
+ pi-lens runs LSP servers + linters per write. That is expensive and would
15
+ cause latency storms if naively ported (seconds of spawn per edit, firing in
16
+ the main session AND every worker). This v1 ships ONLY zero-cost, zero-spawn,
17
+ synchronous validators:
18
+
19
+ - **`json` → `JSON.parse`** (nanoseconds, built-in, no process spawn).
20
+
21
+ The registry is extensible — process-spawning validators (`.js` → `node
22
+ --check`, `.sh` → `bash -n`, `.py` → `py_compile`) are a FUTURE opt-in
23
+ (never default-on), and will need to be async + debounced (pi-lens's
24
+ `inFlightPipelines` / debounce-window pattern) when added.
25
+
26
+ ### Contract guarantees
27
+ - Synchronous. No `await`, no `spawn`, no disk write.
28
+ - One disk READ per validated file (after a cheap extension check, so
29
+ non-validated files cost nothing).
30
+ - Dedup by content: the same path+content is validated at most once per
31
+ process.
32
+ - Silent on success; appends exactly one TextContent block on failure.
33
+ - Best-effort: any internal error is swallowed (never breaks a write).
34
+ - Toggle: `runtime.reliability.perWriteValidation` (default `true` → opt-out).
35
+
36
+ ### Files
37
+ - NEW `src/runtime/per-write-validator.ts` — `validateJson`, the extensible
38
+ `PerWriteValidator` registry, dedup cache, `validateWrittenFile`, and
39
+ `buildValidationBlocker`. Test seams: `setPerWriteValidatorsForTest`,
40
+ `resetPerWriteValidatorCache`.
41
+ - `src/config/types.ts` — `reliability.perWriteValidation?: boolean`.
42
+ - `src/extension/register.ts` — `pi.on("tool_result", ...)` handler for
43
+ `write`/`edit` (pi-crew previously subscribed only to `tool_call`).
44
+ - NEW `test/unit/t5-per-write-validator.test.ts` (15 tests).
45
+ - NEW `.github/issues/2026-06-16-validateworkflowf-team-cold-start-race.md` —
46
+ honest note that the `validateWorkflowForTeam` cold-start error (same
47
+ class as v0.8.1's `existsSync`) was NOT actually fixed by v0.8.1's latch
48
+ (that covered only the peer-dep namespace). Documents the corrected
49
+ root cause (tsx makes every named import a runtime namespace access) and
50
+ 4 candidate fixes for the later pass.
51
+
52
+ typecheck clean; full suite 0 failures.
53
+
54
+ ## [0.8.4] — cold-verifier agent (T9) (2026-06-16)
55
+
56
+ Second APPLIED technique from the pi-ecosystem distillation (piolium /
57
+ Vigolium — cold-verifier pattern). Adds a new builtin agent whose value is
58
+ **independence**: it re-derives claims from ground truth WITHOUT trusting
59
+ prior reviewer/verifier analysis, breaking the confirmation-bias drift the
60
+ chained `reviewer` → `verifier` path can introduce.
61
+
62
+ ### Why
63
+ piolium splits security verification across ~10 narrow agents, including a
64
+ `cold-verifier` whose prompt enforces file-access isolation ("MUST NOT read
65
+ any file other than the single finding draft"). pi-crew's default `verifier`
66
+ instead *correlates* findings against reviewer output ("Trust dependency
67
+ context") — efficient, but it inherits the reviewer's blind spots. There was
68
+ **no** adversarial cross-check agent (confirmed: zero agents reference
69
+ cold/isolation/unbiased semantics).
70
+
71
+ ### What
72
+ NEW builtin `cold-verifier` agent (`agents/cold-verifier.md`):
73
+ - Read-only + `bash` (runs tests fresh, reads its OWN output — never a
74
+ cached prior-worker log).
75
+ - Prompt-enforced isolation discipline: don't trust prior findings, treat
76
+ each as an *unverified hypothesis*, actively look for contradicting evidence.
77
+ - Distinct `COLD_VERIFICATION` output block with a `CLAIMS_REFUTED` field
78
+ (the highest-value output — inherited claims your independent check
79
+ contradicts).
80
+ - `maxTurns: 12` (tighter than verifier's 15 — it's a focused cross-check).
81
+
82
+ Use `verifier` for fast finding-correlation; use `cold-verifier` when the
83
+ cost of a wrong "PASS" is high (security changes, release gates, data-loss
84
+ paths). Both can run in the same workflow.
85
+
86
+ ### Files
87
+ - NEW `agents/cold-verifier.md` — the agent (auto-discovered).
88
+ - `src/agents/discover-agents.ts` — add `cold-verifier` to the SEC-001
89
+ `PROTECTED_AGENT_NAMES` blocklist (can't be shadowed by a dynamic reg).
90
+ - `src/ui/settings-overlay.ts` — add to the settings-overlay agent list.
91
+ - `test/unit/agent-discovery-cache.test.ts` — mirror the protected-names list.
92
+ - NEW `test/unit/t9-cold-verifier.test.ts` (5 tests): discovery, parse,
93
+ isolation-discipline content, SEC-001 protection, frontmatter shape.
94
+
95
+ typecheck clean; full suite 1905 ok / 0 fail.
96
+
3
97
  ## [0.8.3] — Terminal tab title + Ghostty native progress bar (T4) (2026-06-16)
4
98
 
5
99
  First APPLIED technique from the pi-ecosystem distillation (pi-status /
@@ -0,0 +1,66 @@
1
+ ---
2
+ name: cold-verifier
3
+ description: Independently re-verify findings WITHOUT trusting prior analysis — an unbiased cold check to catch confirmation bias the chained reviewer/verifier path can introduce
4
+ model: false
5
+ systemPromptMode: replace
6
+ inheritProjectContext: true
7
+ inheritSkills: false
8
+ tools: read, grep, find, ls, bash
9
+ maxTurns: 12
10
+ ---
11
+
12
+ You are a **cold verifier**. Your value is independence: you re-check claims against ground truth WITHOUT trusting the analysis that came before you. The chained `reviewer` → `verifier` path can drift into confirmation bias (each worker rationalizes the prior worker's framing). You break that loop by starting cold.
13
+
14
+ ## Isolation Rules (THE CORE DISCIPLINE)
15
+
16
+ Distilled from piolium's cold-verifier pattern: prompt-enforced file-access isolation layered on top of context isolation.
17
+
18
+ You **MUST NOT**:
19
+ - Read other workers' notes, debate transcripts, or `.crew/artifacts/.../results/*.txt` reasoning files.
20
+ - Read the reviewer's or verifier's finding drafts as if they were ground truth.
21
+ - Be primed by the goal framing beyond the literal acceptance criteria. Re-derive what "done" means from the spec, not from someone's summary of it.
22
+ - Start from the conclusion that the work is correct (or incorrect). Start from evidence.
23
+
24
+ You **MUST**:
25
+ - Re-derive each claim from the codebase + test output directly.
26
+ - Treat every inherited finding as an *unverified hypothesis* until you confirm it yourself.
27
+ - Actively look for evidence that *contradicts* the prior verdict, not just evidence that supports it.
28
+
29
+ ## Strategy
30
+
31
+ ### Turn 1: Establish ground truth independently
32
+ Run the test suite / build / lint fresh and read the *actual output*:
33
+ ```bash
34
+ npm test 2>&1 | tail -40
35
+ ```
36
+ Do NOT read a cached log from a prior worker — re-run and read your own output. If a prior worker claims "tests pass", confirm the green output yourself.
37
+
38
+ ### Turn 2-N: Verify each claim from source
39
+ For each claim in the task/goal, open the *actual source files* and confirm:
40
+ - Does the code do what's claimed?
41
+ - Do tests actually cover the claimed behavior (not just pass for unrelated reasons)?
42
+ - Is there a claim that is true *in isolation* but false *in context* (e.g. a function works but is never called, a check passes but the input is never reachable)?
43
+
44
+ Look specifically for:
45
+ - **False confirmations**: a prior worker said "verified" but the evidence is weaker than implied (e.g. a test passes but asserts the wrong thing).
46
+ - **Missing cases**: the prior analysis didn't consider an edge case, error path, or interaction.
47
+ - **Scope creep masquerading as done**: the stated goal is met but a regression was introduced elsewhere.
48
+
49
+ ## What makes you different from `verifier`
50
+
51
+ The default `verifier` *correlates* findings against reviewer output ("Trust dependency context"). That's efficient but inherits the reviewer's blind spots. You are the **adversarial cross-check**: assume the prior verdict *might be wrong* and try to find where. Use `verifier` for fast correlation; use `cold-verifier` when the cost of a wrong "PASS" is high (security changes, release gates, data-loss paths).
52
+
53
+ ## Output Format
54
+
55
+ End with exactly this block:
56
+
57
+ ```
58
+ COLD_VERIFICATION: PASS|FAIL|INCONCLUSIVE
59
+ INDEPENDENT_TEST_RESULTS: X passed, Y failed, Z skipped (from your OWN run, not a cached log)
60
+ CLAIMS_CONFIRMED_INDEPENDENTLY: N/M inherited claims reproduced from source
61
+ CLAIMS_REFUTED: any inherited claim your independent check contradicts (highest-value output)
62
+ MISSING_COVERAGE: cases the prior analysis overlooked
63
+ EVIDENCE: file:line references + your own test output
64
+ ```
65
+
66
+ If you cannot refute a claim after honest effort, that is itself evidence the claim is solid — say so explicitly rather than inventing doubt.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-crew",
3
- "version": "0.8.3",
3
+ "version": "0.8.5",
4
4
  "description": "Pi extension for coordinated AI teams, workflows, worktrees, and async task orchestration",
5
5
  "author": "baphuongna",
6
6
  "license": "MIT",
@@ -48,6 +48,7 @@ const PROTECTED_AGENT_NAMES = new Set([
48
48
  "critic",
49
49
  "reviewer",
50
50
  "verifier",
51
+ "cold-verifier", // T9 (v0.8.4): adversarial cold cross-check agent
51
52
  "writer",
52
53
  "security-reviewer",
53
54
  ]);
@@ -180,6 +180,15 @@ export interface CrewReliabilityConfig {
180
180
  cleanupOrphanedTempDirs?: boolean;
181
181
  /** Inject a compact ambient crew-status note into the agent's context on every LLM call while crew runs are in-flight, so the agent stays continuously aware of active runs without calling the `team` tool. No-op when no runs are active. Default: true. */
182
182
  ambientStatusInjection?: boolean;
183
+ /**
184
+ * Per-write validation (T5). On every `write`/`edit` tool result, run a
185
+ * zero-cost synchronous validator for the file type and append a `🔴`
186
+ * blocker to the tool result on failure (e.g. malformed JSON). v1 ships
187
+ * JSON only (`JSON.parse` — instant, no process spawn); process-spawning
188
+ * validators (.js/.sh/.py) are a future opt-in. Default: true (opt-out).
189
+ * Set to `false` to disable.
190
+ */
191
+ perWriteValidation?: boolean;
183
192
  /**
184
193
  * Opt-in model scope enforcement (F7). When true, subagent model choices
185
194
  * that fall outside the user's pi `enabledModels` allowlist are flagged:
@@ -82,6 +82,7 @@ import {
82
82
  import { RenderScheduler } from "../ui/render-scheduler.ts";
83
83
  import { runEventBus } from "../ui/run-event-bus.ts";
84
84
  import { createTerminalStatusController, type TerminalStatusController } from "../ui/terminal-status.ts";
85
+ import { extractPathFromInput, validateWrittenFile, buildValidationBlocker } from "../runtime/per-write-validator.ts";
85
86
  import { createRunSnapshotCache } from "../ui/run-snapshot-cache.ts";
86
87
  import { closeWatcher } from "../utils/fs-watch.ts";
87
88
  import { RunWatcherRegistry } from "../utils/run-watcher-registry.ts";
@@ -1986,6 +1987,27 @@ export function registerPiTeams(pi: ExtensionAPI): void {
1986
1987
  };
1987
1988
  });
1988
1989
 
1990
+ // T5 (v0.8.5): per-write validation. On write/edit, run a zero-cost
1991
+ // SYNCHRONOUS validator (v1: JSON.parse) and append a 🔴 blocker to the
1992
+ // tool result on failure — catches malformed config the moment it's
1993
+ // written, not at the next load. Latency-safe by construction: no process
1994
+ // spawn, one disk read ONLY for validated extensions, dedup'd by content.
1995
+ // Toggle via runtime.reliability.perWriteValidation (default true).
1996
+ // Process-spawning validators (.js/.sh/.py) are a future opt-in.
1997
+ pi.on("tool_result", (event, ctx) => {
1998
+ try {
1999
+ if (event.toolName !== "write" && event.toolName !== "edit") return;
2000
+ if (loadConfig(ctx.cwd).config.reliability?.perWriteValidation === false) return;
2001
+ const filePath = extractPathFromInput(event.input);
2002
+ if (!filePath) return;
2003
+ const result = validateWrittenFile(filePath);
2004
+ if (!result || result.ok) return;
2005
+ return { content: [...event.content, buildValidationBlocker(filePath, result.error ?? "validation failed")] };
2006
+ } catch {
2007
+ // best-effort: never break a tool result
2008
+ }
2009
+ });
2010
+
1989
2011
  registerTeamTool(pi, {
1990
2012
  foregroundControllers,
1991
2013
  startForegroundRun,
@@ -0,0 +1,183 @@
1
+ /**
2
+ * Per-write validator — real-time feedback on file writes/edits (T5).
3
+ *
4
+ * Distilled from pi-lens (apmantza) — the "inline channel": on every
5
+ * `write`/`edit` tool result, run a CHEAP synchronous validator for the file
6
+ * type and, on failure, append a `🔴` blocker block to the tool result the
7
+ * agent sees next. This catches silent-breaking errors (malformed JSON
8
+ * config) at the moment they're introduced instead of at the next load.
9
+ *
10
+ * CRITICAL LATENCY-SAFETY DESIGN (the reason this is a careful slice, not the
11
+ * full pi-lens pipeline): pi-lens runs LSP servers + linters per write. That
12
+ * is expensive and would cause latency storms if naively ported (seconds of
13
+ * spawn per edit, firing in the main session AND every worker). This module's
14
+ * v1 deliberately ships ONLY zero-cost, zero-spawn, synchronous validators:
15
+ *
16
+ * - `json` → `JSON.parse` (nanoseconds, built-in, no process spawn).
17
+ *
18
+ * The registry is extensible — future validators (`.js` → `node --check`,
19
+ * `.sh` → `bash -n`, `.py` → `py_compile`) are process-spawning and MUST be
20
+ * added behind an explicit opt-in (never default-on) to preserve the
21
+ * latency guarantee. A process-spawning validator would also need to be async
22
+ * and debounced (pi-lens's `inFlightPipelines` / debounce-window pattern),
23
+ * which the current sync contract intentionally avoids.
24
+ *
25
+ * Contract guarantees for v1:
26
+ * - Synchronous. No `await`, no `spawn`, no disk write.
27
+ * - One disk READ per validated file (after a cheap extension check, so
28
+ * non-validated files cost nothing).
29
+ * - Dedup by content: the same path+content is validated at most once per
30
+ * process (a repeated identical write doesn't re-report).
31
+ * - Silent on success; appends exactly one TextContent block on failure.
32
+ * - Best-effort: any internal error is swallowed (never breaks a write).
33
+ *
34
+ * @module per-write-validator
35
+ */
36
+
37
+ import { readFileSync } from "node:fs";
38
+ import { extname as pathExtname } from "node:path";
39
+
40
+ /** Outcome of validating a file's content. */
41
+ export interface ValidationResult {
42
+ ok: boolean;
43
+ /** Human-readable error message when `ok` is false. */
44
+ error?: string;
45
+ }
46
+
47
+ /** A synchronous validator: content + path → result. */
48
+ export type PerWriteValidator = (content: string, filePath: string) => ValidationResult;
49
+
50
+ // ─────────────────────────────────────────────────────────────────────────
51
+ // Validators (zero-cost, synchronous, dependency-free for v1)
52
+ // ─────────────────────────────────────────────────────────────────────────
53
+
54
+ /** JSON: parse with `JSON.parse`. Catches malformed config/manifests instantly. */
55
+ export function validateJson(content: string, _filePath: string): ValidationResult {
56
+ if (content.trim() === "") return { ok: true }; // empty file is valid JSON absence, not a parse error
57
+ try {
58
+ JSON.parse(content);
59
+ return { ok: true };
60
+ } catch (error) {
61
+ const message = error instanceof Error ? error.message : String(error);
62
+ return { ok: false, error: `Invalid JSON: ${message}` };
63
+ }
64
+ }
65
+
66
+ /**
67
+ * Registry of default-on validators, keyed by extension (lowercase, no dot).
68
+ * ONLY zero-cost synchronous validators belong here. Process-spawning
69
+ * validators must be registered via a future opt-in path (see module doc).
70
+ */
71
+ const DEFAULT_VALIDATORS: ReadonlyMap<string, PerWriteValidator> = new Map([
72
+ ["json", validateJson],
73
+ ]);
74
+
75
+ // ─────────────────────────────────────────────────────────────────────────
76
+ // Dedup cache (path → last-validated content). Bounded; small.
77
+ // ─────────────────────────────────────────────────────────────────────────
78
+
79
+ const MAX_DEDUP_ENTRIES = 256;
80
+ const seenContent = new Map<string, string>();
81
+
82
+ function rememberSeen(path: string, content: string): void {
83
+ if (seenContent.has(path)) seenContent.delete(path); // refresh LRU position
84
+ seenContent.set(path, content);
85
+ while (seenContent.size > MAX_DEDUP_ENTRIES) {
86
+ const oldest = seenContent.keys().next().value;
87
+ if (oldest === undefined) break;
88
+ seenContent.delete(oldest);
89
+ }
90
+ }
91
+
92
+ /** Test seam: reset the dedup cache between tests. */
93
+ export function resetPerWriteValidatorCache(): void {
94
+ seenContent.clear();
95
+ }
96
+
97
+ /**
98
+ * Replace the validator registry (test seam). Production uses
99
+ * DEFAULT_VALIDATORS; tests inject a custom map to exercise specific extensions.
100
+ */
101
+ let validators: ReadonlyMap<string, PerWriteValidator> = DEFAULT_VALIDATORS;
102
+
103
+ export function setPerWriteValidatorsForTest(map: ReadonlyMap<string, PerWriteValidator> | undefined): void {
104
+ validators = map ?? DEFAULT_VALIDATORS;
105
+ }
106
+
107
+ /**
108
+ * Normalise an extension to the registry key form (lowercase, no leading dot).
109
+ * "" for files with no extension.
110
+ */
111
+ export function extensionKey(filePath: string): string {
112
+ return pathExtname(filePath).replace(/^\./, "").toLowerCase();
113
+ }
114
+
115
+ // ─────────────────────────────────────────────────────────────────────────
116
+ // Path extraction from a tool_result event input (defensive — pi-ai types
117
+ // aren't exported here, so accept a record and probe common field names).
118
+ // ─────────────────────────────────────────────────────────────────────────
119
+
120
+ const PATH_FIELDS = ["filePath", "path", "file"] as const;
121
+
122
+ /** Extract the written/edited path from a tool result input, if present. */
123
+ export function extractPathFromInput(input: unknown): string | undefined {
124
+ if (!input || typeof input !== "object") return undefined;
125
+ const record = input as Record<string, unknown>;
126
+ for (const field of PATH_FIELDS) {
127
+ const value = record[field];
128
+ if (typeof value === "string" && value.length > 0) return value;
129
+ }
130
+ return undefined;
131
+ }
132
+
133
+ // ─────────────────────────────────────────────────────────────────────────
134
+ // Core entry point
135
+ // ─────────────────────────────────────────────────────────────────────────
136
+
137
+ /**
138
+ * Validate a just-written/edited file. Returns `null` when there is nothing
139
+ * to report (no validator for the extension, dedup hit, file unreadable, or
140
+ * the content is valid). Returns a `ValidationResult` with `ok:false` when the
141
+ * content fails validation.
142
+ *
143
+ * Reads the file from disk (it's already written by `tool_result` time) so the
144
+ * logic is uniform across `write` (full content) and `edit` (patch). The disk
145
+ * read happens ONLY after a cheap extension check, so non-validated files cost
146
+ * nothing.
147
+ */
148
+ export function validateWrittenFile(filePath: string): ValidationResult | null {
149
+ const key = extensionKey(filePath);
150
+ const validator = validators.get(key);
151
+ if (!validator) return null; // cheap skip: no validator for this file type
152
+ let content: string;
153
+ try {
154
+ content = readFileSync(filePath, "utf-8");
155
+ } catch {
156
+ // Unreadable / missing / permission denied — can't validate; never block.
157
+ return null;
158
+ }
159
+ // Dedup: identical content already validated this process → don't re-report.
160
+ if (seenContent.get(filePath) === content) return null;
161
+ rememberSeen(filePath, content);
162
+ const result = validator(content, filePath);
163
+ return result.ok ? null : result;
164
+ }
165
+
166
+ /**
167
+ * Build the TextContent block to append to a tool_result on validation failure.
168
+ * Uses a strong `🔴` prefix so the agent treats it as a real signal and fixes
169
+ * the file before continuing.
170
+ */
171
+ export function buildValidationBlocker(filePath: string, error: string): { type: "text"; text: string } {
172
+ return {
173
+ type: "text",
174
+ text: [
175
+ "",
176
+ "🔴 pi-crew per-write check FAILED",
177
+ ` ${filePath}`,
178
+ ` ${error}`,
179
+ " The file you just wrote is malformed. Fix it now — a broken file here will",
180
+ " silently fail the next load/parse. Re-write the file with valid content before continuing.",
181
+ ].join("\n"),
182
+ };
183
+ }
@@ -377,7 +377,7 @@ class AgentOverridesSubmenu {
377
377
  this.onCancel = onCancel;
378
378
  const existing = (config.agents as Record<string, unknown>)?.overrides as Record<string, { model?: string; thinking?: string }> | undefined;
379
379
  this.overrides = existing ? structuredClone(existing) : {};
380
- this.agents = ["explorer", "planner", "analyst", "critic", "executor", "reviewer", "security-reviewer", "test-engineer", "verifier", "writer"];
380
+ this.agents = ["explorer", "planner", "analyst", "critic", "executor", "reviewer", "security-reviewer", "test-engineer", "verifier", "cold-verifier", "writer"];
381
381
  }
382
382
 
383
383
  invalidate(): void {}