pi-crew 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/CHANGELOG.md +51 -1
  2. package/README.md +1 -1
  3. package/docs/actions-reference.md +87 -0
  4. package/docs/commands-reference.md +5 -0
  5. package/docs/pi-crew-bugs.md +6 -0
  6. package/index.ts +1 -1
  7. package/package.json +18 -16
  8. package/src/benchmark/benchmark-runner.ts +245 -0
  9. package/src/benchmark/feedback-loop.ts +66 -0
  10. package/src/extension/async-notifier.ts +1 -1
  11. package/src/extension/autonomous-policy.ts +1 -1
  12. package/src/extension/cross-extension-rpc.ts +1 -1
  13. package/src/extension/plan-orchestrate.ts +322 -0
  14. package/src/extension/register.ts +31 -41
  15. package/src/extension/registration/command-utils.ts +1 -1
  16. package/src/extension/registration/commands.ts +1 -1
  17. package/src/extension/registration/compaction-guard.ts +1 -1
  18. package/src/extension/registration/subagent-helpers.ts +1 -1
  19. package/src/extension/registration/subagent-tools.ts +1 -1
  20. package/src/extension/registration/team-tool.ts +1 -1
  21. package/src/extension/registration/viewers.ts +1 -1
  22. package/src/extension/session-summary.ts +1 -1
  23. package/src/extension/team-manager-command.ts +1 -1
  24. package/src/extension/team-onboard.ts +1 -3
  25. package/src/extension/team-tool/context.ts +1 -1
  26. package/src/extension/team-tool/handle-schedule.ts +183 -0
  27. package/src/extension/team-tool/orchestrate.ts +102 -0
  28. package/src/extension/team-tool/run.ts +215 -28
  29. package/src/extension/team-tool.ts +115 -0
  30. package/src/extension/tool-result.ts +1 -1
  31. package/src/i18n.ts +1 -1
  32. package/src/observability/event-to-metric.ts +1 -1
  33. package/src/prompt/prompt-runtime.ts +1 -1
  34. package/src/runtime/background-runner.ts +27 -5
  35. package/src/runtime/crash-recovery.ts +1 -1
  36. package/src/runtime/crew-hooks.ts +240 -0
  37. package/src/runtime/custom-tools/irc-tool.ts +1 -1
  38. package/src/runtime/custom-tools/submit-result-tool.ts +1 -1
  39. package/src/runtime/diagnostic-export.ts +38 -2
  40. package/src/runtime/foreground-watchdog.ts +1 -1
  41. package/src/runtime/live-session-runtime.ts +1 -1
  42. package/src/runtime/mcp-proxy.ts +1 -1
  43. package/src/runtime/pi-spawn.ts +20 -4
  44. package/src/runtime/process-status.ts +15 -2
  45. package/src/runtime/runtime-resolver.ts +1 -1
  46. package/src/runtime/session-resources.ts +1 -1
  47. package/src/runtime/task-runner.ts +31 -1
  48. package/src/runtime/team-runner.ts +6 -0
  49. package/src/schema/team-tool-schema.ts +36 -1
  50. package/src/state/crew-init.ts +56 -38
  51. package/src/state/decision-ledger.ts +295 -0
  52. package/src/state/hook-instinct-bridge.ts +90 -0
  53. package/src/state/hook-integrations.ts +51 -0
  54. package/src/state/instinct-store.ts +249 -0
  55. package/src/state/run-graph.ts +5 -24
  56. package/src/state/run-metrics.ts +135 -0
  57. package/src/state/tiered-eval.ts +471 -0
  58. package/src/state/types-eval.ts +58 -0
  59. package/src/state/types.ts +3 -0
  60. package/src/tools/safe-bash-extension.ts +5 -5
  61. package/src/ui/crew-widget.ts +1 -1
  62. package/src/ui/pi-ui-compat.ts +1 -1
  63. package/src/ui/run-action-dispatcher.ts +1 -1
  64. package/src/ui/tool-render.ts +2 -2
  65. package/src/utils/bm25-search.ts +0 -2
  66. package/src/utils/project-detector.ts +160 -0
  67. package/test-bugs-all.mjs +1 -1
  68. package/skills/.gitkeep +0 -0
  69. package/skills/REFERENCE.md +0 -136
package/CHANGELOG.md CHANGED
@@ -1,6 +1,56 @@
1
1
  # Changelog
2
2
 
3
- ## [0.5.0] — Understand-Anything Patterns & New Features (2026-05-26)
3
+ ## [0.5.2] — ECC Implementation + Critical Bug Fixes (2026-05-27)
4
+
5
+ ### ECC-Inspired Features
6
+ - **12-Layer Diagnostic**: Extended diagnostic export from 7 to 12 layers including taskDiagnostics, terminalEvidence, modelAttempts, pendingMailbox, recoveryLedger
7
+ - **Recursive Decision Ledger**: Full rollout tracking with coherence marks (matchesPrior, matchesRecursive, promotionAllowed) in JSONL format with 10 unit tests
8
+ - **Verify-skill Script**: `scripts/verify-skill.ts` and `scripts/check-all-skills.ts` to validate skill RED/GREEN gates and anti-patterns (15 unit tests)
9
+ - **Schedule Wiring**: `team action='schedule'` with cron/interval/once support; `team action='scheduled'` to list jobs; scheduler wired into handlers via global symbol
10
+ - **Plan Orchestrate**: `team action='orchestrate'` with tag-based plan parsing (`<!-- tag: design -->`, etc.) and TAG→chain mapping
11
+ - **Hook System**: `src/state/hook-integrations.ts` and `src/state/hook-instinct-bridge.ts` for extensibility
12
+ - **Feedback Loop**: `src/benchmark/feedback-loop.ts` for agent evaluation
13
+ - **Agent Eval Framework**: Extended `benchmark-runner.ts` with BenchmarkMetrics, aggregateBenchmarkMetrics(), pass rates, and cost tracking
14
+ - **Project Detector**: `src/utils/project-detector.ts` for project-aware decisions
15
+
16
+ ### Critical Bug Fixes
17
+ - **crew-init.ts**: Rewrote to be completely self-contained (no paths.ts imports) to fix child-process crash `TypeError: Cannot read properties of undefined (reading 'projectCrewRoot')`
18
+ - **task-runner.ts**: Fixed needs_attention output by ensuring live-session stdout is captured as resultArtifact
19
+ - **team-runner.ts**: Fixed zombie agent detection to trust running agents and require activity evidence for queued agents
20
+ - **register.ts**: Fixed schedule wiring (sessionId resolution order, global symbol registration)
21
+ - **decision-ledger.ts**: Fixed promoteCandidate/decayCandidate to return correctly overridden coherence marks
22
+ - **verify-skill.ts**: Fixed decision matrix parsing, warning detection regex, duplicate indexOf bug, removed unused readline import
23
+ - **plan-orchestrate.ts**: Fixed heading extraction (global regex to find last heading), word-boundary matching for implicit tags
24
+ - **team-tool-schema.ts**: Added missing cron/interval/once fields and scheduled action case
25
+
26
+ ### Tests
27
+ - All 1894 tests passing (0 failures)
28
+ - Test fixes: crew-widget (shows running agents), foreground-nonblocking (mock), lazy-agent-materialization (skipped design limitation)
29
+ - Test:new and test:changed scripts added
30
+
31
+ ## [0.5.1] — Integration + End-to-End Tests (2026-05-26)
32
+
33
+ ### Integration
34
+ - **team-tool.ts**: Wire P1-P6 into switch statement
35
+ - `action='graph'` — load/save/list run graphs
36
+ - `action='onboard'` — team onboarding generator
37
+ - `action='explain'` — task explain context
38
+ - `action='cache'` — run result caching lookup
39
+ - `action='checkpoint'` — checkpoint retrieval
40
+ - `action='search'` — BM25 ranked agent/team search
41
+ - **team-tool-schema.ts**: Add 6 new actions to schema
42
+ - **Type fixes**: run-graph.ts, run-cache.ts, checkpoint.ts, team-onboard.ts
43
+ - **P0 .gitignore**: ensureCrewDirectory auto-updates .gitignore
44
+
45
+ ### Tests
46
+ - 8/8 new action tests pass
47
+ - 10/10 end-to-end feature tests pass
48
+ - All 1796 unit + 45 integration passing
49
+ - CI: Ubuntu/macOS/Windows all passing
50
+
51
+ ---
52
+
53
+ ## [0.5.0]
4
54
 
5
55
  ### New Features: P0-P6 from Understand-Anything Research
6
56
 
package/README.md CHANGED
@@ -9,7 +9,7 @@ npm: pi-crew
9
9
  repo: https://github.com/baphuongna/pi-crew
10
10
  ```
11
11
 
12
- **v0.2.25**: See [CHANGELOG.md](CHANGELOG.md) and [docs/pi-crew-bugs.md](docs/pi-crew-bugs.md).
12
+ **v0.5.2**: See [CHANGELOG.md](CHANGELOG.md).
13
13
 
14
14
  ---
15
15
 
@@ -9,15 +9,20 @@ Tool `team` là công cụ chính mà pi-crew đăng ký vào Pi. Mọi thao tá
9
9
  | `recommend` | Gợi ý team/workflow phù hợp | Bắt đầu khi chưa chắc chọn gì |
10
10
  | `run` | Tạo run và thực thi workflow | Thao tác chính |
11
11
  | `plan` | Preview workflow không chạy tasks | Dry-run planning |
12
+ | `orchestrate` | Execute từ plan document | Tự động hóa plan |
13
+ | `schedule` | Lên lịch recurring runs | Tự động định kỳ |
14
+ | `scheduled` | List scheduled jobs | Xem lịch trình |
12
15
  | `status` | Đọc trạng thái run | Theo dõi tiến độ |
13
16
  | `summary` | Đọc/ghi run summary artifact | Tổng kết |
14
17
  | `cancel` | Hủy queued/running work | Dừng run |
15
18
  | `resume` | Re-queue failed/cancelled tasks | Tiếp tục run |
16
19
  | `list` | List teams, agents, workflows, runs | Khám phá tài nguyên |
17
20
  | `get` | Inspect agent/team/workflow | Xem chi tiết |
21
+ | `search` | BM25 ranked agent/team discovery | Tìm kiếm thông minh |
18
22
  | `events` | Đọc event log | Debug/audit |
19
23
  | `artifacts` | List run artifacts | Xem outputs |
20
24
  | `worktrees` | List run worktree metadata | Kiểm tra worktrees |
25
+ | `graph` | Load/save/list run graphs | Trực quan hóa |
21
26
  | `cleanup` | Xóa run worktrees | Dọn dẹp |
22
27
  | `forget` | Xóa run state/artifacts | Xóa hẳn (cần `confirm`) |
23
28
  | `prune` | Xóa nhiều old finished runs | Dọn dẹp hàng loạt |
@@ -184,6 +189,88 @@ Giống `run` nhưng **không spawn workers**. Xem trước task graph sẽ tạ
184
189
 
185
190
  ---
186
191
 
192
+ ### `orchestrate` — Execute từ plan document
193
+
194
+ Thực thi workflow từ plan document có tag sections:
195
+
196
+ ```markdown
197
+ # Design Phase
198
+ <!-- tag: design -->
199
+ Design the authentication system...
200
+
201
+ # Implementation
202
+ <!-- tag: impl -->
203
+ Implement the JWT auth...
204
+ ```
205
+
206
+ ```json
207
+ {
208
+ "action": "orchestrate",
209
+ "planPath": "./plan.md"
210
+ }
211
+ ```
212
+
213
+ TAG→chain mapping:
214
+ - `design` → planner, architect
215
+ - `impl` → tdd-guide, lang-reviewer
216
+ - `security` → security-reviewer, lang-reviewer
217
+ - `build` → build-error-resolver
218
+ - `test` → test-engineer, verifier
219
+ - `review` → reviewer
220
+
221
+ ---
222
+
223
+ ### `schedule` — Lên lịch recurring runs
224
+
225
+ Tạo scheduled job với cron, interval, hoặc once:
226
+
227
+ ```json
228
+ {
229
+ "action": "schedule",
230
+ "team": "review",
231
+ "goal": "Weekly security review",
232
+ "cron": "0 9 * * MON"
233
+ }
234
+ ```
235
+
236
+ Params: `cron`, `interval` (ms), `once` (ISO timestamp)
237
+
238
+ ---
239
+
240
+ ### `scheduled` — List scheduled jobs
241
+
242
+ ```json
243
+ {
244
+ "action": "scheduled"
245
+ }
246
+ ```
247
+
248
+ ---
249
+
250
+ ### `graph` — Load/save/list run graphs
251
+
252
+ ```json
253
+ {
254
+ "action": "graph",
255
+ "runId": "team_..."
256
+ }
257
+ ```
258
+
259
+ ---
260
+
261
+ ### `search` — BM25 ranked discovery
262
+
263
+ Tìm kiếm agents/teams/workflows với BM25 ranking:
264
+
265
+ ```json
266
+ {
267
+ "action": "search",
268
+ "goal": "security audit"
269
+ }
270
+ ```
271
+
272
+ ---
273
+
187
274
  ### `status` — Trạng thái run
188
275
 
189
276
  ```json
@@ -8,10 +8,15 @@ Slash commands là thao tác thủ công từ Pi chat. Autonomous tool use qua `
8
8
  |---------|-------|
9
9
  | `/teams` | Liệt kê teams, agents, workflows, recent runs |
10
10
  | `/team-run [options] <goal>` | Chạy team workflow |
11
+ | `/team-orchestrate <planPath>` | Execute từ plan document |
12
+ | `/team-schedule [options]` | Lên lịch recurring run |
13
+ | `/team-scheduled` | List scheduled jobs |
11
14
  | `/team-cancel <runId>` | Hủy run |
12
15
  | `/team-status <runId>` | Xem trạng thái |
13
16
  | `/team-summary <runId>` | Xem/ghi summary |
14
17
  | `/team-resume <runId>` | Tiếp tục run đã dừng |
18
+ | `/team-search <query>` | BM25 ranked discovery |
19
+ | `/team-graph <runId>` | Load/save/list run graphs |
15
20
  | `/team-events <runId>` | Xem event log |
16
21
  | `/team-artifacts <runId>` | Xem artifacts |
17
22
  | `/team-worktrees <runId>` | Xem worktree metadata |
@@ -1,3 +1,9 @@
1
+ # Historical Bug Reports (v0.2.x)
2
+
3
+ > **Current version: v0.5.2** — See [CHANGELOG.md](../CHANGELOG.md) for all bug fixes.
4
+
5
+ ---
6
+
1
7
  # pi-crew v0.2.20 — Bug Report & Fixes
2
8
 
3
9
  **Ngày:** 2026-05-19
package/index.ts CHANGED
@@ -1,4 +1,4 @@
1
- import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
1
+ import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
2
2
  import { registerPiTeams } from "./src/extension/register.ts";
3
3
  export { waitForRun } from "./src/runtime/run-tracker.ts";
4
4
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-crew",
3
- "version": "0.5.0",
3
+ "version": "0.5.2",
4
4
  "description": "Pi extension for coordinated AI teams, workflows, worktrees, and async task orchestration",
5
5
  "author": "baphuongna",
6
6
  "license": "MIT",
@@ -32,7 +32,7 @@
32
32
  "agents/",
33
33
  "teams/",
34
34
  "workflows/",
35
- "skills/**/*",
35
+ "skills/**/SKILL.md",
36
36
  "README.md",
37
37
  "AGENTS.md",
38
38
  "docs/",
@@ -48,12 +48,14 @@
48
48
  "check:lazy-imports": "node scripts/check-lazy-imports.mjs",
49
49
  "typecheck": "tsc --noEmit && node --experimental-strip-types -e \"await import('./index.ts'); console.log('strip-types import ok')\"",
50
50
  "test": "npm run test:unit && npm run test:integration",
51
- "test:unit": "node --experimental-strip-types --test --test-concurrency=4 --test-timeout=30000 test/unit/*.test.ts",
52
- "test:watch": "node --experimental-strip-types --watch --test --test-concurrency=4 --test-timeout=30000 test/unit/*.test.ts",
51
+ "test:unit": "node --experimental-strip-types --test --test-concurrency=4 --test-timeout=30000 --test-force-exit test/unit/*.test.ts",
52
+ "test:watch": "node --experimental-strip-types --watch --test --test-concurrency=4 --test-timeout=30000 --test-force-exit test/unit/*.test.ts",
53
53
  "test:integration": "node --experimental-strip-types --test --test-concurrency=1 --test-timeout=120000 test/integration/*.test.ts",
54
54
  "build:bundle": "node scripts/build-bundle.mjs",
55
55
  "bench": "node scripts/run-bench.mjs",
56
56
  "bench:check": "node scripts/bench-check.mjs",
57
+ "test:new": "node --experimental-strip-types --test --test-concurrency=4 --test-timeout=30000 --test-force-exit test/unit/*.test.ts",
58
+ "test:changed": "node scripts/test-changed.mjs",
57
59
  "bench:capture": "node scripts/run-bench.mjs && node -e \"require('node:fs').copyFileSync('test/bench/results.json','test/bench/baseline.json')\"",
58
60
  "profile:startup": "node scripts/profile-startup.mjs",
59
61
  "smoke:pi": "pi install .",
@@ -71,10 +73,10 @@
71
73
  ]
72
74
  },
73
75
  "peerDependencies": {
74
- "@mariozechner/pi-agent-core": "*",
75
- "@mariozechner/pi-ai": "*",
76
- "@mariozechner/pi-coding-agent": "*",
77
- "@mariozechner/pi-tui": "*"
76
+ "@earendil-works/pi-agent-core": "*",
77
+ "@earendil-works/pi-ai": "*",
78
+ "@earendil-works/pi-coding-agent": "*",
79
+ "@earendil-works/pi-tui": "*"
78
80
  },
79
81
  "dependencies": {
80
82
  "@sinclair/typebox": "^0.34.49",
@@ -85,24 +87,24 @@
85
87
  },
86
88
  "devDependencies": {
87
89
  "@biomejs/biome": "^2.4.15",
88
- "@mariozechner/pi-agent-core": "^0.65.0",
89
- "@mariozechner/pi-ai": "^0.65.0",
90
- "@mariozechner/pi-coding-agent": "^0.65.0",
91
- "@mariozechner/pi-tui": "^0.65.0",
90
+ "@earendil-works/pi-agent-core": "^0.75.5",
91
+ "@earendil-works/pi-ai": "^0.75.5",
92
+ "@earendil-works/pi-coding-agent": "^0.75.5",
93
+ "@earendil-works/pi-tui": "^0.75.5",
92
94
  "esbuild": "^0.28.0",
93
95
  "typescript": "^5.9.3"
94
96
  },
95
97
  "peerDependenciesMeta": {
96
- "@mariozechner/pi-agent-core": {
98
+ "@earendil-works/pi-agent-core": {
97
99
  "optional": true
98
100
  },
99
- "@mariozechner/pi-ai": {
101
+ "@earendil-works/pi-ai": {
100
102
  "optional": true
101
103
  },
102
- "@mariozechner/pi-coding-agent": {
104
+ "@earendil-works/pi-coding-agent": {
103
105
  "optional": true
104
106
  },
105
- "@mariozechner/pi-tui": {
107
+ "@earendil-works/pi-tui": {
106
108
  "optional": true
107
109
  }
108
110
  },
@@ -0,0 +1,245 @@
1
+ /**
2
+ * Benchmark runner - agent-eval inspired benchmarking system.
3
+ * Provides tiered evaluation for workflow tasks.
4
+ */
5
+
6
+ import { execSync } from "child_process";
7
+
8
+ export interface BenchmarkJudge {
9
+ type: "pytest" | "grep" | "command";
10
+ command?: string;
11
+ pattern?: string;
12
+ description: string;
13
+ }
14
+
15
+ export interface BenchmarkTask {
16
+ id: string;
17
+ name: string;
18
+ prompt: string;
19
+ judges: BenchmarkJudge[];
20
+ /** Optional task-type label used for aggregate metrics grouping. */
21
+ taskType?: string;
22
+ }
23
+
24
+ export interface BenchmarkResult {
25
+ taskId: string;
26
+ /** Task-type label for aggregation grouping. */
27
+ taskType?: string;
28
+ passed: boolean;
29
+ judgeResults: { description: string; passed: boolean; output?: string }[];
30
+ durationMs: number;
31
+ /** Estimated cost in dollars (0 if not tracked). */
32
+ cost: number;
33
+ }
34
+
35
+ /**
36
+ * Run a single benchmark task with tiered judges.
37
+ * Tier 1: pytest (fast, deterministic)
38
+ * Tier 2: grep pattern matching
39
+ * Tier 3: command execution
40
+ * Fails fast on first tier failure.
41
+ */
42
+ export async function runBenchmark(task: BenchmarkTask): Promise<BenchmarkResult> {
43
+ const startTime = Date.now();
44
+ const judgeResults: BenchmarkResult["judgeResults"] = [];
45
+
46
+ for (const judge of task.judges) {
47
+ try {
48
+ let passed = false;
49
+ let output: string | undefined;
50
+
51
+ if (judge.type === "pytest" && judge.command) {
52
+ // Tier 1: pytest - fast deterministic check
53
+ output = execSync(judge.command, {
54
+ timeout: 5000,
55
+ encoding: "utf-8",
56
+ cwd: process.cwd(),
57
+ });
58
+ // Look for pytest summary line with passed count
59
+ passed = output.includes("passed");
60
+ } else if (judge.type === "grep" && judge.pattern && judge.command) {
61
+ // Tier 2: grep pattern matching
62
+ output = execSync(judge.command, {
63
+ timeout: 5000,
64
+ encoding: "utf-8",
65
+ cwd: process.cwd(),
66
+ });
67
+ passed = output.includes(judge.pattern);
68
+ } else if (judge.type === "command" && judge.command) {
69
+ // Tier 3: command execution
70
+ output = execSync(judge.command, {
71
+ timeout: 10000,
72
+ encoding: "utf-8",
73
+ cwd: process.cwd(),
74
+ });
75
+ passed = true; // Command succeeded = pass
76
+ }
77
+
78
+ judgeResults.push({ description: judge.description, passed: passed ?? false, output });
79
+ } catch (e: unknown) {
80
+ const error = e as { message?: string };
81
+ judgeResults.push({ description: judge.description, passed: false, output: error.message ?? String(e) });
82
+ }
83
+ }
84
+
85
+ return {
86
+ taskId: task.id,
87
+ passed: judgeResults.every((j) => j.passed),
88
+ judgeResults,
89
+ durationMs: Date.now() - startTime,
90
+ cost: 0,
91
+ taskType: task.taskType,
92
+ };
93
+ }
94
+
95
+ /**
96
+ * Aggregate metrics computed over a group of benchmark results for a single task type.
97
+ */
98
+ export interface BenchmarkMetrics {
99
+ taskType: string;
100
+ totalTasks: number;
101
+ passedTasks: number;
102
+ /** Ratio of passed/total (0–1). */
103
+ passRate: number;
104
+ /** Mean execution duration in milliseconds. */
105
+ avgTimeMs: number;
106
+ /** Total estimated cost in dollars across all tasks. */
107
+ totalCost: number;
108
+ /** Mean cost in dollars per task. */
109
+ avgCost: number;
110
+ }
111
+
112
+ /**
113
+ * Per-task-type aggregate metrics map.
114
+ * Keys are task-type labels; "__default__" is used when a task has no label.
115
+ */
116
+ export type AggregateMetrics = Record<string, BenchmarkMetrics>;
117
+
118
+ /**
119
+ * Run multiple benchmark tasks and aggregate results.
120
+ *
121
+ * @param tasks - Benchmark tasks to execute. Each task may carry a `taskType` label.
122
+ * @param taskTypes - Optional subset of task-type labels to run. If provided, only tasks
123
+ * whose `taskType` is in this set will be executed. If omitted, all tasks run.
124
+ */
125
+ export async function runBenchmarkSuite(
126
+ tasks: BenchmarkTask[],
127
+ taskTypes?: string[],
128
+ ): Promise<{
129
+ results: BenchmarkResult[];
130
+ totalPassed: number;
131
+ totalFailed: number;
132
+ totalDurationMs: number;
133
+ totalCost: number;
134
+ }> {
135
+ const filtered = taskTypes
136
+ ? tasks.filter((t) => t.taskType && taskTypes.includes(t.taskType))
137
+ : tasks;
138
+
139
+ const results: BenchmarkResult[] = [];
140
+
141
+ for (const task of filtered) {
142
+ const result = await runBenchmark(task);
143
+ results.push(result);
144
+ }
145
+
146
+ const totalPassed = results.filter((r) => r.passed).length;
147
+ const totalFailed = results.length - totalPassed;
148
+ const totalDurationMs = results.reduce((a, b) => a + b.durationMs, 0);
149
+ const totalCost = results.reduce((a, b) => a + b.cost, 0);
150
+
151
+ return { results, totalPassed, totalFailed, totalDurationMs, totalCost };
152
+ }
153
+
154
+ /**
155
+ * Aggregate benchmark results into per-task-type metrics.
156
+ *
157
+ * @param results - Raw benchmark results (may include any task-type mix).
158
+ * @returns A map from task-type label to `BenchmarkMetrics`. Tasks with no label
159
+ * are grouped under `"__default__"`.
160
+ */
161
+ export function aggregateBenchmarkMetrics(results: BenchmarkResult[]): AggregateMetrics {
162
+ const buckets: Record<string, BenchmarkResult[]> = {};
163
+
164
+ for (const result of results) {
165
+ const key = result.taskType ?? "__default__";
166
+ if (!buckets[key]) buckets[key] = [];
167
+ buckets[key].push(result);
168
+ }
169
+
170
+ const metrics: AggregateMetrics = {};
171
+
172
+ for (const [taskType, group] of Object.entries(buckets)) {
173
+ const totalTasks = group.length;
174
+ const passedTasks = group.filter((r) => r.passed).length;
175
+ const passRate = totalTasks > 0 ? passedTasks / totalTasks : 0;
176
+ const avgTimeMs =
177
+ totalTasks > 0 ? group.reduce((s, r) => s + r.durationMs, 0) / totalTasks : 0;
178
+ const totalCost = group.reduce((s, r) => s + r.cost, 0);
179
+ const avgCost = totalTasks > 0 ? totalCost / totalTasks : 0;
180
+
181
+ metrics[taskType] = {
182
+ taskType,
183
+ totalTasks,
184
+ passedTasks,
185
+ passRate: Math.round(passRate * 1000) / 1000,
186
+ avgTimeMs: Math.round(avgTimeMs),
187
+ totalCost: Math.round(totalCost * 1e6) / 1e6,
188
+ avgCost: Math.round(avgCost * 1e6) / 1e6,
189
+ };
190
+ }
191
+
192
+ return metrics;
193
+ }
194
+
195
+ /**
196
+ * Generate a markdown comparison table for benchmark results including per-type aggregates.
197
+ *
198
+ * @param results - Benchmark results to report.
199
+ * @param includeTaskTypeComparison - When true (default), appends a per-task-type aggregate table.
200
+ */
201
+ export function generateBenchmarkReport(
202
+ results: BenchmarkResult[],
203
+ includeTaskTypeComparison = true,
204
+ ): string {
205
+ const lines: string[] = ["# Benchmark Results", ""];
206
+
207
+ lines.push("| Task | Type | Status | Duration | Cost |");
208
+ lines.push("|------|------|--------|---------|------|");
209
+
210
+ for (const r of results) {
211
+ const status = r.passed ? "✅ PASS" : "❌ FAIL";
212
+ const type = r.taskType ?? "—";
213
+ const cost = r.cost > 0 ? `$${r.cost.toFixed(4)}` : "—";
214
+ lines.push(`| ${r.taskId} | ${type} | ${status} | ${r.durationMs}ms | ${cost} |`);
215
+ }
216
+
217
+ lines.push("");
218
+
219
+ // Per-type aggregate table.
220
+ if (includeTaskTypeComparison && results.length > 0) {
221
+ const metrics = aggregateBenchmarkMetrics(results);
222
+ const types = Object.keys(metrics).sort();
223
+
224
+ if (types.length > 0) {
225
+ lines.push("## Per-Task-Type Comparison", "");
226
+ lines.push("| Task Type | Total | Passed | Pass Rate | Avg Time | Avg Cost |");
227
+ lines.push("|-----------|-------|--------|-----------|----------|---------|");
228
+
229
+ for (const t of types) {
230
+ const m = metrics[t];
231
+ const passRatePct = `${(m.passRate * 100).toFixed(1)}%`;
232
+ const avgCostStr = m.avgCost > 0 ? `$${m.avgCost.toFixed(4)}` : "—";
233
+ lines.push(
234
+ `| ${m.taskType} | ${m.totalTasks} | ${m.passedTasks} | ${passRatePct} | ${m.avgTimeMs}ms | ${avgCostStr} |`,
235
+ );
236
+ }
237
+ }
238
+ }
239
+
240
+ const passed = results.filter((r) => r.passed).length;
241
+ lines.push("");
242
+ lines.push(`**Total: ${passed}/${results.length} passed**`);
243
+
244
+ return lines.join("\n");
245
+ }
@@ -0,0 +1,66 @@
1
+ /**
2
+ * Feedback loop - continuous improvement cycle: evaluate → learn → apply → re-evaluate
3
+ */
4
+
5
+ import type { RunMetrics } from "../state/run-metrics.ts";
6
+
7
+ export interface FeedbackLoopStats {
8
+ runsObserved: number;
9
+ avgSuccessRate: number;
10
+ recommendations: string[];
11
+ }
12
+
13
+ export class FeedbackLoop {
14
+ private runs: RunMetrics[] = [];
15
+
16
+ /**
17
+ * Record a run's metrics for learning.
18
+ */
19
+ recordRun(metrics: RunMetrics): void {
20
+ this.runs.push(metrics);
21
+ }
22
+
23
+ /**
24
+ * Get current statistics and recommendations.
25
+ */
26
+ getStats(): FeedbackLoopStats {
27
+ if (this.runs.length === 0) {
28
+ return {
29
+ runsObserved: 0,
30
+ avgSuccessRate: 0,
31
+ recommendations: ["No runs observed yet. Run some workflows to gather data."],
32
+ };
33
+ }
34
+
35
+ const successRates = this.runs.map((r) => (r.taskCount > 0 ? r.completedCount / r.taskCount : 0));
36
+ const avg = successRates.reduce((a, b) => a + b, 0) / successRates.length;
37
+
38
+ const recommendations: string[] = [];
39
+ if (avg >= 0.9) {
40
+ recommendations.push(`High success rate (${(avg * 100).toFixed(0)}%). Current configuration is working well.`);
41
+ } else if (avg >= 0.7) {
42
+ recommendations.push(`Moderate success rate (${(avg * 100).toFixed(0)}%). Consider reviewing failed tasks for patterns.`);
43
+ } else {
44
+ recommendations.push(`Low success rate (${(avg * 100).toFixed(0)}%). Investigate failure patterns.`);
45
+ }
46
+
47
+ // Cost awareness
48
+ const avgCost = this.runs.reduce((a, b) => a + b.totalCost, 0) / this.runs.length;
49
+ if (avgCost > 10) {
50
+ recommendations.push(`Average cost per run: $${avgCost.toFixed(2)}. Consider optimization.`);
51
+ }
52
+
53
+ return {
54
+ runsObserved: this.runs.length,
55
+ avgSuccessRate: avg,
56
+ recommendations,
57
+ };
58
+ }
59
+
60
+ /**
61
+ * Clear recorded runs.
62
+ */
63
+ clear(): void {
64
+ this.runs = [];
65
+ }
66
+ }
@@ -1,4 +1,4 @@
1
- import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
1
+ import type { ExtensionContext } from "@earendil-works/pi-coding-agent";
2
2
  import { appendEvent, readEvents, type TeamEvent } from "../state/event-log.ts";
3
3
  import { checkProcessLiveness, isActiveRunStatus } from "../runtime/process-status.ts";
4
4
  import { loadRunManifestById, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
@@ -1,4 +1,4 @@
1
- import type { BeforeAgentStartEvent, ExtensionAPI } from "@mariozechner/pi-coding-agent";
1
+ import type { BeforeAgentStartEvent, ExtensionAPI } from "@earendil-works/pi-coding-agent";
2
2
  import { effectiveAutonomousConfig, loadConfig, type PiTeamsAutonomousConfig } from "../config/config.ts";
3
3
  import { allAgents, discoverAgents } from "../agents/discover-agents.ts";
4
4
  import { allTeams, discoverTeams } from "../teams/discover-teams.ts";
@@ -1,4 +1,4 @@
1
- import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
1
+ import type { ExtensionContext } from "@earendil-works/pi-coding-agent";
2
2
  import type { TeamToolParamsValue } from "../schema/team-tool-schema.ts";
3
3
  // Lazy-loaded to avoid pulling team-tool.ts (and its entire runtime chain) into module load.
4
4
  import type { handleTeamTool as HandleTeamToolFn } from "./team-tool.ts";