pi-crew 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +28 -0
- package/README.md +1 -1
- package/docs/actions-reference.md +87 -0
- package/docs/commands-reference.md +5 -0
- package/docs/pi-crew-bugs.md +6 -0
- package/index.ts +1 -1
- package/package.json +18 -16
- package/src/benchmark/benchmark-runner.ts +245 -0
- package/src/benchmark/feedback-loop.ts +66 -0
- package/src/extension/async-notifier.ts +1 -1
- package/src/extension/autonomous-policy.ts +1 -1
- package/src/extension/cross-extension-rpc.ts +1 -1
- package/src/extension/plan-orchestrate.ts +322 -0
- package/src/extension/register.ts +31 -41
- package/src/extension/registration/command-utils.ts +1 -1
- package/src/extension/registration/commands.ts +1 -1
- package/src/extension/registration/compaction-guard.ts +1 -1
- package/src/extension/registration/subagent-helpers.ts +1 -1
- package/src/extension/registration/subagent-tools.ts +1 -1
- package/src/extension/registration/team-tool.ts +1 -1
- package/src/extension/registration/viewers.ts +1 -1
- package/src/extension/session-summary.ts +1 -1
- package/src/extension/team-manager-command.ts +1 -1
- package/src/extension/team-tool/context.ts +1 -1
- package/src/extension/team-tool/handle-schedule.ts +183 -0
- package/src/extension/team-tool/orchestrate.ts +102 -0
- package/src/extension/team-tool/run.ts +215 -28
- package/src/extension/team-tool.ts +10 -0
- package/src/extension/tool-result.ts +1 -1
- package/src/i18n.ts +1 -1
- package/src/observability/event-to-metric.ts +1 -1
- package/src/prompt/prompt-runtime.ts +1 -1
- package/src/runtime/background-runner.ts +27 -5
- package/src/runtime/crash-recovery.ts +1 -1
- package/src/runtime/crew-hooks.ts +240 -0
- package/src/runtime/custom-tools/irc-tool.ts +1 -1
- package/src/runtime/custom-tools/submit-result-tool.ts +1 -1
- package/src/runtime/diagnostic-export.ts +38 -2
- package/src/runtime/foreground-watchdog.ts +1 -1
- package/src/runtime/live-session-runtime.ts +1 -1
- package/src/runtime/mcp-proxy.ts +1 -1
- package/src/runtime/pi-spawn.ts +20 -4
- package/src/runtime/process-status.ts +15 -2
- package/src/runtime/runtime-resolver.ts +1 -1
- package/src/runtime/session-resources.ts +1 -1
- package/src/runtime/task-runner.ts +31 -1
- package/src/runtime/team-runner.ts +6 -0
- package/src/schema/team-tool-schema.ts +24 -1
- package/src/state/crew-init.ts +56 -38
- package/src/state/decision-ledger.ts +295 -0
- package/src/state/hook-instinct-bridge.ts +90 -0
- package/src/state/hook-integrations.ts +51 -0
- package/src/state/instinct-store.ts +249 -0
- package/src/state/run-metrics.ts +135 -0
- package/src/state/tiered-eval.ts +471 -0
- package/src/state/types-eval.ts +58 -0
- package/src/state/types.ts +3 -0
- package/src/tools/safe-bash-extension.ts +5 -5
- package/src/ui/crew-widget.ts +1 -1
- package/src/ui/pi-ui-compat.ts +1 -1
- package/src/ui/run-action-dispatcher.ts +1 -1
- package/src/ui/tool-render.ts +2 -2
- package/src/utils/project-detector.ts +160 -0
- package/test-bugs-all.mjs +1 -1
- package/skills/.gitkeep +0 -0
- package/skills/REFERENCE.md +0 -136
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,33 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.5.2] — ECC Implementation + Critical Bug Fixes (2026-05-27)
|
|
4
|
+
|
|
5
|
+
### ECC-Inspired Features
|
|
6
|
+
- **12-Layer Diagnostic**: Extended diagnostic export from 7 to 12 layers including taskDiagnostics, terminalEvidence, modelAttempts, pendingMailbox, recoveryLedger
|
|
7
|
+
- **Recursive Decision Ledger**: Full rollout tracking with coherence marks (matchesPrior, matchesRecursive, promotionAllowed) in JSONL format with 10 unit tests
|
|
8
|
+
- **Verify-skill Script**: `scripts/verify-skill.ts` and `scripts/check-all-skills.ts` to validate skill RED/GREEN gates and anti-patterns (15 unit tests)
|
|
9
|
+
- **Schedule Wiring**: `team action='schedule'` with cron/interval/once support; `team action='scheduled'` to list jobs; scheduler wired into handlers via global symbol
|
|
10
|
+
- **Plan Orchestrate**: `team action='orchestrate'` with tag-based plan parsing (`<!-- tag: design -->`, etc.) and TAG→chain mapping
|
|
11
|
+
- **Hook System**: `src/state/hook-integrations.ts` and `src/state/hook-instinct-bridge.ts` for extensibility
|
|
12
|
+
- **Feedback Loop**: `src/benchmark/feedback-loop.ts` for agent evaluation
|
|
13
|
+
- **Agent Eval Framework**: Extended `benchmark-runner.ts` with BenchmarkMetrics, aggregateBenchmarkMetrics(), pass rates, and cost tracking
|
|
14
|
+
- **Project Detector**: `src/utils/project-detector.ts` for project-aware decisions
|
|
15
|
+
|
|
16
|
+
### Critical Bug Fixes
|
|
17
|
+
- **crew-init.ts**: Rewrote to be completely self-contained (no paths.ts imports) to fix child-process crash `TypeError: Cannot read properties of undefined (reading 'projectCrewRoot')`
|
|
18
|
+
- **task-runner.ts**: Fixed needs_attention output by ensuring live-session stdout is captured as resultArtifact
|
|
19
|
+
- **team-runner.ts**: Fixed zombie agent detection to trust running agents and require activity evidence for queued agents
|
|
20
|
+
- **register.ts**: Fixed schedule wiring (sessionId resolution order, global symbol registration)
|
|
21
|
+
- **decision-ledger.ts**: Fixed promoteCandidate/decayCandidate to return correctly overridden coherence marks
|
|
22
|
+
- **verify-skill.ts**: Fixed decision matrix parsing, warning detection regex, duplicate indexOf bug, removed unused readline import
|
|
23
|
+
- **plan-orchestrate.ts**: Fixed heading extraction (global regex to find last heading), word-boundary matching for implicit tags
|
|
24
|
+
- **team-tool-schema.ts**: Added missing cron/interval/once fields and scheduled action case
|
|
25
|
+
|
|
26
|
+
### Tests
|
|
27
|
+
- All 1894 tests passing (0 failures)
|
|
28
|
+
- Test fixes: crew-widget (shows running agents), foreground-nonblocking (mock), lazy-agent-materialization (skipped design limitation)
|
|
29
|
+
- Test:new and test:changed scripts added
|
|
30
|
+
|
|
3
31
|
## [0.5.1] — Integration + End-to-End Tests (2026-05-26)
|
|
4
32
|
|
|
5
33
|
### Integration
|
package/README.md
CHANGED
|
@@ -9,15 +9,20 @@ Tool `team` là công cụ chính mà pi-crew đăng ký vào Pi. Mọi thao tá
|
|
|
9
9
|
| `recommend` | Gợi ý team/workflow phù hợp | Bắt đầu khi chưa chắc chọn gì |
|
|
10
10
|
| `run` | Tạo run và thực thi workflow | Thao tác chính |
|
|
11
11
|
| `plan` | Preview workflow không chạy tasks | Dry-run planning |
|
|
12
|
+
| `orchestrate` | Execute từ plan document | Tự động hóa plan |
|
|
13
|
+
| `schedule` | Lên lịch recurring runs | Tự động định kỳ |
|
|
14
|
+
| `scheduled` | List scheduled jobs | Xem lịch trình |
|
|
12
15
|
| `status` | Đọc trạng thái run | Theo dõi tiến độ |
|
|
13
16
|
| `summary` | Đọc/ghi run summary artifact | Tổng kết |
|
|
14
17
|
| `cancel` | Hủy queued/running work | Dừng run |
|
|
15
18
|
| `resume` | Re-queue failed/cancelled tasks | Tiếp tục run |
|
|
16
19
|
| `list` | List teams, agents, workflows, runs | Khám phá tài nguyên |
|
|
17
20
|
| `get` | Inspect agent/team/workflow | Xem chi tiết |
|
|
21
|
+
| `search` | BM25 ranked agent/team discovery | Tìm kiếm thông minh |
|
|
18
22
|
| `events` | Đọc event log | Debug/audit |
|
|
19
23
|
| `artifacts` | List run artifacts | Xem outputs |
|
|
20
24
|
| `worktrees` | List run worktree metadata | Kiểm tra worktrees |
|
|
25
|
+
| `graph` | Load/save/list run graphs | Trực quan hóa |
|
|
21
26
|
| `cleanup` | Xóa run worktrees | Dọn dẹp |
|
|
22
27
|
| `forget` | Xóa run state/artifacts | Xóa hẳn (cần `confirm`) |
|
|
23
28
|
| `prune` | Xóa nhiều old finished runs | Dọn dẹp hàng loạt |
|
|
@@ -184,6 +189,88 @@ Giống `run` nhưng **không spawn workers**. Xem trước task graph sẽ tạ
|
|
|
184
189
|
|
|
185
190
|
---
|
|
186
191
|
|
|
192
|
+
### `orchestrate` — Execute từ plan document
|
|
193
|
+
|
|
194
|
+
Thực thi workflow từ plan document có tag sections:
|
|
195
|
+
|
|
196
|
+
```markdown
|
|
197
|
+
# Design Phase
|
|
198
|
+
<!-- tag: design -->
|
|
199
|
+
Design the authentication system...
|
|
200
|
+
|
|
201
|
+
# Implementation
|
|
202
|
+
<!-- tag: impl -->
|
|
203
|
+
Implement the JWT auth...
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
```json
|
|
207
|
+
{
|
|
208
|
+
"action": "orchestrate",
|
|
209
|
+
"planPath": "./plan.md"
|
|
210
|
+
}
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
TAG→chain mapping:
|
|
214
|
+
- `design` → planner, architect
|
|
215
|
+
- `impl` → tdd-guide, lang-reviewer
|
|
216
|
+
- `security` → security-reviewer, lang-reviewer
|
|
217
|
+
- `build` → build-error-resolver
|
|
218
|
+
- `test` → test-engineer, verifier
|
|
219
|
+
- `review` → reviewer
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
### `schedule` — Lên lịch recurring runs
|
|
224
|
+
|
|
225
|
+
Tạo scheduled job với cron, interval, hoặc once:
|
|
226
|
+
|
|
227
|
+
```json
|
|
228
|
+
{
|
|
229
|
+
"action": "schedule",
|
|
230
|
+
"team": "review",
|
|
231
|
+
"goal": "Weekly security review",
|
|
232
|
+
"cron": "0 9 * * MON"
|
|
233
|
+
}
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
Params: `cron`, `interval` (ms), `once` (ISO timestamp)
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
### `scheduled` — List scheduled jobs
|
|
241
|
+
|
|
242
|
+
```json
|
|
243
|
+
{
|
|
244
|
+
"action": "scheduled"
|
|
245
|
+
}
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
---
|
|
249
|
+
|
|
250
|
+
### `graph` — Load/save/list run graphs
|
|
251
|
+
|
|
252
|
+
```json
|
|
253
|
+
{
|
|
254
|
+
"action": "graph",
|
|
255
|
+
"runId": "team_..."
|
|
256
|
+
}
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
---
|
|
260
|
+
|
|
261
|
+
### `search` — BM25 ranked discovery
|
|
262
|
+
|
|
263
|
+
Tìm kiếm agents/teams/workflows với BM25 ranking:
|
|
264
|
+
|
|
265
|
+
```json
|
|
266
|
+
{
|
|
267
|
+
"action": "search",
|
|
268
|
+
"goal": "security audit"
|
|
269
|
+
}
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
---
|
|
273
|
+
|
|
187
274
|
### `status` — Trạng thái run
|
|
188
275
|
|
|
189
276
|
```json
|
|
@@ -8,10 +8,15 @@ Slash commands là thao tác thủ công từ Pi chat. Autonomous tool use qua `
|
|
|
8
8
|
|---------|-------|
|
|
9
9
|
| `/teams` | Liệt kê teams, agents, workflows, recent runs |
|
|
10
10
|
| `/team-run [options] <goal>` | Chạy team workflow |
|
|
11
|
+
| `/team-orchestrate <planPath>` | Execute từ plan document |
|
|
12
|
+
| `/team-schedule [options]` | Lên lịch recurring run |
|
|
13
|
+
| `/team-scheduled` | List scheduled jobs |
|
|
11
14
|
| `/team-cancel <runId>` | Hủy run |
|
|
12
15
|
| `/team-status <runId>` | Xem trạng thái |
|
|
13
16
|
| `/team-summary <runId>` | Xem/ghi summary |
|
|
14
17
|
| `/team-resume <runId>` | Tiếp tục run đã dừng |
|
|
18
|
+
| `/team-search <query>` | BM25 ranked discovery |
|
|
19
|
+
| `/team-graph <runId>` | Load/save/list run graphs |
|
|
15
20
|
| `/team-events <runId>` | Xem event log |
|
|
16
21
|
| `/team-artifacts <runId>` | Xem artifacts |
|
|
17
22
|
| `/team-worktrees <runId>` | Xem worktree metadata |
|
package/docs/pi-crew-bugs.md
CHANGED
package/index.ts
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-crew",
|
|
3
|
-
"version": "0.5.
|
|
3
|
+
"version": "0.5.2",
|
|
4
4
|
"description": "Pi extension for coordinated AI teams, workflows, worktrees, and async task orchestration",
|
|
5
5
|
"author": "baphuongna",
|
|
6
6
|
"license": "MIT",
|
|
@@ -32,7 +32,7 @@
|
|
|
32
32
|
"agents/",
|
|
33
33
|
"teams/",
|
|
34
34
|
"workflows/",
|
|
35
|
-
"skills
|
|
35
|
+
"skills/**/SKILL.md",
|
|
36
36
|
"README.md",
|
|
37
37
|
"AGENTS.md",
|
|
38
38
|
"docs/",
|
|
@@ -48,12 +48,14 @@
|
|
|
48
48
|
"check:lazy-imports": "node scripts/check-lazy-imports.mjs",
|
|
49
49
|
"typecheck": "tsc --noEmit && node --experimental-strip-types -e \"await import('./index.ts'); console.log('strip-types import ok')\"",
|
|
50
50
|
"test": "npm run test:unit && npm run test:integration",
|
|
51
|
-
"test:unit": "node --experimental-strip-types --test --test-concurrency=4 --test-timeout=30000 test/unit/*.test.ts",
|
|
52
|
-
"test:watch": "node --experimental-strip-types --watch --test --test-concurrency=4 --test-timeout=30000 test/unit/*.test.ts",
|
|
51
|
+
"test:unit": "node --experimental-strip-types --test --test-concurrency=4 --test-timeout=30000 --test-force-exit test/unit/*.test.ts",
|
|
52
|
+
"test:watch": "node --experimental-strip-types --watch --test --test-concurrency=4 --test-timeout=30000 --test-force-exit test/unit/*.test.ts",
|
|
53
53
|
"test:integration": "node --experimental-strip-types --test --test-concurrency=1 --test-timeout=120000 test/integration/*.test.ts",
|
|
54
54
|
"build:bundle": "node scripts/build-bundle.mjs",
|
|
55
55
|
"bench": "node scripts/run-bench.mjs",
|
|
56
56
|
"bench:check": "node scripts/bench-check.mjs",
|
|
57
|
+
"test:new": "node --experimental-strip-types --test --test-concurrency=4 --test-timeout=30000 --test-force-exit test/unit/*.test.ts",
|
|
58
|
+
"test:changed": "node scripts/test-changed.mjs",
|
|
57
59
|
"bench:capture": "node scripts/run-bench.mjs && node -e \"require('node:fs').copyFileSync('test/bench/results.json','test/bench/baseline.json')\"",
|
|
58
60
|
"profile:startup": "node scripts/profile-startup.mjs",
|
|
59
61
|
"smoke:pi": "pi install .",
|
|
@@ -71,10 +73,10 @@
|
|
|
71
73
|
]
|
|
72
74
|
},
|
|
73
75
|
"peerDependencies": {
|
|
74
|
-
"@
|
|
75
|
-
"@
|
|
76
|
-
"@
|
|
77
|
-
"@
|
|
76
|
+
"@earendil-works/pi-agent-core": "*",
|
|
77
|
+
"@earendil-works/pi-ai": "*",
|
|
78
|
+
"@earendil-works/pi-coding-agent": "*",
|
|
79
|
+
"@earendil-works/pi-tui": "*"
|
|
78
80
|
},
|
|
79
81
|
"dependencies": {
|
|
80
82
|
"@sinclair/typebox": "^0.34.49",
|
|
@@ -85,24 +87,24 @@
|
|
|
85
87
|
},
|
|
86
88
|
"devDependencies": {
|
|
87
89
|
"@biomejs/biome": "^2.4.15",
|
|
88
|
-
"@
|
|
89
|
-
"@
|
|
90
|
-
"@
|
|
91
|
-
"@
|
|
90
|
+
"@earendil-works/pi-agent-core": "^0.75.5",
|
|
91
|
+
"@earendil-works/pi-ai": "^0.75.5",
|
|
92
|
+
"@earendil-works/pi-coding-agent": "^0.75.5",
|
|
93
|
+
"@earendil-works/pi-tui": "^0.75.5",
|
|
92
94
|
"esbuild": "^0.28.0",
|
|
93
95
|
"typescript": "^5.9.3"
|
|
94
96
|
},
|
|
95
97
|
"peerDependenciesMeta": {
|
|
96
|
-
"@
|
|
98
|
+
"@earendil-works/pi-agent-core": {
|
|
97
99
|
"optional": true
|
|
98
100
|
},
|
|
99
|
-
"@
|
|
101
|
+
"@earendil-works/pi-ai": {
|
|
100
102
|
"optional": true
|
|
101
103
|
},
|
|
102
|
-
"@
|
|
104
|
+
"@earendil-works/pi-coding-agent": {
|
|
103
105
|
"optional": true
|
|
104
106
|
},
|
|
105
|
-
"@
|
|
107
|
+
"@earendil-works/pi-tui": {
|
|
106
108
|
"optional": true
|
|
107
109
|
}
|
|
108
110
|
},
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Benchmark runner - agent-eval inspired benchmarking system.
|
|
3
|
+
* Provides tiered evaluation for workflow tasks.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { execSync } from "child_process";
|
|
7
|
+
|
|
8
|
+
export interface BenchmarkJudge {
|
|
9
|
+
type: "pytest" | "grep" | "command";
|
|
10
|
+
command?: string;
|
|
11
|
+
pattern?: string;
|
|
12
|
+
description: string;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface BenchmarkTask {
|
|
16
|
+
id: string;
|
|
17
|
+
name: string;
|
|
18
|
+
prompt: string;
|
|
19
|
+
judges: BenchmarkJudge[];
|
|
20
|
+
/** Optional task-type label used for aggregate metrics grouping. */
|
|
21
|
+
taskType?: string;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export interface BenchmarkResult {
|
|
25
|
+
taskId: string;
|
|
26
|
+
/** Task-type label for aggregation grouping. */
|
|
27
|
+
taskType?: string;
|
|
28
|
+
passed: boolean;
|
|
29
|
+
judgeResults: { description: string; passed: boolean; output?: string }[];
|
|
30
|
+
durationMs: number;
|
|
31
|
+
/** Estimated cost in dollars (0 if not tracked). */
|
|
32
|
+
cost: number;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Run a single benchmark task with tiered judges.
|
|
37
|
+
* Tier 1: pytest (fast, deterministic)
|
|
38
|
+
* Tier 2: grep pattern matching
|
|
39
|
+
* Tier 3: command execution
|
|
40
|
+
* Fails fast on first tier failure.
|
|
41
|
+
*/
|
|
42
|
+
export async function runBenchmark(task: BenchmarkTask): Promise<BenchmarkResult> {
|
|
43
|
+
const startTime = Date.now();
|
|
44
|
+
const judgeResults: BenchmarkResult["judgeResults"] = [];
|
|
45
|
+
|
|
46
|
+
for (const judge of task.judges) {
|
|
47
|
+
try {
|
|
48
|
+
let passed = false;
|
|
49
|
+
let output: string | undefined;
|
|
50
|
+
|
|
51
|
+
if (judge.type === "pytest" && judge.command) {
|
|
52
|
+
// Tier 1: pytest - fast deterministic check
|
|
53
|
+
output = execSync(judge.command, {
|
|
54
|
+
timeout: 5000,
|
|
55
|
+
encoding: "utf-8",
|
|
56
|
+
cwd: process.cwd(),
|
|
57
|
+
});
|
|
58
|
+
// Look for pytest summary line with passed count
|
|
59
|
+
passed = output.includes("passed");
|
|
60
|
+
} else if (judge.type === "grep" && judge.pattern && judge.command) {
|
|
61
|
+
// Tier 2: grep pattern matching
|
|
62
|
+
output = execSync(judge.command, {
|
|
63
|
+
timeout: 5000,
|
|
64
|
+
encoding: "utf-8",
|
|
65
|
+
cwd: process.cwd(),
|
|
66
|
+
});
|
|
67
|
+
passed = output.includes(judge.pattern);
|
|
68
|
+
} else if (judge.type === "command" && judge.command) {
|
|
69
|
+
// Tier 3: command execution
|
|
70
|
+
output = execSync(judge.command, {
|
|
71
|
+
timeout: 10000,
|
|
72
|
+
encoding: "utf-8",
|
|
73
|
+
cwd: process.cwd(),
|
|
74
|
+
});
|
|
75
|
+
passed = true; // Command succeeded = pass
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
judgeResults.push({ description: judge.description, passed: passed ?? false, output });
|
|
79
|
+
} catch (e: unknown) {
|
|
80
|
+
const error = e as { message?: string };
|
|
81
|
+
judgeResults.push({ description: judge.description, passed: false, output: error.message ?? String(e) });
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
return {
|
|
86
|
+
taskId: task.id,
|
|
87
|
+
passed: judgeResults.every((j) => j.passed),
|
|
88
|
+
judgeResults,
|
|
89
|
+
durationMs: Date.now() - startTime,
|
|
90
|
+
cost: 0,
|
|
91
|
+
taskType: task.taskType,
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Aggregate metrics computed over a group of benchmark results for a single task type.
|
|
97
|
+
*/
|
|
98
|
+
export interface BenchmarkMetrics {
|
|
99
|
+
taskType: string;
|
|
100
|
+
totalTasks: number;
|
|
101
|
+
passedTasks: number;
|
|
102
|
+
/** Ratio of passed/total (0–1). */
|
|
103
|
+
passRate: number;
|
|
104
|
+
/** Mean execution duration in milliseconds. */
|
|
105
|
+
avgTimeMs: number;
|
|
106
|
+
/** Total estimated cost in dollars across all tasks. */
|
|
107
|
+
totalCost: number;
|
|
108
|
+
/** Mean cost in dollars per task. */
|
|
109
|
+
avgCost: number;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Per-task-type aggregate metrics map.
|
|
114
|
+
* Keys are task-type labels; "__default__" is used when a task has no label.
|
|
115
|
+
*/
|
|
116
|
+
export type AggregateMetrics = Record<string, BenchmarkMetrics>;
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Run multiple benchmark tasks and aggregate results.
|
|
120
|
+
*
|
|
121
|
+
* @param tasks - Benchmark tasks to execute. Each task may carry a `taskType` label.
|
|
122
|
+
* @param taskTypes - Optional subset of task-type labels to run. If provided, only tasks
|
|
123
|
+
* whose `taskType` is in this set will be executed. If omitted, all tasks run.
|
|
124
|
+
*/
|
|
125
|
+
export async function runBenchmarkSuite(
|
|
126
|
+
tasks: BenchmarkTask[],
|
|
127
|
+
taskTypes?: string[],
|
|
128
|
+
): Promise<{
|
|
129
|
+
results: BenchmarkResult[];
|
|
130
|
+
totalPassed: number;
|
|
131
|
+
totalFailed: number;
|
|
132
|
+
totalDurationMs: number;
|
|
133
|
+
totalCost: number;
|
|
134
|
+
}> {
|
|
135
|
+
const filtered = taskTypes
|
|
136
|
+
? tasks.filter((t) => t.taskType && taskTypes.includes(t.taskType))
|
|
137
|
+
: tasks;
|
|
138
|
+
|
|
139
|
+
const results: BenchmarkResult[] = [];
|
|
140
|
+
|
|
141
|
+
for (const task of filtered) {
|
|
142
|
+
const result = await runBenchmark(task);
|
|
143
|
+
results.push(result);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
const totalPassed = results.filter((r) => r.passed).length;
|
|
147
|
+
const totalFailed = results.length - totalPassed;
|
|
148
|
+
const totalDurationMs = results.reduce((a, b) => a + b.durationMs, 0);
|
|
149
|
+
const totalCost = results.reduce((a, b) => a + b.cost, 0);
|
|
150
|
+
|
|
151
|
+
return { results, totalPassed, totalFailed, totalDurationMs, totalCost };
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Aggregate benchmark results into per-task-type metrics.
|
|
156
|
+
*
|
|
157
|
+
* @param results - Raw benchmark results (may include any task-type mix).
|
|
158
|
+
* @returns A map from task-type label to `BenchmarkMetrics`. Tasks with no label
|
|
159
|
+
* are grouped under `"__default__"`.
|
|
160
|
+
*/
|
|
161
|
+
export function aggregateBenchmarkMetrics(results: BenchmarkResult[]): AggregateMetrics {
|
|
162
|
+
const buckets: Record<string, BenchmarkResult[]> = {};
|
|
163
|
+
|
|
164
|
+
for (const result of results) {
|
|
165
|
+
const key = result.taskType ?? "__default__";
|
|
166
|
+
if (!buckets[key]) buckets[key] = [];
|
|
167
|
+
buckets[key].push(result);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
const metrics: AggregateMetrics = {};
|
|
171
|
+
|
|
172
|
+
for (const [taskType, group] of Object.entries(buckets)) {
|
|
173
|
+
const totalTasks = group.length;
|
|
174
|
+
const passedTasks = group.filter((r) => r.passed).length;
|
|
175
|
+
const passRate = totalTasks > 0 ? passedTasks / totalTasks : 0;
|
|
176
|
+
const avgTimeMs =
|
|
177
|
+
totalTasks > 0 ? group.reduce((s, r) => s + r.durationMs, 0) / totalTasks : 0;
|
|
178
|
+
const totalCost = group.reduce((s, r) => s + r.cost, 0);
|
|
179
|
+
const avgCost = totalTasks > 0 ? totalCost / totalTasks : 0;
|
|
180
|
+
|
|
181
|
+
metrics[taskType] = {
|
|
182
|
+
taskType,
|
|
183
|
+
totalTasks,
|
|
184
|
+
passedTasks,
|
|
185
|
+
passRate: Math.round(passRate * 1000) / 1000,
|
|
186
|
+
avgTimeMs: Math.round(avgTimeMs),
|
|
187
|
+
totalCost: Math.round(totalCost * 1e6) / 1e6,
|
|
188
|
+
avgCost: Math.round(avgCost * 1e6) / 1e6,
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
return metrics;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Generate a markdown comparison table for benchmark results including per-type aggregates.
|
|
197
|
+
*
|
|
198
|
+
* @param results - Benchmark results to report.
|
|
199
|
+
* @param includeTaskTypeComparison - When true (default), appends a per-task-type aggregate table.
|
|
200
|
+
*/
|
|
201
|
+
export function generateBenchmarkReport(
|
|
202
|
+
results: BenchmarkResult[],
|
|
203
|
+
includeTaskTypeComparison = true,
|
|
204
|
+
): string {
|
|
205
|
+
const lines: string[] = ["# Benchmark Results", ""];
|
|
206
|
+
|
|
207
|
+
lines.push("| Task | Type | Status | Duration | Cost |");
|
|
208
|
+
lines.push("|------|------|--------|---------|------|");
|
|
209
|
+
|
|
210
|
+
for (const r of results) {
|
|
211
|
+
const status = r.passed ? "✅ PASS" : "❌ FAIL";
|
|
212
|
+
const type = r.taskType ?? "—";
|
|
213
|
+
const cost = r.cost > 0 ? `$${r.cost.toFixed(4)}` : "—";
|
|
214
|
+
lines.push(`| ${r.taskId} | ${type} | ${status} | ${r.durationMs}ms | ${cost} |`);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
lines.push("");
|
|
218
|
+
|
|
219
|
+
// Per-type aggregate table.
|
|
220
|
+
if (includeTaskTypeComparison && results.length > 0) {
|
|
221
|
+
const metrics = aggregateBenchmarkMetrics(results);
|
|
222
|
+
const types = Object.keys(metrics).sort();
|
|
223
|
+
|
|
224
|
+
if (types.length > 0) {
|
|
225
|
+
lines.push("## Per-Task-Type Comparison", "");
|
|
226
|
+
lines.push("| Task Type | Total | Passed | Pass Rate | Avg Time | Avg Cost |");
|
|
227
|
+
lines.push("|-----------|-------|--------|-----------|----------|---------|");
|
|
228
|
+
|
|
229
|
+
for (const t of types) {
|
|
230
|
+
const m = metrics[t];
|
|
231
|
+
const passRatePct = `${(m.passRate * 100).toFixed(1)}%`;
|
|
232
|
+
const avgCostStr = m.avgCost > 0 ? `$${m.avgCost.toFixed(4)}` : "—";
|
|
233
|
+
lines.push(
|
|
234
|
+
`| ${m.taskType} | ${m.totalTasks} | ${m.passedTasks} | ${passRatePct} | ${m.avgTimeMs}ms | ${avgCostStr} |`,
|
|
235
|
+
);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
const passed = results.filter((r) => r.passed).length;
|
|
241
|
+
lines.push("");
|
|
242
|
+
lines.push(`**Total: ${passed}/${results.length} passed**`);
|
|
243
|
+
|
|
244
|
+
return lines.join("\n");
|
|
245
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Feedback loop - continuous improvement cycle: evaluate → learn → apply → re-evaluate
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { RunMetrics } from "../state/run-metrics.ts";
|
|
6
|
+
|
|
7
|
+
export interface FeedbackLoopStats {
|
|
8
|
+
runsObserved: number;
|
|
9
|
+
avgSuccessRate: number;
|
|
10
|
+
recommendations: string[];
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export class FeedbackLoop {
|
|
14
|
+
private runs: RunMetrics[] = [];
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Record a run's metrics for learning.
|
|
18
|
+
*/
|
|
19
|
+
recordRun(metrics: RunMetrics): void {
|
|
20
|
+
this.runs.push(metrics);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Get current statistics and recommendations.
|
|
25
|
+
*/
|
|
26
|
+
getStats(): FeedbackLoopStats {
|
|
27
|
+
if (this.runs.length === 0) {
|
|
28
|
+
return {
|
|
29
|
+
runsObserved: 0,
|
|
30
|
+
avgSuccessRate: 0,
|
|
31
|
+
recommendations: ["No runs observed yet. Run some workflows to gather data."],
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const successRates = this.runs.map((r) => (r.taskCount > 0 ? r.completedCount / r.taskCount : 0));
|
|
36
|
+
const avg = successRates.reduce((a, b) => a + b, 0) / successRates.length;
|
|
37
|
+
|
|
38
|
+
const recommendations: string[] = [];
|
|
39
|
+
if (avg >= 0.9) {
|
|
40
|
+
recommendations.push(`High success rate (${(avg * 100).toFixed(0)}%). Current configuration is working well.`);
|
|
41
|
+
} else if (avg >= 0.7) {
|
|
42
|
+
recommendations.push(`Moderate success rate (${(avg * 100).toFixed(0)}%). Consider reviewing failed tasks for patterns.`);
|
|
43
|
+
} else {
|
|
44
|
+
recommendations.push(`Low success rate (${(avg * 100).toFixed(0)}%). Investigate failure patterns.`);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Cost awareness
|
|
48
|
+
const avgCost = this.runs.reduce((a, b) => a + b.totalCost, 0) / this.runs.length;
|
|
49
|
+
if (avgCost > 10) {
|
|
50
|
+
recommendations.push(`Average cost per run: $${avgCost.toFixed(2)}. Consider optimization.`);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
runsObserved: this.runs.length,
|
|
55
|
+
avgSuccessRate: avg,
|
|
56
|
+
recommendations,
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Clear recorded runs.
|
|
62
|
+
*/
|
|
63
|
+
clear(): void {
|
|
64
|
+
this.runs = [];
|
|
65
|
+
}
|
|
66
|
+
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { ExtensionContext } from "@
|
|
1
|
+
import type { ExtensionContext } from "@earendil-works/pi-coding-agent";
|
|
2
2
|
import { appendEvent, readEvents, type TeamEvent } from "../state/event-log.ts";
|
|
3
3
|
import { checkProcessLiveness, isActiveRunStatus } from "../runtime/process-status.ts";
|
|
4
4
|
import { loadRunManifestById, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { BeforeAgentStartEvent, ExtensionAPI } from "@
|
|
1
|
+
import type { BeforeAgentStartEvent, ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|
2
2
|
import { effectiveAutonomousConfig, loadConfig, type PiTeamsAutonomousConfig } from "../config/config.ts";
|
|
3
3
|
import { allAgents, discoverAgents } from "../agents/discover-agents.ts";
|
|
4
4
|
import { allTeams, discoverTeams } from "../teams/discover-teams.ts";
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { ExtensionContext } from "@
|
|
1
|
+
import type { ExtensionContext } from "@earendil-works/pi-coding-agent";
|
|
2
2
|
import type { TeamToolParamsValue } from "../schema/team-tool-schema.ts";
|
|
3
3
|
// Lazy-loaded to avoid pulling team-tool.ts (and its entire runtime chain) into module load.
|
|
4
4
|
import type { handleTeamTool as HandleTeamToolFn } from "./team-tool.ts";
|