pi-crew 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +51 -1
- package/README.md +1 -1
- package/docs/actions-reference.md +87 -0
- package/docs/commands-reference.md +5 -0
- package/docs/pi-crew-bugs.md +6 -0
- package/index.ts +1 -1
- package/package.json +18 -16
- package/src/benchmark/benchmark-runner.ts +245 -0
- package/src/benchmark/feedback-loop.ts +66 -0
- package/src/extension/async-notifier.ts +1 -1
- package/src/extension/autonomous-policy.ts +1 -1
- package/src/extension/cross-extension-rpc.ts +1 -1
- package/src/extension/plan-orchestrate.ts +322 -0
- package/src/extension/register.ts +31 -41
- package/src/extension/registration/command-utils.ts +1 -1
- package/src/extension/registration/commands.ts +1 -1
- package/src/extension/registration/compaction-guard.ts +1 -1
- package/src/extension/registration/subagent-helpers.ts +1 -1
- package/src/extension/registration/subagent-tools.ts +1 -1
- package/src/extension/registration/team-tool.ts +1 -1
- package/src/extension/registration/viewers.ts +1 -1
- package/src/extension/session-summary.ts +1 -1
- package/src/extension/team-manager-command.ts +1 -1
- package/src/extension/team-onboard.ts +1 -3
- package/src/extension/team-tool/context.ts +1 -1
- package/src/extension/team-tool/handle-schedule.ts +183 -0
- package/src/extension/team-tool/orchestrate.ts +102 -0
- package/src/extension/team-tool/run.ts +215 -28
- package/src/extension/team-tool.ts +115 -0
- package/src/extension/tool-result.ts +1 -1
- package/src/i18n.ts +1 -1
- package/src/observability/event-to-metric.ts +1 -1
- package/src/prompt/prompt-runtime.ts +1 -1
- package/src/runtime/background-runner.ts +27 -5
- package/src/runtime/crash-recovery.ts +1 -1
- package/src/runtime/crew-hooks.ts +240 -0
- package/src/runtime/custom-tools/irc-tool.ts +1 -1
- package/src/runtime/custom-tools/submit-result-tool.ts +1 -1
- package/src/runtime/diagnostic-export.ts +38 -2
- package/src/runtime/foreground-watchdog.ts +1 -1
- package/src/runtime/live-session-runtime.ts +1 -1
- package/src/runtime/mcp-proxy.ts +1 -1
- package/src/runtime/pi-spawn.ts +20 -4
- package/src/runtime/process-status.ts +15 -2
- package/src/runtime/runtime-resolver.ts +1 -1
- package/src/runtime/session-resources.ts +1 -1
- package/src/runtime/task-runner.ts +31 -1
- package/src/runtime/team-runner.ts +6 -0
- package/src/schema/team-tool-schema.ts +36 -1
- package/src/state/crew-init.ts +56 -38
- package/src/state/decision-ledger.ts +295 -0
- package/src/state/hook-instinct-bridge.ts +90 -0
- package/src/state/hook-integrations.ts +51 -0
- package/src/state/instinct-store.ts +249 -0
- package/src/state/run-graph.ts +5 -24
- package/src/state/run-metrics.ts +135 -0
- package/src/state/tiered-eval.ts +471 -0
- package/src/state/types-eval.ts +58 -0
- package/src/state/types.ts +3 -0
- package/src/tools/safe-bash-extension.ts +5 -5
- package/src/ui/crew-widget.ts +1 -1
- package/src/ui/pi-ui-compat.ts +1 -1
- package/src/ui/run-action-dispatcher.ts +1 -1
- package/src/ui/tool-render.ts +2 -2
- package/src/utils/bm25-search.ts +0 -2
- package/src/utils/project-detector.ts +160 -0
- package/test-bugs-all.mjs +1 -1
- package/skills/.gitkeep +0 -0
- package/skills/REFERENCE.md +0 -136
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tiered Evaluation System
|
|
3
|
+
*
|
|
4
|
+
* Inspired by agent-eval's judge tiers, this module provides a hierarchical
|
|
5
|
+
* evaluation system where checks are grouped by computational cost and reliability:
|
|
6
|
+
*
|
|
7
|
+
* - Tier 1 (deterministic): Fast checks (~1s timeout) - file exists, parse errors, etc.
|
|
8
|
+
* - Tier 2 (pattern): Medium checks (~5s timeout) - grep, regex, structural checks
|
|
9
|
+
* - Tier 3 (llm): Expensive checks (~60s timeout) - LLM-based evaluation
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import type { EvalTier, TierConfig, EvalResult } from "./types-eval.ts";
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Default tier configurations with increasing timeouts for more expensive evaluations.
|
|
16
|
+
*/
|
|
17
|
+
export const TIER_CONFIGS: Record<EvalTier, TierConfig> = {
|
|
18
|
+
1: {
|
|
19
|
+
tier: 1,
|
|
20
|
+
name: "deterministic",
|
|
21
|
+
description: "File exists, parse errors, fast checks",
|
|
22
|
+
timeoutMs: 1000,
|
|
23
|
+
},
|
|
24
|
+
2: {
|
|
25
|
+
tier: 2,
|
|
26
|
+
name: "pattern",
|
|
27
|
+
description: "grep, regex, structural checks",
|
|
28
|
+
timeoutMs: 5000,
|
|
29
|
+
},
|
|
30
|
+
3: {
|
|
31
|
+
tier: 3,
|
|
32
|
+
name: "llm",
|
|
33
|
+
description: "LLM-based evaluation",
|
|
34
|
+
timeoutMs: 60000,
|
|
35
|
+
},
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Default tier configurations (re-exported for convenience).
|
|
40
|
+
*/
|
|
41
|
+
export const DEFAULT_TIER_CONFIGS = TIER_CONFIGS;
|
|
42
|
+
|
|
43
|
+
export type { EvalTier, TierConfig, EvalResult };
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Individual evaluation check with its assigned tier.
|
|
47
|
+
*/
|
|
48
|
+
export interface EvalCheck<T = unknown> {
|
|
49
|
+
/** The evaluation tier for this check */
|
|
50
|
+
tier: EvalTier;
|
|
51
|
+
/** The check function - returns true if passed */
|
|
52
|
+
check: () => Promise<boolean> | boolean;
|
|
53
|
+
/** Optional metadata about this check */
|
|
54
|
+
metadata?: T;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Configuration for the TieredEvalRunner.
|
|
59
|
+
*/
|
|
60
|
+
export interface TieredEvalRunnerConfig {
|
|
61
|
+
/** Override default tier configurations */
|
|
62
|
+
tierConfigs?: Partial<Record<EvalTier, TierConfig>>;
|
|
63
|
+
/** Default timeout multiplier for all tiers (default: 1.0) */
|
|
64
|
+
timeoutMultiplier?: number;
|
|
65
|
+
/** Whether to sort checks by tier before execution (default: true) */
|
|
66
|
+
sortByTier?: boolean;
|
|
67
|
+
/** Custom error message for timeouts */
|
|
68
|
+
timeoutMessage?: string;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Result of a single evaluation check.
|
|
73
|
+
*/
|
|
74
|
+
export interface CheckResult extends EvalResult {
|
|
75
|
+
/** The check function returned true */
|
|
76
|
+
passed: boolean;
|
|
77
|
+
/** Error message if check failed or timed out */
|
|
78
|
+
error?: string;
|
|
79
|
+
/** Check index in the original array */
|
|
80
|
+
index: number;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Extended result type for multi-check evaluations.
|
|
85
|
+
*/
|
|
86
|
+
export interface TieredEvalResult {
|
|
87
|
+
/** Overall success status - all checks passed */
|
|
88
|
+
passed: boolean;
|
|
89
|
+
/** Results for each individual check */
|
|
90
|
+
results: CheckResult[];
|
|
91
|
+
/** Total duration of all checks in milliseconds */
|
|
92
|
+
totalDurationMs: number;
|
|
93
|
+
/** Tier at which evaluation failed (if any) */
|
|
94
|
+
failedAtTier?: EvalTier;
|
|
95
|
+
/** Index of first failing check (if any) */
|
|
96
|
+
failedAtIndex?: number;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* TieredEvalRunner executes evaluation checks in tiered order,
|
|
101
|
+
* with appropriate timeouts for each tier level.
|
|
102
|
+
*
|
|
103
|
+
* Supports two execution modes:
|
|
104
|
+
* - runTieredEval: Runs all checks regardless of failures
|
|
105
|
+
* - runTieredEvalFailFast: Stops at first failure (like ECC promotion gates)
|
|
106
|
+
*
|
|
107
|
+
* @example
|
|
108
|
+
* ```typescript
|
|
109
|
+
* const runner = new TieredEvalRunner();
|
|
110
|
+
*
|
|
111
|
+
* // Run all checks
|
|
112
|
+
* const allResults = await runner.runTieredEval('task-1', [
|
|
113
|
+
* { tier: 1, check: () => fs.existsSync('output.json') },
|
|
114
|
+
* { tier: 2, check: async () => (await run('grep', ['pattern', 'output.json'])).exitCode === 0 }
|
|
115
|
+
* ]);
|
|
116
|
+
*
|
|
117
|
+
* // Fail-fast mode
|
|
118
|
+
* const failFastResult = await runner.runTieredEvalFailFast('task-2', [
|
|
119
|
+
* { tier: 1, check: () => fs.existsSync('output.json') },
|
|
120
|
+
* { tier: 2, check: async () => (await run('grep', ['pattern', 'output.json'])).exitCode === 0 }
|
|
121
|
+
* ]);
|
|
122
|
+
* ```
|
|
123
|
+
*/
|
|
124
|
+
export class TieredEvalRunner {
|
|
125
|
+
private readonly tierConfigs: Record<EvalTier, TierConfig>;
|
|
126
|
+
private readonly timeoutMultiplier: number;
|
|
127
|
+
private readonly sortByTier: boolean;
|
|
128
|
+
private readonly timeoutMessage: string;
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Creates a new TieredEvalRunner instance.
|
|
132
|
+
*
|
|
133
|
+
* @param config - Optional configuration to override defaults
|
|
134
|
+
*/
|
|
135
|
+
constructor(config?: TieredEvalRunnerConfig) {
|
|
136
|
+
this.tierConfigs = { ...TIER_CONFIGS };
|
|
137
|
+
this.timeoutMultiplier = config?.timeoutMultiplier ?? 1.0;
|
|
138
|
+
this.sortByTier = config?.sortByTier ?? true;
|
|
139
|
+
this.timeoutMessage = config?.timeoutMessage ?? "Evaluation timed out";
|
|
140
|
+
|
|
141
|
+
// Apply tier config overrides
|
|
142
|
+
if (config?.tierConfigs) {
|
|
143
|
+
for (const [tierStr, tierConfig] of Object.entries(config.tierConfigs)) {
|
|
144
|
+
const tier = Number(tierStr) as EvalTier;
|
|
145
|
+
if (tierConfig && !isNaN(tier)) {
|
|
146
|
+
this.tierConfigs[tier] = {
|
|
147
|
+
...this.tierConfigs[tier],
|
|
148
|
+
...tierConfig,
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Gets the effective timeout for a given tier.
|
|
157
|
+
*
|
|
158
|
+
* @param tier - The evaluation tier
|
|
159
|
+
* @returns The timeout in milliseconds (after multiplier is applied)
|
|
160
|
+
*/
|
|
161
|
+
getTimeout(tier: EvalTier): number {
|
|
162
|
+
return this.tierConfigs[tier].timeoutMs * this.timeoutMultiplier;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Gets the configuration for a specific tier.
|
|
167
|
+
*
|
|
168
|
+
* @param tier - The evaluation tier
|
|
169
|
+
* @returns The tier configuration
|
|
170
|
+
*/
|
|
171
|
+
getTierConfig(tier: EvalTier): TierConfig {
|
|
172
|
+
return this.tierConfigs[tier];
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Runs a check with the specified timeout.
|
|
177
|
+
*
|
|
178
|
+
* @param check - The check function to run
|
|
179
|
+
* @param tier - The tier this check belongs to
|
|
180
|
+
* @returns The result of the check
|
|
181
|
+
*/
|
|
182
|
+
private async runCheckWithTimeout(
|
|
183
|
+
check: () => Promise<boolean> | boolean,
|
|
184
|
+
tier: EvalTier,
|
|
185
|
+
): Promise<CheckResult> {
|
|
186
|
+
const timeout = this.getTimeout(tier);
|
|
187
|
+
const startTime = Date.now();
|
|
188
|
+
|
|
189
|
+
return new Promise<CheckResult>((resolve) => {
|
|
190
|
+
const timeoutHandle = setTimeout(() => {
|
|
191
|
+
resolve({
|
|
192
|
+
tier,
|
|
193
|
+
passed: false,
|
|
194
|
+
durationMs: timeout,
|
|
195
|
+
message: this.timeoutMessage,
|
|
196
|
+
error: `Check timed out after ${timeout}ms`,
|
|
197
|
+
index: -1,
|
|
198
|
+
});
|
|
199
|
+
}, timeout);
|
|
200
|
+
|
|
201
|
+
// Execute the check
|
|
202
|
+
Promise.resolve(check())
|
|
203
|
+
.then((result) => {
|
|
204
|
+
clearTimeout(timeoutHandle);
|
|
205
|
+
const durationMs = Date.now() - startTime;
|
|
206
|
+
resolve({
|
|
207
|
+
tier,
|
|
208
|
+
passed: result === true,
|
|
209
|
+
durationMs,
|
|
210
|
+
index: -1,
|
|
211
|
+
error: result !== true ? "Check returned false" : undefined,
|
|
212
|
+
});
|
|
213
|
+
})
|
|
214
|
+
.catch((error) => {
|
|
215
|
+
clearTimeout(timeoutHandle);
|
|
216
|
+
const durationMs = Date.now() - startTime;
|
|
217
|
+
resolve({
|
|
218
|
+
tier,
|
|
219
|
+
passed: false,
|
|
220
|
+
durationMs,
|
|
221
|
+
message: error instanceof Error ? error.message : String(error),
|
|
222
|
+
error: error instanceof Error ? error.message : String(error),
|
|
223
|
+
index: -1,
|
|
224
|
+
});
|
|
225
|
+
});
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Runs all evaluation checks and returns results for each.
|
|
231
|
+
*
|
|
232
|
+
* @param taskId - Identifier for the task being evaluated
|
|
233
|
+
* @param checks - Array of checks to run, each with a tier assignment
|
|
234
|
+
* @returns Array of evaluation results for each check
|
|
235
|
+
*
|
|
236
|
+
* @example
|
|
237
|
+
* ```typescript
|
|
238
|
+
* const results = await runner.runTieredEval('task-123', [
|
|
239
|
+
* { tier: 1, check: () => fs.existsSync('output.json') },
|
|
240
|
+
* { tier: 2, check: async () => (await grep('output.json', 'pattern')).found },
|
|
241
|
+
* { tier: 3, check: async () => llmJudge.evaluate(output) }
|
|
242
|
+
* ]);
|
|
243
|
+
*
|
|
244
|
+
* // Check if all passed
|
|
245
|
+
* const allPassed = results.every(r => r.passed);
|
|
246
|
+
* ```
|
|
247
|
+
*/
|
|
248
|
+
async runTieredEval(
|
|
249
|
+
taskId: string,
|
|
250
|
+
checks: Array<{ tier: EvalTier; check: () => Promise<boolean> | boolean }>,
|
|
251
|
+
): Promise<EvalResult[]> {
|
|
252
|
+
// Sort checks by tier if configured (lower tiers first)
|
|
253
|
+
const sortedChecks = this.sortByTier
|
|
254
|
+
? [...checks].sort((a, b) => a.tier - b.tier)
|
|
255
|
+
: checks;
|
|
256
|
+
|
|
257
|
+
const results: EvalResult[] = [];
|
|
258
|
+
|
|
259
|
+
for (let i = 0; i < sortedChecks.length; i++) {
|
|
260
|
+
const { tier, check } = sortedChecks[i];
|
|
261
|
+
const result = await this.runCheckWithTimeout(check, tier);
|
|
262
|
+
results.push(result);
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
return results;
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* Runs evaluation checks in fail-fast mode, stopping at the first failure.
|
|
270
|
+
*
|
|
271
|
+
* This is useful for promotion gates where cheaper checks should run first
|
|
272
|
+
* and any failure should stop the evaluation immediately.
|
|
273
|
+
*
|
|
274
|
+
* @param taskId - Identifier for the task being evaluated
|
|
275
|
+
* @param checks - Array of checks to run, each with a tier assignment
|
|
276
|
+
* @returns Array of evaluation results (may be shorter than input if fail-fast triggered)
|
|
277
|
+
*
|
|
278
|
+
* @example
|
|
279
|
+
* ```typescript
|
|
280
|
+
* const results = await runner.runTieredEvalFailFast('task-123', [
|
|
281
|
+
* { tier: 1, check: () => fs.existsSync('output.json') },
|
|
282
|
+
* { tier: 2, check: async () => (await grep('output.json', 'pattern')).found },
|
|
283
|
+
* { tier: 3, check: async () => llmJudge.evaluate(output) }
|
|
284
|
+
* ]);
|
|
285
|
+
*
|
|
286
|
+
* if (results.length < checks.length) {
|
|
287
|
+
* console.log(`Failed at tier ${results[results.length - 1].tier}`);
|
|
288
|
+
* }
|
|
289
|
+
* ```
|
|
290
|
+
*/
|
|
291
|
+
async runTieredEvalFailFast(
|
|
292
|
+
taskId: string,
|
|
293
|
+
checks: Array<{ tier: EvalTier; check: () => Promise<boolean> | boolean }>,
|
|
294
|
+
): Promise<EvalResult[]> {
|
|
295
|
+
// Sort checks by tier if configured (lower tiers first)
|
|
296
|
+
const sortedChecks = this.sortByTier
|
|
297
|
+
? [...checks].sort((a, b) => a.tier - b.tier)
|
|
298
|
+
: checks;
|
|
299
|
+
|
|
300
|
+
const results: EvalResult[] = [];
|
|
301
|
+
|
|
302
|
+
for (let i = 0; i < sortedChecks.length; i++) {
|
|
303
|
+
const { tier, check } = sortedChecks[i];
|
|
304
|
+
const result = await this.runCheckWithTimeout(check, tier);
|
|
305
|
+
results.push(result);
|
|
306
|
+
|
|
307
|
+
// Fail-fast: stop at first failure
|
|
308
|
+
if (!result.passed) {
|
|
309
|
+
break;
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
return results;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Runs evaluation checks and returns a structured result object.
|
|
318
|
+
*
|
|
319
|
+
* @param taskId - Identifier for the task being evaluated
|
|
320
|
+
* @param checks - Array of checks to run, each with a tier assignment
|
|
321
|
+
* @param failFast - Whether to stop at first failure (default: false)
|
|
322
|
+
* @returns Structured evaluation result with metadata
|
|
323
|
+
*/
|
|
324
|
+
async runEval(
|
|
325
|
+
taskId: string,
|
|
326
|
+
checks: Array<{ tier: EvalTier; check: () => Promise<boolean> | boolean }>,
|
|
327
|
+
failFast = false,
|
|
328
|
+
): Promise<TieredEvalResult> {
|
|
329
|
+
const sortedChecks = this.sortByTier
|
|
330
|
+
? [...checks].sort((a, b) => a.tier - b.tier)
|
|
331
|
+
: checks;
|
|
332
|
+
|
|
333
|
+
const results: CheckResult[] = [];
|
|
334
|
+
let totalDurationMs = 0;
|
|
335
|
+
|
|
336
|
+
for (let i = 0; i < sortedChecks.length; i++) {
|
|
337
|
+
const { tier, check } = sortedChecks[i];
|
|
338
|
+
const result = await this.runCheckWithTimeout(check, tier);
|
|
339
|
+
result.index = i;
|
|
340
|
+
results.push(result);
|
|
341
|
+
totalDurationMs += result.durationMs;
|
|
342
|
+
|
|
343
|
+
// Fail-fast: stop at first failure
|
|
344
|
+
if (failFast && !result.passed) {
|
|
345
|
+
return {
|
|
346
|
+
passed: false,
|
|
347
|
+
results,
|
|
348
|
+
totalDurationMs,
|
|
349
|
+
failedAtTier: tier,
|
|
350
|
+
failedAtIndex: i,
|
|
351
|
+
};
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
return {
|
|
356
|
+
passed: results.every((r) => r.passed),
|
|
357
|
+
results,
|
|
358
|
+
totalDurationMs,
|
|
359
|
+
};
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
/**
|
|
363
|
+
* Runs checks in parallel within each tier, but sequentially across tiers.
|
|
364
|
+
*
|
|
365
|
+
* This optimizes execution time when multiple checks exist at the same tier level.
|
|
366
|
+
*
|
|
367
|
+
* @param taskId - Identifier for the task being evaluated
|
|
368
|
+
* @param checks - Array of checks to run
|
|
369
|
+
* @param failFast - Whether to stop at first failure (default: false)
|
|
370
|
+
* @returns Structured evaluation result
|
|
371
|
+
*/
|
|
372
|
+
async runTieredEvalParallel(
|
|
373
|
+
taskId: string,
|
|
374
|
+
checks: Array<{ tier: EvalTier; check: () => Promise<boolean> | boolean }>,
|
|
375
|
+
failFast = false,
|
|
376
|
+
): Promise<TieredEvalResult> {
|
|
377
|
+
// Group checks by tier
|
|
378
|
+
const checksByTier = new Map<EvalTier, Array<{ check: () => Promise<boolean> | boolean; originalIndex: number }>>();
|
|
379
|
+
|
|
380
|
+
checks.forEach((c, originalIndex) => {
|
|
381
|
+
const existing = checksByTier.get(c.tier) || [];
|
|
382
|
+
existing.push({ check: c.check, originalIndex });
|
|
383
|
+
checksByTier.set(c.tier, existing);
|
|
384
|
+
});
|
|
385
|
+
|
|
386
|
+
// Execute tiers in order
|
|
387
|
+
const results: CheckResult[] = [];
|
|
388
|
+
let totalDurationMs = 0;
|
|
389
|
+
|
|
390
|
+
for (const tier of [1, 2, 3] as EvalTier[]) {
|
|
391
|
+
const tierChecks = checksByTier.get(tier);
|
|
392
|
+
if (!tierChecks || tierChecks.length === 0) continue;
|
|
393
|
+
|
|
394
|
+
// Run all checks for this tier in parallel
|
|
395
|
+
const tierResults = await Promise.all(
|
|
396
|
+
tierChecks.map(async ({ check, originalIndex }) => {
|
|
397
|
+
const result = await this.runCheckWithTimeout(check, tier);
|
|
398
|
+
result.index = originalIndex;
|
|
399
|
+
return result;
|
|
400
|
+
}),
|
|
401
|
+
);
|
|
402
|
+
|
|
403
|
+
tierResults.forEach((result) => {
|
|
404
|
+
results.push(result);
|
|
405
|
+
totalDurationMs += result.durationMs;
|
|
406
|
+
});
|
|
407
|
+
|
|
408
|
+
// Check for any failures in this tier
|
|
409
|
+
const tierFailed = tierResults.some((r) => !r.passed);
|
|
410
|
+
if (failFast && tierFailed) {
|
|
411
|
+
const failedIndex = tierResults.findIndex((r) => !r.passed);
|
|
412
|
+
return {
|
|
413
|
+
passed: false,
|
|
414
|
+
results,
|
|
415
|
+
totalDurationMs,
|
|
416
|
+
failedAtTier: tier,
|
|
417
|
+
failedAtIndex: tierResults[failedIndex].index,
|
|
418
|
+
};
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
// Sort results by original index
|
|
423
|
+
results.sort((a, b) => a.index - b.index);
|
|
424
|
+
|
|
425
|
+
return {
|
|
426
|
+
passed: results.every((r) => r.passed),
|
|
427
|
+
results,
|
|
428
|
+
totalDurationMs,
|
|
429
|
+
};
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
/**
|
|
433
|
+
* Creates a new runner with overridden tier configurations.
|
|
434
|
+
*
|
|
435
|
+
* @param overrides - Tier configurations to override
|
|
436
|
+
* @returns A new TieredEvalRunner instance
|
|
437
|
+
*/
|
|
438
|
+
withConfig(overrides: Partial<Record<EvalTier, TierConfig>>): TieredEvalRunner {
|
|
439
|
+
return new TieredEvalRunner({
|
|
440
|
+
tierConfigs: overrides,
|
|
441
|
+
timeoutMultiplier: this.timeoutMultiplier,
|
|
442
|
+
sortByTier: this.sortByTier,
|
|
443
|
+
timeoutMessage: this.timeoutMessage,
|
|
444
|
+
});
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
/**
|
|
449
|
+
* Convenience function to create a TieredEvalRunner with default configuration.
|
|
450
|
+
*
|
|
451
|
+
* @param config - Optional configuration overrides
|
|
452
|
+
* @returns A new TieredEvalRunner instance
|
|
453
|
+
*
|
|
454
|
+
* @example
|
|
455
|
+
* ```typescript
|
|
456
|
+
* const runner = createRunner({
|
|
457
|
+
* timeoutMultiplier: 2.0, // Double all timeouts
|
|
458
|
+
* tierConfigs: {
|
|
459
|
+
* 3: { timeoutMs: 120000 } // 2 minutes for LLM tier
|
|
460
|
+
* }
|
|
461
|
+
* });
|
|
462
|
+
* ```
|
|
463
|
+
*/
|
|
464
|
+
export function createRunner(config?: TieredEvalRunnerConfig): TieredEvalRunner {
|
|
465
|
+
return new TieredEvalRunner(config);
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
/**
|
|
469
|
+
* Default runner instance with standard configuration.
|
|
470
|
+
*/
|
|
471
|
+
export const defaultRunner = new TieredEvalRunner();
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Types for the Tiered Evaluation System
|
|
3
|
+
*
|
|
4
|
+
* Inspired by agent-eval's judge tiers for hierarchical evaluation.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Evaluation tiers with increasing computational cost.
|
|
9
|
+
*
|
|
10
|
+
* - Tier 1: Deterministic, fast checks (file existence, parse errors)
|
|
11
|
+
* - Tier 2: Pattern matching, structural checks (grep, regex)
|
|
12
|
+
* - Tier 3: LLM-based evaluation for natural language checks
|
|
13
|
+
*/
|
|
14
|
+
export type EvalTier = 1 | 2 | 3;
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Configuration for a specific evaluation tier.
|
|
18
|
+
*/
|
|
19
|
+
export interface TierConfig {
|
|
20
|
+
/** The tier level */
|
|
21
|
+
tier: EvalTier;
|
|
22
|
+
/** Human-readable name for the tier */
|
|
23
|
+
name: string;
|
|
24
|
+
/** Description of what this tier evaluates */
|
|
25
|
+
description: string;
|
|
26
|
+
/** Maximum time allowed for checks in this tier (milliseconds) */
|
|
27
|
+
timeoutMs: number;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Result of a single evaluation check.
|
|
32
|
+
*/
|
|
33
|
+
export interface EvalResult {
|
|
34
|
+
/** The tier this result came from */
|
|
35
|
+
tier: EvalTier;
|
|
36
|
+
/** Whether the check passed */
|
|
37
|
+
passed: boolean;
|
|
38
|
+
/** Optional message with additional context */
|
|
39
|
+
message?: string;
|
|
40
|
+
/** How long the check took in milliseconds */
|
|
41
|
+
durationMs: number;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Summary of a tiered evaluation run.
|
|
46
|
+
*/
|
|
47
|
+
export interface TieredEvalSummary {
|
|
48
|
+
/** Number of checks that passed */
|
|
49
|
+
passed: number;
|
|
50
|
+
/** Number of checks that failed */
|
|
51
|
+
failed: number;
|
|
52
|
+
/** Number of checks that timed out */
|
|
53
|
+
timedOut: number;
|
|
54
|
+
/** Total duration of all checks */
|
|
55
|
+
totalDurationMs: number;
|
|
56
|
+
/** Whether all checks passed */
|
|
57
|
+
allPassed: boolean;
|
|
58
|
+
}
|
package/src/state/types.ts
CHANGED
|
@@ -2,6 +2,9 @@ import type { TeamRunStatus, TeamTaskStatus } from "./contracts.ts";
|
|
|
2
2
|
import type { TaskClaimState } from "./task-claims.ts";
|
|
3
3
|
import type { WorkerHeartbeatState } from "../runtime/worker-heartbeat.ts";
|
|
4
4
|
import type { CrewAgentProgress } from "../runtime/crew-agent-runtime.ts";
|
|
5
|
+
import type { RolloutEntry, CoherenceMark } from "./decision-ledger.ts";
|
|
6
|
+
export type { RolloutEntry, CoherenceMark };
|
|
7
|
+
|
|
5
8
|
export type { TeamRunStatus, TeamTaskStatus } from "./contracts.ts";
|
|
6
9
|
|
|
7
10
|
export interface ArtifactDescriptor {
|
|
@@ -8,10 +8,9 @@
|
|
|
8
8
|
* 3. Or set env var: PI_CREW_SAFE_BASH=true
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
|
-
import type { ExtensionAPI } from "@
|
|
12
|
-
import { createBashTool } from "@
|
|
11
|
+
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|
12
|
+
import { createBashTool } from "@earendil-works/pi-coding-agent";
|
|
13
13
|
import { Type } from "@sinclair/typebox";
|
|
14
|
-
import * as path from "node:path";
|
|
15
14
|
|
|
16
15
|
// Dangerous command patterns to block
|
|
17
16
|
const DANGEROUS_PATTERNS = [
|
|
@@ -59,7 +58,7 @@ function isDangerous(command: string): string | null {
|
|
|
59
58
|
}
|
|
60
59
|
|
|
61
60
|
export default function safeBashExtension(pi: ExtensionAPI): void {
|
|
62
|
-
const cwd =
|
|
61
|
+
const cwd = process.cwd();
|
|
63
62
|
const bashTool = createBashTool(cwd);
|
|
64
63
|
|
|
65
64
|
pi.registerTool({
|
|
@@ -76,10 +75,11 @@ export default function safeBashExtension(pi: ExtensionAPI): void {
|
|
|
76
75
|
Type.String({ description: "Description of what this command does (optional)" }),
|
|
77
76
|
),
|
|
78
77
|
}),
|
|
79
|
-
async execute(toolCallId, params, signal, onUpdate) {
|
|
78
|
+
async execute(toolCallId, params, signal, onUpdate, ctx) {
|
|
80
79
|
const danger = isDangerous(params.command);
|
|
81
80
|
if (danger) {
|
|
82
81
|
return {
|
|
82
|
+
details: {},
|
|
83
83
|
content: [
|
|
84
84
|
{
|
|
85
85
|
type: "text" as const,
|
package/src/ui/crew-widget.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { ExtensionContext } from "@
|
|
1
|
+
import type { ExtensionContext } from "@earendil-works/pi-coding-agent";
|
|
2
2
|
import type { CrewUiConfig } from "../config/config.ts";
|
|
3
3
|
import { listRecentRuns } from "../extension/run-index.ts";
|
|
4
4
|
import { readCrewAgents } from "../runtime/crew-agent-records.ts";
|
package/src/ui/pi-ui-compat.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { ExtensionContext } from "@
|
|
1
|
+
import type { ExtensionContext } from "@earendil-works/pi-coding-agent";
|
|
2
2
|
import type { MetricRegistry } from "../observability/metric-registry.ts";
|
|
3
3
|
// Lazy-loaded: team-tool.ts pulls in entire runtime chain.
|
|
4
4
|
import type { handleTeamTool as HandleTeamToolFn } from "../extension/team-tool.ts";
|
package/src/ui/tool-render.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Shared rendering for pi-crew's tool TUI display.
|
|
3
3
|
* Ports logic from pi-subagent4 adapted for pi-crew's data model.
|
|
4
|
-
* Uses @
|
|
4
|
+
* Uses @earendil-works/pi-tui Components (Container, Text, Spacer) directly.
|
|
5
5
|
*/
|
|
6
|
-
import { Container, Spacer, Text, visibleWidth } from "@
|
|
6
|
+
import { Container, Spacer, Text, visibleWidth } from "@earendil-works/pi-tui";
|
|
7
7
|
import type { CrewAgentRecord } from "../runtime/crew-agent-runtime.ts";
|
|
8
8
|
|
|
9
9
|
// ── Types ──────────────────────────────────────────────────────────────
|
package/src/utils/bm25-search.ts
CHANGED
|
@@ -140,7 +140,6 @@ export async function searchAgents(query: string, options?: { limit?: number }):
|
|
|
140
140
|
name: agent.name,
|
|
141
141
|
description: agent.description ?? "",
|
|
142
142
|
skills: (agent.skills ?? []).join(" "),
|
|
143
|
-
tags: (agent.tags ?? []).join(" "),
|
|
144
143
|
},
|
|
145
144
|
agent,
|
|
146
145
|
}));
|
|
@@ -149,7 +148,6 @@ export async function searchAgents(query: string, options?: { limit?: number }):
|
|
|
149
148
|
name: 3.0,
|
|
150
149
|
description: 1.5,
|
|
151
150
|
skills: 1.0,
|
|
152
|
-
tags: 1.0,
|
|
153
151
|
});
|
|
154
152
|
|
|
155
153
|
const results = engine.search(query, {
|