pi-crew 0.5.2 → 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +183 -0
- package/README.md +17 -1
- package/docs/architecture.md +2 -0
- package/docs/bugs/cross-session-notification-leakage.md +82 -0
- package/docs/coding-agent-optimization.md +268 -0
- package/docs/deep-review-report.md +384 -0
- package/docs/distillation/cybersecurity-patterns.md +294 -0
- package/docs/migration-v0.4-v0.5.md +208 -0
- package/docs/optimization-plan.md +642 -0
- package/docs/pi-crew-v0.5.5-audit-fix-plan.md +133 -0
- package/docs/pi-mono-opportunities.md +969 -0
- package/docs/pi-mono-review.md +291 -0
- package/docs/skills/REFERENCE.md +144 -0
- package/package.json +12 -9
- package/skills/artifact-analysis-loop/SKILL.md +302 -0
- package/skills/async-worker-recovery/SKILL.md +19 -1
- package/skills/child-pi-spawning/SKILL.md +19 -6
- package/skills/context-artifact-hygiene/SKILL.md +19 -2
- package/skills/delegation-patterns/SKILL.md +68 -3
- package/skills/detection-pipeline-design/SKILL.md +285 -0
- package/skills/event-log-tracing/SKILL.md +20 -6
- package/skills/git-master/SKILL.md +20 -6
- package/skills/hunting-investigation-loop/SKILL.md +401 -0
- package/skills/incident-playbook-construction/SKILL.md +383 -0
- package/skills/live-agent-lifecycle/SKILL.md +20 -6
- package/skills/mailbox-interactive/SKILL.md +19 -6
- package/skills/model-routing-context/SKILL.md +19 -1
- package/skills/multi-perspective-review/SKILL.md +19 -4
- package/skills/observability-reliability/SKILL.md +19 -2
- package/skills/orchestration/SKILL.md +20 -2
- package/skills/ownership-session-security/SKILL.md +20 -2
- package/skills/pi-extension-lifecycle/SKILL.md +20 -2
- package/skills/post-mortem/SKILL.md +7 -2
- package/skills/read-only-explorer/SKILL.md +20 -6
- package/skills/requirements-to-task-packet/SKILL.md +23 -3
- package/skills/resource-discovery-config/SKILL.md +20 -2
- package/skills/runtime-state-reader/SKILL.md +20 -2
- package/skills/safe-bash/SKILL.md +21 -6
- package/skills/scrutinize/SKILL.md +20 -2
- package/skills/secure-agent-orchestration-review/SKILL.md +29 -2
- package/skills/security-review/SKILL.md +560 -0
- package/skills/state-mutation-locking/SKILL.md +22 -2
- package/skills/systematic-debugging/SKILL.md +8 -6
- package/skills/threat-hypothesis-framework/SKILL.md +175 -0
- package/skills/ui-render-performance/SKILL.md +20 -2
- package/skills/verification-before-done/SKILL.md +17 -2
- package/skills/widget-rendering/SKILL.md +21 -6
- package/skills/workspace-isolation/SKILL.md +20 -6
- package/skills/worktree-isolation/SKILL.md +20 -6
- package/src/agents/agent-config.ts +40 -1
- package/src/benchmark/benchmark-runner.ts +45 -0
- package/src/benchmark/feedback-loop.ts +5 -0
- package/src/config/config.ts +32 -5
- package/src/config/role-tools.ts +82 -0
- package/src/config/suggestions.ts +8 -0
- package/src/config/types.ts +4 -0
- package/src/extension/async-notifier.ts +10 -1
- package/src/extension/crew-cleanup.ts +114 -0
- package/src/extension/cross-extension-rpc.ts +1 -1
- package/src/extension/notification-router.ts +18 -0
- package/src/extension/register.ts +27 -19
- package/src/extension/registration/subagent-tools.ts +1 -1
- package/src/extension/team-tool/anchor.ts +201 -0
- package/src/extension/team-tool/api.ts +2 -1
- package/src/extension/team-tool/auto-summarize.ts +154 -0
- package/src/extension/team-tool/run.ts +42 -7
- package/src/extension/team-tool.ts +44 -2
- package/src/hooks/registry.ts +1 -3
- package/src/observability/event-bus.ts +69 -0
- package/src/observability/event-to-metric.ts +0 -2
- package/src/runtime/anchor-manager.ts +473 -0
- package/src/runtime/async-runner.ts +8 -4
- package/src/runtime/auto-summarize.ts +350 -0
- package/src/runtime/background-runner.ts +10 -3
- package/src/runtime/budget-tracker.ts +354 -0
- package/src/runtime/chain-runner.ts +507 -0
- package/src/runtime/child-pi.ts +123 -35
- package/src/runtime/crash-recovery.ts +5 -4
- package/src/runtime/crew-agent-runtime.ts +1 -0
- package/src/runtime/custom-tools/irc-tool.ts +13 -0
- package/src/runtime/custom-tools/submit-result-tool.ts +3 -2
- package/src/runtime/delivery-coordinator.ts +10 -3
- package/src/runtime/dynamic-script-runner.ts +482 -0
- package/src/runtime/foreground-control.ts +87 -17
- package/src/runtime/handoff-manager.ts +589 -0
- package/src/runtime/hidden-handoff.ts +424 -0
- package/src/runtime/live-agent-manager.ts +20 -4
- package/src/runtime/live-session-runtime.ts +39 -4
- package/src/runtime/manifest-cache.ts +2 -1
- package/src/runtime/model-resolver.ts +16 -4
- package/src/runtime/phase-tracker.ts +373 -0
- package/src/runtime/pi-args.ts +11 -1
- package/src/runtime/pi-json-output.ts +31 -0
- package/src/runtime/pipeline-runner.ts +514 -0
- package/src/runtime/progress-tracker.ts +124 -0
- package/src/runtime/retry-runner.ts +354 -0
- package/src/runtime/sandbox.ts +252 -0
- package/src/runtime/scheduler.ts +7 -2
- package/src/runtime/skill-effectiveness.ts +473 -0
- package/src/runtime/skill-instructions.ts +37 -3
- package/src/runtime/subagent-manager.ts +1 -1
- package/src/runtime/task-graph.ts +11 -1
- package/src/runtime/task-runner.ts +92 -18
- package/src/runtime/team-runner.ts +13 -12
- package/src/runtime/tool-progress.ts +10 -3
- package/src/runtime/verification-gates.ts +367 -0
- package/src/schema/team-tool-schema.ts +37 -0
- package/src/skills/discover-skills.ts +5 -0
- package/src/state/active-run-registry.ts +9 -2
- package/src/state/contracts.ts +9 -0
- package/src/state/crew-init.ts +3 -3
- package/src/state/decision-ledger.ts +98 -55
- package/src/state/event-log-rotation.ts +2 -2
- package/src/state/event-log.ts +144 -10
- package/src/state/hook-instinct-bridge.ts +5 -5
- package/src/state/mailbox.ts +10 -0
- package/src/state/run-cache.ts +18 -8
- package/src/state/state-store.ts +3 -1
- package/src/state/types.ts +4 -0
- package/src/tools/safe-bash-extension.ts +1 -0
- package/src/tools/safe-bash.ts +152 -20
- package/src/types/new-api-types.ts +34 -0
- package/src/ui/agent-management-overlay.ts +5 -1
- package/src/ui/crew-widget.ts +29 -15
- package/src/ui/overlays/mailbox-detail-overlay.ts +13 -2
- package/src/ui/powerbar-publisher.ts +101 -7
- package/src/ui/tool-render.ts +15 -15
- package/src/ui/transcript-cache.ts +13 -0
- package/src/utils/bm25-search.ts +16 -8
- package/src/utils/env-filter.ts +8 -5
- package/src/utils/redaction.ts +169 -15
- package/src/utils/session-utils.ts +52 -0
- package/src/utils/sse-parser.ts +10 -1
- package/src/worktree/cleanup.ts +6 -1
- package/src/worktree/worktree-manager.ts +32 -13
- package/workflows/chain.workflow.md +252 -0
- package/workflows/pipeline.workflow.md +27 -0
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Skill Effectiveness — ECC INSTINCT/CONFIDENCE Pattern Implementation
|
|
3
|
+
*
|
|
4
|
+
* Implements confidence-weighted skill activation based on ECC's instinct system.
|
|
5
|
+
* Tracks skill activation success and adjusts confidence scores.
|
|
6
|
+
*
|
|
7
|
+
* Based on: docs/distillation/ECC-hooks-instincts.md §2-3 (instinct system, confidence thresholds)
|
|
8
|
+
* Based on: docs/distillation/ECC-10-skills.md §8 (continuous-learning-v2)
|
|
9
|
+
*
|
|
10
|
+
* @module skill-effectiveness
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
14
|
+
import { dirname, join } from "path";
|
|
15
|
+
import { crewHooks } from "./crew-hooks.ts";
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Confidence thresholds per ECC instinct system.
|
|
19
|
+
* Skills below 0.3 threshold are considered tentative and not enforced.
|
|
20
|
+
*/
|
|
21
|
+
export const CONFIDENCE_THRESHOLDS = {
|
|
22
|
+
TENTATIVE: 0.3, // Suggested but not enforced
|
|
23
|
+
MODERATE: 0.5, // Applied when relevant
|
|
24
|
+
STRONG: 0.7, // Auto-approved for application
|
|
25
|
+
NEAR_CERTAIN: 0.9, // Core behavior
|
|
26
|
+
} as const;
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Initial confidence by observation frequency.
|
|
30
|
+
* From ECC instinct system: 1-2 observations → 0.3, 3-5 → 0.5, etc.
|
|
31
|
+
*/
|
|
32
|
+
export const INITIAL_CONFIDENCE_BY_FREQUENCY: Record<string, number> = {
|
|
33
|
+
"1": 0.3, // 1 observation → tentative
|
|
34
|
+
"2": 0.3, // 2 observations → tentative
|
|
35
|
+
"3": 0.5, // 3 observations → moderate
|
|
36
|
+
"4": 0.5,
|
|
37
|
+
"5": 0.5,
|
|
38
|
+
"6": 0.7, // 6-10 observations → strong
|
|
39
|
+
"7": 0.7,
|
|
40
|
+
"8": 0.7,
|
|
41
|
+
"9": 0.7,
|
|
42
|
+
"10": 0.7,
|
|
43
|
+
"11+": 0.85, // 11+ observations → very strong
|
|
44
|
+
} as const;
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Confidence adjustments per ECC instinct system.
|
|
48
|
+
*/
|
|
49
|
+
export const CONFIDENCE_ADJUSTMENTS = {
|
|
50
|
+
CONFIRMING: 0.05, // Each confirming observation
|
|
51
|
+
CONTRADICTING: -0.1, // Each contradicting observation
|
|
52
|
+
DECAY_PER_WEEK: -0.02, // Per week without observation
|
|
53
|
+
} as const;
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Promotion gate criteria for skills.
|
|
57
|
+
* Skill can be promoted to "strong enforcement" when these are met.
|
|
58
|
+
*/
|
|
59
|
+
export const PROMOTION_GATE_CRITERIA = {
|
|
60
|
+
MIN_CORRECTNESS: 0.8, // 80% pass rate
|
|
61
|
+
MIN_ACTIVATIONS: 5, // Minimum observations before filtering
|
|
62
|
+
MIN_AVG_CONFIDENCE: 0.7, // Average confidence threshold
|
|
63
|
+
} as const;
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Skill activation record - captures each time a skill is used.
|
|
67
|
+
*/
|
|
68
|
+
export interface SkillActivation {
|
|
69
|
+
id: string; // Unique activation ID
|
|
70
|
+
skillId: string; // Skill identifier (e.g., "verification-before-done")
|
|
71
|
+
role: string; // Role that activated the skill
|
|
72
|
+
runId: string; // Run ID
|
|
73
|
+
taskId: string; // Task ID
|
|
74
|
+
timestamp: string; // ISO timestamp
|
|
75
|
+
passed: boolean; // Whether the skill was successfully applied
|
|
76
|
+
outcome?: string; // Optional outcome description
|
|
77
|
+
confidence: number; // Confidence at time of activation
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Skill metrics - aggregated statistics for a skill.
|
|
82
|
+
*/
|
|
83
|
+
export interface SkillMetrics {
|
|
84
|
+
skillId: string;
|
|
85
|
+
totalActivations: number;
|
|
86
|
+
passedActivations: number;
|
|
87
|
+
failedActivations: number;
|
|
88
|
+
passRate: number; // passed / total
|
|
89
|
+
avgConfidence: number; // Rolling average confidence
|
|
90
|
+
currentConfidence: number; // Current confidence score
|
|
91
|
+
trend: "improving" | "stable" | "declining";
|
|
92
|
+
lastActivation?: string; // ISO timestamp
|
|
93
|
+
firstActivation?: string; // ISO timestamp
|
|
94
|
+
roleBreakdown: Record<string, number>; // Activations per role
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Confidence-weighted skill entry for activation decisions.
|
|
99
|
+
*/
|
|
100
|
+
export interface WeightedSkill {
|
|
101
|
+
skillId: string;
|
|
102
|
+
confidence: number;
|
|
103
|
+
threshold: keyof typeof CONFIDENCE_THRESHOLDS;
|
|
104
|
+
behavior: "suggest" | "apply_if_asked" | "apply_auto" | "act_autonomous";
|
|
105
|
+
evidence: string; // Evidence for confidence score
|
|
106
|
+
metrics: SkillMetrics;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Get skill effectiveness storage path.
|
|
111
|
+
*/
|
|
112
|
+
function getSkillMetricsPath(runId: string): string {
|
|
113
|
+
return join(
|
|
114
|
+
process.cwd(),
|
|
115
|
+
`.crew/state/runs/${runId}/skill-metrics.jsonl`,
|
|
116
|
+
);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Get skill activations path.
|
|
121
|
+
*/
|
|
122
|
+
function getSkillActivationsPath(runId: string): string {
|
|
123
|
+
return join(
|
|
124
|
+
process.cwd(),
|
|
125
|
+
`.crew/state/runs/${runId}/skill-activations.jsonl`,
|
|
126
|
+
);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Ensure directory exists for skill metrics.
|
|
131
|
+
*/
|
|
132
|
+
function ensureSkillMetricsDir(runId: string): void {
|
|
133
|
+
const dir = dirname(getSkillMetricsPath(runId));
|
|
134
|
+
if (!existsSync(dir)) {
|
|
135
|
+
mkdirSync(dir, { recursive: true });
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Compute initial confidence from observation count.
|
|
141
|
+
*/
|
|
142
|
+
export function computeInitialConfidence(observationCount: number): number {
|
|
143
|
+
if (observationCount <= 2) return INITIAL_CONFIDENCE_BY_FREQUENCY["1"];
|
|
144
|
+
if (observationCount <= 5) return INITIAL_CONFIDENCE_BY_FREQUENCY["3"];
|
|
145
|
+
if (observationCount <= 10) return INITIAL_CONFIDENCE_BY_FREQUENCY["6"];
|
|
146
|
+
return INITIAL_CONFIDENCE_BY_FREQUENCY["11+"];
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Adjust confidence based on outcome.
|
|
151
|
+
* Per ECC instinct system: +0.05 for success, -0.1 for failure.
|
|
152
|
+
*/
|
|
153
|
+
export function adjustConfidence(current: number, passed: boolean): number {
|
|
154
|
+
const delta = passed
|
|
155
|
+
? CONFIDENCE_ADJUSTMENTS.CONFIRMING
|
|
156
|
+
: CONFIDENCE_ADJUSTMENTS.CONTRADICTING;
|
|
157
|
+
return Math.max(0.1, Math.min(0.95, current + delta)); // Clamp to [0.1, 0.95]
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Apply decay to confidence for skills not observed recently.
|
|
162
|
+
*/
|
|
163
|
+
export function applyDecay(current: number, lastActivation?: string): number {
|
|
164
|
+
if (!lastActivation) return current;
|
|
165
|
+
|
|
166
|
+
const daysSince = (Date.now() - new Date(lastActivation).getTime()) / (1000 * 60 * 60 * 24);
|
|
167
|
+
const decayWeeks = Math.floor(daysSince / 7);
|
|
168
|
+
const decay = decayWeeks * CONFIDENCE_ADJUSTMENTS.DECAY_PER_WEEK;
|
|
169
|
+
|
|
170
|
+
return Math.max(0.1, current + decay);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Determine behavior based on confidence threshold.
|
|
175
|
+
*/
|
|
176
|
+
export function confidenceToBehavior(confidence: number): WeightedSkill["behavior"] {
|
|
177
|
+
if (confidence >= CONFIDENCE_THRESHOLDS.NEAR_CERTAIN) return "act_autonomous";
|
|
178
|
+
if (confidence >= CONFIDENCE_THRESHOLDS.STRONG) return "apply_auto";
|
|
179
|
+
if (confidence >= CONFIDENCE_THRESHOLDS.MODERATE) return "apply_if_asked";
|
|
180
|
+
return "suggest";
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Determine threshold name from confidence.
|
|
185
|
+
*/
|
|
186
|
+
export function confidenceToThreshold(confidence: number): keyof typeof CONFIDENCE_THRESHOLDS {
|
|
187
|
+
if (confidence >= CONFIDENCE_THRESHOLDS.NEAR_CERTAIN) return "NEAR_CERTAIN";
|
|
188
|
+
if (confidence >= CONFIDENCE_THRESHOLDS.STRONG) return "STRONG";
|
|
189
|
+
if (confidence >= CONFIDENCE_THRESHOLDS.TENTATIVE) return "MODERATE";
|
|
190
|
+
return "TENTATIVE";
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* Record a skill activation.
|
|
195
|
+
* Appends to the run's skill-activations.jsonl for learning.
|
|
196
|
+
*/
|
|
197
|
+
export function recordSkillActivation(
|
|
198
|
+
activation: SkillActivation,
|
|
199
|
+
): SkillActivation {
|
|
200
|
+
ensureSkillMetricsDir(activation.runId);
|
|
201
|
+
|
|
202
|
+
const path = getSkillActivationsPath(activation.runId);
|
|
203
|
+
const line = JSON.stringify(activation) + "\n";
|
|
204
|
+
writeFileSync(path, line, { flag: "a", encoding: "utf-8" });
|
|
205
|
+
|
|
206
|
+
return activation;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Get all skill activations for a run.
|
|
211
|
+
*/
|
|
212
|
+
export function getSkillActivations(runId: string): SkillActivation[] {
|
|
213
|
+
const path = getSkillActivationsPath(runId);
|
|
214
|
+
|
|
215
|
+
if (!existsSync(path)) {
|
|
216
|
+
return [];
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const content = readFileSync(path, "utf-8");
|
|
220
|
+
if (!content.trim()) {
|
|
221
|
+
return [];
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
return content
|
|
225
|
+
.split("\n")
|
|
226
|
+
.filter((line) => line.trim())
|
|
227
|
+
.map((line) => JSON.parse(line) as SkillActivation);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Compute metrics for a skill across all activations.
|
|
232
|
+
*/
|
|
233
|
+
export function computeSkillMetrics(
|
|
234
|
+
skillId: string,
|
|
235
|
+
activations: SkillActivation[],
|
|
236
|
+
): SkillMetrics {
|
|
237
|
+
const skillActivations = activations.filter((a) => a.skillId === skillId);
|
|
238
|
+
|
|
239
|
+
if (skillActivations.length === 0) {
|
|
240
|
+
return {
|
|
241
|
+
skillId,
|
|
242
|
+
totalActivations: 0,
|
|
243
|
+
passedActivations: 0,
|
|
244
|
+
failedActivations: 0,
|
|
245
|
+
passRate: 0,
|
|
246
|
+
avgConfidence: 0,
|
|
247
|
+
currentConfidence: computeInitialConfidence(0),
|
|
248
|
+
trend: "stable",
|
|
249
|
+
roleBreakdown: {},
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
const passed = skillActivations.filter((a) => a.passed).length;
|
|
254
|
+
const failed = skillActivations.filter((a) => !a.passed).length;
|
|
255
|
+
const avgConfidence =
|
|
256
|
+
skillActivations.reduce((sum, a) => sum + a.confidence, 0) /
|
|
257
|
+
skillActivations.length;
|
|
258
|
+
const currentConfidence =
|
|
259
|
+
skillActivations[skillActivations.length - 1]?.confidence ?? avgConfidence;
|
|
260
|
+
|
|
261
|
+
// Compute trend from last 5 activations
|
|
262
|
+
const recent = skillActivations.slice(-5);
|
|
263
|
+
const recentPassRate = recent.filter((a) => a.passed).length / recent.length;
|
|
264
|
+
const earlier = skillActivations.slice(0, -5);
|
|
265
|
+
const earlierPassRate =
|
|
266
|
+
earlier.length > 0
|
|
267
|
+
? earlier.filter((a) => a.passed).length / earlier.length
|
|
268
|
+
: recentPassRate;
|
|
269
|
+
|
|
270
|
+
let trend: SkillMetrics["trend"] = "stable";
|
|
271
|
+
if (recentPassRate > earlierPassRate + 0.1) {
|
|
272
|
+
trend = "improving";
|
|
273
|
+
} else if (recentPassRate < earlierPassRate - 0.1) {
|
|
274
|
+
trend = "declining";
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// Role breakdown
|
|
278
|
+
const roleBreakdown: Record<string, number> = {};
|
|
279
|
+
for (const activation of skillActivations) {
|
|
280
|
+
roleBreakdown[activation.role] =
|
|
281
|
+
(roleBreakdown[activation.role] ?? 0) + 1;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// Apply decay if not observed recently
|
|
285
|
+
const lastActivation = skillActivations[skillActivations.length - 1]?.timestamp;
|
|
286
|
+
const decayedConfidence = applyDecay(currentConfidence, lastActivation);
|
|
287
|
+
|
|
288
|
+
return {
|
|
289
|
+
skillId,
|
|
290
|
+
totalActivations: skillActivations.length,
|
|
291
|
+
passedActivations: passed,
|
|
292
|
+
failedActivations: failed,
|
|
293
|
+
passRate: passed / skillActivations.length,
|
|
294
|
+
avgConfidence,
|
|
295
|
+
currentConfidence: decayedConfidence,
|
|
296
|
+
trend,
|
|
297
|
+
lastActivation,
|
|
298
|
+
firstActivation: skillActivations[0]?.timestamp,
|
|
299
|
+
roleBreakdown,
|
|
300
|
+
};
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
/**
|
|
304
|
+
* Evaluate if a skill passes the promotion gate.
|
|
305
|
+
* Skill can be promoted to "strong enforcement" when criteria are met.
|
|
306
|
+
*/
|
|
307
|
+
export function evaluatePromotionGate(metrics: SkillMetrics): {
|
|
308
|
+
passed: boolean;
|
|
309
|
+
criteria: {
|
|
310
|
+
correctness: boolean;
|
|
311
|
+
evidence: boolean;
|
|
312
|
+
rollback: boolean;
|
|
313
|
+
encoding: boolean;
|
|
314
|
+
};
|
|
315
|
+
reason: string;
|
|
316
|
+
} {
|
|
317
|
+
const criteria = {
|
|
318
|
+
correctness: metrics.passRate >= PROMOTION_GATE_CRITERIA.MIN_CORRECTNESS,
|
|
319
|
+
evidence: metrics.totalActivations >= PROMOTION_GATE_CRITERIA.MIN_ACTIVATIONS,
|
|
320
|
+
rollback: metrics.trend !== "declining",
|
|
321
|
+
encoding: metrics.avgConfidence >= PROMOTION_GATE_CRITERIA.MIN_AVG_CONFIDENCE,
|
|
322
|
+
};
|
|
323
|
+
|
|
324
|
+
const allPassed = Object.values(criteria).every(Boolean);
|
|
325
|
+
|
|
326
|
+
let reason: string;
|
|
327
|
+
if (allPassed) {
|
|
328
|
+
reason = `All promotion gate criteria met: ${metrics.passRate.toFixed(1)} pass rate, ${metrics.totalActivations} activations, ${metrics.trend} trend`;
|
|
329
|
+
} else {
|
|
330
|
+
const failedCriteria = Object.entries(criteria)
|
|
331
|
+
.filter(([, passed]) => !passed)
|
|
332
|
+
.map(([name]) => name);
|
|
333
|
+
reason = `Promotion gate not passed. Failed: ${failedCriteria.join(", ")}`;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
return { passed: allPassed, criteria, reason };
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Get weighted skills for a role based on activation history.
|
|
341
|
+
* Filters by minimum confidence threshold.
|
|
342
|
+
*/
|
|
343
|
+
export function getWeightedSkillsForRole(
|
|
344
|
+
role: string,
|
|
345
|
+
skillIds: string[],
|
|
346
|
+
runId: string,
|
|
347
|
+
minConfidence: number = CONFIDENCE_THRESHOLDS.TENTATIVE,
|
|
348
|
+
): WeightedSkill[] {
|
|
349
|
+
const activations = getSkillActivations(runId);
|
|
350
|
+
|
|
351
|
+
return skillIds
|
|
352
|
+
.map((skillId) => {
|
|
353
|
+
const metrics = computeSkillMetrics(skillId, activations);
|
|
354
|
+
const confidence = metrics.currentConfidence;
|
|
355
|
+
|
|
356
|
+
if (confidence < minConfidence) {
|
|
357
|
+
return null;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
return {
|
|
361
|
+
skillId,
|
|
362
|
+
confidence,
|
|
363
|
+
threshold: confidenceToThreshold(confidence),
|
|
364
|
+
behavior: confidenceToBehavior(confidence),
|
|
365
|
+
evidence: `${metrics.totalActivations} activations, ${(metrics.passRate * 100).toFixed(0)}% pass rate`,
|
|
366
|
+
metrics,
|
|
367
|
+
};
|
|
368
|
+
})
|
|
369
|
+
.filter((s): s is WeightedSkill => s !== null)
|
|
370
|
+
.sort((a, b) => b.confidence - a.confidence);
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
/**
|
|
374
|
+
* Filter skills by confidence threshold.
|
|
375
|
+
* Skills below threshold are marked as "suggest" only.
|
|
376
|
+
*/
|
|
377
|
+
export function filterSkillsByConfidence(
|
|
378
|
+
skillIds: string[],
|
|
379
|
+
runId: string,
|
|
380
|
+
threshold: keyof typeof CONFIDENCE_THRESHOLDS = "MODERATE",
|
|
381
|
+
): WeightedSkill[] {
|
|
382
|
+
const minConfidence = CONFIDENCE_THRESHOLDS[threshold];
|
|
383
|
+
return getWeightedSkillsForRole("global", skillIds, runId, minConfidence);
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
/**
|
|
387
|
+
* Register crew hooks for automatic skill activation tracking.
|
|
388
|
+
* Hooks are registered once per process lifetime.
|
|
389
|
+
*/
|
|
390
|
+
let hooksRegistered = false;
|
|
391
|
+
|
|
392
|
+
export function registerSkillEffectivenessHooks(): void {
|
|
393
|
+
if (hooksRegistered) return;
|
|
394
|
+
hooksRegistered = true;
|
|
395
|
+
|
|
396
|
+
// Track task completion for skill effectiveness
|
|
397
|
+
crewHooks.register("task_completed", (event) => {
|
|
398
|
+
const { taskId, runId, data } = event;
|
|
399
|
+
if (!taskId || !runId) return;
|
|
400
|
+
|
|
401
|
+
// Extract skills that were activated from task data
|
|
402
|
+
const skillNames = (data?.skills as string[]) ?? [];
|
|
403
|
+
const success = (data?.status as string) === "completed";
|
|
404
|
+
|
|
405
|
+
// Record each skill activation
|
|
406
|
+
for (const skillId of skillNames) {
|
|
407
|
+
const activation: SkillActivation = {
|
|
408
|
+
id: `act-${Date.now()}-${Math.random().toString(36).slice(2)}`,
|
|
409
|
+
skillId,
|
|
410
|
+
role: (data?.role as string) ?? "unknown",
|
|
411
|
+
runId,
|
|
412
|
+
taskId,
|
|
413
|
+
timestamp: new Date().toISOString(),
|
|
414
|
+
passed: success,
|
|
415
|
+
confidence: computeInitialConfidence(1),
|
|
416
|
+
};
|
|
417
|
+
recordSkillActivation(activation);
|
|
418
|
+
}
|
|
419
|
+
});
|
|
420
|
+
|
|
421
|
+
// Track task failures
|
|
422
|
+
crewHooks.register("task_failed", (event) => {
|
|
423
|
+
const { taskId, runId, data } = event;
|
|
424
|
+
if (!taskId || !runId) return;
|
|
425
|
+
|
|
426
|
+
// Downgrade confidence for skills associated with failed tasks
|
|
427
|
+
// This is handled by computeSkillMetrics when processing activations
|
|
428
|
+
});
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
/**
|
|
432
|
+
* Generate a skill effectiveness report for a run.
|
|
433
|
+
*/
|
|
434
|
+
export function generateSkillEffectivenessReport(
|
|
435
|
+
runId: string,
|
|
436
|
+
skillIds: string[],
|
|
437
|
+
): string {
|
|
438
|
+
const activations = getSkillActivations(runId);
|
|
439
|
+
const lines: string[] = [
|
|
440
|
+
`# Skill Effectiveness Report: ${runId}`,
|
|
441
|
+
"",
|
|
442
|
+
`Generated: ${new Date().toISOString()}`,
|
|
443
|
+
`Total Activations: ${activations.length}`,
|
|
444
|
+
"",
|
|
445
|
+
];
|
|
446
|
+
|
|
447
|
+
if (activations.length === 0) {
|
|
448
|
+
lines.push("*No skill activations recorded yet.*");
|
|
449
|
+
return lines.join("\n");
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
lines.push("## Skill Metrics");
|
|
453
|
+
lines.push("");
|
|
454
|
+
|
|
455
|
+
for (const skillId of skillIds) {
|
|
456
|
+
const metrics = computeSkillMetrics(skillId, activations);
|
|
457
|
+
const gate = evaluatePromotionGate(metrics);
|
|
458
|
+
|
|
459
|
+
lines.push(`### ${skillId}`);
|
|
460
|
+
lines.push(`- **Confidence**: ${metrics.currentConfidence.toFixed(2)} (${metrics.trend})`);
|
|
461
|
+
lines.push(`- **Pass Rate**: ${(metrics.passRate * 100).toFixed(1)}% (${metrics.passedActivations}/${metrics.totalActivations})`);
|
|
462
|
+
lines.push(`- **Avg Confidence**: ${metrics.avgConfidence.toFixed(2)}`);
|
|
463
|
+
lines.push(`- **Promotion Gate**: ${gate.passed ? "PASSED ✅" : "NOT MET"}`);
|
|
464
|
+
|
|
465
|
+
if (Object.keys(metrics.roleBreakdown).length > 0) {
|
|
466
|
+
lines.push(`- **By Role**: ${JSON.stringify(metrics.roleBreakdown)}`);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
lines.push("");
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
return lines.join("\n");
|
|
473
|
+
}
|
|
@@ -5,6 +5,11 @@ import type { AgentConfig } from "../agents/agent-config.ts";
|
|
|
5
5
|
import type { TeamRole } from "../teams/team-config.ts";
|
|
6
6
|
import type { WorkflowStep } from "../workflows/workflow-config.ts";
|
|
7
7
|
import { isSafePathId, resolveContainedPath, resolveRealContainedPath } from "../utils/safe-paths.ts";
|
|
8
|
+
import {
|
|
9
|
+
getWeightedSkillsForRole,
|
|
10
|
+
registerSkillEffectivenessHooks,
|
|
11
|
+
CONFIDENCE_THRESHOLDS,
|
|
12
|
+
} from "./skill-effectiveness.ts";
|
|
8
13
|
|
|
9
14
|
const PACKAGE_SKILLS_DIR = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..", "..", "skills");
|
|
10
15
|
const MAX_SKILL_CHARS = 1500;
|
|
@@ -76,7 +81,8 @@ export function defaultSkillsForRole(role: string): string[] {
|
|
|
76
81
|
return DEFAULT_ROLE_SKILLS[role] ?? [];
|
|
77
82
|
}
|
|
78
83
|
|
|
79
|
-
function collectTaskSkillNames(input: ResolveTaskSkillsInput): string[] {
|
|
84
|
+
function collectTaskSkillNames(input: ResolveTaskSkillsInput | undefined): string[] {
|
|
85
|
+
if (!input) return [];
|
|
80
86
|
if (input.override === false) return [];
|
|
81
87
|
const roleDefaultsDisabled = input.teamRole?.skills === false || input.step?.skills === false;
|
|
82
88
|
const names = roleDefaultsDisabled ? [] : defaultSkillsForRole(input.role);
|
|
@@ -182,9 +188,16 @@ export interface RenderedSkillInstructions {
|
|
|
182
188
|
names: string[];
|
|
183
189
|
paths: string[];
|
|
184
190
|
block: string;
|
|
191
|
+
/** Confidence-weighted skills for this render, sorted by confidence */
|
|
192
|
+
weightedSkills?: Array<{
|
|
193
|
+
skillId: string;
|
|
194
|
+
confidence: number;
|
|
195
|
+
behavior: string;
|
|
196
|
+
threshold: string;
|
|
197
|
+
}>;
|
|
185
198
|
}
|
|
186
199
|
|
|
187
|
-
export function renderSkillInstructions(input: RenderSkillInstructionsInput): RenderedSkillInstructions {
|
|
200
|
+
export function renderSkillInstructions(input: RenderSkillInstructionsInput & { runId?: string } = {} as RenderSkillInstructionsInput & { runId?: string }): RenderedSkillInstructions {
|
|
188
201
|
const allNames = collectTaskSkillNames(input);
|
|
189
202
|
const names = allNames.slice(0, MAX_SELECTED_SKILLS);
|
|
190
203
|
const overflowCount = Math.max(0, allNames.length - names.length);
|
|
@@ -193,6 +206,21 @@ export function renderSkillInstructions(input: RenderSkillInstructionsInput): Re
|
|
|
193
206
|
const skillPaths: string[] = [];
|
|
194
207
|
let total = 0;
|
|
195
208
|
let omittedCount = overflowCount;
|
|
209
|
+
|
|
210
|
+
// ECC INSTINCT: Get confidence-weighted skills if runId is provided
|
|
211
|
+
let weightedSkills: RenderedSkillInstructions["weightedSkills"] = undefined;
|
|
212
|
+
if (input.runId) {
|
|
213
|
+
// Register effectiveness hooks once per process
|
|
214
|
+
registerSkillEffectivenessHooks();
|
|
215
|
+
const weighted = getWeightedSkillsForRole(input.role, names, input.runId, CONFIDENCE_THRESHOLDS.TENTATIVE);
|
|
216
|
+
weightedSkills = weighted.map(w => ({
|
|
217
|
+
skillId: w.skillId,
|
|
218
|
+
confidence: w.confidence,
|
|
219
|
+
behavior: w.behavior,
|
|
220
|
+
threshold: w.threshold,
|
|
221
|
+
}));
|
|
222
|
+
}
|
|
223
|
+
|
|
196
224
|
const pushSection = (section: string): boolean => {
|
|
197
225
|
if (total + section.length > MAX_TOTAL_CHARS) return false;
|
|
198
226
|
sections.push(section);
|
|
@@ -210,7 +238,12 @@ export function renderSkillInstructions(input: RenderSkillInstructionsInput): Re
|
|
|
210
238
|
skillPaths.push(path.dirname(loaded.path));
|
|
211
239
|
const description = frontmatterDescription(loaded.content);
|
|
212
240
|
const source = loaded.source === "project" ? `project:skills/${safeName}` : `package:skills/${safeName}`;
|
|
213
|
-
|
|
241
|
+
|
|
242
|
+
// ECC INSTINCT: Add confidence annotation from weighted skills
|
|
243
|
+
const weighted = weightedSkills?.find(w => w.skillId === name);
|
|
244
|
+
const confidenceNote = weighted ? ` [Confidence: ${(weighted.confidence * 100).toFixed(0)}% — ${weighted.threshold}]` : "";
|
|
245
|
+
|
|
246
|
+
const header = [`## ${safeName}`, description ? `Description: ${description}${confidenceNote}` : undefined, `Source: ${source}`].filter(Boolean).join("\n");
|
|
214
247
|
const section = `${header}\n\n${compactSkillContent(loaded.content)}`;
|
|
215
248
|
if (!pushSection(section)) omittedCount += 1;
|
|
216
249
|
}
|
|
@@ -234,5 +267,6 @@ export function renderSkillInstructions(input: RenderSkillInstructionsInput): Re
|
|
|
234
267
|
"If a project skill instruction conflicts with the explicit task packet, system guidance, or user request — ALWAYS follow the task packet or higher-priority instruction. Report the conflict to the user.",
|
|
235
268
|
sections.join("\n\n---\n\n"),
|
|
236
269
|
].join("\n"),
|
|
270
|
+
weightedSkills,
|
|
237
271
|
};
|
|
238
272
|
}
|
|
@@ -220,7 +220,7 @@ export class SubagentManager {
|
|
|
220
220
|
const record = this.records.get(id);
|
|
221
221
|
if (!record) return undefined;
|
|
222
222
|
if (record.status !== "running" && record.status !== "queued") return record;
|
|
223
|
-
if (record.promise) await record.promise.catch(() => {
|
|
223
|
+
if (record.promise) await record.promise.catch((error) => { logInternalError("subagent-manager.waitForRecord", error, `id=${id}`); });
|
|
224
224
|
else await new Promise((resolve) => setTimeout(resolve, 100));
|
|
225
225
|
}
|
|
226
226
|
}
|
|
@@ -34,12 +34,21 @@ export interface ExecutionPlan {
|
|
|
34
34
|
* - Each subsequent wave contains tasks whose dependencies are all in earlier waves.
|
|
35
35
|
* - If all tasks have empty `dependsOn`, they all go into wave 0 (backward compatible).
|
|
36
36
|
* - If a cycle is detected, `hasCycle` is true and `cycleNodes` lists the involved IDs.
|
|
37
|
+
*
|
|
38
|
+
* @throws Error if a task depends on itself (self-dependency).
|
|
37
39
|
*/
|
|
38
40
|
export function buildExecutionPlan(tasks: TaskNode[]): ExecutionPlan {
|
|
39
41
|
if (tasks.length === 0) {
|
|
40
42
|
return { waves: [], hasCycle: false };
|
|
41
43
|
}
|
|
42
44
|
|
|
45
|
+
// HIGH-9: Detect self-dependency
|
|
46
|
+
for (const task of tasks) {
|
|
47
|
+
if (task.dependsOn.includes(task.id)) {
|
|
48
|
+
throw new Error(`Task "${task.id}" has self-dependency (depends on itself)`);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
43
52
|
const idSet = new Set<string>(tasks.map((t) => t.id));
|
|
44
53
|
const adjacency = new Map<string, Set<string>>(); // id -> ids that depend on it
|
|
45
54
|
const inDegree = new Map<string, number>();
|
|
@@ -108,7 +117,8 @@ export function buildExecutionPlan(tasks: TaskNode[]): ExecutionPlan {
|
|
|
108
117
|
*/
|
|
109
118
|
function buildWave(tasks: TaskNode[], ids: string[], index: number): ExecutionWave {
|
|
110
119
|
const taskMap = new Map(tasks.map((t) => [t.id, t]));
|
|
111
|
-
|
|
120
|
+
// MEDIUM-12: Filter out undefined values instead of using non-null assertion
|
|
121
|
+
const waveTasks = ids.map((id) => taskMap.get(id)).filter(Boolean) as TaskNode[];
|
|
112
122
|
|
|
113
123
|
let label: string | undefined;
|
|
114
124
|
if (waveTasks.length > 0 && waveTasks.every((t) => t.phase !== undefined)) {
|