@agentplate/cli 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +54 -0
- package/LICENSE +21 -0
- package/README.md +206 -0
- package/agents/architect.md +108 -0
- package/agents/builder.md +97 -0
- package/agents/coordinator.md +113 -0
- package/agents/deployer.md +117 -0
- package/agents/devops.md +114 -0
- package/agents/lead.md +107 -0
- package/agents/merger.md +103 -0
- package/agents/reviewer.md +90 -0
- package/agents/scout.md +95 -0
- package/agents/verifier.md +106 -0
- package/package.json +64 -0
- package/src/agents/guard-rules.ts +55 -0
- package/src/agents/identity.test.ts +161 -0
- package/src/agents/identity.ts +229 -0
- package/src/agents/manifest.test.ts +260 -0
- package/src/agents/manifest.ts +286 -0
- package/src/agents/overlay.test.ts +190 -0
- package/src/agents/overlay.ts +212 -0
- package/src/agents/system-prompt.test.ts +53 -0
- package/src/agents/system-prompt.ts +95 -0
- package/src/agents/turn-runner.ts +79 -0
- package/src/commands/coordinator.test.ts +75 -0
- package/src/commands/coordinator.ts +259 -0
- package/src/commands/deploy.test.ts +504 -0
- package/src/commands/deploy.ts +874 -0
- package/src/commands/doctor.test.ts +106 -0
- package/src/commands/doctor.ts +208 -0
- package/src/commands/init.ts +71 -0
- package/src/commands/log.ts +51 -0
- package/src/commands/mail.ts +197 -0
- package/src/commands/merge.ts +127 -0
- package/src/commands/model.ts +58 -0
- package/src/commands/prime.ts +61 -0
- package/src/commands/reap.ts +87 -0
- package/src/commands/serve.ts +61 -0
- package/src/commands/setup.ts +48 -0
- package/src/commands/ship.test.ts +106 -0
- package/src/commands/ship.ts +202 -0
- package/src/commands/skill.test.ts +458 -0
- package/src/commands/skill.ts +730 -0
- package/src/commands/sling.ts +365 -0
- package/src/commands/status.ts +60 -0
- package/src/commands/stop.ts +56 -0
- package/src/commands/tui.ts +199 -0
- package/src/commands/worktree.ts +77 -0
- package/src/config.test.ts +92 -0
- package/src/config.ts +202 -0
- package/src/db/sqlite.test.ts +77 -0
- package/src/db/sqlite.ts +102 -0
- package/src/deploy/audit.test.ts +233 -0
- package/src/deploy/audit.ts +245 -0
- package/src/deploy/context.test.ts +243 -0
- package/src/deploy/context.ts +72 -0
- package/src/deploy/registry.test.ts +101 -0
- package/src/deploy/registry.ts +86 -0
- package/src/deploy/secrets.test.ts +129 -0
- package/src/deploy/secrets.ts +69 -0
- package/src/deploy/targets/docker-gha.test.ts +323 -0
- package/src/deploy/targets/docker-gha.ts +841 -0
- package/src/deploy/types.ts +153 -0
- package/src/errors.test.ts +42 -0
- package/src/errors.ts +69 -0
- package/src/events/store.test.ts +183 -0
- package/src/events/store.ts +201 -0
- package/src/index.ts +137 -0
- package/src/insights/quality-gates.ts +73 -0
- package/src/json.test.ts +28 -0
- package/src/json.ts +50 -0
- package/src/logging/color.ts +62 -0
- package/src/logging/logger.ts +60 -0
- package/src/logging/sanitizer.test.ts +36 -0
- package/src/logging/sanitizer.ts +57 -0
- package/src/mail/client.test.ts +192 -0
- package/src/mail/client.ts +188 -0
- package/src/mail/store.test.ts +279 -0
- package/src/mail/store.ts +311 -0
- package/src/merge/lock.test.ts +88 -0
- package/src/merge/lock.ts +84 -0
- package/src/merge/queue.test.ts +136 -0
- package/src/merge/queue.ts +177 -0
- package/src/merge/resolver.test.ts +219 -0
- package/src/merge/resolver.ts +274 -0
- package/src/paths.ts +36 -0
- package/src/providers/apply.test.ts +90 -0
- package/src/providers/apply.ts +66 -0
- package/src/providers/registry.test.ts +74 -0
- package/src/providers/registry.ts +254 -0
- package/src/runtimes/claude.ts +313 -0
- package/src/runtimes/codex.ts +280 -0
- package/src/runtimes/cursor.ts +247 -0
- package/src/runtimes/gemini.ts +173 -0
- package/src/runtimes/mock.ts +71 -0
- package/src/runtimes/opencode.ts +259 -0
- package/src/runtimes/registry.test.ts +924 -0
- package/src/runtimes/registry.ts +63 -0
- package/src/runtimes/resolve.ts +45 -0
- package/src/runtimes/types.ts +97 -0
- package/src/scaffold.ts +68 -0
- package/src/secrets.test.ts +51 -0
- package/src/secrets.ts +78 -0
- package/src/serve/api.ts +667 -0
- package/src/serve/server.test.ts +433 -0
- package/src/serve/server.ts +271 -0
- package/src/serve/system.ts +90 -0
- package/src/serve/weather.ts +140 -0
- package/src/sessions/reaper.test.ts +162 -0
- package/src/sessions/reaper.ts +149 -0
- package/src/sessions/store.test.ts +351 -0
- package/src/sessions/store.ts +350 -0
- package/src/skills/distiller.test.ts +498 -0
- package/src/skills/distiller.ts +426 -0
- package/src/skills/feedback.test.ts +300 -0
- package/src/skills/feedback.ts +168 -0
- package/src/skills/lifecycle.ts +169 -0
- package/src/skills/retrieval.test.ts +421 -0
- package/src/skills/retrieval.ts +365 -0
- package/src/skills/safety.test.ts +335 -0
- package/src/skills/safety.ts +216 -0
- package/src/skills/store.test.ts +425 -0
- package/src/skills/store.ts +684 -0
- package/src/skills/types.ts +107 -0
- package/src/types.ts +442 -0
- package/src/utils/detect.test.ts +35 -0
- package/src/utils/detect.ts +82 -0
- package/src/version.test.ts +19 -0
- package/src/version.ts +7 -0
- package/src/wizard/setup.ts +254 -0
- package/src/worktree/manager.test.ts +181 -0
- package/src/worktree/manager.ts +229 -0
- package/templates/overlay.md.tmpl +102 -0
- package/ui/dist/assets/index-C7rXIMER.css +1 -0
- package/ui/dist/assets/index-W4kbr4by.js +4526 -0
- package/ui/dist/favicon.svg +21 -0
- package/ui/dist/index.html +16 -0
- package/ui/dist/logo-clay.svg +21 -0
- package/ui/dist/logo.svg +18 -0
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import { describe, expect, test } from "bun:test";
|
|
2
|
+
|
|
3
|
+
import type { OutcomeStatus } from "../types.ts";
|
|
4
|
+
import {
|
|
5
|
+
computeConfidence,
|
|
6
|
+
evaluateLifecycle,
|
|
7
|
+
type LifecycleConfig,
|
|
8
|
+
wilsonLowerBound,
|
|
9
|
+
} from "./feedback.ts";
|
|
10
|
+
import type { Skill, SkillOutcome, SkillStatus } from "./types.ts";
|
|
11
|
+
|
|
12
|
+
// --- builders -------------------------------------------------------------
|
|
13
|
+
|
|
14
|
+
let outcomeSeq = 0;
|
|
15
|
+
|
|
16
|
+
/** Build a minimal {@link SkillOutcome} carrying just the status under test. */
|
|
17
|
+
function outcome(status: OutcomeStatus): SkillOutcome {
|
|
18
|
+
outcomeSeq += 1;
|
|
19
|
+
return {
|
|
20
|
+
status,
|
|
21
|
+
agent: "builder-1",
|
|
22
|
+
taskId: `task-${outcomeSeq}`,
|
|
23
|
+
gates: status,
|
|
24
|
+
ts: new Date(2026, 4, 31, 0, 0, outcomeSeq).toISOString(),
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/** Build N outcomes of the same status. */
|
|
29
|
+
function outcomes(status: OutcomeStatus, n: number): SkillOutcome[] {
|
|
30
|
+
return Array.from({ length: n }, () => outcome(status));
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/** Build a {@link Skill} with overridable derived fields for lifecycle tests. */
|
|
34
|
+
function makeSkill(overrides: Partial<Skill> = {}): Skill {
|
|
35
|
+
return {
|
|
36
|
+
id: "00000000-0000-0000-0000-000000000000",
|
|
37
|
+
slug: "example-skill",
|
|
38
|
+
title: "Example skill",
|
|
39
|
+
version: 1,
|
|
40
|
+
status: "active",
|
|
41
|
+
goal: "Do the thing reliably.",
|
|
42
|
+
whenToUse: ["when the thing needs doing"],
|
|
43
|
+
filePatterns: ["src/**/*.ts"],
|
|
44
|
+
tags: ["example"],
|
|
45
|
+
created: "2026-05-31T00:00:00.000Z",
|
|
46
|
+
updatedAt: "2026-05-31T00:00:00.000Z",
|
|
47
|
+
relatesTo: [],
|
|
48
|
+
supersedes: [],
|
|
49
|
+
body: "Steps go here.",
|
|
50
|
+
confidence: 0.9,
|
|
51
|
+
appliedCount: 0,
|
|
52
|
+
successCount: 0,
|
|
53
|
+
lastOutcome: null,
|
|
54
|
+
...overrides,
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const CFG: LifecycleConfig = { quarantineBelow: 0.3, minSamples: 5 };
|
|
59
|
+
|
|
60
|
+
// --- wilsonLowerBound -----------------------------------------------------
|
|
61
|
+
|
|
62
|
+
describe("wilsonLowerBound", () => {
|
|
63
|
+
test("n <= 0 returns 0 (no evidence, no confidence)", () => {
|
|
64
|
+
expect(wilsonLowerBound(0, 0)).toBe(0);
|
|
65
|
+
expect(wilsonLowerBound(5, 0)).toBe(0);
|
|
66
|
+
expect(wilsonLowerBound(3, -2)).toBe(0);
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
test("0 successes out of n yields a lower bound of 0", () => {
|
|
70
|
+
expect(wilsonLowerBound(0, 1)).toBe(0);
|
|
71
|
+
expect(wilsonLowerBound(0, 10)).toBe(0);
|
|
72
|
+
expect(wilsonLowerBound(0, 1000)).toBe(0);
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
test("all successes give a high bound but strictly below 1 for small n", () => {
|
|
76
|
+
const small = wilsonLowerBound(1, 1);
|
|
77
|
+
expect(small).toBeGreaterThan(0);
|
|
78
|
+
expect(small).toBeLessThan(1);
|
|
79
|
+
|
|
80
|
+
// More all-success evidence pushes the bound up, but never reaches 1.
|
|
81
|
+
const more = wilsonLowerBound(20, 20);
|
|
82
|
+
expect(more).toBeLessThan(1);
|
|
83
|
+
expect(more).toBeGreaterThan(small);
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
test("result is always clamped to [0, 1]", () => {
|
|
87
|
+
for (const [s, n] of [
|
|
88
|
+
[0, 1],
|
|
89
|
+
[1, 1],
|
|
90
|
+
[5, 10],
|
|
91
|
+
[10, 10],
|
|
92
|
+
[100, 100],
|
|
93
|
+
[3, 7],
|
|
94
|
+
] as const) {
|
|
95
|
+
const lb = wilsonLowerBound(s, n);
|
|
96
|
+
expect(lb).toBeGreaterThanOrEqual(0);
|
|
97
|
+
expect(lb).toBeLessThanOrEqual(1);
|
|
98
|
+
}
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
test("out-of-range successes are clamped, keeping the bound in [0,1]", () => {
|
|
102
|
+
// successes > n is clamped to n -> behaves like an all-success run.
|
|
103
|
+
expect(wilsonLowerBound(99, 10)).toBe(wilsonLowerBound(10, 10));
|
|
104
|
+
// negative successes clamp to 0.
|
|
105
|
+
expect(wilsonLowerBound(-5, 10)).toBe(wilsonLowerBound(0, 10));
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
test("monotonic in successes at fixed n", () => {
|
|
109
|
+
const n = 10;
|
|
110
|
+
let previous = -1;
|
|
111
|
+
for (let s = 0; s <= n; s += 1) {
|
|
112
|
+
const lb = wilsonLowerBound(s, n);
|
|
113
|
+
expect(lb).toBeGreaterThan(previous);
|
|
114
|
+
previous = lb;
|
|
115
|
+
}
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
test("monotonic in n for a fixed perfect success rate (more evidence => higher bound)", () => {
|
|
119
|
+
let previous = -1;
|
|
120
|
+
for (const n of [1, 2, 5, 10, 25, 50, 100]) {
|
|
121
|
+
const lb = wilsonLowerBound(n, n);
|
|
122
|
+
expect(lb).toBeGreaterThan(previous);
|
|
123
|
+
previous = lb;
|
|
124
|
+
}
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
test("larger z (wider interval) lowers the bound", () => {
|
|
128
|
+
const tight = wilsonLowerBound(8, 10, 1.0);
|
|
129
|
+
const wide = wilsonLowerBound(8, 10, 2.576);
|
|
130
|
+
expect(wide).toBeLessThan(tight);
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
test("supports fractional (weighted) successes", () => {
|
|
134
|
+
// 5 partials over 10 trials -> weight 2.5; bound sits between 0 and the
|
|
135
|
+
// 0.5 raw proportion.
|
|
136
|
+
const lb = wilsonLowerBound(2.5, 10);
|
|
137
|
+
expect(lb).toBeGreaterThan(0);
|
|
138
|
+
expect(lb).toBeLessThan(0.5);
|
|
139
|
+
});
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
// --- computeConfidence ----------------------------------------------------
|
|
143
|
+
|
|
144
|
+
describe("computeConfidence", () => {
|
|
145
|
+
test("empty outcome log -> zeroed result with null lastOutcome", () => {
|
|
146
|
+
expect(computeConfidence([])).toEqual({
|
|
147
|
+
confidence: 0,
|
|
148
|
+
appliedCount: 0,
|
|
149
|
+
successCount: 0,
|
|
150
|
+
lastOutcome: null,
|
|
151
|
+
});
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
test("weights outcomes: success=1, partial=0.5, failure=0", () => {
|
|
155
|
+
const result = computeConfidence([
|
|
156
|
+
outcome("success"),
|
|
157
|
+
outcome("partial"),
|
|
158
|
+
outcome("failure"),
|
|
159
|
+
outcome("success"),
|
|
160
|
+
]);
|
|
161
|
+
expect(result.appliedCount).toBe(4);
|
|
162
|
+
// 1 + 0.5 + 0 + 1 = 2.5 (kept as a real-valued weight sum, NOT rounded).
|
|
163
|
+
expect(result.successCount).toBe(2.5);
|
|
164
|
+
expect(result.confidence).toBeCloseTo(wilsonLowerBound(2.5, 4), 12);
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
test("lastOutcome reflects the final element in chronological order", () => {
|
|
168
|
+
expect(computeConfidence([outcome("success"), outcome("failure")]).lastOutcome).toBe("failure");
|
|
169
|
+
expect(computeConfidence([outcome("failure"), outcome("partial")]).lastOutcome).toBe("partial");
|
|
170
|
+
expect(computeConfidence([outcome("success")]).lastOutcome).toBe("success");
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
test("successCount is a SUM of weights, not a rounded integer", () => {
|
|
174
|
+
const result = computeConfidence(outcomes("partial", 3));
|
|
175
|
+
expect(result.successCount).toBe(1.5);
|
|
176
|
+
expect(Number.isInteger(result.successCount)).toBe(false);
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
test("all-success confidence equals the Wilson bound for n/n", () => {
|
|
180
|
+
const result = computeConfidence(outcomes("success", 8));
|
|
181
|
+
expect(result.successCount).toBe(8);
|
|
182
|
+
expect(result.confidence).toBeCloseTo(wilsonLowerBound(8, 8), 12);
|
|
183
|
+
expect(result.confidence).toBeLessThan(1);
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
test("all-failure confidence is 0", () => {
|
|
187
|
+
const result = computeConfidence(outcomes("failure", 6));
|
|
188
|
+
expect(result.successCount).toBe(0);
|
|
189
|
+
expect(result.confidence).toBe(0);
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
test("small-sample penalty: 1/1 success has LOWER confidence than 30/33", () => {
|
|
193
|
+
const oneOfOne = computeConfidence(outcomes("success", 1));
|
|
194
|
+
|
|
195
|
+
const thirtyOfThirtyThree = computeConfidence([
|
|
196
|
+
...outcomes("success", 30),
|
|
197
|
+
...outcomes("failure", 3),
|
|
198
|
+
]);
|
|
199
|
+
|
|
200
|
+
expect(thirtyOfThirtyThree.appliedCount).toBe(33);
|
|
201
|
+
expect(thirtyOfThirtyThree.successCount).toBe(30);
|
|
202
|
+
expect(oneOfOne.confidence).toBeLessThan(thirtyOfThirtyThree.confidence);
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
test("more consistent successes raise confidence over time", () => {
|
|
206
|
+
const few = computeConfidence(outcomes("success", 3));
|
|
207
|
+
const many = computeConfidence(outcomes("success", 30));
|
|
208
|
+
expect(many.confidence).toBeGreaterThan(few.confidence);
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
test("confidence stays within [0, 1] across mixed logs", () => {
|
|
212
|
+
const result = computeConfidence([
|
|
213
|
+
...outcomes("success", 7),
|
|
214
|
+
...outcomes("partial", 4),
|
|
215
|
+
...outcomes("failure", 5),
|
|
216
|
+
]);
|
|
217
|
+
expect(result.confidence).toBeGreaterThanOrEqual(0);
|
|
218
|
+
expect(result.confidence).toBeLessThanOrEqual(1);
|
|
219
|
+
expect(result.appliedCount).toBe(16);
|
|
220
|
+
expect(result.successCount).toBe(9); // 7 + 2 + 0
|
|
221
|
+
});
|
|
222
|
+
});
|
|
223
|
+
|
|
224
|
+
// --- evaluateLifecycle ----------------------------------------------------
|
|
225
|
+
|
|
226
|
+
describe("evaluateLifecycle", () => {
|
|
227
|
+
test("quarantines on low confidence WITH enough samples", () => {
|
|
228
|
+
const skill = makeSkill({ status: "active", appliedCount: 10, confidence: 0.2 });
|
|
229
|
+
expect(evaluateLifecycle(skill, [], CFG)).toBe("quarantined");
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
test("does NOT quarantine on low confidence when below minSamples", () => {
|
|
233
|
+
const skill = makeSkill({ status: "active", appliedCount: 2, confidence: 0.1 });
|
|
234
|
+
expect(evaluateLifecycle(skill, [], CFG)).toBe("active");
|
|
235
|
+
});
|
|
236
|
+
|
|
237
|
+
test("does NOT quarantine when confidence is at or above the floor", () => {
|
|
238
|
+
const atFloor = makeSkill({ status: "active", appliedCount: 20, confidence: 0.3 });
|
|
239
|
+
expect(evaluateLifecycle(atFloor, [], CFG)).toBe("active");
|
|
240
|
+
|
|
241
|
+
const aboveFloor = makeSkill({ status: "active", appliedCount: 20, confidence: 0.8 });
|
|
242
|
+
expect(evaluateLifecycle(aboveFloor, [], CFG)).toBe("active");
|
|
243
|
+
});
|
|
244
|
+
|
|
245
|
+
test("quarantines on three consecutive failures regardless of historical confidence", () => {
|
|
246
|
+
const skill = makeSkill({ status: "active", appliedCount: 50, confidence: 0.95 });
|
|
247
|
+
const recent = outcomes("failure", 3);
|
|
248
|
+
expect(evaluateLifecycle(skill, recent, CFG)).toBe("quarantined");
|
|
249
|
+
});
|
|
250
|
+
|
|
251
|
+
test("inspects only the trailing 3 outcomes for the consecutive-failure rule", () => {
|
|
252
|
+
const skill = makeSkill({ status: "active", appliedCount: 50, confidence: 0.95 });
|
|
253
|
+
// Older failures + a recent success tail -> NOT quarantined.
|
|
254
|
+
const recoveredTail: SkillOutcome[] = [
|
|
255
|
+
...outcomes("failure", 5),
|
|
256
|
+
outcome("success"),
|
|
257
|
+
outcome("success"),
|
|
258
|
+
outcome("success"),
|
|
259
|
+
];
|
|
260
|
+
expect(evaluateLifecycle(skill, recoveredTail, CFG)).toBe("active");
|
|
261
|
+
|
|
262
|
+
// A full log whose final three are failures DOES quarantine.
|
|
263
|
+
const regressedTail: SkillOutcome[] = [
|
|
264
|
+
outcome("success"),
|
|
265
|
+
outcome("success"),
|
|
266
|
+
...outcomes("failure", 3),
|
|
267
|
+
];
|
|
268
|
+
expect(evaluateLifecycle(skill, regressedTail, CFG)).toBe("quarantined");
|
|
269
|
+
});
|
|
270
|
+
|
|
271
|
+
test("fewer than 3 trailing failures never trips the consecutive rule", () => {
|
|
272
|
+
const skill = makeSkill({ status: "active", appliedCount: 50, confidence: 0.95 });
|
|
273
|
+
expect(evaluateLifecycle(skill, outcomes("failure", 2), CFG)).toBe("active");
|
|
274
|
+
expect(
|
|
275
|
+
evaluateLifecycle(skill, [outcome("failure"), outcome("success"), outcome("failure")], CFG),
|
|
276
|
+
).toBe("active");
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
test("preserves the current status when neither rule fires", () => {
|
|
280
|
+
const healthy = makeSkill({ status: "active", appliedCount: 40, confidence: 0.7 });
|
|
281
|
+
expect(evaluateLifecycle(healthy, outcomes("success", 3), CFG)).toBe("active");
|
|
282
|
+
});
|
|
283
|
+
|
|
284
|
+
test("never auto-resurrects a deprecated skill", () => {
|
|
285
|
+
const deprecated = makeSkill({ status: "deprecated", appliedCount: 40, confidence: 0.99 });
|
|
286
|
+
const status: SkillStatus = evaluateLifecycle(deprecated, outcomes("success", 3), CFG);
|
|
287
|
+
expect(status).toBe("deprecated");
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
test("a deprecated skill can still be (re)quarantined by the rules", () => {
|
|
291
|
+
const deprecated = makeSkill({ status: "deprecated", appliedCount: 40, confidence: 0.05 });
|
|
292
|
+
expect(evaluateLifecycle(deprecated, [], CFG)).toBe("quarantined");
|
|
293
|
+
});
|
|
294
|
+
|
|
295
|
+
test("an already-quarantined skill stays quarantined when healthy again", () => {
|
|
296
|
+
// Status is preserved (not promoted) — only an explicit action reactivates.
|
|
297
|
+
const quarantined = makeSkill({ status: "quarantined", appliedCount: 40, confidence: 0.9 });
|
|
298
|
+
expect(evaluateLifecycle(quarantined, outcomes("success", 3), CFG)).toBe("quarantined");
|
|
299
|
+
});
|
|
300
|
+
});
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Confidence + lifecycle scoring for skills — the canonical, pure implementation.
|
|
3
|
+
*
|
|
4
|
+
* A skill earns trust through use. Every time an agent applies a skill the
|
|
5
|
+
* session-end feedback step appends a {@link SkillOutcome} to its
|
|
6
|
+
* `outcomes.jsonl`. This module turns that outcome log into the derived
|
|
7
|
+
* confidence track record (Wilson lower bound) and decides when a skill should
|
|
8
|
+
* be quarantined out of retrieval.
|
|
9
|
+
*
|
|
10
|
+
* WHY the Wilson score lower bound (rather than a naive success ratio): a skill
|
|
11
|
+
* that succeeded once (1/1) should NOT outrank a skill that succeeded 30 of 33
|
|
12
|
+
* times. The naive ratio gives the 1/1 skill a "perfect" 1.0; Wilson penalises
|
|
13
|
+
* small samples by reporting the lower bound of the confidence interval on the
|
|
14
|
+
* true success rate, so 1/1 lands well below 30/33. Outcomes are weighted —
|
|
15
|
+
* `partial` counts as half a success — so the "successes" fed to Wilson is a
|
|
16
|
+
* real-valued weight, not an integer count.
|
|
17
|
+
*
|
|
18
|
+
* PURITY: this module performs no I/O and imports no store. The skill store and
|
|
19
|
+
* the session-end hook each recompute these values inline at write time, but
|
|
20
|
+
* they converge on the formulas defined here (this is the single tested source
|
|
21
|
+
* of truth). Do NOT import the store from here — that would create a cycle.
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import type { OutcomeStatus } from "../types.ts";
|
|
25
|
+
import type { Skill, SkillOutcome, SkillStatus } from "./types.ts";
|
|
26
|
+
|
|
27
|
+
/** z-score for a 95% one-sided confidence interval (the default for Wilson). */
|
|
28
|
+
const DEFAULT_Z = 1.96;
|
|
29
|
+
|
|
30
|
+
/** Outcome-to-weight mapping for the success proportion. */
|
|
31
|
+
const OUTCOME_WEIGHT: Record<OutcomeStatus, number> = {
|
|
32
|
+
success: 1.0,
|
|
33
|
+
partial: 0.5,
|
|
34
|
+
failure: 0.0,
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
/** Derived confidence figures recomputed from a skill's full outcome log. */
|
|
38
|
+
export interface ConfidenceResult {
|
|
39
|
+
/** Wilson lower bound (0..1) of the weighted success proportion. */
|
|
40
|
+
confidence: number;
|
|
41
|
+
/** Total number of recorded outcomes (= the sample size n). */
|
|
42
|
+
appliedCount: number;
|
|
43
|
+
/** Sum of outcome weights (success=1, partial=0.5, failure=0) — a real number. */
|
|
44
|
+
successCount: number;
|
|
45
|
+
/** Status of the most recent outcome, or null when there are none. */
|
|
46
|
+
lastOutcome: OutcomeStatus | null;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Wilson score interval lower bound for a proportion.
|
|
51
|
+
*
|
|
52
|
+
* Given `successes` out of `n` trials (successes may be fractional, since
|
|
53
|
+
* outcomes are weighted), returns the lower bound of the Wilson confidence
|
|
54
|
+
* interval at the given z-score. This is a small-sample-aware estimate of the
|
|
55
|
+
* true success rate: it rewards more evidence and stays conservative when n is
|
|
56
|
+
* tiny.
|
|
57
|
+
*
|
|
58
|
+
* Edge cases: `n <= 0` returns 0 (no evidence ⇒ no confidence). The result is
|
|
59
|
+
* clamped to `[0, 1]`. `successes` is clamped to `[0, n]` defensively so a
|
|
60
|
+
* caller passing a slightly out-of-range weight cannot push the bound outside
|
|
61
|
+
* the unit interval.
|
|
62
|
+
*
|
|
63
|
+
* Formula (one-sided lower bound):
|
|
64
|
+
*
|
|
65
|
+
* p = successes / n
|
|
66
|
+
* lb = (p + z²/2n − z·√( (p(1−p) + z²/4n) / n )) / (1 + z²/n)
|
|
67
|
+
*/
|
|
68
|
+
export function wilsonLowerBound(successes: number, n: number, z: number = DEFAULT_Z): number {
|
|
69
|
+
if (n <= 0) {
|
|
70
|
+
return 0;
|
|
71
|
+
}
|
|
72
|
+
// Defensive clamp: weighted successes must lie within [0, n].
|
|
73
|
+
const s = Math.min(Math.max(successes, 0), n);
|
|
74
|
+
const p = s / n;
|
|
75
|
+
const z2 = z * z;
|
|
76
|
+
const denominator = 1 + z2 / n;
|
|
77
|
+
const centre = p + z2 / (2 * n);
|
|
78
|
+
const margin = z * Math.sqrt((p * (1 - p) + z2 / (4 * n)) / n);
|
|
79
|
+
const lb = (centre - margin) / denominator;
|
|
80
|
+
// Clamp to the unit interval — guards against tiny floating-point overshoot.
|
|
81
|
+
return Math.min(Math.max(lb, 0), 1);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Recompute a skill's derived confidence figures from its full outcome log.
|
|
86
|
+
*
|
|
87
|
+
* Weights each outcome (success=1.0, partial=0.5, failure=0.0), sums the weights
|
|
88
|
+
* into `successCount`, and feeds `p = successCount / appliedCount` to the Wilson
|
|
89
|
+
* lower bound. With no outcomes, confidence is 0 and `lastOutcome` is null.
|
|
90
|
+
*
|
|
91
|
+
* `outcomes` is treated as chronologically ordered (append-only `outcomes.jsonl`
|
|
92
|
+
* order), so `lastOutcome` is the status of the final element.
|
|
93
|
+
*/
|
|
94
|
+
export function computeConfidence(outcomes: SkillOutcome[]): ConfidenceResult {
|
|
95
|
+
const appliedCount = outcomes.length;
|
|
96
|
+
if (appliedCount === 0) {
|
|
97
|
+
return { confidence: 0, appliedCount: 0, successCount: 0, lastOutcome: null };
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
let successCount = 0;
|
|
101
|
+
for (const outcome of outcomes) {
|
|
102
|
+
successCount += OUTCOME_WEIGHT[outcome.status];
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
const confidence = wilsonLowerBound(successCount, appliedCount);
|
|
106
|
+
const last = outcomes[appliedCount - 1];
|
|
107
|
+
const lastOutcome: OutcomeStatus | null = last ? last.status : null;
|
|
108
|
+
|
|
109
|
+
return { confidence, appliedCount, successCount, lastOutcome };
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/** How many trailing failures in a row trigger an automatic quarantine. */
|
|
113
|
+
const CONSECUTIVE_FAILURE_LIMIT = 3;
|
|
114
|
+
|
|
115
|
+
/** Tunables controlling when a skill is quarantined out of retrieval. */
|
|
116
|
+
export interface LifecycleConfig {
|
|
117
|
+
/** Quarantine when confidence drops below this (and the skill has >= minSamples). */
|
|
118
|
+
quarantineBelow: number;
|
|
119
|
+
/** Minimum sample size before the confidence floor is enforced. */
|
|
120
|
+
minSamples: number;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Decide a skill's lifecycle status given its (already-recomputed) confidence
|
|
125
|
+
* and its most recent outcomes.
|
|
126
|
+
*
|
|
127
|
+
* Returns `"quarantined"` when EITHER:
|
|
128
|
+
* - the skill has enough evidence and its confidence is below the floor
|
|
129
|
+
* (`skill.appliedCount >= minSamples && skill.confidence < quarantineBelow`), OR
|
|
130
|
+
* - its last three outcomes are all failures (a sudden regression, regardless
|
|
131
|
+
* of historical confidence).
|
|
132
|
+
*
|
|
133
|
+
* Otherwise the skill keeps its current `status`. In particular this never
|
|
134
|
+
* auto-resurrects a `deprecated` (or already `quarantined`) skill back to
|
|
135
|
+
* `active` — promotion is an explicit, human/distiller action, not a side
|
|
136
|
+
* effect of a few good runs.
|
|
137
|
+
*
|
|
138
|
+
* `recentOutcomes` should be the most recent outcomes in chronological order;
|
|
139
|
+
* only the final {@link CONSECUTIVE_FAILURE_LIMIT} are inspected for the
|
|
140
|
+
* consecutive-failure rule, so passing the full log or just the tail both work.
|
|
141
|
+
*/
|
|
142
|
+
export function evaluateLifecycle(
|
|
143
|
+
skill: Skill,
|
|
144
|
+
recentOutcomes: SkillOutcome[],
|
|
145
|
+
cfg: LifecycleConfig,
|
|
146
|
+
): SkillStatus {
|
|
147
|
+
const lowConfidenceWithEvidence =
|
|
148
|
+
skill.appliedCount >= cfg.minSamples && skill.confidence < cfg.quarantineBelow;
|
|
149
|
+
|
|
150
|
+
if (lowConfidenceWithEvidence || hasConsecutiveFailures(recentOutcomes)) {
|
|
151
|
+
return "quarantined";
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
return skill.status;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* True when the final {@link CONSECUTIVE_FAILURE_LIMIT} outcomes are all
|
|
159
|
+
* failures. Requires at least that many outcomes — fewer than the limit can
|
|
160
|
+
* never trip the rule.
|
|
161
|
+
*/
|
|
162
|
+
function hasConsecutiveFailures(outcomes: SkillOutcome[]): boolean {
|
|
163
|
+
if (outcomes.length < CONSECUTIVE_FAILURE_LIMIT) {
|
|
164
|
+
return false;
|
|
165
|
+
}
|
|
166
|
+
const tail = outcomes.slice(-CONSECUTIVE_FAILURE_LIMIT);
|
|
167
|
+
return tail.every((outcome) => outcome.status === "failure");
|
|
168
|
+
}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Skills lifecycle glue — the two integration points that close the learning
|
|
3
|
+
* loop, kept out of the command files so `sling`/`log` stay thin.
|
|
4
|
+
*
|
|
5
|
+
* - {@link retrieveSkillsForSpawn}: at spawn, select relevant skills, persist an
|
|
6
|
+
* `applied-skills.json` record, and return the overlay markdown block.
|
|
7
|
+
* - {@link runSkillFeedbackAndDistill}: at session-end, append outcomes to the
|
|
8
|
+
* applied skills (evolving confidence) and — when gates passed — distill a new
|
|
9
|
+
* or updated skill from the diff.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
|
|
13
|
+
import { dirname } from "node:path";
|
|
14
|
+
import { appliedSkillsPath } from "../paths.ts";
|
|
15
|
+
import type { AgentRuntime } from "../runtimes/types.ts";
|
|
16
|
+
import type { OutcomeStatus, SkillsConfig } from "../types.ts";
|
|
17
|
+
import { distillSkill } from "./distiller.ts";
|
|
18
|
+
import { selectSkills } from "./retrieval.ts";
|
|
19
|
+
import { createSkillStore } from "./store.ts";
|
|
20
|
+
import type { AppliedSkillsRecord } from "./types.ts";
|
|
21
|
+
|
|
22
|
+
export interface RetrieveSpawnArgs {
|
|
23
|
+
root: string;
|
|
24
|
+
agentName: string;
|
|
25
|
+
capability: string;
|
|
26
|
+
taskId: string;
|
|
27
|
+
fileScope: string[];
|
|
28
|
+
taskText: string;
|
|
29
|
+
skills: SkillsConfig;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Select skills for a spawning agent. Returns the overlay markdown to inject
|
|
34
|
+
* (empty string when skills are disabled or none match) and writes the
|
|
35
|
+
* `applied-skills.json` record the feedback step consumes.
|
|
36
|
+
*/
|
|
37
|
+
export function retrieveSkillsForSpawn(args: RetrieveSpawnArgs): string {
|
|
38
|
+
if (!args.skills.enabled) return "";
|
|
39
|
+
const store = createSkillStore(args.root);
|
|
40
|
+
try {
|
|
41
|
+
const all = store.list({ status: "active" });
|
|
42
|
+
if (all.length === 0) return "";
|
|
43
|
+
const result = selectSkills(all, {
|
|
44
|
+
fileScope: args.fileScope,
|
|
45
|
+
taskText: args.taskText,
|
|
46
|
+
capability: args.capability,
|
|
47
|
+
budgetChars: args.skills.retrieval.budgetChars,
|
|
48
|
+
maxFull: args.skills.retrieval.maxFull,
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
const record: AppliedSkillsRecord = {
|
|
52
|
+
taskId: args.taskId,
|
|
53
|
+
agent: args.agentName,
|
|
54
|
+
capability: args.capability,
|
|
55
|
+
skills: [
|
|
56
|
+
...result.full.map((r) => ({
|
|
57
|
+
id: r.skill.id,
|
|
58
|
+
slug: r.skill.slug,
|
|
59
|
+
injected: "full" as const,
|
|
60
|
+
})),
|
|
61
|
+
...result.summarized.map((r) => ({
|
|
62
|
+
id: r.skill.id,
|
|
63
|
+
slug: r.skill.slug,
|
|
64
|
+
injected: "summary" as const,
|
|
65
|
+
})),
|
|
66
|
+
],
|
|
67
|
+
};
|
|
68
|
+
if (record.skills.length > 0) {
|
|
69
|
+
const path = appliedSkillsPath(args.root, args.agentName);
|
|
70
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
71
|
+
writeFileSync(path, `${JSON.stringify(record, null, 2)}\n`, "utf8");
|
|
72
|
+
}
|
|
73
|
+
return result.overlayMarkdown;
|
|
74
|
+
} finally {
|
|
75
|
+
store.close();
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export interface FeedbackDistillArgs {
|
|
80
|
+
root: string;
|
|
81
|
+
agentName: string;
|
|
82
|
+
capability: string;
|
|
83
|
+
taskId: string | null;
|
|
84
|
+
worktreePath: string;
|
|
85
|
+
baseRef: string;
|
|
86
|
+
runtime: AgentRuntime;
|
|
87
|
+
/** Quality-gate status for the session (null when no gates ran). */
|
|
88
|
+
outcomeStatus: OutcomeStatus | null;
|
|
89
|
+
skills: SkillsConfig;
|
|
90
|
+
model?: string;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
export interface FeedbackDistillResult {
|
|
94
|
+
outcomesAppended: number;
|
|
95
|
+
distill: { action: "created" | "updated" | "skipped"; slug?: string };
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Session-end: append the session's outcome to each applied skill (evolving its
|
|
100
|
+
* confidence), then — when enabled and gates passed — distill a skill from the
|
|
101
|
+
* work. Best-effort: never throws into the caller's hook path.
|
|
102
|
+
*/
|
|
103
|
+
export async function runSkillFeedbackAndDistill(
|
|
104
|
+
args: FeedbackDistillArgs,
|
|
105
|
+
): Promise<FeedbackDistillResult> {
|
|
106
|
+
const result: FeedbackDistillResult = {
|
|
107
|
+
outcomesAppended: 0,
|
|
108
|
+
distill: { action: "skipped" },
|
|
109
|
+
};
|
|
110
|
+
if (!args.skills.enabled) return result;
|
|
111
|
+
|
|
112
|
+
const store = createSkillStore(args.root);
|
|
113
|
+
try {
|
|
114
|
+
// 1. Feedback: append the session outcome to every applied skill. Capture
|
|
115
|
+
// the applied slugs first (the record file is removed afterward, but the
|
|
116
|
+
// distiller still needs them to target an UPDATE).
|
|
117
|
+
const appliedSlugs = readAppliedSlugs(args.root, args.agentName);
|
|
118
|
+
const appliedPath = appliedSkillsPath(args.root, args.agentName);
|
|
119
|
+
if (existsSync(appliedPath)) {
|
|
120
|
+
const status: OutcomeStatus = args.outcomeStatus ?? "partial";
|
|
121
|
+
const ts = new Date().toISOString();
|
|
122
|
+
for (const slug of appliedSlugs) {
|
|
123
|
+
if (!store.get(slug)) continue;
|
|
124
|
+
store.appendOutcome(slug, {
|
|
125
|
+
status,
|
|
126
|
+
agent: args.agentName,
|
|
127
|
+
taskId: args.taskId,
|
|
128
|
+
gates: args.outcomeStatus,
|
|
129
|
+
ts,
|
|
130
|
+
note: `Applied by ${args.capability} ${args.agentName}`,
|
|
131
|
+
});
|
|
132
|
+
result.outcomesAppended++;
|
|
133
|
+
}
|
|
134
|
+
rmSync(appliedPath, { force: true });
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// 2. Distill: only from work that passed its gates (when configured).
|
|
138
|
+
const gatesOk = !args.skills.distill.onlyOnGatesPass || args.outcomeStatus === "success";
|
|
139
|
+
if (gatesOk) {
|
|
140
|
+
result.distill = await distillSkill({
|
|
141
|
+
store,
|
|
142
|
+
runtime: args.runtime,
|
|
143
|
+
root: args.root,
|
|
144
|
+
worktreePath: args.worktreePath,
|
|
145
|
+
baseRef: args.baseRef,
|
|
146
|
+
taskId: args.taskId,
|
|
147
|
+
agentName: args.agentName,
|
|
148
|
+
capability: args.capability,
|
|
149
|
+
appliedSlugs,
|
|
150
|
+
model: args.skills.distill.model ?? args.model,
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
return result;
|
|
154
|
+
} finally {
|
|
155
|
+
store.close();
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/** Read the applied-skill slugs recorded at spawn (empty when absent/corrupt). */
|
|
160
|
+
function readAppliedSlugs(root: string, agentName: string): string[] {
|
|
161
|
+
const path = appliedSkillsPath(root, agentName);
|
|
162
|
+
if (!existsSync(path)) return [];
|
|
163
|
+
try {
|
|
164
|
+
const record = JSON.parse(readFileSync(path, "utf8")) as AppliedSkillsRecord;
|
|
165
|
+
return record.skills.map((s) => s.slug);
|
|
166
|
+
} catch {
|
|
167
|
+
return [];
|
|
168
|
+
}
|
|
169
|
+
}
|