selftune 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +156 -0
- package/.claude/agents/evolution-reviewer.md +180 -0
- package/.claude/agents/integration-guide.md +212 -0
- package/.claude/agents/pattern-analyst.md +160 -0
- package/CHANGELOG.md +46 -1
- package/README.md +105 -257
- package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
- package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
- package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
- package/apps/local-dashboard/dist/favicon.png +0 -0
- package/apps/local-dashboard/dist/index.html +17 -0
- package/apps/local-dashboard/dist/logo.png +0 -0
- package/apps/local-dashboard/dist/logo.svg +9 -0
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +99 -0
- package/cli/selftune/canonical-export.ts +183 -0
- package/cli/selftune/constants.ts +103 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-contract.ts +202 -0
- package/cli/selftune/dashboard-server.ts +1049 -0
- package/cli/selftune/dashboard.ts +43 -156
- package/cli/selftune/eval/baseline.ts +248 -0
- package/cli/selftune/eval/composability-v2.ts +273 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +101 -16
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evidence.ts +26 -0
- package/cli/selftune/evolution/evolve-body.ts +586 -0
- package/cli/selftune/evolution/evolve.ts +825 -116
- package/cli/selftune/evolution/extract-patterns.ts +105 -16
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +21 -4
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/auto-grade.ts +200 -0
- package/cli/selftune/grading/grade-session.ts +513 -42
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/grading/results.ts +42 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/prompt-log.ts +172 -2
- package/cli/selftune/hooks/session-stop.ts +123 -3
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/hooks/skill-eval.ts +119 -3
- package/cli/selftune/index.ts +415 -48
- package/cli/selftune/ingestors/claude-replay.ts +377 -0
- package/cli/selftune/ingestors/codex-rollout.ts +345 -46
- package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
- package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
- package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
- package/cli/selftune/init.ts +376 -16
- package/cli/selftune/last.ts +14 -5
- package/cli/selftune/localdb/db.ts +63 -0
- package/cli/selftune/localdb/materialize.ts +428 -0
- package/cli/selftune/localdb/queries.ts +376 -0
- package/cli/selftune/localdb/schema.ts +204 -0
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +90 -16
- package/cli/selftune/normalization.ts +682 -0
- package/cli/selftune/observability.ts +19 -44
- package/cli/selftune/orchestrate.ts +1073 -0
- package/cli/selftune/quickstart.ts +203 -0
- package/cli/selftune/repair/skill-usage.ts +576 -0
- package/cli/selftune/schedule.ts +561 -0
- package/cli/selftune/status.ts +59 -33
- package/cli/selftune/sync.ts +627 -0
- package/cli/selftune/types.ts +525 -5
- package/cli/selftune/utils/canonical-log.ts +45 -0
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/hooks.ts +41 -0
- package/cli/selftune/utils/html.ts +27 -0
- package/cli/selftune/utils/llm-call.ts +103 -19
- package/cli/selftune/utils/math.ts +10 -0
- package/cli/selftune/utils/query-filter.ts +139 -0
- package/cli/selftune/utils/skill-discovery.ts +340 -0
- package/cli/selftune/utils/skill-log.ts +68 -0
- package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
- package/cli/selftune/utils/transcript.ts +307 -26
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/cli/selftune/workflows/discover.ts +254 -0
- package/cli/selftune/workflows/skill-md-writer.ts +288 -0
- package/cli/selftune/workflows/workflows.ts +188 -0
- package/package.json +28 -11
- package/packages/telemetry-contract/README.md +11 -0
- package/packages/telemetry-contract/fixtures/golden.json +87 -0
- package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
- package/packages/telemetry-contract/index.ts +1 -0
- package/packages/telemetry-contract/package.json +19 -0
- package/packages/telemetry-contract/src/index.ts +2 -0
- package/packages/telemetry-contract/src/types.ts +163 -0
- package/packages/telemetry-contract/src/validators.ts +109 -0
- package/skill/SKILL.md +180 -33
- package/skill/Workflows/AutoActivation.md +145 -0
- package/skill/Workflows/Badge.md +124 -0
- package/skill/Workflows/Baseline.md +144 -0
- package/skill/Workflows/Composability.md +107 -0
- package/skill/Workflows/Contribute.md +94 -0
- package/skill/Workflows/Cron.md +132 -0
- package/skill/Workflows/Dashboard.md +214 -0
- package/skill/Workflows/Doctor.md +63 -14
- package/skill/Workflows/Evals.md +110 -18
- package/skill/Workflows/EvolutionMemory.md +154 -0
- package/skill/Workflows/Evolve.md +181 -21
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/Grade.md +36 -31
- package/skill/Workflows/ImportSkillsBench.md +117 -0
- package/skill/Workflows/Ingest.md +142 -21
- package/skill/Workflows/Initialize.md +91 -23
- package/skill/Workflows/Orchestrate.md +139 -0
- package/skill/Workflows/Replay.md +91 -0
- package/skill/Workflows/Rollback.md +23 -4
- package/skill/Workflows/Schedule.md +61 -0
- package/skill/Workflows/Sync.md +88 -0
- package/skill/Workflows/UnitTest.md +150 -0
- package/skill/Workflows/Watch.md +33 -1
- package/skill/Workflows/Workflows.md +129 -0
- package/skill/assets/activation-rules-default.json +26 -0
- package/skill/assets/multi-skill-settings.json +63 -0
- package/skill/assets/single-skill-settings.json +57 -0
- package/skill/references/invocation-taxonomy.md +2 -2
- package/skill/references/logs.md +164 -2
- package/skill/references/setup-patterns.md +65 -0
- package/skill/references/version-history.md +40 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
- package/dashboard/index.html +0 -1119
|
@@ -6,7 +6,16 @@
|
|
|
6
6
|
* similar queries together using Jaccard similarity.
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
|
-
import type {
|
|
9
|
+
import type {
|
|
10
|
+
EvalEntry,
|
|
11
|
+
FailureFeedback,
|
|
12
|
+
FailurePattern,
|
|
13
|
+
GradingResult,
|
|
14
|
+
InvocationType,
|
|
15
|
+
SkillUsageRecord,
|
|
16
|
+
} from "../types.js";
|
|
17
|
+
import { filterActionableSkillUsageRecords } from "../utils/query-filter.js";
|
|
18
|
+
import { isHighConfidencePositiveSkillRecord } from "../utils/skill-usage-confidence.js";
|
|
10
19
|
|
|
11
20
|
// ---------------------------------------------------------------------------
|
|
12
21
|
// Jaccard similarity
|
|
@@ -93,37 +102,39 @@ export function extractFailurePatterns(
|
|
|
93
102
|
evalEntries: EvalEntry[],
|
|
94
103
|
skillUsage: SkillUsageRecord[],
|
|
95
104
|
skillName: string,
|
|
105
|
+
gradingResults?: GradingResult[],
|
|
96
106
|
): FailurePattern[] {
|
|
97
|
-
|
|
107
|
+
const actionableSkillUsage = filterActionableSkillUsageRecords(skillUsage);
|
|
98
108
|
const triggeredQueries = new Set<string>();
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
109
|
+
const skillUsageBySession = new Map<string, SkillUsageRecord[]>();
|
|
110
|
+
|
|
111
|
+
for (const record of actionableSkillUsage) {
|
|
112
|
+
if (!isHighConfidencePositiveSkillRecord(record, skillName)) continue;
|
|
113
|
+
triggeredQueries.add(record.query);
|
|
114
|
+
const sessionRecords = skillUsageBySession.get(record.session_id) ?? [];
|
|
115
|
+
sessionRecords.push(record);
|
|
116
|
+
skillUsageBySession.set(record.session_id, sessionRecords);
|
|
103
117
|
}
|
|
104
118
|
|
|
105
|
-
// 2. Find missed queries: should_trigger === true but NOT in the triggered set
|
|
106
119
|
const missedByType = new Map<InvocationType, string[]>();
|
|
107
|
-
|
|
108
120
|
for (const entry of evalEntries) {
|
|
109
121
|
if (!entry.should_trigger) continue;
|
|
110
122
|
if (triggeredQueries.has(entry.query)) continue;
|
|
111
123
|
|
|
112
124
|
const invType = entry.invocation_type ?? "implicit";
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
missedByType.get(invType)?.push(entry.query);
|
|
125
|
+
const queries = missedByType.get(invType) ?? [];
|
|
126
|
+
queries.push(entry.query);
|
|
127
|
+
missedByType.set(invType, queries);
|
|
117
128
|
}
|
|
118
129
|
|
|
119
|
-
// 3. For each group, cluster similar queries
|
|
120
130
|
const now = new Date().toISOString();
|
|
121
131
|
const allPatterns: FailurePattern[] = [];
|
|
122
132
|
let index = 0;
|
|
133
|
+
const feedbackMap = new Map<string, FailureFeedback[]>();
|
|
134
|
+
const sampleSessionsByQuery = new Map<string, Set<string>>();
|
|
123
135
|
|
|
124
136
|
for (const [invType, queries] of missedByType) {
|
|
125
137
|
const clusters = clusterQueries(queries);
|
|
126
|
-
|
|
127
138
|
for (const cluster of clusters) {
|
|
128
139
|
allPatterns.push({
|
|
129
140
|
pattern_id: `fp-${skillName}-${index}`,
|
|
@@ -138,8 +149,86 @@ export function extractFailurePatterns(
|
|
|
138
149
|
}
|
|
139
150
|
}
|
|
140
151
|
|
|
141
|
-
|
|
142
|
-
|
|
152
|
+
if (gradingResults && gradingResults.length > 0) {
|
|
153
|
+
for (const result of gradingResults) {
|
|
154
|
+
const hasExplicitFeedback = (result.failure_feedback?.length ?? 0) > 0;
|
|
155
|
+
const hasFailedSummary = (result.summary.failed ?? 0) > 0;
|
|
156
|
+
if (result.skill_name !== skillName || (!hasExplicitFeedback && !hasFailedSummary)) continue;
|
|
157
|
+
|
|
158
|
+
const failedQueries = new Set<string>();
|
|
159
|
+
|
|
160
|
+
if (result.failure_feedback) {
|
|
161
|
+
const sessionRecords = skillUsageBySession.get(result.session_id) ?? [];
|
|
162
|
+
for (const feedback of result.failure_feedback) {
|
|
163
|
+
if (!feedback.query) continue;
|
|
164
|
+
const existing = feedbackMap.get(feedback.query) ?? [];
|
|
165
|
+
existing.push(feedback);
|
|
166
|
+
feedbackMap.set(feedback.query, existing);
|
|
167
|
+
if (sessionRecords.some((record) => record.query === feedback.query)) {
|
|
168
|
+
failedQueries.add(feedback.query);
|
|
169
|
+
const sessions = sampleSessionsByQuery.get(feedback.query) ?? new Set<string>();
|
|
170
|
+
sessions.add(result.session_id);
|
|
171
|
+
sampleSessionsByQuery.set(feedback.query, sessions);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
143
175
|
|
|
176
|
+
if (failedQueries.size === 0) {
|
|
177
|
+
const sessionRecords = skillUsageBySession.get(result.session_id) ?? [];
|
|
178
|
+
const failedExpectations = result.expectations.filter((expectation) => !expectation.passed);
|
|
179
|
+
for (const record of sessionRecords) {
|
|
180
|
+
failedQueries.add(record.query);
|
|
181
|
+
const sessions = sampleSessionsByQuery.get(record.query) ?? new Set<string>();
|
|
182
|
+
sessions.add(result.session_id);
|
|
183
|
+
sampleSessionsByQuery.set(record.query, sessions);
|
|
184
|
+
|
|
185
|
+
if (failedExpectations.length > 0) {
|
|
186
|
+
const feedback = feedbackMap.get(record.query) ?? [];
|
|
187
|
+
for (const expectation of failedExpectations) {
|
|
188
|
+
feedback.push({
|
|
189
|
+
query: record.query,
|
|
190
|
+
failure_reason: expectation.evidence || expectation.text,
|
|
191
|
+
improvement_hint: expectation.text,
|
|
192
|
+
invocation_type: "contextual",
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
feedbackMap.set(record.query, feedback);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const contextualQueries = [...sampleSessionsByQuery.keys()];
|
|
202
|
+
if (contextualQueries.length > 0) {
|
|
203
|
+
const clusters = clusterQueries(contextualQueries);
|
|
204
|
+
for (const cluster of clusters) {
|
|
205
|
+
allPatterns.push({
|
|
206
|
+
pattern_id: `fp-${skillName}-${index}`,
|
|
207
|
+
skill_name: skillName,
|
|
208
|
+
invocation_type: "contextual",
|
|
209
|
+
missed_queries: cluster,
|
|
210
|
+
frequency: cluster.length,
|
|
211
|
+
sample_sessions: [
|
|
212
|
+
...new Set(cluster.flatMap((query) => [...(sampleSessionsByQuery.get(query) ?? [])])),
|
|
213
|
+
],
|
|
214
|
+
extracted_at: now,
|
|
215
|
+
feedback: cluster.flatMap((query) => feedbackMap.get(query) ?? []),
|
|
216
|
+
});
|
|
217
|
+
index++;
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
for (const pattern of allPatterns) {
|
|
223
|
+
if (pattern.feedback && pattern.feedback.length > 0) continue;
|
|
224
|
+
const matchingFeedback = pattern.missed_queries.flatMap(
|
|
225
|
+
(query) => feedbackMap.get(query) ?? [],
|
|
226
|
+
);
|
|
227
|
+
if (matchingFeedback.length > 0) {
|
|
228
|
+
pattern.feedback = matchingFeedback;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
allPatterns.sort((a, b) => b.frequency - a.frequency);
|
|
144
233
|
return allPatterns;
|
|
145
234
|
}
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pareto.ts
|
|
3
|
+
*
|
|
4
|
+
* Pareto frontier computation for multi-candidate evolution.
|
|
5
|
+
* All functions are pure — no I/O, no LLM calls.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type {
|
|
9
|
+
InvocationType,
|
|
10
|
+
InvocationTypeScores,
|
|
11
|
+
ParetoCandidate,
|
|
12
|
+
SessionTelemetryRecord,
|
|
13
|
+
TokenUsageMetrics,
|
|
14
|
+
} from "../types.js";
|
|
15
|
+
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
// Score computation
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Compute per-invocation-type scores from per-entry validation results.
|
|
22
|
+
*/
|
|
23
|
+
export function computeInvocationScores(
|
|
24
|
+
perEntryResults: Array<{ entry: { invocation_type?: InvocationType }; after_pass: boolean }>,
|
|
25
|
+
): InvocationTypeScores {
|
|
26
|
+
const dims: InvocationType[] = ["explicit", "implicit", "contextual", "negative"];
|
|
27
|
+
const counts: Record<string, { passed: number; total: number }> = {};
|
|
28
|
+
|
|
29
|
+
for (const dim of dims) {
|
|
30
|
+
counts[dim] = { passed: 0, total: 0 };
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
for (const r of perEntryResults) {
|
|
34
|
+
const type = r.entry.invocation_type ?? "implicit";
|
|
35
|
+
counts[type].total++;
|
|
36
|
+
if (r.after_pass) counts[type].passed++;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const result: Record<string, { passed: number; total: number; pass_rate: number }> = {};
|
|
40
|
+
for (const dim of dims) {
|
|
41
|
+
const { passed, total } = counts[dim];
|
|
42
|
+
result[dim] = { passed, total, pass_rate: total > 0 ? passed / total : 0 };
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
return result as unknown as InvocationTypeScores;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// ---------------------------------------------------------------------------
|
|
49
|
+
// Token efficiency scoring
|
|
50
|
+
// ---------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Clamp a value to [min, max].
|
|
54
|
+
*/
|
|
55
|
+
function clamp(value: number, min: number, max: number): number {
|
|
56
|
+
return Math.max(min, Math.min(max, value));
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Compute token usage metrics from telemetry records.
|
|
61
|
+
*/
|
|
62
|
+
export function computeTokenUsageMetrics(records: SessionTelemetryRecord[]): TokenUsageMetrics {
|
|
63
|
+
let input = 0;
|
|
64
|
+
let output = 0;
|
|
65
|
+
for (const r of records) {
|
|
66
|
+
input += r.input_tokens ?? 0;
|
|
67
|
+
output += r.output_tokens ?? 0;
|
|
68
|
+
}
|
|
69
|
+
return {
|
|
70
|
+
input_tokens: input,
|
|
71
|
+
output_tokens: output,
|
|
72
|
+
total_tokens: input + output,
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Compute a token efficiency score for a skill.
|
|
78
|
+
*
|
|
79
|
+
* Compares average total tokens for sessions WITH the skill triggered
|
|
80
|
+
* vs sessions WITHOUT it. Returns `clamp(baseline_avg / with_skill_avg, 0, 1)`.
|
|
81
|
+
* Values near 1.0 indicate the baseline uses more tokens than sessions with the
|
|
82
|
+
* skill (i.e. the skill is efficient). Values near 0.0 indicate the skill uses
|
|
83
|
+
* more tokens than the baseline.
|
|
84
|
+
*
|
|
85
|
+
* Returns 0.5 (neutral) when there is insufficient data in either group.
|
|
86
|
+
*/
|
|
87
|
+
export function computeTokenEfficiencyScore(
|
|
88
|
+
skillName: string,
|
|
89
|
+
telemetry: SessionTelemetryRecord[],
|
|
90
|
+
): number {
|
|
91
|
+
const withSkill: number[] = [];
|
|
92
|
+
const withoutSkill: number[] = [];
|
|
93
|
+
|
|
94
|
+
for (const record of telemetry) {
|
|
95
|
+
const total = (record.input_tokens ?? 0) + (record.output_tokens ?? 0);
|
|
96
|
+
if (total <= 0) continue;
|
|
97
|
+
|
|
98
|
+
if (record.skills_triggered.includes(skillName)) {
|
|
99
|
+
withSkill.push(total);
|
|
100
|
+
} else {
|
|
101
|
+
withoutSkill.push(total);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
if (withSkill.length === 0 || withoutSkill.length === 0) {
|
|
106
|
+
return 0.5; // neutral when insufficient data
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
const avgWithSkill = withSkill.reduce((a, b) => a + b, 0) / withSkill.length;
|
|
110
|
+
const avgBaseline = withoutSkill.reduce((a, b) => a + b, 0) / withoutSkill.length;
|
|
111
|
+
|
|
112
|
+
if (avgWithSkill === 0) return 1; // zero-token skill usage is maximally efficient
|
|
113
|
+
|
|
114
|
+
return clamp(avgBaseline / avgWithSkill, 0, 1);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// ---------------------------------------------------------------------------
|
|
118
|
+
// Pareto dominance
|
|
119
|
+
// ---------------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
const DIMS: InvocationType[] = ["explicit", "implicit", "contextual", "negative"];
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Returns true if candidate A dominates candidate B:
|
|
125
|
+
* A >= B on all dimensions AND A > B on at least one.
|
|
126
|
+
*
|
|
127
|
+
* When token efficiency scores are provided for BOTH candidates,
|
|
128
|
+
* a 5th dimension is added to the comparison.
|
|
129
|
+
*/
|
|
130
|
+
export function dominates(
|
|
131
|
+
a: InvocationTypeScores,
|
|
132
|
+
b: InvocationTypeScores,
|
|
133
|
+
aTokenEfficiency?: number,
|
|
134
|
+
bTokenEfficiency?: number,
|
|
135
|
+
): boolean {
|
|
136
|
+
let strictlyBetterOnAny = false;
|
|
137
|
+
|
|
138
|
+
for (const dim of DIMS) {
|
|
139
|
+
const aRate = a[dim].pass_rate;
|
|
140
|
+
const bRate = b[dim].pass_rate;
|
|
141
|
+
|
|
142
|
+
if (aRate < bRate) return false; // A is worse on this dim
|
|
143
|
+
if (aRate > bRate) strictlyBetterOnAny = true;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// 5th dimension: token efficiency (only when both have data)
|
|
147
|
+
if (aTokenEfficiency !== undefined && bTokenEfficiency !== undefined) {
|
|
148
|
+
if (aTokenEfficiency < bTokenEfficiency) return false;
|
|
149
|
+
if (aTokenEfficiency > bTokenEfficiency) strictlyBetterOnAny = true;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
return strictlyBetterOnAny;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Compute the dimensions where candidate A dominates candidate B.
|
|
157
|
+
*/
|
|
158
|
+
export function getDominatedDimensions(
|
|
159
|
+
a: InvocationTypeScores,
|
|
160
|
+
b: InvocationTypeScores,
|
|
161
|
+
): InvocationType[] {
|
|
162
|
+
const result: InvocationType[] = [];
|
|
163
|
+
for (const dim of DIMS) {
|
|
164
|
+
if (a[dim].pass_rate > b[dim].pass_rate) {
|
|
165
|
+
result.push(dim);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
return result;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// ---------------------------------------------------------------------------
|
|
172
|
+
// Pareto frontier
|
|
173
|
+
// ---------------------------------------------------------------------------
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Filter candidates to the Pareto frontier (non-dominated set).
|
|
177
|
+
* Also sets `dominates_on` for each frontier member.
|
|
178
|
+
*
|
|
179
|
+
* When candidates have `token_efficiency_score` set, the 5th dimension
|
|
180
|
+
* is used in dominance checks.
|
|
181
|
+
*/
|
|
182
|
+
export function computeParetoFrontier(candidates: ParetoCandidate[]): ParetoCandidate[] {
|
|
183
|
+
if (candidates.length === 0) return [];
|
|
184
|
+
|
|
185
|
+
const frontier: ParetoCandidate[] = [];
|
|
186
|
+
|
|
187
|
+
for (const candidate of candidates) {
|
|
188
|
+
// Check if any existing frontier member dominates this candidate
|
|
189
|
+
let isDominated = false;
|
|
190
|
+
for (const member of frontier) {
|
|
191
|
+
if (
|
|
192
|
+
dominates(
|
|
193
|
+
member.invocation_scores,
|
|
194
|
+
candidate.invocation_scores,
|
|
195
|
+
member.token_efficiency_score,
|
|
196
|
+
candidate.token_efficiency_score,
|
|
197
|
+
)
|
|
198
|
+
) {
|
|
199
|
+
isDominated = true;
|
|
200
|
+
break;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
if (!isDominated) {
|
|
205
|
+
// Remove frontier members that this candidate dominates
|
|
206
|
+
for (let i = frontier.length - 1; i >= 0; i--) {
|
|
207
|
+
if (
|
|
208
|
+
dominates(
|
|
209
|
+
candidate.invocation_scores,
|
|
210
|
+
frontier[i].invocation_scores,
|
|
211
|
+
candidate.token_efficiency_score,
|
|
212
|
+
frontier[i].token_efficiency_score,
|
|
213
|
+
)
|
|
214
|
+
) {
|
|
215
|
+
frontier.splice(i, 1);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
frontier.push(candidate);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Set dominates_on for each frontier member (compared to others in frontier)
|
|
223
|
+
for (const member of frontier) {
|
|
224
|
+
const allDominatedDims = new Set<InvocationType>();
|
|
225
|
+
for (const other of frontier) {
|
|
226
|
+
if (other === member) continue;
|
|
227
|
+
for (const dim of getDominatedDimensions(member.invocation_scores, other.invocation_scores)) {
|
|
228
|
+
allDominatedDims.add(dim);
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
member.dominates_on = [...allDominatedDims];
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return frontier;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// ---------------------------------------------------------------------------
|
|
238
|
+
// Merge prompt
|
|
239
|
+
// ---------------------------------------------------------------------------
|
|
240
|
+
|
|
241
|
+
/**
|
|
242
|
+
* Build a merge prompt for complementary frontier candidates.
|
|
243
|
+
* Returns null if <= 1 candidate or no complementarity detected.
|
|
244
|
+
*/
|
|
245
|
+
export function buildMergePrompt(
|
|
246
|
+
frontier: ParetoCandidate[],
|
|
247
|
+
originalDescription: string,
|
|
248
|
+
): string | null {
|
|
249
|
+
if (frontier.length <= 1) return null;
|
|
250
|
+
|
|
251
|
+
// Check for complementarity: different candidates dominate on different dimensions
|
|
252
|
+
const hasComplementarity = frontier.some((c) => c.dominates_on.length > 0);
|
|
253
|
+
if (!hasComplementarity) return null;
|
|
254
|
+
|
|
255
|
+
const candidateDescriptions = frontier
|
|
256
|
+
.map((c, i) => {
|
|
257
|
+
const strengths =
|
|
258
|
+
c.dominates_on.length > 0
|
|
259
|
+
? `Strengths: ${c.dominates_on.join(", ")}`
|
|
260
|
+
: "No unique strengths";
|
|
261
|
+
return `Candidate ${i + 1} (${c.proposal.proposal_id}):\nDescription: ${c.proposal.proposed_description}\n${strengths}\nOverall pass rate: ${(c.validation.after_pass_rate * 100).toFixed(1)}%`;
|
|
262
|
+
})
|
|
263
|
+
.join("\n\n");
|
|
264
|
+
|
|
265
|
+
return `You are merging multiple skill descriptions that each excel on different invocation types.
|
|
266
|
+
|
|
267
|
+
Original description:
|
|
268
|
+
${originalDescription}
|
|
269
|
+
|
|
270
|
+
Candidates:
|
|
271
|
+
${candidateDescriptions}
|
|
272
|
+
|
|
273
|
+
Create a single merged description that combines the strengths of all candidates.
|
|
274
|
+
Output ONLY valid JSON with:
|
|
275
|
+
- "proposed_description": the merged description
|
|
276
|
+
- "rationale": explanation of what was combined
|
|
277
|
+
- "confidence": 0.0-1.0`;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// ---------------------------------------------------------------------------
|
|
281
|
+
// Selection
|
|
282
|
+
// ---------------------------------------------------------------------------
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* Select the best candidate from a Pareto frontier.
|
|
286
|
+
* Returns the best single candidate and whether a merge should be attempted.
|
|
287
|
+
*/
|
|
288
|
+
export function selectFromFrontier(frontier: ParetoCandidate[]): {
|
|
289
|
+
best: ParetoCandidate;
|
|
290
|
+
shouldMerge: boolean;
|
|
291
|
+
mergePrompt: string | null;
|
|
292
|
+
} {
|
|
293
|
+
if (frontier.length === 0) {
|
|
294
|
+
throw new Error("Cannot select from empty frontier");
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// Sort by overall after_pass_rate descending, then by number of new_passes
|
|
298
|
+
const sorted = [...frontier].sort((a, b) => {
|
|
299
|
+
const rateDiff = b.validation.after_pass_rate - a.validation.after_pass_rate;
|
|
300
|
+
if (Math.abs(rateDiff) > 0.001) return rateDiff;
|
|
301
|
+
return b.validation.new_passes.length - a.validation.new_passes.length;
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
const best = sorted[0];
|
|
305
|
+
const shouldMerge = frontier.length > 1 && frontier.some((c) => c.dominates_on.length > 0);
|
|
306
|
+
|
|
307
|
+
return {
|
|
308
|
+
best,
|
|
309
|
+
shouldMerge,
|
|
310
|
+
mergePrompt: shouldMerge
|
|
311
|
+
? buildMergePrompt(frontier, best.proposal.original_description)
|
|
312
|
+
: null,
|
|
313
|
+
};
|
|
314
|
+
}
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* propose-body.ts
|
|
3
|
+
*
|
|
4
|
+
* Generates full body proposals for SKILL.md files using a teacher LLM.
|
|
5
|
+
* The teacher analyzes current content, failure patterns, and missed queries
|
|
6
|
+
* to produce an improved skill body.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { BodyEvolutionProposal, EvolutionTarget, FailurePattern } from "../types.js";
|
|
10
|
+
import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
|
|
11
|
+
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// System prompt
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
/** System prompt for the body generator (teacher) LLM. */
|
|
17
|
+
export const BODY_GENERATOR_SYSTEM = `You are an expert skill document author for an AI agent routing system.
|
|
18
|
+
|
|
19
|
+
Your task is to generate an improved SKILL.md body that better covers the semantic
|
|
20
|
+
space of queries that the skill should handle. The body includes everything after
|
|
21
|
+
the title line: the description, workflow routing table, instructions, examples, etc.
|
|
22
|
+
|
|
23
|
+
Rules:
|
|
24
|
+
- Preserve the overall structure: description paragraph, ## Workflow Routing table, and other ## sections.
|
|
25
|
+
- The ## Workflow Routing table must be a valid markdown table with | Trigger | Workflow | columns.
|
|
26
|
+
- Cover the semantic space of the missed queries without being too broad.
|
|
27
|
+
- Maintain the original intent and scope of the skill.
|
|
28
|
+
- Be specific and actionable in instructions.
|
|
29
|
+
- Output ONLY valid JSON with exactly these fields:
|
|
30
|
+
- "proposed_body" (string): the complete improved skill body (markdown, everything below the title)
|
|
31
|
+
- "rationale" (string): explanation of what changed and why
|
|
32
|
+
- "confidence" (number): 0.0-1.0 how confident you are this improves the skill
|
|
33
|
+
|
|
34
|
+
Do NOT include any text outside the JSON object.`;
|
|
35
|
+
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
// Prompt builder
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
/** Build the user prompt for full body generation. */
|
|
41
|
+
export function buildBodyGenerationPrompt(
|
|
42
|
+
currentContent: string,
|
|
43
|
+
failurePatterns: FailurePattern[],
|
|
44
|
+
missedQueries: string[],
|
|
45
|
+
skillName: string,
|
|
46
|
+
fewShotExamples?: string[],
|
|
47
|
+
): string {
|
|
48
|
+
const patternLines = failurePatterns.map((p) => {
|
|
49
|
+
const queries = p.missed_queries.map((q) => ` - "${q}"`).join("\n");
|
|
50
|
+
return ` Pattern ${p.pattern_id} (frequency: ${p.frequency}, type: ${p.invocation_type}):\n${queries}`;
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
const missedLines = missedQueries.map((q) => ` - "${q}"`).join("\n");
|
|
54
|
+
|
|
55
|
+
// Build failure feedback section if any patterns have feedback
|
|
56
|
+
const feedbackLines: string[] = [];
|
|
57
|
+
for (const p of failurePatterns) {
|
|
58
|
+
if (p.feedback && p.feedback.length > 0) {
|
|
59
|
+
for (const fb of p.feedback) {
|
|
60
|
+
feedbackLines.push(` Query: "${fb.query}"`);
|
|
61
|
+
feedbackLines.push(` Failure reason: ${fb.failure_reason}`);
|
|
62
|
+
feedbackLines.push(` Improvement hint: ${fb.improvement_hint}`);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
const feedbackSection =
|
|
67
|
+
feedbackLines.length > 0 ? `\n\nStructured Failure Analysis:\n${feedbackLines.join("\n")}` : "";
|
|
68
|
+
|
|
69
|
+
// Build few-shot examples section if provided
|
|
70
|
+
const fewShotSection =
|
|
71
|
+
fewShotExamples && fewShotExamples.length > 0
|
|
72
|
+
? `\n\nReference Examples (other well-written skills):\n${fewShotExamples.map((ex, i) => `--- Example ${i + 1} ---\n${ex}`).join("\n\n")}`
|
|
73
|
+
: "";
|
|
74
|
+
|
|
75
|
+
return `Skill Name: ${skillName}
|
|
76
|
+
|
|
77
|
+
Current Skill Content:
|
|
78
|
+
${currentContent}
|
|
79
|
+
|
|
80
|
+
Failure Patterns:
|
|
81
|
+
${patternLines.join("\n\n")}
|
|
82
|
+
|
|
83
|
+
All Missed Queries:
|
|
84
|
+
${missedLines}${feedbackSection}${fewShotSection}
|
|
85
|
+
|
|
86
|
+
Generate an improved full body for the "${skillName}" skill that would correctly handle the missed queries listed above. The body should include everything below the # Title line: description, ## Workflow Routing table, and any other sections. Output ONLY a JSON object with "proposed_body", "rationale", and "confidence" fields.`;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// ---------------------------------------------------------------------------
|
|
90
|
+
// Response parser
|
|
91
|
+
// ---------------------------------------------------------------------------
|
|
92
|
+
|
|
93
|
+
/** Parse LLM response text into structured body proposal data. */
|
|
94
|
+
export function parseBodyProposalResponse(raw: string): {
|
|
95
|
+
proposed_body: string;
|
|
96
|
+
rationale: string;
|
|
97
|
+
confidence: number;
|
|
98
|
+
} {
|
|
99
|
+
const cleaned = stripMarkdownFences(raw);
|
|
100
|
+
|
|
101
|
+
let parsed: unknown;
|
|
102
|
+
try {
|
|
103
|
+
parsed = JSON.parse(cleaned);
|
|
104
|
+
} catch {
|
|
105
|
+
throw new Error(`Failed to parse LLM response as JSON: ${cleaned.slice(0, 200)}`);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if (typeof parsed !== "object" || parsed === null) {
|
|
109
|
+
throw new Error("LLM response is not a JSON object");
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const obj = parsed as Record<string, unknown>;
|
|
113
|
+
|
|
114
|
+
if (typeof obj.proposed_body !== "string") {
|
|
115
|
+
throw new Error("Missing or invalid 'proposed_body' field in LLM response");
|
|
116
|
+
}
|
|
117
|
+
if (typeof obj.rationale !== "string") {
|
|
118
|
+
throw new Error("Missing or invalid 'rationale' field in LLM response");
|
|
119
|
+
}
|
|
120
|
+
if (typeof obj.confidence !== "number") {
|
|
121
|
+
throw new Error("Missing or invalid 'confidence' field in LLM response");
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const confidence = Math.max(0.0, Math.min(1.0, obj.confidence));
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
proposed_body: obj.proposed_body,
|
|
128
|
+
rationale: obj.rationale,
|
|
129
|
+
confidence,
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// ---------------------------------------------------------------------------
|
|
134
|
+
// Proposal generator
|
|
135
|
+
// ---------------------------------------------------------------------------
|
|
136
|
+
|
|
137
|
+
/** Generate a full body evolution proposal using teacher LLM. */
|
|
138
|
+
export async function generateBodyProposal(
|
|
139
|
+
currentContent: string,
|
|
140
|
+
failurePatterns: FailurePattern[],
|
|
141
|
+
missedQueries: string[],
|
|
142
|
+
skillName: string,
|
|
143
|
+
skillPath: string,
|
|
144
|
+
agent: string,
|
|
145
|
+
modelFlag?: string,
|
|
146
|
+
fewShotExamples?: string[],
|
|
147
|
+
): Promise<BodyEvolutionProposal> {
|
|
148
|
+
const prompt = buildBodyGenerationPrompt(
|
|
149
|
+
currentContent,
|
|
150
|
+
failurePatterns,
|
|
151
|
+
missedQueries,
|
|
152
|
+
skillName,
|
|
153
|
+
fewShotExamples,
|
|
154
|
+
);
|
|
155
|
+
const rawResponse = await callLlm(BODY_GENERATOR_SYSTEM, prompt, agent, modelFlag);
|
|
156
|
+
const { proposed_body, rationale, confidence } = parseBodyProposalResponse(rawResponse);
|
|
157
|
+
|
|
158
|
+
return {
|
|
159
|
+
proposal_id: `evo-body-${skillName}-${Date.now()}`,
|
|
160
|
+
skill_name: skillName,
|
|
161
|
+
skill_path: skillPath,
|
|
162
|
+
original_body: currentContent,
|
|
163
|
+
proposed_body,
|
|
164
|
+
rationale,
|
|
165
|
+
target: "body" as EvolutionTarget,
|
|
166
|
+
failure_patterns: failurePatterns.map((p) => p.pattern_id),
|
|
167
|
+
confidence,
|
|
168
|
+
created_at: new Date().toISOString(),
|
|
169
|
+
status: "pending",
|
|
170
|
+
};
|
|
171
|
+
}
|