selftune 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/.claude/agents/diagnosis-analyst.md +156 -0
  2. package/.claude/agents/evolution-reviewer.md +180 -0
  3. package/.claude/agents/integration-guide.md +212 -0
  4. package/.claude/agents/pattern-analyst.md +160 -0
  5. package/CHANGELOG.md +46 -1
  6. package/README.md +105 -257
  7. package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
  8. package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
  9. package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
  10. package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
  11. package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
  12. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
  13. package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
  14. package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
  15. package/apps/local-dashboard/dist/favicon.png +0 -0
  16. package/apps/local-dashboard/dist/index.html +17 -0
  17. package/apps/local-dashboard/dist/logo.png +0 -0
  18. package/apps/local-dashboard/dist/logo.svg +9 -0
  19. package/assets/BeforeAfter.gif +0 -0
  20. package/assets/FeedbackLoop.gif +0 -0
  21. package/assets/logo.svg +9 -0
  22. package/assets/skill-health-badge.svg +20 -0
  23. package/cli/selftune/activation-rules.ts +171 -0
  24. package/cli/selftune/badge/badge-data.ts +108 -0
  25. package/cli/selftune/badge/badge-svg.ts +212 -0
  26. package/cli/selftune/badge/badge.ts +99 -0
  27. package/cli/selftune/canonical-export.ts +183 -0
  28. package/cli/selftune/constants.ts +103 -1
  29. package/cli/selftune/contribute/bundle.ts +314 -0
  30. package/cli/selftune/contribute/contribute.ts +214 -0
  31. package/cli/selftune/contribute/sanitize.ts +162 -0
  32. package/cli/selftune/cron/setup.ts +266 -0
  33. package/cli/selftune/dashboard-contract.ts +202 -0
  34. package/cli/selftune/dashboard-server.ts +1049 -0
  35. package/cli/selftune/dashboard.ts +43 -156
  36. package/cli/selftune/eval/baseline.ts +248 -0
  37. package/cli/selftune/eval/composability-v2.ts +273 -0
  38. package/cli/selftune/eval/composability.ts +117 -0
  39. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  40. package/cli/selftune/eval/hooks-to-evals.ts +101 -16
  41. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  42. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  43. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  44. package/cli/selftune/eval/unit-test.ts +196 -0
  45. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  46. package/cli/selftune/evolution/evidence.ts +26 -0
  47. package/cli/selftune/evolution/evolve-body.ts +586 -0
  48. package/cli/selftune/evolution/evolve.ts +825 -116
  49. package/cli/selftune/evolution/extract-patterns.ts +105 -16
  50. package/cli/selftune/evolution/pareto.ts +314 -0
  51. package/cli/selftune/evolution/propose-body.ts +171 -0
  52. package/cli/selftune/evolution/propose-description.ts +100 -2
  53. package/cli/selftune/evolution/propose-routing.ts +166 -0
  54. package/cli/selftune/evolution/refine-body.ts +141 -0
  55. package/cli/selftune/evolution/rollback.ts +21 -4
  56. package/cli/selftune/evolution/validate-body.ts +254 -0
  57. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  58. package/cli/selftune/evolution/validate-routing.ts +177 -0
  59. package/cli/selftune/grading/auto-grade.ts +200 -0
  60. package/cli/selftune/grading/grade-session.ts +513 -42
  61. package/cli/selftune/grading/pre-gates.ts +104 -0
  62. package/cli/selftune/grading/results.ts +42 -0
  63. package/cli/selftune/hooks/auto-activate.ts +185 -0
  64. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  65. package/cli/selftune/hooks/prompt-log.ts +172 -2
  66. package/cli/selftune/hooks/session-stop.ts +123 -3
  67. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  68. package/cli/selftune/hooks/skill-eval.ts +119 -3
  69. package/cli/selftune/index.ts +415 -48
  70. package/cli/selftune/ingestors/claude-replay.ts +377 -0
  71. package/cli/selftune/ingestors/codex-rollout.ts +345 -46
  72. package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
  73. package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
  74. package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
  75. package/cli/selftune/init.ts +376 -16
  76. package/cli/selftune/last.ts +14 -5
  77. package/cli/selftune/localdb/db.ts +63 -0
  78. package/cli/selftune/localdb/materialize.ts +428 -0
  79. package/cli/selftune/localdb/queries.ts +376 -0
  80. package/cli/selftune/localdb/schema.ts +204 -0
  81. package/cli/selftune/memory/writer.ts +447 -0
  82. package/cli/selftune/monitoring/watch.ts +90 -16
  83. package/cli/selftune/normalization.ts +682 -0
  84. package/cli/selftune/observability.ts +19 -44
  85. package/cli/selftune/orchestrate.ts +1073 -0
  86. package/cli/selftune/quickstart.ts +203 -0
  87. package/cli/selftune/repair/skill-usage.ts +576 -0
  88. package/cli/selftune/schedule.ts +561 -0
  89. package/cli/selftune/status.ts +59 -33
  90. package/cli/selftune/sync.ts +627 -0
  91. package/cli/selftune/types.ts +525 -5
  92. package/cli/selftune/utils/canonical-log.ts +45 -0
  93. package/cli/selftune/utils/frontmatter.ts +217 -0
  94. package/cli/selftune/utils/hooks.ts +41 -0
  95. package/cli/selftune/utils/html.ts +27 -0
  96. package/cli/selftune/utils/llm-call.ts +103 -19
  97. package/cli/selftune/utils/math.ts +10 -0
  98. package/cli/selftune/utils/query-filter.ts +139 -0
  99. package/cli/selftune/utils/skill-discovery.ts +340 -0
  100. package/cli/selftune/utils/skill-log.ts +68 -0
  101. package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
  102. package/cli/selftune/utils/transcript.ts +307 -26
  103. package/cli/selftune/utils/trigger-check.ts +89 -0
  104. package/cli/selftune/utils/tui.ts +156 -0
  105. package/cli/selftune/workflows/discover.ts +254 -0
  106. package/cli/selftune/workflows/skill-md-writer.ts +288 -0
  107. package/cli/selftune/workflows/workflows.ts +188 -0
  108. package/package.json +28 -11
  109. package/packages/telemetry-contract/README.md +11 -0
  110. package/packages/telemetry-contract/fixtures/golden.json +87 -0
  111. package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
  112. package/packages/telemetry-contract/index.ts +1 -0
  113. package/packages/telemetry-contract/package.json +19 -0
  114. package/packages/telemetry-contract/src/index.ts +2 -0
  115. package/packages/telemetry-contract/src/types.ts +163 -0
  116. package/packages/telemetry-contract/src/validators.ts +109 -0
  117. package/skill/SKILL.md +180 -33
  118. package/skill/Workflows/AutoActivation.md +145 -0
  119. package/skill/Workflows/Badge.md +124 -0
  120. package/skill/Workflows/Baseline.md +144 -0
  121. package/skill/Workflows/Composability.md +107 -0
  122. package/skill/Workflows/Contribute.md +94 -0
  123. package/skill/Workflows/Cron.md +132 -0
  124. package/skill/Workflows/Dashboard.md +214 -0
  125. package/skill/Workflows/Doctor.md +63 -14
  126. package/skill/Workflows/Evals.md +110 -18
  127. package/skill/Workflows/EvolutionMemory.md +154 -0
  128. package/skill/Workflows/Evolve.md +181 -21
  129. package/skill/Workflows/EvolveBody.md +159 -0
  130. package/skill/Workflows/Grade.md +36 -31
  131. package/skill/Workflows/ImportSkillsBench.md +117 -0
  132. package/skill/Workflows/Ingest.md +142 -21
  133. package/skill/Workflows/Initialize.md +91 -23
  134. package/skill/Workflows/Orchestrate.md +139 -0
  135. package/skill/Workflows/Replay.md +91 -0
  136. package/skill/Workflows/Rollback.md +23 -4
  137. package/skill/Workflows/Schedule.md +61 -0
  138. package/skill/Workflows/Sync.md +88 -0
  139. package/skill/Workflows/UnitTest.md +150 -0
  140. package/skill/Workflows/Watch.md +33 -1
  141. package/skill/Workflows/Workflows.md +129 -0
  142. package/skill/assets/activation-rules-default.json +26 -0
  143. package/skill/assets/multi-skill-settings.json +63 -0
  144. package/skill/assets/single-skill-settings.json +57 -0
  145. package/skill/references/invocation-taxonomy.md +2 -2
  146. package/skill/references/logs.md +164 -2
  147. package/skill/references/setup-patterns.md +65 -0
  148. package/skill/references/version-history.md +40 -0
  149. package/skill/settings_snippet.json +23 -0
  150. package/templates/activation-rules-default.json +27 -0
  151. package/templates/multi-skill-settings.json +64 -0
  152. package/templates/single-skill-settings.json +58 -0
  153. package/dashboard/index.html +0 -1119
@@ -6,7 +6,16 @@
6
6
  * similar queries together using Jaccard similarity.
7
7
  */
8
8
 
9
- import type { EvalEntry, FailurePattern, InvocationType, SkillUsageRecord } from "../types.js";
9
+ import type {
10
+ EvalEntry,
11
+ FailureFeedback,
12
+ FailurePattern,
13
+ GradingResult,
14
+ InvocationType,
15
+ SkillUsageRecord,
16
+ } from "../types.js";
17
+ import { filterActionableSkillUsageRecords } from "../utils/query-filter.js";
18
+ import { isHighConfidencePositiveSkillRecord } from "../utils/skill-usage-confidence.js";
10
19
 
11
20
  // ---------------------------------------------------------------------------
12
21
  // Jaccard similarity
@@ -93,37 +102,39 @@ export function extractFailurePatterns(
93
102
  evalEntries: EvalEntry[],
94
103
  skillUsage: SkillUsageRecord[],
95
104
  skillName: string,
105
+ gradingResults?: GradingResult[],
96
106
  ): FailurePattern[] {
97
- // 1. Build a set of triggered queries from skillUsage for the given skillName
107
+ const actionableSkillUsage = filterActionableSkillUsageRecords(skillUsage);
98
108
  const triggeredQueries = new Set<string>();
99
- for (const record of skillUsage) {
100
- if (record.skill_name === skillName && record.triggered) {
101
- triggeredQueries.add(record.query);
102
- }
109
+ const skillUsageBySession = new Map<string, SkillUsageRecord[]>();
110
+
111
+ for (const record of actionableSkillUsage) {
112
+ if (!isHighConfidencePositiveSkillRecord(record, skillName)) continue;
113
+ triggeredQueries.add(record.query);
114
+ const sessionRecords = skillUsageBySession.get(record.session_id) ?? [];
115
+ sessionRecords.push(record);
116
+ skillUsageBySession.set(record.session_id, sessionRecords);
103
117
  }
104
118
 
105
- // 2. Find missed queries: should_trigger === true but NOT in the triggered set
106
119
  const missedByType = new Map<InvocationType, string[]>();
107
-
108
120
  for (const entry of evalEntries) {
109
121
  if (!entry.should_trigger) continue;
110
122
  if (triggeredQueries.has(entry.query)) continue;
111
123
 
112
124
  const invType = entry.invocation_type ?? "implicit";
113
- if (!missedByType.has(invType)) {
114
- missedByType.set(invType, []);
115
- }
116
- missedByType.get(invType)?.push(entry.query);
125
+ const queries = missedByType.get(invType) ?? [];
126
+ queries.push(entry.query);
127
+ missedByType.set(invType, queries);
117
128
  }
118
129
 
119
- // 3. For each group, cluster similar queries
120
130
  const now = new Date().toISOString();
121
131
  const allPatterns: FailurePattern[] = [];
122
132
  let index = 0;
133
+ const feedbackMap = new Map<string, FailureFeedback[]>();
134
+ const sampleSessionsByQuery = new Map<string, Set<string>>();
123
135
 
124
136
  for (const [invType, queries] of missedByType) {
125
137
  const clusters = clusterQueries(queries);
126
-
127
138
  for (const cluster of clusters) {
128
139
  allPatterns.push({
129
140
  pattern_id: `fp-${skillName}-${index}`,
@@ -138,8 +149,86 @@ export function extractFailurePatterns(
138
149
  }
139
150
  }
140
151
 
141
- // 4. Sort by frequency descending
142
- allPatterns.sort((a, b) => b.frequency - a.frequency);
152
+ if (gradingResults && gradingResults.length > 0) {
153
+ for (const result of gradingResults) {
154
+ const hasExplicitFeedback = (result.failure_feedback?.length ?? 0) > 0;
155
+ const hasFailedSummary = (result.summary.failed ?? 0) > 0;
156
+ if (result.skill_name !== skillName || (!hasExplicitFeedback && !hasFailedSummary)) continue;
157
+
158
+ const failedQueries = new Set<string>();
159
+
160
+ if (result.failure_feedback) {
161
+ const sessionRecords = skillUsageBySession.get(result.session_id) ?? [];
162
+ for (const feedback of result.failure_feedback) {
163
+ if (!feedback.query) continue;
164
+ const existing = feedbackMap.get(feedback.query) ?? [];
165
+ existing.push(feedback);
166
+ feedbackMap.set(feedback.query, existing);
167
+ if (sessionRecords.some((record) => record.query === feedback.query)) {
168
+ failedQueries.add(feedback.query);
169
+ const sessions = sampleSessionsByQuery.get(feedback.query) ?? new Set<string>();
170
+ sessions.add(result.session_id);
171
+ sampleSessionsByQuery.set(feedback.query, sessions);
172
+ }
173
+ }
174
+ }
143
175
 
176
+ if (failedQueries.size === 0) {
177
+ const sessionRecords = skillUsageBySession.get(result.session_id) ?? [];
178
+ const failedExpectations = result.expectations.filter((expectation) => !expectation.passed);
179
+ for (const record of sessionRecords) {
180
+ failedQueries.add(record.query);
181
+ const sessions = sampleSessionsByQuery.get(record.query) ?? new Set<string>();
182
+ sessions.add(result.session_id);
183
+ sampleSessionsByQuery.set(record.query, sessions);
184
+
185
+ if (failedExpectations.length > 0) {
186
+ const feedback = feedbackMap.get(record.query) ?? [];
187
+ for (const expectation of failedExpectations) {
188
+ feedback.push({
189
+ query: record.query,
190
+ failure_reason: expectation.evidence || expectation.text,
191
+ improvement_hint: expectation.text,
192
+ invocation_type: "contextual",
193
+ });
194
+ }
195
+ feedbackMap.set(record.query, feedback);
196
+ }
197
+ }
198
+ }
199
+ }
200
+
201
+ const contextualQueries = [...sampleSessionsByQuery.keys()];
202
+ if (contextualQueries.length > 0) {
203
+ const clusters = clusterQueries(contextualQueries);
204
+ for (const cluster of clusters) {
205
+ allPatterns.push({
206
+ pattern_id: `fp-${skillName}-${index}`,
207
+ skill_name: skillName,
208
+ invocation_type: "contextual",
209
+ missed_queries: cluster,
210
+ frequency: cluster.length,
211
+ sample_sessions: [
212
+ ...new Set(cluster.flatMap((query) => [...(sampleSessionsByQuery.get(query) ?? [])])),
213
+ ],
214
+ extracted_at: now,
215
+ feedback: cluster.flatMap((query) => feedbackMap.get(query) ?? []),
216
+ });
217
+ index++;
218
+ }
219
+ }
220
+ }
221
+
222
+ for (const pattern of allPatterns) {
223
+ if (pattern.feedback && pattern.feedback.length > 0) continue;
224
+ const matchingFeedback = pattern.missed_queries.flatMap(
225
+ (query) => feedbackMap.get(query) ?? [],
226
+ );
227
+ if (matchingFeedback.length > 0) {
228
+ pattern.feedback = matchingFeedback;
229
+ }
230
+ }
231
+
232
+ allPatterns.sort((a, b) => b.frequency - a.frequency);
144
233
  return allPatterns;
145
234
  }
@@ -0,0 +1,314 @@
1
+ /**
2
+ * pareto.ts
3
+ *
4
+ * Pareto frontier computation for multi-candidate evolution.
5
+ * All functions are pure — no I/O, no LLM calls.
6
+ */
7
+
8
+ import type {
9
+ InvocationType,
10
+ InvocationTypeScores,
11
+ ParetoCandidate,
12
+ SessionTelemetryRecord,
13
+ TokenUsageMetrics,
14
+ } from "../types.js";
15
+
16
+ // ---------------------------------------------------------------------------
17
+ // Score computation
18
+ // ---------------------------------------------------------------------------
19
+
20
+ /**
21
+ * Compute per-invocation-type scores from per-entry validation results.
22
+ */
23
+ export function computeInvocationScores(
24
+ perEntryResults: Array<{ entry: { invocation_type?: InvocationType }; after_pass: boolean }>,
25
+ ): InvocationTypeScores {
26
+ const dims: InvocationType[] = ["explicit", "implicit", "contextual", "negative"];
27
+ const counts: Record<string, { passed: number; total: number }> = {};
28
+
29
+ for (const dim of dims) {
30
+ counts[dim] = { passed: 0, total: 0 };
31
+ }
32
+
33
+ for (const r of perEntryResults) {
34
+ const type = r.entry.invocation_type ?? "implicit";
35
+ counts[type].total++;
36
+ if (r.after_pass) counts[type].passed++;
37
+ }
38
+
39
+ const result: Record<string, { passed: number; total: number; pass_rate: number }> = {};
40
+ for (const dim of dims) {
41
+ const { passed, total } = counts[dim];
42
+ result[dim] = { passed, total, pass_rate: total > 0 ? passed / total : 0 };
43
+ }
44
+
45
+ return result as unknown as InvocationTypeScores;
46
+ }
47
+
48
+ // ---------------------------------------------------------------------------
49
+ // Token efficiency scoring
50
+ // ---------------------------------------------------------------------------
51
+
52
+ /**
53
+ * Clamp a value to [min, max].
54
+ */
55
+ function clamp(value: number, min: number, max: number): number {
56
+ return Math.max(min, Math.min(max, value));
57
+ }
58
+
59
+ /**
60
+ * Compute token usage metrics from telemetry records.
61
+ */
62
+ export function computeTokenUsageMetrics(records: SessionTelemetryRecord[]): TokenUsageMetrics {
63
+ let input = 0;
64
+ let output = 0;
65
+ for (const r of records) {
66
+ input += r.input_tokens ?? 0;
67
+ output += r.output_tokens ?? 0;
68
+ }
69
+ return {
70
+ input_tokens: input,
71
+ output_tokens: output,
72
+ total_tokens: input + output,
73
+ };
74
+ }
75
+
76
+ /**
77
+ * Compute a token efficiency score for a skill.
78
+ *
79
+ * Compares average total tokens for sessions WITH the skill triggered
80
+ * vs sessions WITHOUT it. Returns `clamp(baseline_avg / with_skill_avg, 0, 1)`.
81
+ * Values near 1.0 indicate the baseline uses more tokens than sessions with the
82
+ * skill (i.e. the skill is efficient). Values near 0.0 indicate the skill uses
83
+ * more tokens than the baseline.
84
+ *
85
+ * Returns 0.5 (neutral) when there is insufficient data in either group.
86
+ */
87
+ export function computeTokenEfficiencyScore(
88
+ skillName: string,
89
+ telemetry: SessionTelemetryRecord[],
90
+ ): number {
91
+ const withSkill: number[] = [];
92
+ const withoutSkill: number[] = [];
93
+
94
+ for (const record of telemetry) {
95
+ const total = (record.input_tokens ?? 0) + (record.output_tokens ?? 0);
96
+ if (total <= 0) continue;
97
+
98
+ if (record.skills_triggered.includes(skillName)) {
99
+ withSkill.push(total);
100
+ } else {
101
+ withoutSkill.push(total);
102
+ }
103
+ }
104
+
105
+ if (withSkill.length === 0 || withoutSkill.length === 0) {
106
+ return 0.5; // neutral when insufficient data
107
+ }
108
+
109
+ const avgWithSkill = withSkill.reduce((a, b) => a + b, 0) / withSkill.length;
110
+ const avgBaseline = withoutSkill.reduce((a, b) => a + b, 0) / withoutSkill.length;
111
+
112
+ if (avgWithSkill === 0) return 1; // zero-token skill usage is maximally efficient
113
+
114
+ return clamp(avgBaseline / avgWithSkill, 0, 1);
115
+ }
116
+
117
+ // ---------------------------------------------------------------------------
118
+ // Pareto dominance
119
+ // ---------------------------------------------------------------------------
120
+
121
+ const DIMS: InvocationType[] = ["explicit", "implicit", "contextual", "negative"];
122
+
123
+ /**
124
+ * Returns true if candidate A dominates candidate B:
125
+ * A >= B on all dimensions AND A > B on at least one.
126
+ *
127
+ * When token efficiency scores are provided for BOTH candidates,
128
+ * a 5th dimension is added to the comparison.
129
+ */
130
+ export function dominates(
131
+ a: InvocationTypeScores,
132
+ b: InvocationTypeScores,
133
+ aTokenEfficiency?: number,
134
+ bTokenEfficiency?: number,
135
+ ): boolean {
136
+ let strictlyBetterOnAny = false;
137
+
138
+ for (const dim of DIMS) {
139
+ const aRate = a[dim].pass_rate;
140
+ const bRate = b[dim].pass_rate;
141
+
142
+ if (aRate < bRate) return false; // A is worse on this dim
143
+ if (aRate > bRate) strictlyBetterOnAny = true;
144
+ }
145
+
146
+ // 5th dimension: token efficiency (only when both have data)
147
+ if (aTokenEfficiency !== undefined && bTokenEfficiency !== undefined) {
148
+ if (aTokenEfficiency < bTokenEfficiency) return false;
149
+ if (aTokenEfficiency > bTokenEfficiency) strictlyBetterOnAny = true;
150
+ }
151
+
152
+ return strictlyBetterOnAny;
153
+ }
154
+
155
+ /**
156
+ * Compute the dimensions where candidate A dominates candidate B.
157
+ */
158
+ export function getDominatedDimensions(
159
+ a: InvocationTypeScores,
160
+ b: InvocationTypeScores,
161
+ ): InvocationType[] {
162
+ const result: InvocationType[] = [];
163
+ for (const dim of DIMS) {
164
+ if (a[dim].pass_rate > b[dim].pass_rate) {
165
+ result.push(dim);
166
+ }
167
+ }
168
+ return result;
169
+ }
170
+
171
+ // ---------------------------------------------------------------------------
172
+ // Pareto frontier
173
+ // ---------------------------------------------------------------------------
174
+
175
+ /**
176
+ * Filter candidates to the Pareto frontier (non-dominated set).
177
+ * Also sets `dominates_on` for each frontier member.
178
+ *
179
+ * When candidates have `token_efficiency_score` set, the 5th dimension
180
+ * is used in dominance checks.
181
+ */
182
+ export function computeParetoFrontier(candidates: ParetoCandidate[]): ParetoCandidate[] {
183
+ if (candidates.length === 0) return [];
184
+
185
+ const frontier: ParetoCandidate[] = [];
186
+
187
+ for (const candidate of candidates) {
188
+ // Check if any existing frontier member dominates this candidate
189
+ let isDominated = false;
190
+ for (const member of frontier) {
191
+ if (
192
+ dominates(
193
+ member.invocation_scores,
194
+ candidate.invocation_scores,
195
+ member.token_efficiency_score,
196
+ candidate.token_efficiency_score,
197
+ )
198
+ ) {
199
+ isDominated = true;
200
+ break;
201
+ }
202
+ }
203
+
204
+ if (!isDominated) {
205
+ // Remove frontier members that this candidate dominates
206
+ for (let i = frontier.length - 1; i >= 0; i--) {
207
+ if (
208
+ dominates(
209
+ candidate.invocation_scores,
210
+ frontier[i].invocation_scores,
211
+ candidate.token_efficiency_score,
212
+ frontier[i].token_efficiency_score,
213
+ )
214
+ ) {
215
+ frontier.splice(i, 1);
216
+ }
217
+ }
218
+ frontier.push(candidate);
219
+ }
220
+ }
221
+
222
+ // Set dominates_on for each frontier member (compared to others in frontier)
223
+ for (const member of frontier) {
224
+ const allDominatedDims = new Set<InvocationType>();
225
+ for (const other of frontier) {
226
+ if (other === member) continue;
227
+ for (const dim of getDominatedDimensions(member.invocation_scores, other.invocation_scores)) {
228
+ allDominatedDims.add(dim);
229
+ }
230
+ }
231
+ member.dominates_on = [...allDominatedDims];
232
+ }
233
+
234
+ return frontier;
235
+ }
236
+
237
+ // ---------------------------------------------------------------------------
238
+ // Merge prompt
239
+ // ---------------------------------------------------------------------------
240
+
241
+ /**
242
+ * Build a merge prompt for complementary frontier candidates.
243
+ * Returns null if <= 1 candidate or no complementarity detected.
244
+ */
245
+ export function buildMergePrompt(
246
+ frontier: ParetoCandidate[],
247
+ originalDescription: string,
248
+ ): string | null {
249
+ if (frontier.length <= 1) return null;
250
+
251
+ // Check for complementarity: different candidates dominate on different dimensions
252
+ const hasComplementarity = frontier.some((c) => c.dominates_on.length > 0);
253
+ if (!hasComplementarity) return null;
254
+
255
+ const candidateDescriptions = frontier
256
+ .map((c, i) => {
257
+ const strengths =
258
+ c.dominates_on.length > 0
259
+ ? `Strengths: ${c.dominates_on.join(", ")}`
260
+ : "No unique strengths";
261
+ return `Candidate ${i + 1} (${c.proposal.proposal_id}):\nDescription: ${c.proposal.proposed_description}\n${strengths}\nOverall pass rate: ${(c.validation.after_pass_rate * 100).toFixed(1)}%`;
262
+ })
263
+ .join("\n\n");
264
+
265
+ return `You are merging multiple skill descriptions that each excel on different invocation types.
266
+
267
+ Original description:
268
+ ${originalDescription}
269
+
270
+ Candidates:
271
+ ${candidateDescriptions}
272
+
273
+ Create a single merged description that combines the strengths of all candidates.
274
+ Output ONLY valid JSON with:
275
+ - "proposed_description": the merged description
276
+ - "rationale": explanation of what was combined
277
+ - "confidence": 0.0-1.0`;
278
+ }
279
+
280
+ // ---------------------------------------------------------------------------
281
+ // Selection
282
+ // ---------------------------------------------------------------------------
283
+
284
+ /**
285
+ * Select the best candidate from a Pareto frontier.
286
+ * Returns the best single candidate and whether a merge should be attempted.
287
+ */
288
+ export function selectFromFrontier(frontier: ParetoCandidate[]): {
289
+ best: ParetoCandidate;
290
+ shouldMerge: boolean;
291
+ mergePrompt: string | null;
292
+ } {
293
+ if (frontier.length === 0) {
294
+ throw new Error("Cannot select from empty frontier");
295
+ }
296
+
297
+ // Sort by overall after_pass_rate descending, then by number of new_passes
298
+ const sorted = [...frontier].sort((a, b) => {
299
+ const rateDiff = b.validation.after_pass_rate - a.validation.after_pass_rate;
300
+ if (Math.abs(rateDiff) > 0.001) return rateDiff;
301
+ return b.validation.new_passes.length - a.validation.new_passes.length;
302
+ });
303
+
304
+ const best = sorted[0];
305
+ const shouldMerge = frontier.length > 1 && frontier.some((c) => c.dominates_on.length > 0);
306
+
307
+ return {
308
+ best,
309
+ shouldMerge,
310
+ mergePrompt: shouldMerge
311
+ ? buildMergePrompt(frontier, best.proposal.original_description)
312
+ : null,
313
+ };
314
+ }
@@ -0,0 +1,171 @@
1
+ /**
2
+ * propose-body.ts
3
+ *
4
+ * Generates full body proposals for SKILL.md files using a teacher LLM.
5
+ * The teacher analyzes current content, failure patterns, and missed queries
6
+ * to produce an improved skill body.
7
+ */
8
+
9
+ import type { BodyEvolutionProposal, EvolutionTarget, FailurePattern } from "../types.js";
10
+ import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // System prompt
14
+ // ---------------------------------------------------------------------------
15
+
16
+ /** System prompt for the body generator (teacher) LLM. */
17
+ export const BODY_GENERATOR_SYSTEM = `You are an expert skill document author for an AI agent routing system.
18
+
19
+ Your task is to generate an improved SKILL.md body that better covers the semantic
20
+ space of queries that the skill should handle. The body includes everything after
21
+ the title line: the description, workflow routing table, instructions, examples, etc.
22
+
23
+ Rules:
24
+ - Preserve the overall structure: description paragraph, ## Workflow Routing table, and other ## sections.
25
+ - The ## Workflow Routing table must be a valid markdown table with | Trigger | Workflow | columns.
26
+ - Cover the semantic space of the missed queries without being too broad.
27
+ - Maintain the original intent and scope of the skill.
28
+ - Be specific and actionable in instructions.
29
+ - Output ONLY valid JSON with exactly these fields:
30
+ - "proposed_body" (string): the complete improved skill body (markdown, everything below the title)
31
+ - "rationale" (string): explanation of what changed and why
32
+ - "confidence" (number): 0.0-1.0 how confident you are this improves the skill
33
+
34
+ Do NOT include any text outside the JSON object.`;
35
+
36
+ // ---------------------------------------------------------------------------
37
+ // Prompt builder
38
+ // ---------------------------------------------------------------------------
39
+
40
+ /** Build the user prompt for full body generation. */
41
+ export function buildBodyGenerationPrompt(
42
+ currentContent: string,
43
+ failurePatterns: FailurePattern[],
44
+ missedQueries: string[],
45
+ skillName: string,
46
+ fewShotExamples?: string[],
47
+ ): string {
48
+ const patternLines = failurePatterns.map((p) => {
49
+ const queries = p.missed_queries.map((q) => ` - "${q}"`).join("\n");
50
+ return ` Pattern ${p.pattern_id} (frequency: ${p.frequency}, type: ${p.invocation_type}):\n${queries}`;
51
+ });
52
+
53
+ const missedLines = missedQueries.map((q) => ` - "${q}"`).join("\n");
54
+
55
+ // Build failure feedback section if any patterns have feedback
56
+ const feedbackLines: string[] = [];
57
+ for (const p of failurePatterns) {
58
+ if (p.feedback && p.feedback.length > 0) {
59
+ for (const fb of p.feedback) {
60
+ feedbackLines.push(` Query: "${fb.query}"`);
61
+ feedbackLines.push(` Failure reason: ${fb.failure_reason}`);
62
+ feedbackLines.push(` Improvement hint: ${fb.improvement_hint}`);
63
+ }
64
+ }
65
+ }
66
+ const feedbackSection =
67
+ feedbackLines.length > 0 ? `\n\nStructured Failure Analysis:\n${feedbackLines.join("\n")}` : "";
68
+
69
+ // Build few-shot examples section if provided
70
+ const fewShotSection =
71
+ fewShotExamples && fewShotExamples.length > 0
72
+ ? `\n\nReference Examples (other well-written skills):\n${fewShotExamples.map((ex, i) => `--- Example ${i + 1} ---\n${ex}`).join("\n\n")}`
73
+ : "";
74
+
75
+ return `Skill Name: ${skillName}
76
+
77
+ Current Skill Content:
78
+ ${currentContent}
79
+
80
+ Failure Patterns:
81
+ ${patternLines.join("\n\n")}
82
+
83
+ All Missed Queries:
84
+ ${missedLines}${feedbackSection}${fewShotSection}
85
+
86
+ Generate an improved full body for the "${skillName}" skill that would correctly handle the missed queries listed above. The body should include everything below the # Title line: description, ## Workflow Routing table, and any other sections. Output ONLY a JSON object with "proposed_body", "rationale", and "confidence" fields.`;
87
+ }
88
+
89
+ // ---------------------------------------------------------------------------
90
+ // Response parser
91
+ // ---------------------------------------------------------------------------
92
+
93
+ /** Parse LLM response text into structured body proposal data. */
94
+ export function parseBodyProposalResponse(raw: string): {
95
+ proposed_body: string;
96
+ rationale: string;
97
+ confidence: number;
98
+ } {
99
+ const cleaned = stripMarkdownFences(raw);
100
+
101
+ let parsed: unknown;
102
+ try {
103
+ parsed = JSON.parse(cleaned);
104
+ } catch {
105
+ throw new Error(`Failed to parse LLM response as JSON: ${cleaned.slice(0, 200)}`);
106
+ }
107
+
108
+ if (typeof parsed !== "object" || parsed === null) {
109
+ throw new Error("LLM response is not a JSON object");
110
+ }
111
+
112
+ const obj = parsed as Record<string, unknown>;
113
+
114
+ if (typeof obj.proposed_body !== "string") {
115
+ throw new Error("Missing or invalid 'proposed_body' field in LLM response");
116
+ }
117
+ if (typeof obj.rationale !== "string") {
118
+ throw new Error("Missing or invalid 'rationale' field in LLM response");
119
+ }
120
+ if (typeof obj.confidence !== "number") {
121
+ throw new Error("Missing or invalid 'confidence' field in LLM response");
122
+ }
123
+
124
+ const confidence = Math.max(0.0, Math.min(1.0, obj.confidence));
125
+
126
+ return {
127
+ proposed_body: obj.proposed_body,
128
+ rationale: obj.rationale,
129
+ confidence,
130
+ };
131
+ }
132
+
133
+ // ---------------------------------------------------------------------------
134
+ // Proposal generator
135
+ // ---------------------------------------------------------------------------
136
+
137
+ /** Generate a full body evolution proposal using teacher LLM. */
138
+ export async function generateBodyProposal(
139
+ currentContent: string,
140
+ failurePatterns: FailurePattern[],
141
+ missedQueries: string[],
142
+ skillName: string,
143
+ skillPath: string,
144
+ agent: string,
145
+ modelFlag?: string,
146
+ fewShotExamples?: string[],
147
+ ): Promise<BodyEvolutionProposal> {
148
+ const prompt = buildBodyGenerationPrompt(
149
+ currentContent,
150
+ failurePatterns,
151
+ missedQueries,
152
+ skillName,
153
+ fewShotExamples,
154
+ );
155
+ const rawResponse = await callLlm(BODY_GENERATOR_SYSTEM, prompt, agent, modelFlag);
156
+ const { proposed_body, rationale, confidence } = parseBodyProposalResponse(rawResponse);
157
+
158
+ return {
159
+ proposal_id: `evo-body-${skillName}-${Date.now()}`,
160
+ skill_name: skillName,
161
+ skill_path: skillPath,
162
+ original_body: currentContent,
163
+ proposed_body,
164
+ rationale,
165
+ target: "body" as EvolutionTarget,
166
+ failure_patterns: failurePatterns.map((p) => p.pattern_id),
167
+ confidence,
168
+ created_at: new Date().toISOString(),
169
+ status: "pending",
170
+ };
171
+ }