selftune 0.2.18 → 0.2.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -4
- package/apps/local-dashboard/dist/assets/index-DnhnXQm6.js +60 -0
- package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-table-BIiI3YhS.js +1 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +12 -0
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/alpha-upload/stage-canonical.ts +7 -6
- package/cli/selftune/constants.ts +10 -0
- package/cli/selftune/contribute/contribute.ts +30 -2
- package/cli/selftune/contribution-config.ts +249 -0
- package/cli/selftune/contribution-relay.ts +177 -0
- package/cli/selftune/contribution-signals.ts +219 -0
- package/cli/selftune/contribution-staging.ts +147 -0
- package/cli/selftune/contributions.ts +532 -0
- package/cli/selftune/creator-contributions.ts +333 -0
- package/cli/selftune/dashboard-contract.ts +205 -1
- package/cli/selftune/dashboard-server.ts +45 -11
- package/cli/selftune/eval/family-overlap.ts +395 -0
- package/cli/selftune/eval/hooks-to-evals.ts +182 -28
- package/cli/selftune/eval/synthetic-evals.ts +298 -11
- package/cli/selftune/export.ts +2 -2
- package/cli/selftune/index.ts +41 -5
- package/cli/selftune/ingestors/codex-rollout.ts +31 -35
- package/cli/selftune/ingestors/codex-wrapper.ts +32 -24
- package/cli/selftune/localdb/db.ts +2 -2
- package/cli/selftune/localdb/queries.ts +701 -30
- package/cli/selftune/localdb/schema.ts +20 -0
- package/cli/selftune/recover.ts +153 -0
- package/cli/selftune/repair/skill-usage.ts +363 -4
- package/cli/selftune/routes/actions.ts +35 -1
- package/cli/selftune/routes/analytics.ts +14 -0
- package/cli/selftune/routes/index.ts +1 -0
- package/cli/selftune/routes/overview.ts +112 -4
- package/cli/selftune/routes/skill-report.ts +569 -10
- package/cli/selftune/status.ts +81 -2
- package/cli/selftune/sync.ts +56 -2
- package/cli/selftune/trust-model.ts +66 -0
- package/cli/selftune/types.ts +49 -0
- package/cli/selftune/utils/skill-detection.ts +43 -0
- package/cli/selftune/watchlist.ts +65 -0
- package/package.json +1 -1
- package/packages/ui/src/components/ActivityTimeline.tsx +165 -150
- package/packages/ui/src/components/EvidenceViewer.tsx +335 -144
- package/packages/ui/src/components/EvolutionTimeline.tsx +58 -28
- package/packages/ui/src/components/OrchestrateRunsPanel.tsx +33 -16
- package/packages/ui/src/components/RecentActivityFeed.tsx +72 -41
- package/packages/ui/src/components/section-cards.tsx +12 -9
- package/packages/ui/src/primitives/card.tsx +1 -1
- package/skill/SKILL.md +11 -1
- package/skill/Workflows/AlphaUpload.md +4 -0
- package/skill/Workflows/Composability.md +64 -0
- package/skill/Workflows/Contribute.md +6 -3
- package/skill/Workflows/Contributions.md +97 -0
- package/skill/Workflows/CreatorContributions.md +74 -0
- package/skill/Workflows/Dashboard.md +31 -0
- package/skill/Workflows/Evals.md +57 -8
- package/skill/Workflows/Ingest.md +7 -0
- package/skill/Workflows/Initialize.md +20 -1
- package/skill/Workflows/Recover.md +84 -0
- package/skill/Workflows/RepairSkillUsage.md +12 -4
- package/skill/Workflows/Sync.md +18 -12
- package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +0 -2
- package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +0 -16
- package/apps/local-dashboard/dist/assets/vendor-table-pHbDxq36.js +0 -8
- package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +0 -12
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
|
|
3
|
+
import { parseArgs } from "node:util";
|
|
4
|
+
|
|
5
|
+
import { getDb } from "../localdb/db.js";
|
|
6
|
+
import { queryQueryLog, querySkillUsageRecords } from "../localdb/queries.js";
|
|
7
|
+
import type {
|
|
8
|
+
QueryLogRecord,
|
|
9
|
+
SkillFamilyOverlapMember,
|
|
10
|
+
SkillFamilyOverlapPair,
|
|
11
|
+
SkillFamilyOverlapReport,
|
|
12
|
+
SkillFamilyRefactorProposal,
|
|
13
|
+
SkillUsageRecord,
|
|
14
|
+
} from "../types.js";
|
|
15
|
+
import { CLIError } from "../utils/cli-error.js";
|
|
16
|
+
import {
|
|
17
|
+
findInstalledSkillNames,
|
|
18
|
+
findInstalledSkillPath,
|
|
19
|
+
findRepositoryClaudeSkillDirs,
|
|
20
|
+
findRepositorySkillDirs,
|
|
21
|
+
} from "../utils/skill-discovery.js";
|
|
22
|
+
import { buildEvalSet } from "./hooks-to-evals.js";
|
|
23
|
+
|
|
24
|
+
const DEFAULT_MIN_OVERLAP = 0.3;
|
|
25
|
+
const DEFAULT_MIN_SHARED = 2;
|
|
26
|
+
const DEFAULT_MAX_SHARED = 10;
|
|
27
|
+
|
|
28
|
+
interface FamilyOverlapOptions {
|
|
29
|
+
familyPrefix?: string;
|
|
30
|
+
parentSkillName?: string;
|
|
31
|
+
minOverlapPct?: number;
|
|
32
|
+
minSharedQueries?: number;
|
|
33
|
+
maxSharedQueries?: number;
|
|
34
|
+
searchDirs?: string[];
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function getEvalSkillSearchDirs(): string[] {
|
|
38
|
+
const cwd = process.cwd();
|
|
39
|
+
const homeDir = process.env.HOME ?? "";
|
|
40
|
+
const codexHome = process.env.CODEX_HOME ?? `${homeDir}/.codex`;
|
|
41
|
+
return [
|
|
42
|
+
...findRepositorySkillDirs(cwd),
|
|
43
|
+
...findRepositoryClaudeSkillDirs(cwd),
|
|
44
|
+
`${homeDir}/.agents/skills`,
|
|
45
|
+
`${homeDir}/.claude/skills`,
|
|
46
|
+
`${codexHome}/skills`,
|
|
47
|
+
];
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function normalizeQuery(value: string): string {
|
|
51
|
+
return value.trim().replace(/\s+/g, " ").toLowerCase();
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function inferFamilyPrefix(skills: string[]): string | undefined {
|
|
55
|
+
if (skills.length < 2) return undefined;
|
|
56
|
+
const firstPrefixes = skills.map((skill) => {
|
|
57
|
+
const hyphen = skill.indexOf("-");
|
|
58
|
+
return hyphen === -1 ? skill : skill.slice(0, hyphen + 1);
|
|
59
|
+
});
|
|
60
|
+
const candidate = firstPrefixes[0];
|
|
61
|
+
return firstPrefixes.every((prefix) => prefix === candidate) ? candidate : undefined;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function inferParentSkillName(
|
|
65
|
+
skills: string[],
|
|
66
|
+
explicitParent?: string,
|
|
67
|
+
familyPrefix?: string,
|
|
68
|
+
): string {
|
|
69
|
+
if (explicitParent?.trim()) return explicitParent.trim();
|
|
70
|
+
const inferredPrefix = familyPrefix ?? inferFamilyPrefix(skills) ?? "family";
|
|
71
|
+
return inferredPrefix.endsWith("-") ? inferredPrefix.slice(0, -1) : inferredPrefix;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function toWorkflowName(skillName: string, familyPrefix?: string): string {
|
|
75
|
+
const stripped =
|
|
76
|
+
familyPrefix && skillName.startsWith(familyPrefix)
|
|
77
|
+
? skillName.slice(familyPrefix.length)
|
|
78
|
+
: skillName;
|
|
79
|
+
return stripped.trim() || "default";
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function buildPositiveQuerySet(
|
|
83
|
+
skillName: string,
|
|
84
|
+
skillRecords: SkillUsageRecord[],
|
|
85
|
+
queryRecords: QueryLogRecord[],
|
|
86
|
+
): Set<string> {
|
|
87
|
+
const evalEntries = buildEvalSet(
|
|
88
|
+
skillRecords,
|
|
89
|
+
queryRecords,
|
|
90
|
+
skillName,
|
|
91
|
+
Number.MAX_SAFE_INTEGER,
|
|
92
|
+
false,
|
|
93
|
+
42,
|
|
94
|
+
false,
|
|
95
|
+
);
|
|
96
|
+
return new Set(
|
|
97
|
+
evalEntries
|
|
98
|
+
.filter((entry) => entry.should_trigger)
|
|
99
|
+
.map((entry) => normalizeQuery(entry.query))
|
|
100
|
+
.filter(Boolean),
|
|
101
|
+
);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function buildMember(
|
|
105
|
+
skillName: string,
|
|
106
|
+
positiveQueries: Set<string>,
|
|
107
|
+
searchDirs: string[],
|
|
108
|
+
): SkillFamilyOverlapMember {
|
|
109
|
+
return {
|
|
110
|
+
skill_name: skillName,
|
|
111
|
+
skill_path: findInstalledSkillPath(skillName, searchDirs),
|
|
112
|
+
positive_query_count: positiveQueries.size,
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function scoreConsolidationPressure(overlapPct: number): "low" | "medium" | "high" {
|
|
117
|
+
if (overlapPct >= 0.6) return "high";
|
|
118
|
+
if (overlapPct >= 0.4) return "medium";
|
|
119
|
+
return "low";
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
function buildRefactorProposal(
|
|
123
|
+
skills: string[],
|
|
124
|
+
familyPrefix: string | undefined,
|
|
125
|
+
parentSkillName: string,
|
|
126
|
+
): SkillFamilyRefactorProposal {
|
|
127
|
+
const workflows = skills.map((skillName) => {
|
|
128
|
+
const workflowName = toWorkflowName(skillName, familyPrefix);
|
|
129
|
+
return {
|
|
130
|
+
workflow_name: workflowName,
|
|
131
|
+
source_skill: skillName,
|
|
132
|
+
suggested_path: `Workflows/${workflowName}.md`,
|
|
133
|
+
};
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
parent_skill_name: parentSkillName,
|
|
138
|
+
family_prefix: familyPrefix,
|
|
139
|
+
internal_workflows: workflows,
|
|
140
|
+
compatibility_aliases: workflows.map((workflow) => ({
|
|
141
|
+
skill_name: workflow.source_skill,
|
|
142
|
+
target_workflow: workflow.workflow_name,
|
|
143
|
+
})),
|
|
144
|
+
migration_notes: [
|
|
145
|
+
`Create a parent skill \`${parentSkillName}\` whose SKILL.md routes into internal workflows instead of exposing each family member as a primary top-level trigger surface.`,
|
|
146
|
+
"Keep the existing sibling skills as thin compatibility aliases for at least one release cycle while usage shifts to the parent skill.",
|
|
147
|
+
"Move execution-specific instructions into internal Workflows/ or references/ files so the parent SKILL.md stays focused on routing and progressive disclosure.",
|
|
148
|
+
"Use the compatibility aliases to measure whether trigger quality improves before removing the old skill entry points.",
|
|
149
|
+
],
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
export function analyzeSkillFamilyOverlap(
|
|
154
|
+
skills: string[],
|
|
155
|
+
skillRecords: SkillUsageRecord[],
|
|
156
|
+
queryRecords: QueryLogRecord[],
|
|
157
|
+
options: FamilyOverlapOptions = {},
|
|
158
|
+
): SkillFamilyOverlapReport {
|
|
159
|
+
if (skills.length < 2) {
|
|
160
|
+
throw new CLIError(
|
|
161
|
+
"Skill family overlap analysis requires at least 2 skills.",
|
|
162
|
+
"INVALID_FLAG",
|
|
163
|
+
"selftune eval family-overlap --skills skill-a,skill-b",
|
|
164
|
+
);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
const searchDirs = options.searchDirs ?? getEvalSkillSearchDirs();
|
|
168
|
+
const familyPrefix = options.familyPrefix ?? inferFamilyPrefix(skills);
|
|
169
|
+
const minOverlapPct = options.minOverlapPct ?? DEFAULT_MIN_OVERLAP;
|
|
170
|
+
const minSharedQueries = options.minSharedQueries ?? DEFAULT_MIN_SHARED;
|
|
171
|
+
const maxSharedQueries = options.maxSharedQueries ?? DEFAULT_MAX_SHARED;
|
|
172
|
+
|
|
173
|
+
const positiveQueriesBySkill = new Map<string, Set<string>>();
|
|
174
|
+
const members: SkillFamilyOverlapMember[] = [];
|
|
175
|
+
for (const skillName of skills) {
|
|
176
|
+
const positives = buildPositiveQuerySet(skillName, skillRecords, queryRecords);
|
|
177
|
+
positiveQueriesBySkill.set(skillName, positives);
|
|
178
|
+
members.push(buildMember(skillName, positives, searchDirs));
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
const pairs: SkillFamilyOverlapPair[] = [];
|
|
182
|
+
for (let i = 0; i < skills.length; i++) {
|
|
183
|
+
for (let j = i + 1; j < skills.length; j++) {
|
|
184
|
+
const skillA = skills[i];
|
|
185
|
+
const skillB = skills[j];
|
|
186
|
+
const positivesA = positiveQueriesBySkill.get(skillA) ?? new Set<string>();
|
|
187
|
+
const positivesB = positiveQueriesBySkill.get(skillB) ?? new Set<string>();
|
|
188
|
+
if (positivesA.size === 0 || positivesB.size === 0) continue;
|
|
189
|
+
|
|
190
|
+
const sharedQueries = [...positivesA].filter((query) => positivesB.has(query));
|
|
191
|
+
const overlapPct = sharedQueries.length / Math.min(positivesA.size, positivesB.size);
|
|
192
|
+
if (sharedQueries.length < minSharedQueries || overlapPct < minOverlapPct) continue;
|
|
193
|
+
|
|
194
|
+
pairs.push({
|
|
195
|
+
skill_a: skillA,
|
|
196
|
+
skill_b: skillB,
|
|
197
|
+
overlap_pct: overlapPct,
|
|
198
|
+
shared_query_count: sharedQueries.length,
|
|
199
|
+
shared_queries: sharedQueries.slice(0, maxSharedQueries),
|
|
200
|
+
consolidation_pressure: scoreConsolidationPressure(overlapPct),
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
pairs.sort(
|
|
206
|
+
(a, b) => b.overlap_pct - a.overlap_pct || b.shared_query_count - a.shared_query_count,
|
|
207
|
+
);
|
|
208
|
+
|
|
209
|
+
const totalPairsAnalyzed = (skills.length * (skills.length - 1)) / 2;
|
|
210
|
+
const overlapCount = pairs.length;
|
|
211
|
+
const overlapDensity = totalPairsAnalyzed > 0 ? overlapCount / totalPairsAnalyzed : 0;
|
|
212
|
+
const averageOverlapPct =
|
|
213
|
+
overlapCount > 0 ? pairs.reduce((sum, pair) => sum + pair.overlap_pct, 0) / overlapCount : 0;
|
|
214
|
+
const readySkillCount = members.filter(
|
|
215
|
+
(member) => member.positive_query_count >= minSharedQueries,
|
|
216
|
+
).length;
|
|
217
|
+
const consolidationCandidate =
|
|
218
|
+
readySkillCount >= 2 &&
|
|
219
|
+
skills.length >= 3 &&
|
|
220
|
+
(overlapCount >= 2 || (overlapCount >= 1 && overlapDensity >= 0.5));
|
|
221
|
+
|
|
222
|
+
const parentSkillName = inferParentSkillName(skills, options.parentSkillName, familyPrefix);
|
|
223
|
+
const rationale = [
|
|
224
|
+
`${skills.length} sibling skills analyzed with ${totalPairsAnalyzed} pairwise boundary checks.`,
|
|
225
|
+
overlapCount === 0
|
|
226
|
+
? "No exact-query overlap crossed the current consolidation threshold."
|
|
227
|
+
: `${overlapCount} skill pairs share at least ${Math.round(minOverlapPct * 100)}% of their trusted positive queries.`,
|
|
228
|
+
];
|
|
229
|
+
|
|
230
|
+
if (pairs.some((pair) => pair.consolidation_pressure === "high")) {
|
|
231
|
+
rationale.push(
|
|
232
|
+
"High-overlap pairs suggest the current top-level routing surfaces are competing for the same real user intent.",
|
|
233
|
+
);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if (readySkillCount < 2) {
|
|
237
|
+
rationale.push(
|
|
238
|
+
`Only ${readySkillCount} sibling skills currently have enough trusted positives to make a packaging call. Generate cold-start evals and gather real usage before treating this as evidence against consolidation.`,
|
|
239
|
+
);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
if (consolidationCandidate) {
|
|
243
|
+
rationale.push(
|
|
244
|
+
"This family looks like a packaging problem, not just a wording problem. Test a parent skill with internal workflows before continuing standalone description optimization.",
|
|
245
|
+
);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
return {
|
|
249
|
+
family_prefix: familyPrefix,
|
|
250
|
+
analyzed_skills: skills,
|
|
251
|
+
members,
|
|
252
|
+
pairs,
|
|
253
|
+
total_pairs_analyzed: totalPairsAnalyzed,
|
|
254
|
+
overlap_count: overlapCount,
|
|
255
|
+
overlap_density: overlapDensity,
|
|
256
|
+
average_overlap_pct: averageOverlapPct,
|
|
257
|
+
consolidation_candidate: consolidationCandidate,
|
|
258
|
+
recommendation:
|
|
259
|
+
readySkillCount < 2
|
|
260
|
+
? "Insufficient trusted telemetry to make a family-packaging call yet. Use cold-start evals plus a few days of real usage before deciding whether to consolidate."
|
|
261
|
+
: consolidationCandidate
|
|
262
|
+
? `Consider consolidating this family under a parent skill like \`${parentSkillName}\`.`
|
|
263
|
+
: "Keep the skills separate for now and continue improving boundaries at the description/workflow level.",
|
|
264
|
+
rationale,
|
|
265
|
+
refactor_proposal: consolidationCandidate
|
|
266
|
+
? buildRefactorProposal(skills, familyPrefix, parentSkillName)
|
|
267
|
+
: undefined,
|
|
268
|
+
generated_at: new Date().toISOString(),
|
|
269
|
+
};
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
function parseSkillList(raw: string | undefined): string[] {
|
|
273
|
+
if (!raw) return [];
|
|
274
|
+
return raw
|
|
275
|
+
.split(",")
|
|
276
|
+
.map((value) => value.trim())
|
|
277
|
+
.filter(Boolean);
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
function resolveFamilySkills(
|
|
281
|
+
explicitSkills: string[],
|
|
282
|
+
familyPrefix: string | undefined,
|
|
283
|
+
skillRecords: SkillUsageRecord[],
|
|
284
|
+
searchDirs: string[],
|
|
285
|
+
): string[] {
|
|
286
|
+
if (explicitSkills.length > 0)
|
|
287
|
+
return [...new Set(explicitSkills)].sort((a, b) => a.localeCompare(b));
|
|
288
|
+
|
|
289
|
+
if (!familyPrefix) {
|
|
290
|
+
throw new CLIError(
|
|
291
|
+
"Pass either --skills <a,b,c> or --prefix <family->.",
|
|
292
|
+
"MISSING_FLAG",
|
|
293
|
+
"selftune eval family-overlap --prefix sc-",
|
|
294
|
+
);
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
const installedNames = findInstalledSkillNames(searchDirs);
|
|
298
|
+
const observedNames = new Set<string>(
|
|
299
|
+
skillRecords.map((record) => record.skill_name).filter(Boolean),
|
|
300
|
+
);
|
|
301
|
+
const familySkills = new Set<string>();
|
|
302
|
+
for (const name of [...installedNames, ...observedNames]) {
|
|
303
|
+
if (name.startsWith(familyPrefix)) familySkills.add(name);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
return [...familySkills].sort((a, b) => a.localeCompare(b));
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
export async function cliMain(): Promise<void> {
|
|
310
|
+
let values: ReturnType<typeof parseArgs>["values"];
|
|
311
|
+
try {
|
|
312
|
+
({ values } = parseArgs({
|
|
313
|
+
options: {
|
|
314
|
+
help: { type: "boolean", short: "h", default: false },
|
|
315
|
+
prefix: { type: "string" },
|
|
316
|
+
skills: { type: "string" },
|
|
317
|
+
"parent-skill": { type: "string" },
|
|
318
|
+
"min-overlap": { type: "string" },
|
|
319
|
+
"min-shared": { type: "string" },
|
|
320
|
+
},
|
|
321
|
+
strict: true,
|
|
322
|
+
}));
|
|
323
|
+
} catch (error) {
|
|
324
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
325
|
+
throw new CLIError(
|
|
326
|
+
`Invalid arguments: ${message}`,
|
|
327
|
+
"INVALID_FLAG",
|
|
328
|
+
"selftune eval family-overlap --help",
|
|
329
|
+
);
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
if (values.help) {
|
|
333
|
+
console.log(`Usage:
|
|
334
|
+
selftune eval family-overlap --skills skill-a,skill-b[,skill-c]
|
|
335
|
+
selftune eval family-overlap --prefix sc-
|
|
336
|
+
|
|
337
|
+
Options:
|
|
338
|
+
--skills <a,b,c> Explicit skill names
|
|
339
|
+
--prefix <family-> Analyze installed or observed skills with this prefix
|
|
340
|
+
--parent-skill <name> Override the inferred parent skill name
|
|
341
|
+
--min-overlap <0-1> Minimum overlap percentage (default: 0.3)
|
|
342
|
+
--min-shared <n> Minimum shared queries (default: 2)
|
|
343
|
+
-h, --help Show this help
|
|
344
|
+
`);
|
|
345
|
+
return;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
const rawMinOverlap = values["min-overlap"] as string | undefined;
|
|
349
|
+
const rawMinShared = values["min-shared"] as string | undefined;
|
|
350
|
+
const minOverlapPct =
|
|
351
|
+
rawMinOverlap === undefined ? DEFAULT_MIN_OVERLAP : Number.parseFloat(rawMinOverlap);
|
|
352
|
+
const minSharedQueries =
|
|
353
|
+
rawMinShared === undefined ? DEFAULT_MIN_SHARED : Number.parseInt(rawMinShared, 10);
|
|
354
|
+
|
|
355
|
+
if (!Number.isFinite(minOverlapPct) || minOverlapPct <= 0 || minOverlapPct > 1) {
|
|
356
|
+
throw new CLIError(
|
|
357
|
+
"Invalid --min-overlap value. Use a number between 0 and 1.",
|
|
358
|
+
"INVALID_FLAG",
|
|
359
|
+
"selftune eval family-overlap --prefix sc- --min-overlap 0.3",
|
|
360
|
+
);
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
if (!Number.isFinite(minSharedQueries) || minSharedQueries < 1) {
|
|
364
|
+
throw new CLIError(
|
|
365
|
+
"Invalid --min-shared value. Use a positive integer.",
|
|
366
|
+
"INVALID_FLAG",
|
|
367
|
+
"selftune eval family-overlap --prefix sc- --min-shared 2",
|
|
368
|
+
);
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
const searchDirs = getEvalSkillSearchDirs();
|
|
372
|
+
const db = getDb();
|
|
373
|
+
const skillRecords = querySkillUsageRecords(db) as SkillUsageRecord[];
|
|
374
|
+
const queryRecords = queryQueryLog(db) as QueryLogRecord[];
|
|
375
|
+
const familyPrefix = (values.prefix as string | undefined)?.trim() || undefined;
|
|
376
|
+
const explicitSkills = parseSkillList(values.skills as string | undefined);
|
|
377
|
+
const skills = resolveFamilySkills(explicitSkills, familyPrefix, skillRecords, searchDirs);
|
|
378
|
+
|
|
379
|
+
if (skills.length < 2) {
|
|
380
|
+
throw new CLIError(
|
|
381
|
+
`Need at least 2 skills to analyze, found ${skills.length}.`,
|
|
382
|
+
"INVALID_FLAG",
|
|
383
|
+
"selftune eval family-overlap --prefix sc-",
|
|
384
|
+
);
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
const report = analyzeSkillFamilyOverlap(skills, skillRecords, queryRecords, {
|
|
388
|
+
familyPrefix,
|
|
389
|
+
parentSkillName: (values["parent-skill"] as string | undefined)?.trim() || undefined,
|
|
390
|
+
minOverlapPct,
|
|
391
|
+
minSharedQueries,
|
|
392
|
+
searchDirs,
|
|
393
|
+
});
|
|
394
|
+
console.log(JSON.stringify(report, null, 2));
|
|
395
|
+
}
|
|
@@ -2,7 +2,8 @@
|
|
|
2
2
|
/**
|
|
3
3
|
* hooks-to-evals.ts
|
|
4
4
|
*
|
|
5
|
-
* Converts hook logs into trigger eval sets compatible with
|
|
5
|
+
* Converts hook logs into trigger eval sets compatible with the current
|
|
6
|
+
* eval-generate -> evolve --dry-run validation loop.
|
|
6
7
|
*
|
|
7
8
|
* Default read path is SQLite (via localdb/queries). JSONL fallback is used only
|
|
8
9
|
* when custom --skill-log / --query-log / --telemetry-log paths are supplied
|
|
@@ -43,6 +44,13 @@ import {
|
|
|
43
44
|
filterActionableSkillUsageRecords,
|
|
44
45
|
} from "../utils/query-filter.js";
|
|
45
46
|
import { seededShuffle } from "../utils/seeded-random.js";
|
|
47
|
+
import {
|
|
48
|
+
escapeRegExp,
|
|
49
|
+
findInstalledSkillNames,
|
|
50
|
+
findInstalledSkillPath,
|
|
51
|
+
findRepositoryClaudeSkillDirs,
|
|
52
|
+
findRepositorySkillDirs,
|
|
53
|
+
} from "../utils/skill-discovery.js";
|
|
46
54
|
import { isHighConfidencePositiveSkillRecord } from "../utils/skill-usage-confidence.js";
|
|
47
55
|
import { generateSyntheticEvals } from "./synthetic-evals.js";
|
|
48
56
|
|
|
@@ -78,14 +86,14 @@ export function classifyInvocation(query: string, skillName: string): Invocation
|
|
|
78
86
|
// Handle hyphenated skill names: check if all parts appear
|
|
79
87
|
if (skillLower.includes("-")) {
|
|
80
88
|
const parts = skillLower.split("-");
|
|
81
|
-
if (parts.every((part) =>
|
|
89
|
+
if (parts.every((part) => new RegExp(`\\b${escapeRegExp(part)}\\b`, "i").test(query))) {
|
|
82
90
|
return "explicit";
|
|
83
91
|
}
|
|
84
92
|
}
|
|
85
93
|
|
|
86
94
|
// Convert skill-name to camelCase and check
|
|
87
95
|
const camelCase = skillLower.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
|
|
88
|
-
if (camelCase !== skillLower && qLower.includes(camelCase)) {
|
|
96
|
+
if (camelCase !== skillLower && qLower.includes(camelCase.toLowerCase())) {
|
|
89
97
|
return "explicit";
|
|
90
98
|
}
|
|
91
99
|
|
|
@@ -207,6 +215,78 @@ export function buildEvalSet(
|
|
|
207
215
|
return [...shuffledPositives, ...negatives];
|
|
208
216
|
}
|
|
209
217
|
|
|
218
|
+
// ---------------------------------------------------------------------------
|
|
219
|
+
// Installed skill discovery / readiness
|
|
220
|
+
// ---------------------------------------------------------------------------
|
|
221
|
+
|
|
222
|
+
export interface EvalSkillReadiness {
|
|
223
|
+
name: string;
|
|
224
|
+
trusted_trigger_count: number;
|
|
225
|
+
raw_trigger_count: number;
|
|
226
|
+
trusted_session_count: number;
|
|
227
|
+
raw_session_count: number;
|
|
228
|
+
installed: boolean;
|
|
229
|
+
skill_path?: string;
|
|
230
|
+
readiness: "log_ready" | "cold_start_ready" | "telemetry_only";
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
function getEvalSkillSearchDirs(): string[] {
|
|
234
|
+
const cwd = process.cwd();
|
|
235
|
+
const homeDir = process.env.HOME ?? "";
|
|
236
|
+
const codexHome = process.env.CODEX_HOME ?? `${homeDir}/.codex`;
|
|
237
|
+
return [
|
|
238
|
+
...findRepositorySkillDirs(cwd),
|
|
239
|
+
...findRepositoryClaudeSkillDirs(cwd),
|
|
240
|
+
`${homeDir}/.agents/skills`,
|
|
241
|
+
`${homeDir}/.claude/skills`,
|
|
242
|
+
`${codexHome}/skills`,
|
|
243
|
+
];
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
export function listEvalSkillReadiness(
|
|
247
|
+
skillRecords: SkillUsageRecord[],
|
|
248
|
+
searchDirs: string[] = getEvalSkillSearchDirs(),
|
|
249
|
+
): EvalSkillReadiness[] {
|
|
250
|
+
const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
|
|
251
|
+
const rawTriggerCounts = new Map<string, number>();
|
|
252
|
+
const rawSessionCounts = new Map<string, Set<string>>();
|
|
253
|
+
const trustedTriggerCounts = new Map<string, number>();
|
|
254
|
+
const trustedSessionCounts = new Map<string, Set<string>>();
|
|
255
|
+
for (const r of actionableSkillRecords) {
|
|
256
|
+
const name = r.skill_name ?? "unknown";
|
|
257
|
+
rawTriggerCounts.set(name, (rawTriggerCounts.get(name) ?? 0) + 1);
|
|
258
|
+
if (!rawSessionCounts.has(name)) rawSessionCounts.set(name, new Set<string>());
|
|
259
|
+
if (r.session_id) rawSessionCounts.get(name)?.add(r.session_id);
|
|
260
|
+
|
|
261
|
+
if (!isHighConfidencePositiveSkillRecord(r, name)) continue;
|
|
262
|
+
trustedTriggerCounts.set(name, (trustedTriggerCounts.get(name) ?? 0) + 1);
|
|
263
|
+
if (!trustedSessionCounts.has(name)) trustedSessionCounts.set(name, new Set<string>());
|
|
264
|
+
if (r.session_id) trustedSessionCounts.get(name)?.add(r.session_id);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
const installedNames = findInstalledSkillNames(searchDirs);
|
|
268
|
+
const allNames = new Set<string>([...rawTriggerCounts.keys(), ...installedNames]);
|
|
269
|
+
|
|
270
|
+
return [...allNames]
|
|
271
|
+
.sort((a, b) => a.localeCompare(b))
|
|
272
|
+
.map((name) => {
|
|
273
|
+
const trustedTriggerCount = trustedTriggerCounts.get(name) ?? 0;
|
|
274
|
+
const rawTriggerCount = rawTriggerCounts.get(name) ?? 0;
|
|
275
|
+
const installed = installedNames.has(name);
|
|
276
|
+
return {
|
|
277
|
+
name,
|
|
278
|
+
trusted_trigger_count: trustedTriggerCount,
|
|
279
|
+
raw_trigger_count: rawTriggerCount,
|
|
280
|
+
trusted_session_count: trustedSessionCounts.get(name)?.size ?? 0,
|
|
281
|
+
raw_session_count: rawSessionCounts.get(name)?.size ?? 0,
|
|
282
|
+
installed,
|
|
283
|
+
skill_path: installed ? findInstalledSkillPath(name, searchDirs) : undefined,
|
|
284
|
+
readiness:
|
|
285
|
+
trustedTriggerCount > 0 ? "log_ready" : installed ? "cold_start_ready" : "telemetry_only",
|
|
286
|
+
} satisfies EvalSkillReadiness;
|
|
287
|
+
});
|
|
288
|
+
}
|
|
289
|
+
|
|
210
290
|
// ---------------------------------------------------------------------------
|
|
211
291
|
// List skills
|
|
212
292
|
// ---------------------------------------------------------------------------
|
|
@@ -216,24 +296,37 @@ export function listSkills(
|
|
|
216
296
|
queryRecords: QueryLogRecord[],
|
|
217
297
|
telemetryRecords: SessionTelemetryRecord[],
|
|
218
298
|
): void {
|
|
219
|
-
const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
|
|
220
299
|
const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
|
|
221
|
-
const
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
300
|
+
const readiness = listEvalSkillReadiness(skillRecords);
|
|
301
|
+
|
|
302
|
+
console.log(`Skills with eval readiness (${readiness.length} total):`);
|
|
303
|
+
if (readiness.length > 0) {
|
|
304
|
+
for (const skill of readiness) {
|
|
305
|
+
const readinessLabel =
|
|
306
|
+
skill.readiness === "log_ready"
|
|
307
|
+
? "log-ready"
|
|
308
|
+
: skill.readiness === "cold_start_ready"
|
|
309
|
+
? "cold-start"
|
|
310
|
+
: "telemetry-only";
|
|
311
|
+
const installLabel = skill.installed ? "installed" : "not installed";
|
|
312
|
+
const trustedLabel = `${String(skill.trusted_trigger_count).padStart(3)} trusted`;
|
|
313
|
+
const rawLabel =
|
|
314
|
+
skill.raw_trigger_count !== skill.trusted_trigger_count
|
|
315
|
+
? ` / ${String(skill.raw_trigger_count).padStart(3)} raw`
|
|
316
|
+
: "";
|
|
317
|
+
console.log(
|
|
318
|
+
` ${skill.name.padEnd(30)} ${trustedLabel}${rawLabel} ${String(skill.trusted_session_count).padStart(3)} trusted sessions ${readinessLabel} / ${installLabel}`,
|
|
319
|
+
);
|
|
234
320
|
}
|
|
321
|
+
console.log("");
|
|
322
|
+
console.log("Legend:");
|
|
323
|
+
console.log(" log-ready real triggers exist; run eval generate normally");
|
|
324
|
+
console.log(
|
|
325
|
+
" cold-start installed locally but no trusted triggers yet; use --auto-synthetic",
|
|
326
|
+
);
|
|
327
|
+
console.log(" telemetry-only trigger data exists but local SKILL.md was not found");
|
|
235
328
|
} else {
|
|
236
|
-
console.log(" (none yet --
|
|
329
|
+
console.log(" (none yet -- install skills or sync source data first)");
|
|
237
330
|
}
|
|
238
331
|
|
|
239
332
|
console.log(`\nActionable queries in all_queries_log: ${actionableQueryRecords.length}`);
|
|
@@ -370,15 +463,25 @@ export function printEvalStats(
|
|
|
370
463
|
}
|
|
371
464
|
|
|
372
465
|
console.log("Next steps:");
|
|
373
|
-
console.log(
|
|
466
|
+
console.log(` selftune evolve --skill ${skillName} \\`);
|
|
467
|
+
console.log(` --skill-path /path/to/skills/${skillName}/SKILL.md \\`);
|
|
374
468
|
console.log(` --eval-set ${outputPath} \\`);
|
|
375
|
-
console.log(
|
|
376
|
-
console.log(" --runs-per-query 3 --verbose");
|
|
469
|
+
console.log(" --dry-run --verbose");
|
|
377
470
|
console.log();
|
|
378
|
-
console.log(
|
|
379
|
-
console.log(` --
|
|
380
|
-
console.log(` --
|
|
381
|
-
|
|
471
|
+
console.log(` selftune evolve --skill ${skillName} \\`);
|
|
472
|
+
console.log(` --skill-path /path/to/skills/${skillName}/SKILL.md \\`);
|
|
473
|
+
console.log(` --eval-set ${outputPath}`);
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
function printSyntheticFallbackHint(skillName: string, skillPath: string): void {
|
|
477
|
+
console.log("");
|
|
478
|
+
console.log(`[TIP] No trusted trigger data found yet for '${skillName}'.`);
|
|
479
|
+
console.log(
|
|
480
|
+
" This skill is installed locally, so you can still generate a cold-start eval set:",
|
|
481
|
+
);
|
|
482
|
+
console.log(
|
|
483
|
+
` selftune eval generate --skill ${skillName} --auto-synthetic --skill-path ${skillPath}`,
|
|
484
|
+
);
|
|
382
485
|
}
|
|
383
486
|
|
|
384
487
|
// ---------------------------------------------------------------------------
|
|
@@ -401,6 +504,7 @@ export async function cliMain(): Promise<void> {
|
|
|
401
504
|
"query-log": { type: "string", default: QUERY_LOG },
|
|
402
505
|
"telemetry-log": { type: "string", default: TELEMETRY_LOG },
|
|
403
506
|
synthetic: { type: "boolean", default: false },
|
|
507
|
+
"auto-synthetic": { type: "boolean", default: false },
|
|
404
508
|
"skill-path": { type: "string" },
|
|
405
509
|
model: { type: "string" },
|
|
406
510
|
},
|
|
@@ -466,10 +570,10 @@ export async function cliMain(): Promise<void> {
|
|
|
466
570
|
}
|
|
467
571
|
|
|
468
572
|
console.log("\nNext steps:");
|
|
469
|
-
console.log(
|
|
470
|
-
console.log(` --eval-set ${outputPath} \\`);
|
|
573
|
+
console.log(` selftune evolve --skill ${values.skill} \\`);
|
|
471
574
|
console.log(` --skill-path ${values["skill-path"]} \\`);
|
|
472
|
-
console.log(
|
|
575
|
+
console.log(` --eval-set ${outputPath} \\`);
|
|
576
|
+
console.log(" --dry-run --verbose");
|
|
473
577
|
return;
|
|
474
578
|
}
|
|
475
579
|
|
|
@@ -504,6 +608,8 @@ export async function cliMain(): Promise<void> {
|
|
|
504
608
|
const maxPerSide = Number.parseInt(values.max ?? "50", 10);
|
|
505
609
|
const seed = Number.parseInt(values.seed ?? "42", 10);
|
|
506
610
|
const annotateTaxonomy = !values["no-taxonomy"];
|
|
611
|
+
const searchDirs = getEvalSkillSearchDirs();
|
|
612
|
+
const detectedSkillPath = findInstalledSkillPath(values.skill, searchDirs);
|
|
507
613
|
|
|
508
614
|
const evalSet = buildEvalSet(
|
|
509
615
|
skillRecords,
|
|
@@ -515,9 +621,57 @@ export async function cliMain(): Promise<void> {
|
|
|
515
621
|
annotateTaxonomy,
|
|
516
622
|
);
|
|
517
623
|
|
|
624
|
+
const positiveCount = evalSet.filter((entry) => entry.should_trigger).length;
|
|
625
|
+
if (positiveCount === 0 && values["auto-synthetic"]) {
|
|
626
|
+
const skillPath = values["skill-path"] ?? detectedSkillPath;
|
|
627
|
+
if (!skillPath) {
|
|
628
|
+
throw new CLIError(
|
|
629
|
+
`No trusted triggers found for '${values.skill}', and no SKILL.md path could be resolved for synthetic fallback.`,
|
|
630
|
+
"FILE_NOT_FOUND",
|
|
631
|
+
`Run 'selftune eval generate --list-skills' or rerun with --skill-path /path/to/SKILL.md`,
|
|
632
|
+
);
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
const agent = detectAgent();
|
|
636
|
+
if (!agent) {
|
|
637
|
+
throw new CLIError(
|
|
638
|
+
"No agent CLI found (claude/codex/opencode)",
|
|
639
|
+
"AGENT_NOT_FOUND",
|
|
640
|
+
"Install one of the supported agent CLIs",
|
|
641
|
+
);
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
console.log(
|
|
645
|
+
`No trusted triggers found for '${values.skill}'. Falling back to synthetic cold-start eval generation...`,
|
|
646
|
+
);
|
|
647
|
+
const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
|
|
648
|
+
const syntheticEvalSet = await generateSyntheticEvals(skillPath, values.skill, agent, {
|
|
649
|
+
maxPositives: effectiveMax,
|
|
650
|
+
maxNegatives: effectiveMax,
|
|
651
|
+
modelFlag: values.model,
|
|
652
|
+
});
|
|
653
|
+
const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
|
|
654
|
+
writeFileSync(outputPath, JSON.stringify(syntheticEvalSet, null, 2), "utf-8");
|
|
655
|
+
const pos = syntheticEvalSet.filter((e) => e.should_trigger);
|
|
656
|
+
const neg = syntheticEvalSet.filter((e) => !e.should_trigger);
|
|
657
|
+
|
|
658
|
+
console.log(`Wrote ${syntheticEvalSet.length} synthetic eval entries to ${outputPath}`);
|
|
659
|
+
console.log(` Positives (should_trigger=true) : ${pos.length}`);
|
|
660
|
+
console.log(` Negatives (should_trigger=false): ${neg.length}`);
|
|
661
|
+
console.log("\nNext steps:");
|
|
662
|
+
console.log(` selftune evolve --skill ${values.skill} \\`);
|
|
663
|
+
console.log(` --skill-path ${skillPath} \\`);
|
|
664
|
+
console.log(` --eval-set ${outputPath} \\`);
|
|
665
|
+
console.log(" --dry-run --verbose");
|
|
666
|
+
return;
|
|
667
|
+
}
|
|
668
|
+
|
|
518
669
|
const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
|
|
519
670
|
writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
|
|
520
671
|
printEvalStats(evalSet, values.skill, outputPath, skillRecords, queryRecords, annotateTaxonomy);
|
|
672
|
+
if (positiveCount === 0 && detectedSkillPath) {
|
|
673
|
+
printSyntheticFallbackHint(values.skill, detectedSkillPath);
|
|
674
|
+
}
|
|
521
675
|
}
|
|
522
676
|
|
|
523
677
|
if (import.meta.main) {
|