selftune 0.2.18 → 0.2.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -4
- package/apps/local-dashboard/dist/assets/index-DnhnXQm6.js +60 -0
- package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-table-BIiI3YhS.js +1 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +12 -0
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/alpha-upload/stage-canonical.ts +7 -6
- package/cli/selftune/constants.ts +10 -0
- package/cli/selftune/contribute/contribute.ts +30 -2
- package/cli/selftune/contribution-config.ts +249 -0
- package/cli/selftune/contribution-relay.ts +177 -0
- package/cli/selftune/contribution-signals.ts +219 -0
- package/cli/selftune/contribution-staging.ts +147 -0
- package/cli/selftune/contributions.ts +532 -0
- package/cli/selftune/creator-contributions.ts +333 -0
- package/cli/selftune/dashboard-contract.ts +205 -1
- package/cli/selftune/dashboard-server.ts +45 -11
- package/cli/selftune/eval/family-overlap.ts +395 -0
- package/cli/selftune/eval/hooks-to-evals.ts +182 -28
- package/cli/selftune/eval/synthetic-evals.ts +298 -11
- package/cli/selftune/export.ts +2 -2
- package/cli/selftune/index.ts +41 -5
- package/cli/selftune/ingestors/codex-rollout.ts +31 -35
- package/cli/selftune/ingestors/codex-wrapper.ts +32 -24
- package/cli/selftune/localdb/db.ts +2 -2
- package/cli/selftune/localdb/queries.ts +701 -30
- package/cli/selftune/localdb/schema.ts +20 -0
- package/cli/selftune/recover.ts +153 -0
- package/cli/selftune/repair/skill-usage.ts +363 -4
- package/cli/selftune/routes/actions.ts +35 -1
- package/cli/selftune/routes/analytics.ts +14 -0
- package/cli/selftune/routes/index.ts +1 -0
- package/cli/selftune/routes/overview.ts +112 -4
- package/cli/selftune/routes/skill-report.ts +569 -10
- package/cli/selftune/status.ts +81 -2
- package/cli/selftune/sync.ts +56 -2
- package/cli/selftune/trust-model.ts +66 -0
- package/cli/selftune/types.ts +49 -0
- package/cli/selftune/utils/skill-detection.ts +43 -0
- package/cli/selftune/watchlist.ts +65 -0
- package/package.json +1 -1
- package/packages/ui/src/components/ActivityTimeline.tsx +165 -150
- package/packages/ui/src/components/EvidenceViewer.tsx +335 -144
- package/packages/ui/src/components/EvolutionTimeline.tsx +58 -28
- package/packages/ui/src/components/OrchestrateRunsPanel.tsx +33 -16
- package/packages/ui/src/components/RecentActivityFeed.tsx +72 -41
- package/packages/ui/src/components/section-cards.tsx +12 -9
- package/packages/ui/src/primitives/card.tsx +1 -1
- package/skill/SKILL.md +11 -1
- package/skill/Workflows/AlphaUpload.md +4 -0
- package/skill/Workflows/Composability.md +64 -0
- package/skill/Workflows/Contribute.md +6 -3
- package/skill/Workflows/Contributions.md +97 -0
- package/skill/Workflows/CreatorContributions.md +74 -0
- package/skill/Workflows/Dashboard.md +31 -0
- package/skill/Workflows/Evals.md +57 -8
- package/skill/Workflows/Ingest.md +7 -0
- package/skill/Workflows/Initialize.md +20 -1
- package/skill/Workflows/Recover.md +84 -0
- package/skill/Workflows/RepairSkillUsage.md +12 -4
- package/skill/Workflows/Sync.md +18 -12
- package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +0 -2
- package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +0 -16
- package/apps/local-dashboard/dist/assets/vendor-table-pHbDxq36.js +0 -8
- package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +0 -12
|
@@ -10,6 +10,7 @@ import { readFileSync } from "node:fs";
|
|
|
10
10
|
|
|
11
11
|
import type { EvalEntry, InvocationType } from "../types.js";
|
|
12
12
|
import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
|
|
13
|
+
import { findInstalledSkillNames } from "../utils/skill-discovery.js";
|
|
13
14
|
import { classifyInvocation } from "./hooks-to-evals.js";
|
|
14
15
|
|
|
15
16
|
// ---------------------------------------------------------------------------
|
|
@@ -28,6 +29,181 @@ interface RawSyntheticEntry {
|
|
|
28
29
|
invocation_type?: string;
|
|
29
30
|
}
|
|
30
31
|
|
|
32
|
+
interface SyntheticPromptRealExamples {
|
|
33
|
+
positive: string[];
|
|
34
|
+
negative: string[];
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
interface PromptFamilyTargets {
|
|
38
|
+
explicitCount: number;
|
|
39
|
+
implicitCount: number;
|
|
40
|
+
contextualCount: number;
|
|
41
|
+
siblingNegativeCount: number;
|
|
42
|
+
adjacentNegativeCount: number;
|
|
43
|
+
unrelatedNegativeCount: number;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function getSyntheticSkillSearchDirs(): string[] {
|
|
47
|
+
const cwd = process.cwd();
|
|
48
|
+
const homeDir = process.env.HOME ?? "";
|
|
49
|
+
const codexHome = process.env.CODEX_HOME ?? `${homeDir}/.codex`;
|
|
50
|
+
return [
|
|
51
|
+
`${cwd}/.agents/skills`,
|
|
52
|
+
`${cwd}/.claude/skills`,
|
|
53
|
+
`${homeDir}/.agents/skills`,
|
|
54
|
+
`${homeDir}/.claude/skills`,
|
|
55
|
+
`${codexHome}/skills`,
|
|
56
|
+
];
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function inferSiblingSkills(
|
|
60
|
+
skillName: string,
|
|
61
|
+
searchDirs: string[] = getSyntheticSkillSearchDirs(),
|
|
62
|
+
): string[] {
|
|
63
|
+
const normalized = skillName.trim().toLowerCase();
|
|
64
|
+
if (!normalized) return [];
|
|
65
|
+
|
|
66
|
+
const familyPrefix = normalized.includes("-") ? normalized.split("-")[0] : "";
|
|
67
|
+
const installedNames = [...findInstalledSkillNames(searchDirs)];
|
|
68
|
+
|
|
69
|
+
const sameFamily = installedNames
|
|
70
|
+
.filter((name) => name.toLowerCase() !== normalized)
|
|
71
|
+
.filter((name) => familyPrefix && name.toLowerCase().startsWith(`${familyPrefix}-`))
|
|
72
|
+
.sort((a, b) => a.localeCompare(b));
|
|
73
|
+
|
|
74
|
+
if (sameFamily.length >= 5) return sameFamily.slice(0, 5);
|
|
75
|
+
|
|
76
|
+
const adjacent = installedNames
|
|
77
|
+
.filter((name) => name.toLowerCase() !== normalized)
|
|
78
|
+
.filter((name) => !sameFamily.includes(name))
|
|
79
|
+
.sort((a, b) => a.localeCompare(b));
|
|
80
|
+
|
|
81
|
+
return [...sameFamily, ...adjacent].slice(0, 5);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function buildPromptFamilyTargets(
|
|
85
|
+
maxPositives: number,
|
|
86
|
+
maxNegatives: number,
|
|
87
|
+
hasSiblingSkills: boolean,
|
|
88
|
+
): PromptFamilyTargets {
|
|
89
|
+
const explicitCount = Math.max(1, Math.round(maxPositives * 0.2));
|
|
90
|
+
const contextualCount = Math.max(1, Math.round(maxPositives * 0.4));
|
|
91
|
+
const implicitCount = Math.max(1, maxPositives - explicitCount - contextualCount);
|
|
92
|
+
|
|
93
|
+
const siblingNegativeCount =
|
|
94
|
+
hasSiblingSkills && maxNegatives > 0 ? Math.max(1, Math.round(maxNegatives * 0.4)) : 0;
|
|
95
|
+
const adjacentNegativeCount = Math.max(
|
|
96
|
+
1,
|
|
97
|
+
maxNegatives - siblingNegativeCount - Math.max(1, Math.round(maxNegatives * 0.2)),
|
|
98
|
+
);
|
|
99
|
+
const unrelatedNegativeCount = Math.max(
|
|
100
|
+
1,
|
|
101
|
+
maxNegatives - siblingNegativeCount - adjacentNegativeCount,
|
|
102
|
+
);
|
|
103
|
+
|
|
104
|
+
return {
|
|
105
|
+
explicitCount,
|
|
106
|
+
implicitCount,
|
|
107
|
+
contextualCount,
|
|
108
|
+
siblingNegativeCount,
|
|
109
|
+
adjacentNegativeCount,
|
|
110
|
+
unrelatedNegativeCount,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function normalizeEvalQuery(query: string): string {
|
|
115
|
+
return query.trim().toLowerCase().replace(/\s+/g, " ");
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function dedupeEvalEntries(entries: EvalEntry[]): EvalEntry[] {
|
|
119
|
+
const seen = new Set<string>();
|
|
120
|
+
const deduped: EvalEntry[] = [];
|
|
121
|
+
for (const entry of entries) {
|
|
122
|
+
const key = `${entry.should_trigger ? "p" : "n"}:${normalizeEvalQuery(entry.query)}`;
|
|
123
|
+
if (seen.has(key)) continue;
|
|
124
|
+
seen.add(key);
|
|
125
|
+
deduped.push(entry);
|
|
126
|
+
}
|
|
127
|
+
return deduped;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
function takeEntries(entries: EvalEntry[], count: number): EvalEntry[] {
|
|
131
|
+
if (count <= 0) return [];
|
|
132
|
+
return entries.slice(0, count);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
export function selectBalancedEvalEntries(
|
|
136
|
+
entries: EvalEntry[],
|
|
137
|
+
maxPositives: number,
|
|
138
|
+
maxNegatives: number,
|
|
139
|
+
siblingSkills: string[] | boolean,
|
|
140
|
+
): EvalEntry[] {
|
|
141
|
+
const normalizedSiblingSkills = Array.isArray(siblingSkills)
|
|
142
|
+
? siblingSkills.map((skill) => skill.trim().toLowerCase()).filter(Boolean)
|
|
143
|
+
: [];
|
|
144
|
+
const hasSiblingSkills = normalizedSiblingSkills.length > 0;
|
|
145
|
+
const targets = buildPromptFamilyTargets(maxPositives, maxNegatives, hasSiblingSkills);
|
|
146
|
+
const positives = entries.filter((entry) => entry.should_trigger);
|
|
147
|
+
const negatives = entries.filter((entry) => !entry.should_trigger);
|
|
148
|
+
|
|
149
|
+
const explicit = positives.filter((entry) => entry.invocation_type === "explicit");
|
|
150
|
+
const implicit = positives.filter((entry) => entry.invocation_type === "implicit");
|
|
151
|
+
const contextual = positives.filter((entry) => entry.invocation_type === "contextual");
|
|
152
|
+
const remainingPositive = positives.filter(
|
|
153
|
+
(entry) => !["explicit", "implicit", "contextual"].includes(entry.invocation_type ?? ""),
|
|
154
|
+
);
|
|
155
|
+
|
|
156
|
+
const selectedPositives = [
|
|
157
|
+
...takeEntries(explicit, targets.explicitCount),
|
|
158
|
+
...takeEntries(implicit, targets.implicitCount),
|
|
159
|
+
...takeEntries(contextual, targets.contextualCount),
|
|
160
|
+
];
|
|
161
|
+
const selectedPositiveKeys = new Set(
|
|
162
|
+
selectedPositives.map((entry) => normalizeEvalQuery(entry.query)),
|
|
163
|
+
);
|
|
164
|
+
for (const entry of [...positives, ...remainingPositive]) {
|
|
165
|
+
if (selectedPositives.length >= maxPositives) break;
|
|
166
|
+
const key = normalizeEvalQuery(entry.query);
|
|
167
|
+
if (selectedPositiveKeys.has(key)) continue;
|
|
168
|
+
selectedPositiveKeys.add(key);
|
|
169
|
+
selectedPositives.push(entry);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
const siblingMentions = hasSiblingSkills
|
|
173
|
+
? negatives.filter((entry) => {
|
|
174
|
+
const normalizedQuery = entry.query.toLowerCase();
|
|
175
|
+
return normalizedSiblingSkills.some((skill) => normalizedQuery.includes(skill));
|
|
176
|
+
})
|
|
177
|
+
: siblingSkills === true
|
|
178
|
+
? negatives.filter((entry) =>
|
|
179
|
+
/(^|[\s/$-])(sc-[a-z0-9-]+|mentor cli|State Change mentor CLI|resource\s+\d+|mental model)/i.test(
|
|
180
|
+
entry.query,
|
|
181
|
+
),
|
|
182
|
+
)
|
|
183
|
+
: [];
|
|
184
|
+
const nonSiblingNegatives = negatives.filter((entry) => !siblingMentions.includes(entry));
|
|
185
|
+
const selectedNegatives = [
|
|
186
|
+
...takeEntries(siblingMentions, targets.siblingNegativeCount),
|
|
187
|
+
...takeEntries(
|
|
188
|
+
nonSiblingNegatives,
|
|
189
|
+
maxNegatives - Math.min(targets.siblingNegativeCount, siblingMentions.length),
|
|
190
|
+
),
|
|
191
|
+
];
|
|
192
|
+
|
|
193
|
+
const selectedNegativeKeys = new Set(
|
|
194
|
+
selectedNegatives.map((entry) => normalizeEvalQuery(entry.query)),
|
|
195
|
+
);
|
|
196
|
+
for (const entry of negatives) {
|
|
197
|
+
if (selectedNegatives.length >= maxNegatives) break;
|
|
198
|
+
const key = normalizeEvalQuery(entry.query);
|
|
199
|
+
if (selectedNegativeKeys.has(key)) continue;
|
|
200
|
+
selectedNegativeKeys.add(key);
|
|
201
|
+
selectedNegatives.push(entry);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
return [...selectedPositives.slice(0, maxPositives), ...selectedNegatives.slice(0, maxNegatives)];
|
|
205
|
+
}
|
|
206
|
+
|
|
31
207
|
// ---------------------------------------------------------------------------
|
|
32
208
|
// Prompt building
|
|
33
209
|
// ---------------------------------------------------------------------------
|
|
@@ -37,21 +213,38 @@ export function buildSyntheticPrompt(
|
|
|
37
213
|
skillName: string,
|
|
38
214
|
maxPositives: number,
|
|
39
215
|
maxNegatives: number,
|
|
40
|
-
realExamples?:
|
|
216
|
+
realExamples?: SyntheticPromptRealExamples,
|
|
217
|
+
siblingSkills: string[] = [],
|
|
41
218
|
): { system: string; user: string } {
|
|
219
|
+
const {
|
|
220
|
+
explicitCount,
|
|
221
|
+
implicitCount,
|
|
222
|
+
contextualCount,
|
|
223
|
+
siblingNegativeCount,
|
|
224
|
+
adjacentNegativeCount,
|
|
225
|
+
unrelatedNegativeCount,
|
|
226
|
+
} = buildPromptFamilyTargets(maxPositives, maxNegatives, siblingSkills.length > 0);
|
|
227
|
+
|
|
42
228
|
const system = `You are generating test queries for a coding agent skill. Given the skill description below, generate realistic user queries.
|
|
43
229
|
|
|
230
|
+
Your job is to create a SMALL, TARGETED benchmark for cold-start routing quality.
|
|
231
|
+
|
|
44
232
|
For POSITIVE queries (should trigger this skill):
|
|
45
|
-
- Generate a mix of:
|
|
233
|
+
- Generate a balanced mix of:
|
|
46
234
|
- Explicit: directly names the skill or uses $${skillName} syntax
|
|
47
235
|
- Implicit: describes the task without naming the skill
|
|
48
|
-
- Contextual: natural language with domain context, proper nouns,
|
|
49
|
-
-
|
|
236
|
+
- Contextual: realistic natural language with domain context, proper nouns, filenames, or setup noise
|
|
237
|
+
- Avoid merely paraphrasing bullet points from the skill
|
|
238
|
+
- Prefer realistic user phrasing over polished product copy
|
|
239
|
+
- Include at least a few prompts that test the edge of the skill's scope, not just the obvious center
|
|
50
240
|
|
|
51
241
|
For NEGATIVE queries (should NOT trigger this skill):
|
|
52
|
-
-
|
|
53
|
-
-
|
|
54
|
-
-
|
|
242
|
+
- Include hard negative controls:
|
|
243
|
+
- sibling-skill confusion cases
|
|
244
|
+
- topically adjacent but wrong-intent cases
|
|
245
|
+
- clearly unrelated cases
|
|
246
|
+
- Make the hard negatives plausible, not cartoonishly unrelated
|
|
247
|
+
- If a query belongs to another installed skill, make that obvious from the task itself
|
|
55
248
|
|
|
56
249
|
Output as JSON array with no surrounding text:
|
|
57
250
|
[{"query": "...", "should_trigger": true, "invocation_type": "explicit|implicit|contextual|negative"}]`;
|
|
@@ -61,7 +254,19 @@ Output as JSON array with no surrounding text:
|
|
|
61
254
|
Skill content:
|
|
62
255
|
${skillContent}
|
|
63
256
|
|
|
64
|
-
Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false).
|
|
257
|
+
Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false).
|
|
258
|
+
|
|
259
|
+
Required positive mix:
|
|
260
|
+
- ${explicitCount} explicit
|
|
261
|
+
- ${implicitCount} implicit
|
|
262
|
+
- ${contextualCount} contextual
|
|
263
|
+
|
|
264
|
+
Required negative mix:
|
|
265
|
+
- ${siblingNegativeCount} sibling-skill confusion cases
|
|
266
|
+
- ${adjacentNegativeCount} adjacent but wrong-intent cases
|
|
267
|
+
- ${unrelatedNegativeCount} clearly unrelated cases
|
|
268
|
+
|
|
269
|
+
Return ONLY the JSON array.`;
|
|
65
270
|
|
|
66
271
|
if (realExamples && (realExamples.positive.length > 0 || realExamples.negative.length > 0)) {
|
|
67
272
|
const parts: string[] = ["\n\nReal user queries for style and phrasing reference:"];
|
|
@@ -77,6 +282,61 @@ Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${m
|
|
|
77
282
|
user += parts.join("\n");
|
|
78
283
|
}
|
|
79
284
|
|
|
285
|
+
if (siblingSkills.length > 0) {
|
|
286
|
+
user += `\n\nNearby installed skills to use for boundary-setting hard negatives:\n${siblingSkills
|
|
287
|
+
.map((skill) => `- ${skill}`)
|
|
288
|
+
.join(
|
|
289
|
+
"\n",
|
|
290
|
+
)}\n\nAt least ${siblingNegativeCount} negative queries should clearly belong to one of these sibling skills instead of ${skillName}.`;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
return { system, user };
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
export function buildSyntheticRefinementPrompt(
|
|
297
|
+
skillContent: string,
|
|
298
|
+
skillName: string,
|
|
299
|
+
candidates: EvalEntry[],
|
|
300
|
+
maxPositives: number,
|
|
301
|
+
maxNegatives: number,
|
|
302
|
+
siblingSkills: string[] = [],
|
|
303
|
+
): { system: string; user: string } {
|
|
304
|
+
const targets = buildPromptFamilyTargets(maxPositives, maxNegatives, siblingSkills.length > 0);
|
|
305
|
+
const system = `You are refining a cold-start eval benchmark for a coding agent skill.
|
|
306
|
+
|
|
307
|
+
Your job is to critique and prune a candidate pool into a SMALL, SHARP benchmark.
|
|
308
|
+
|
|
309
|
+
For each candidate, reason using binary questions:
|
|
310
|
+
- Is this realistic user phrasing?
|
|
311
|
+
- Is this more than a trivial paraphrase of the skill bullets?
|
|
312
|
+
- Does this clearly test in-scope behavior, or clearly test a boundary?
|
|
313
|
+
- For negatives: does it clearly belong elsewhere or represent a plausible wrong-intent adjacent request?
|
|
314
|
+
- Is it sufficiently distinct from the other selected prompts?
|
|
315
|
+
|
|
316
|
+
Return ONLY a JSON array with the final benchmark.`;
|
|
317
|
+
|
|
318
|
+
const user = `Skill name: ${skillName}
|
|
319
|
+
|
|
320
|
+
Skill content:
|
|
321
|
+
${skillContent}
|
|
322
|
+
|
|
323
|
+
Target final benchmark:
|
|
324
|
+
- ${maxPositives} positives
|
|
325
|
+
- ${maxNegatives} negatives
|
|
326
|
+
- Positive mix: ${targets.explicitCount} explicit, ${targets.implicitCount} implicit, ${targets.contextualCount} contextual
|
|
327
|
+
- Negative mix: ${targets.siblingNegativeCount} sibling-skill confusion, ${targets.adjacentNegativeCount} adjacent wrong-intent, ${targets.unrelatedNegativeCount} unrelated
|
|
328
|
+
|
|
329
|
+
${siblingSkills.length > 0 ? `Sibling skills for hard-negative boundaries:\n${siblingSkills.map((skill) => `- ${skill}`).join("\n")}\n` : ""}
|
|
330
|
+
Candidate pool:
|
|
331
|
+
${JSON.stringify(candidates, null, 2)}
|
|
332
|
+
|
|
333
|
+
Instructions:
|
|
334
|
+
- Remove duplicates and near-duplicates
|
|
335
|
+
- Prefer prompts that test trigger boundaries, not just center-of-mass obvious usage
|
|
336
|
+
- Keep sibling-skill negatives if they are strong boundary tests
|
|
337
|
+
- Keep the final set compact, diverse, and realistic
|
|
338
|
+
- Return ONLY the final JSON array`;
|
|
339
|
+
|
|
80
340
|
return { system, user };
|
|
81
341
|
}
|
|
82
342
|
|
|
@@ -172,8 +432,10 @@ export async function generateSyntheticEvals(
|
|
|
172
432
|
): Promise<EvalEntry[]> {
|
|
173
433
|
const maxPositives = options.maxPositives ?? 15;
|
|
174
434
|
const maxNegatives = options.maxNegatives ?? 10;
|
|
435
|
+
const oversampleFactor = 2;
|
|
175
436
|
|
|
176
437
|
const skillContent = readFileSync(skillPath, "utf-8");
|
|
438
|
+
const siblingSkills = inferSiblingSkills(skillName);
|
|
177
439
|
|
|
178
440
|
// Load real query examples from the database for few-shot style guidance.
|
|
179
441
|
// Uses dynamic imports since SQLite may not be available in all contexts.
|
|
@@ -214,11 +476,36 @@ export async function generateSyntheticEvals(
|
|
|
214
476
|
const { system, user } = buildSyntheticPrompt(
|
|
215
477
|
skillContent,
|
|
216
478
|
skillName,
|
|
217
|
-
maxPositives,
|
|
218
|
-
maxNegatives,
|
|
479
|
+
maxPositives * oversampleFactor,
|
|
480
|
+
maxNegatives * oversampleFactor,
|
|
219
481
|
realExamples,
|
|
482
|
+
siblingSkills,
|
|
220
483
|
);
|
|
221
484
|
|
|
222
485
|
const raw = await callLlm(system, user, agent, options.modelFlag);
|
|
223
|
-
|
|
486
|
+
const firstPass = dedupeEvalEntries(parseSyntheticResponse(raw, skillName));
|
|
487
|
+
|
|
488
|
+
try {
|
|
489
|
+
const refinement = buildSyntheticRefinementPrompt(
|
|
490
|
+
skillContent,
|
|
491
|
+
skillName,
|
|
492
|
+
firstPass,
|
|
493
|
+
maxPositives,
|
|
494
|
+
maxNegatives,
|
|
495
|
+
siblingSkills,
|
|
496
|
+
);
|
|
497
|
+
const refinedRaw = await callLlm(refinement.system, refinement.user, agent, options.modelFlag);
|
|
498
|
+
const refined = dedupeEvalEntries(parseSyntheticResponse(refinedRaw, skillName));
|
|
499
|
+
const selected = selectBalancedEvalEntries(refined, maxPositives, maxNegatives, siblingSkills);
|
|
500
|
+
if (
|
|
501
|
+
selected.filter((entry) => entry.should_trigger).length >= maxPositives &&
|
|
502
|
+
selected.filter((entry) => !entry.should_trigger).length >= maxNegatives
|
|
503
|
+
) {
|
|
504
|
+
return selected;
|
|
505
|
+
}
|
|
506
|
+
} catch {
|
|
507
|
+
// fall through to first-pass selection
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
return selectBalancedEvalEntries(firstPass, maxPositives, maxNegatives, siblingSkills);
|
|
224
511
|
}
|
package/cli/selftune/export.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Export SQLite data to JSONL format.
|
|
3
|
-
*
|
|
4
|
-
*
|
|
3
|
+
* Use this only when you explicitly need portable/debuggable JSONL snapshots
|
|
4
|
+
* for recovery, the contribute workflow, or external tools.
|
|
5
5
|
*/
|
|
6
6
|
import { mkdirSync, writeFileSync } from "node:fs";
|
|
7
7
|
import { join } from "node:path";
|
package/cli/selftune/index.ts
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
* selftune ingest <agent> — Ingest agent sessions (claude, codex, opencode, openclaw, wrap-codex)
|
|
7
7
|
* selftune grade [mode] — Grade skill sessions (auto, baseline)
|
|
8
8
|
* selftune evolve [target] — Evolve skill descriptions (body, rollback)
|
|
9
|
-
* selftune eval <action> — Evaluation tools (generate, unit-test, import, composability)
|
|
9
|
+
* selftune eval <action> — Evaluation tools (generate, unit-test, import, composability, family-overlap)
|
|
10
10
|
* selftune sync — Sync source-truth telemetry across supported agents
|
|
11
11
|
* selftune orchestrate — Run autonomous core loop (sync → status → evolve → watch)
|
|
12
12
|
* selftune init — Initialize agent identity and config
|
|
@@ -19,11 +19,14 @@
|
|
|
19
19
|
* selftune cron — Scheduling & automation (setup, list, remove)
|
|
20
20
|
* selftune badge — Generate skill health badges for READMEs
|
|
21
21
|
* selftune contribute — Export anonymized skill data for community
|
|
22
|
+
* selftune contributions — Manage creator-directed sharing preferences
|
|
23
|
+
* selftune creator-contributions — Manage creator-side contribution configs
|
|
22
24
|
* selftune workflows — Discover and manage multi-skill workflows
|
|
23
25
|
* selftune quickstart — Guided onboarding: init, ingest, status, and suggestions
|
|
24
26
|
* selftune repair-skill-usage — Rebuild trustworthy skill usage from transcripts
|
|
25
|
-
* selftune export — Export SQLite data to JSONL
|
|
27
|
+
* selftune export — Export SQLite data to JSONL snapshots
|
|
26
28
|
* selftune export-canonical — Export canonical telemetry for downstream ingestion
|
|
29
|
+
* selftune recover — Recover SQLite from legacy/exported JSONL
|
|
27
30
|
* selftune telemetry — Manage anonymous usage analytics (status, enable, disable)
|
|
28
31
|
* selftune alpha <subcommand> — Alpha program management (upload)
|
|
29
32
|
* selftune hook <name> — Run a hook by name (prompt-log, session-stop, etc.)
|
|
@@ -46,7 +49,7 @@ Commands:
|
|
|
46
49
|
ingest <agent> Ingest agent sessions (claude, codex, opencode, openclaw, wrap-codex)
|
|
47
50
|
grade [mode] Grade skill sessions (auto, baseline)
|
|
48
51
|
evolve [target] Evolve skill descriptions (body, rollback)
|
|
49
|
-
eval <action> Evaluation tools (generate, unit-test, import, composability)
|
|
52
|
+
eval <action> Evaluation tools (generate, unit-test, import, composability, family-overlap)
|
|
50
53
|
sync Sync source-truth telemetry across supported agents
|
|
51
54
|
orchestrate Run autonomous core loop (sync → status → evolve → watch)
|
|
52
55
|
init Initialize agent identity and config
|
|
@@ -59,11 +62,14 @@ Commands:
|
|
|
59
62
|
cron Scheduling & automation (setup, list, remove)
|
|
60
63
|
badge Generate skill health badges for READMEs
|
|
61
64
|
contribute Export anonymized skill data for community
|
|
65
|
+
contributions Manage creator-directed sharing preferences
|
|
66
|
+
creator-contributions Manage creator-side contribution configs
|
|
62
67
|
workflows Discover and manage multi-skill workflows
|
|
63
68
|
quickstart Guided onboarding: init, ingest, status, and suggestions
|
|
64
69
|
repair-skill-usage Rebuild trustworthy skill usage from transcripts
|
|
65
|
-
export Export SQLite data to JSONL
|
|
70
|
+
export Export SQLite data to JSONL snapshots
|
|
66
71
|
export-canonical Export canonical telemetry for downstream ingestion
|
|
72
|
+
recover Recover SQLite from legacy/exported JSONL
|
|
67
73
|
alpha <subcommand> Alpha program management (upload)
|
|
68
74
|
telemetry Manage anonymous usage analytics (status, enable, disable)
|
|
69
75
|
hook <name> Run a hook by name (prompt-log, session-stop, etc.)
|
|
@@ -254,6 +260,7 @@ Actions:
|
|
|
254
260
|
unit-test Run or generate skill unit tests
|
|
255
261
|
import Import SkillsBench task corpus as eval entries
|
|
256
262
|
composability Analyze skill co-occurrence conflicts
|
|
263
|
+
family-overlap Detect sibling-skill overlap and consolidation pressure
|
|
257
264
|
|
|
258
265
|
Run 'selftune eval <action> --help' for action-specific options.`);
|
|
259
266
|
process.exit(0);
|
|
@@ -341,6 +348,17 @@ Run 'selftune eval <action> --help' for action-specific options.`);
|
|
|
341
348
|
console.log(JSON.stringify(report, null, 2));
|
|
342
349
|
break;
|
|
343
350
|
}
|
|
351
|
+
case "family-overlap": {
|
|
352
|
+
if (process.argv[2] === "--help" || process.argv[2] === "-h") {
|
|
353
|
+
console.log(
|
|
354
|
+
"selftune eval family-overlap --prefix <family-> | --skills <a,b,c> [--parent-skill <name>] [--min-overlap 0.3] [--min-shared 2]",
|
|
355
|
+
);
|
|
356
|
+
process.exit(0);
|
|
357
|
+
}
|
|
358
|
+
const { cliMain } = await import("./eval/family-overlap.js");
|
|
359
|
+
await cliMain();
|
|
360
|
+
break;
|
|
361
|
+
}
|
|
344
362
|
default:
|
|
345
363
|
throw new CLIError(
|
|
346
364
|
`Unknown eval action: ${sub}`,
|
|
@@ -368,6 +386,16 @@ Run 'selftune eval <action> --help' for action-specific options.`);
|
|
|
368
386
|
await cliMain();
|
|
369
387
|
break;
|
|
370
388
|
}
|
|
389
|
+
case "contributions": {
|
|
390
|
+
const { cliMain } = await import("./contributions.js");
|
|
391
|
+
await cliMain();
|
|
392
|
+
break;
|
|
393
|
+
}
|
|
394
|
+
case "creator-contributions": {
|
|
395
|
+
const { cliMain } = await import("./creator-contributions.js");
|
|
396
|
+
await cliMain();
|
|
397
|
+
break;
|
|
398
|
+
}
|
|
371
399
|
case "watch": {
|
|
372
400
|
const { cliMain } = await import("./monitoring/watch.js");
|
|
373
401
|
await cliMain();
|
|
@@ -527,11 +555,14 @@ Run 'selftune cron <subcommand> --help' for subcommand-specific options.`);
|
|
|
527
555
|
throw new CLIError(`Invalid arguments: ${message}`, "INVALID_FLAG", "selftune export --help");
|
|
528
556
|
}
|
|
529
557
|
if (values.help) {
|
|
530
|
-
console.log(`selftune export — Export SQLite data to JSONL
|
|
558
|
+
console.log(`selftune export — Export SQLite data to JSONL snapshots
|
|
531
559
|
|
|
532
560
|
Usage:
|
|
533
561
|
selftune export [tables...] [options]
|
|
534
562
|
|
|
563
|
+
Use this for portability, debugging, contribute flows, or explicit recovery
|
|
564
|
+
snapshots. Normal runtime reads and writes stay in SQLite.
|
|
565
|
+
|
|
535
566
|
Tables (default: all):
|
|
536
567
|
telemetry Session telemetry records
|
|
537
568
|
skills Skill usage records
|
|
@@ -570,6 +601,11 @@ Options:
|
|
|
570
601
|
cliMain();
|
|
571
602
|
break;
|
|
572
603
|
}
|
|
604
|
+
case "recover": {
|
|
605
|
+
const { cliMain } = await import("./recover.js");
|
|
606
|
+
cliMain();
|
|
607
|
+
break;
|
|
608
|
+
}
|
|
573
609
|
case "orchestrate": {
|
|
574
610
|
const { cliMain } = await import("./orchestrate.js");
|
|
575
611
|
await cliMain();
|
|
@@ -52,9 +52,9 @@ import type {
|
|
|
52
52
|
import { handleCLIError } from "../utils/cli-error.js";
|
|
53
53
|
import { loadMarker, saveMarker } from "../utils/jsonl.js";
|
|
54
54
|
import { extractActionableQueryText } from "../utils/query-filter.js";
|
|
55
|
+
import { getInternalPromptTargetSkill, isWrappedNonUserPart } from "../utils/skill-detection.js";
|
|
55
56
|
import {
|
|
56
57
|
classifySkillPath,
|
|
57
|
-
containsWholeSkillMention,
|
|
58
58
|
extractExplicitSkillMentions,
|
|
59
59
|
extractSkillNamesFromInstructions,
|
|
60
60
|
extractSkillNamesFromPathReferences,
|
|
@@ -228,6 +228,15 @@ export function parseRolloutFile(path: string, skillNames: Set<string>): ParsedR
|
|
|
228
228
|
let observedCwd: string | undefined;
|
|
229
229
|
const sessionSkillNames = new Set(skillNames);
|
|
230
230
|
let hasActionablePrompt = false;
|
|
231
|
+
const markSkillTriggered = (skillName: string, evidence: "explicit" | "inferred"): void => {
|
|
232
|
+
if (!skillsTriggered.includes(skillName)) {
|
|
233
|
+
skillsTriggered.push(skillName);
|
|
234
|
+
}
|
|
235
|
+
const existingEvidence = skillEvidence.get(skillName);
|
|
236
|
+
if (existingEvidence !== "explicit") {
|
|
237
|
+
skillEvidence.set(skillName, evidence);
|
|
238
|
+
}
|
|
239
|
+
};
|
|
231
240
|
const rememberSessionSkillNames = (text: unknown): void => {
|
|
232
241
|
if (typeof text !== "string" || !text) return;
|
|
233
242
|
for (const skillName of extractSkillNamesFromInstructions(text, sessionSkillNames)) {
|
|
@@ -240,33 +249,23 @@ export function parseRolloutFile(path: string, skillNames: Set<string>): ParsedR
|
|
|
240
249
|
sessionSkillNames.add(skillName);
|
|
241
250
|
}
|
|
242
251
|
};
|
|
243
|
-
const detectTriggeredSkills = (text: unknown): void => {
|
|
244
|
-
if (typeof text !== "string" || !text) return;
|
|
245
|
-
for (const skillName of sessionSkillNames) {
|
|
246
|
-
if (containsWholeSkillMention(text, skillName) && !skillsTriggered.includes(skillName)) {
|
|
247
|
-
skillsTriggered.push(skillName);
|
|
248
|
-
}
|
|
249
|
-
if (containsWholeSkillMention(text, skillName) && !skillEvidence.has(skillName)) {
|
|
250
|
-
skillEvidence.set(skillName, "inferred");
|
|
251
|
-
}
|
|
252
|
-
}
|
|
253
|
-
};
|
|
254
252
|
const detectExplicitPromptSkillMentions = (text: unknown): void => {
|
|
255
253
|
if (typeof text !== "string" || !text) return;
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
254
|
+
if (isWrappedNonUserPart(text)) return;
|
|
255
|
+
const actionableText = extractActionableQueryText(text) ?? text;
|
|
256
|
+
const internalTargetSkill = getInternalPromptTargetSkill(actionableText, sessionSkillNames);
|
|
257
|
+
if (internalTargetSkill) {
|
|
258
|
+
markSkillTriggered(internalTargetSkill, "explicit");
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
261
|
+
for (const skillName of extractExplicitSkillMentions(actionableText, sessionSkillNames)) {
|
|
262
|
+
markSkillTriggered(skillName, "explicit");
|
|
261
263
|
}
|
|
262
264
|
};
|
|
263
265
|
const detectExplicitSkillReads = (text: unknown): void => {
|
|
264
266
|
if (typeof text !== "string" || !text) return;
|
|
265
267
|
for (const skillName of extractSkillNamesFromPathReferences(text, sessionSkillNames)) {
|
|
266
|
-
|
|
267
|
-
skillsTriggered.push(skillName);
|
|
268
|
-
}
|
|
269
|
-
skillEvidence.set(skillName, "explicit");
|
|
268
|
+
markSkillTriggered(skillName, "explicit");
|
|
270
269
|
}
|
|
271
270
|
};
|
|
272
271
|
const rememberPromptCandidate = (value: unknown): void => {
|
|
@@ -352,27 +351,26 @@ export function parseRolloutFile(path: string, skillNames: Set<string>): ParsedR
|
|
|
352
351
|
if (itemType === "function_call") {
|
|
353
352
|
const fnName = (payload.name as string) ?? "function_call";
|
|
354
353
|
toolCalls[fnName] = (toolCalls[fnName] ?? 0) + 1;
|
|
355
|
-
//
|
|
354
|
+
// Only path-based skill references count as triggers here.
|
|
356
355
|
detectExplicitSkillReads(payload.arguments);
|
|
357
|
-
detectTriggeredSkills(payload.arguments);
|
|
358
356
|
} else if (itemType === "agent_reasoning") {
|
|
359
357
|
toolCalls.reasoning = (toolCalls.reasoning ?? 0) + 1;
|
|
360
|
-
detectTriggeredSkills(payload.text);
|
|
361
358
|
} else if (itemType === "message") {
|
|
362
|
-
const
|
|
359
|
+
const parts = Array.isArray(payload.content)
|
|
363
360
|
? payload.content
|
|
364
361
|
.map((part) =>
|
|
365
362
|
typeof part === "object" && part
|
|
366
363
|
? (((part as Record<string, unknown>).text as string | undefined) ?? "")
|
|
367
364
|
: "",
|
|
368
365
|
)
|
|
369
|
-
.
|
|
370
|
-
:
|
|
366
|
+
.filter(Boolean)
|
|
367
|
+
: [];
|
|
368
|
+
const content = parts.join("\n");
|
|
371
369
|
rememberSessionSkillNames(content);
|
|
372
|
-
if ((payload.role as string) === "
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
370
|
+
if ((payload.role as string) === "user") {
|
|
371
|
+
for (const part of parts) {
|
|
372
|
+
detectExplicitPromptSkillMentions(part);
|
|
373
|
+
}
|
|
376
374
|
}
|
|
377
375
|
}
|
|
378
376
|
} else if (etype === "turn.started") {
|
|
@@ -410,10 +408,8 @@ export function parseRolloutFile(path: string, skillNames: Set<string>): ParsedR
|
|
|
410
408
|
}
|
|
411
409
|
|
|
412
410
|
// Detect skill names in text content on completed events
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
if (etype === "item.completed") {
|
|
416
|
-
detectTriggeredSkills(textContent);
|
|
411
|
+
if (itemType === "command_execution") {
|
|
412
|
+
detectExplicitSkillReads(item.command);
|
|
417
413
|
}
|
|
418
414
|
} else if (etype === "error") {
|
|
419
415
|
errors += 1;
|