selftune 0.2.31 → 0.2.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -56
- package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
- package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
- package/apps/local-dashboard/dist/index.html +3 -3
- package/cli/selftune/command-surface.ts +613 -2
- package/cli/selftune/create/baseline.ts +429 -0
- package/cli/selftune/create/check.ts +35 -0
- package/cli/selftune/create/init.ts +115 -0
- package/cli/selftune/create/package-candidate-state.ts +771 -0
- package/cli/selftune/create/package-evaluator.ts +710 -0
- package/cli/selftune/create/package-fingerprint.ts +142 -0
- package/cli/selftune/create/package-search.ts +377 -0
- package/cli/selftune/create/publish.ts +431 -0
- package/cli/selftune/create/readiness.ts +495 -0
- package/cli/selftune/create/replay.ts +330 -0
- package/cli/selftune/create/report.ts +74 -0
- package/cli/selftune/create/scaffold.ts +121 -0
- package/cli/selftune/create/skills-ref-adapter.ts +177 -0
- package/cli/selftune/create/status.ts +33 -0
- package/cli/selftune/create/templates.ts +249 -0
- package/cli/selftune/cron/setup.ts +1 -1
- package/cli/selftune/dashboard-action-events.ts +4 -1
- package/cli/selftune/dashboard-action-result.ts +789 -24
- package/cli/selftune/dashboard-action-stream.ts +80 -0
- package/cli/selftune/dashboard-contract.ts +146 -3
- package/cli/selftune/dashboard-server.ts +5 -4
- package/cli/selftune/eval/hooks-to-evals.ts +58 -35
- package/cli/selftune/eval/synthetic-evals.ts +145 -17
- package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
- package/cli/selftune/evolution/evolve-body.ts +9 -36
- package/cli/selftune/evolution/evolve.ts +8 -72
- package/cli/selftune/evolution/stopping-criteria.ts +5 -13
- package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
- package/cli/selftune/evolution/validate-host-replay.ts +115 -15
- package/cli/selftune/improve.ts +206 -0
- package/cli/selftune/index.ts +123 -6
- package/cli/selftune/init.ts +1 -1
- package/cli/selftune/localdb/queries/dashboard.ts +30 -0
- package/cli/selftune/localdb/schema.ts +52 -0
- package/cli/selftune/monitoring/watch.ts +257 -23
- package/cli/selftune/orchestrate/execute.ts +300 -1
- package/cli/selftune/orchestrate/finalize.ts +14 -0
- package/cli/selftune/orchestrate/plan.ts +22 -5
- package/cli/selftune/orchestrate/prepare.ts +59 -4
- package/cli/selftune/orchestrate/report.ts +1 -1
- package/cli/selftune/orchestrate.ts +34 -1
- package/cli/selftune/publish.ts +35 -0
- package/cli/selftune/routes/actions.ts +81 -15
- package/cli/selftune/routes/overview.ts +1 -1
- package/cli/selftune/routes/skill-report.ts +147 -2
- package/cli/selftune/run.ts +18 -0
- package/cli/selftune/schedule.ts +3 -3
- package/cli/selftune/search-run.ts +703 -0
- package/cli/selftune/status.ts +35 -11
- package/cli/selftune/testing-readiness.ts +431 -40
- package/cli/selftune/types.ts +316 -0
- package/cli/selftune/utils/eval-readiness.ts +1 -0
- package/cli/selftune/utils/json-output.ts +11 -0
- package/cli/selftune/utils/lifecycle-surface.ts +48 -0
- package/cli/selftune/utils/query-filter.ts +82 -1
- package/cli/selftune/utils/tui.ts +85 -2
- package/cli/selftune/verify.ts +205 -0
- package/cli/selftune/workflows/proposals.ts +1 -1
- package/cli/selftune/workflows/skill-scaffold.ts +141 -63
- package/cli/selftune/workflows/workflows.ts +4 -4
- package/package.json +1 -1
- package/skill/SKILL.md +148 -85
- package/skill/references/cli-quick-reference.md +16 -1
- package/skill/references/creator-playbook.md +31 -10
- package/skill/workflows/Baseline.md +8 -9
- package/skill/workflows/Contributions.md +4 -4
- package/skill/workflows/Create.md +173 -0
- package/skill/workflows/CreateTestDeploy.md +34 -30
- package/skill/workflows/Cron.md +2 -2
- package/skill/workflows/Dashboard.md +3 -3
- package/skill/workflows/Evals.md +13 -7
- package/skill/workflows/Evolve.md +75 -32
- package/skill/workflows/EvolveBody.md +22 -15
- package/skill/workflows/Hook.md +1 -1
- package/skill/workflows/Improve.md +168 -0
- package/skill/workflows/Initialize.md +3 -3
- package/skill/workflows/Orchestrate.md +49 -12
- package/skill/workflows/Publish.md +100 -0
- package/skill/workflows/Run.md +72 -0
- package/skill/workflows/Schedule.md +2 -2
- package/skill/workflows/SearchRun.md +89 -0
- package/skill/workflows/SignalsDashboard.md +2 -2
- package/skill/workflows/UnitTest.md +13 -4
- package/skill/workflows/Verify.md +136 -0
- package/skill/workflows/Watch.md +114 -47
- package/skill/workflows/Workflows.md +13 -8
- package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +0 -15
- package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +0 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
|
@@ -8,9 +8,10 @@
|
|
|
8
8
|
|
|
9
9
|
import { readFileSync } from "node:fs";
|
|
10
10
|
|
|
11
|
-
import type { EvalEntry, InvocationType, SkillUsageRecord } from "../types.js";
|
|
11
|
+
import type { EvalEntry, InvocationType, QueryLogRecord, SkillUsageRecord } from "../types.js";
|
|
12
12
|
import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
|
|
13
13
|
import type { LlmCallObserver } from "../utils/llm-call.js";
|
|
14
|
+
import { extractActionableQueryText, extractPositiveEvalQueryText } from "../utils/query-filter.js";
|
|
14
15
|
import { findInstalledSkillNames } from "../utils/skill-discovery.js";
|
|
15
16
|
import { classifyInvocation } from "./invocation-classifier.js";
|
|
16
17
|
|
|
@@ -50,6 +51,25 @@ interface PromptFamilyTargets {
|
|
|
50
51
|
unrelatedNegativeCount: number;
|
|
51
52
|
}
|
|
52
53
|
|
|
54
|
+
const MAX_REAL_EXAMPLE_LENGTH = 220;
|
|
55
|
+
const MAX_SYNTHETIC_SKILL_CONTENT_CHARS = 6000;
|
|
56
|
+
const MAX_SYNTHETIC_SECTION_CHARS = 1200;
|
|
57
|
+
const MAX_SYNTHETIC_PREAMBLE_CHARS = 800;
|
|
58
|
+
const PRIORITY_SYNTHETIC_SECTION_PATTERNS = [
|
|
59
|
+
/when this skill activates/i,
|
|
60
|
+
/when to invoke/i,
|
|
61
|
+
/when to use/i,
|
|
62
|
+
/\buse when\b/i,
|
|
63
|
+
/workflow routing/i,
|
|
64
|
+
/\busage\b/i,
|
|
65
|
+
/\bexamples?\b/i,
|
|
66
|
+
/\bformat\b/i,
|
|
67
|
+
/publish workflow/i,
|
|
68
|
+
/input/i,
|
|
69
|
+
/output/i,
|
|
70
|
+
/activation/i,
|
|
71
|
+
] as const;
|
|
72
|
+
|
|
53
73
|
function getSyntheticSkillSearchDirs(): string[] {
|
|
54
74
|
const cwd = process.cwd();
|
|
55
75
|
const homeDir = process.env.HOME ?? "";
|
|
@@ -122,6 +142,115 @@ function normalizeEvalQuery(query: string): string {
|
|
|
122
142
|
return query.trim().toLowerCase().replace(/\s+/g, " ");
|
|
123
143
|
}
|
|
124
144
|
|
|
145
|
+
function truncatePromptExample(query: string): string {
|
|
146
|
+
const trimmed = query.trim();
|
|
147
|
+
if (trimmed.length <= MAX_REAL_EXAMPLE_LENGTH) return trimmed;
|
|
148
|
+
return `${trimmed.slice(0, MAX_REAL_EXAMPLE_LENGTH - 1).trimEnd()}…`;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
function truncateSyntheticSection(text: string, limit: number): string {
|
|
152
|
+
const trimmed = text.trim();
|
|
153
|
+
if (trimmed.length <= limit) return trimmed;
|
|
154
|
+
return `${trimmed.slice(0, limit - 1).trimEnd()}…`;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
export function summarizeSkillContentForSyntheticPrompt(skillContent: string): string {
|
|
158
|
+
const trimmed = skillContent.trim();
|
|
159
|
+
if (trimmed.length <= MAX_SYNTHETIC_SKILL_CONTENT_CHARS) return trimmed;
|
|
160
|
+
|
|
161
|
+
const frontmatterMatch = trimmed.match(/^---\n[\s\S]*?\n---\n*/);
|
|
162
|
+
const frontmatter = frontmatterMatch?.[0]?.trim() ?? "";
|
|
163
|
+
const body = frontmatterMatch ? trimmed.slice(frontmatterMatch[0].length).trim() : trimmed;
|
|
164
|
+
const sectionRegex = /^#{1,6}\s+.+$/gm;
|
|
165
|
+
const headingMatches = [...body.matchAll(sectionRegex)];
|
|
166
|
+
|
|
167
|
+
if (headingMatches.length === 0) {
|
|
168
|
+
return truncateSyntheticSection(trimmed, MAX_SYNTHETIC_SKILL_CONTENT_CHARS);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
const summaryParts: string[] = [];
|
|
172
|
+
let usedLength = 0;
|
|
173
|
+
const appendPart = (part: string): boolean => {
|
|
174
|
+
const normalized = part.trim();
|
|
175
|
+
if (!normalized) return false;
|
|
176
|
+
const nextLength = usedLength + normalized.length + (summaryParts.length > 0 ? 2 : 0);
|
|
177
|
+
if (nextLength > MAX_SYNTHETIC_SKILL_CONTENT_CHARS) return false;
|
|
178
|
+
summaryParts.push(normalized);
|
|
179
|
+
usedLength = nextLength;
|
|
180
|
+
return true;
|
|
181
|
+
};
|
|
182
|
+
|
|
183
|
+
if (frontmatter) {
|
|
184
|
+
appendPart(frontmatter);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const preamble = body.slice(0, headingMatches[0]?.index ?? 0).trim();
|
|
188
|
+
if (preamble) {
|
|
189
|
+
appendPart(truncateSyntheticSection(preamble, MAX_SYNTHETIC_PREAMBLE_CHARS));
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
const sections = headingMatches.map((match, index) => {
|
|
193
|
+
const start = match.index ?? 0;
|
|
194
|
+
const end = headingMatches[index + 1]?.index ?? body.length;
|
|
195
|
+
const content = body.slice(start, end).trim();
|
|
196
|
+
const heading = match[0].replace(/^#{1,6}\s+/, "").trim();
|
|
197
|
+
return { heading, content, index };
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
const selectedIndices = new Set<number>();
|
|
201
|
+
if (sections.length > 0) selectedIndices.add(0);
|
|
202
|
+
for (const section of sections) {
|
|
203
|
+
if (PRIORITY_SYNTHETIC_SECTION_PATTERNS.some((pattern) => pattern.test(section.heading))) {
|
|
204
|
+
selectedIndices.add(section.index);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
for (const section of sections) {
|
|
209
|
+
if (!selectedIndices.has(section.index)) continue;
|
|
210
|
+
appendPart(truncateSyntheticSection(section.content, MAX_SYNTHETIC_SECTION_CHARS));
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
appendPart("[skill content summarized for synthetic eval generation]");
|
|
214
|
+
return summaryParts.join("\n\n");
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
export function buildSyntheticPromptRealExamples(
|
|
218
|
+
positiveCandidates: string[],
|
|
219
|
+
negativeCandidates: string[],
|
|
220
|
+
skillName: string,
|
|
221
|
+
limit = 5,
|
|
222
|
+
): SyntheticPromptRealExamples | undefined {
|
|
223
|
+
const cleanedPositives: string[] = [];
|
|
224
|
+
const seenPositives = new Set<string>();
|
|
225
|
+
for (const candidate of positiveCandidates) {
|
|
226
|
+
const cleaned = extractPositiveEvalQueryText(candidate, skillName);
|
|
227
|
+
if (!cleaned) continue;
|
|
228
|
+
const normalized = normalizeEvalQuery(cleaned);
|
|
229
|
+
if (seenPositives.has(normalized)) continue;
|
|
230
|
+
seenPositives.add(normalized);
|
|
231
|
+
cleanedPositives.push(truncatePromptExample(cleaned));
|
|
232
|
+
if (cleanedPositives.length >= limit) break;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
if (cleanedPositives.length === 0) return undefined;
|
|
236
|
+
|
|
237
|
+
const positiveSet = new Set(cleanedPositives.map((query) => normalizeEvalQuery(query)));
|
|
238
|
+
const cleanedNegatives: string[] = [];
|
|
239
|
+
const seenNegatives = new Set<string>();
|
|
240
|
+
for (const candidate of negativeCandidates) {
|
|
241
|
+
const cleaned = extractActionableQueryText(candidate);
|
|
242
|
+
if (!cleaned) continue;
|
|
243
|
+
const truncated = truncatePromptExample(cleaned);
|
|
244
|
+
const normalized = normalizeEvalQuery(truncated);
|
|
245
|
+
if (positiveSet.has(normalized) || seenNegatives.has(normalized)) continue;
|
|
246
|
+
seenNegatives.add(normalized);
|
|
247
|
+
cleanedNegatives.push(truncated);
|
|
248
|
+
if (cleanedNegatives.length >= limit) break;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
return { positive: cleanedPositives, negative: cleanedNegatives };
|
|
252
|
+
}
|
|
253
|
+
|
|
125
254
|
function dedupeEvalEntries(entries: EvalEntry[]): EvalEntry[] {
|
|
126
255
|
const seen = new Set<string>();
|
|
127
256
|
const deduped: EvalEntry[] = [];
|
|
@@ -223,6 +352,7 @@ export function buildSyntheticPrompt(
|
|
|
223
352
|
realExamples?: SyntheticPromptRealExamples,
|
|
224
353
|
siblingSkills: string[] = [],
|
|
225
354
|
): { system: string; user: string } {
|
|
355
|
+
const summarizedSkillContent = summarizeSkillContentForSyntheticPrompt(skillContent);
|
|
226
356
|
const {
|
|
227
357
|
explicitCount,
|
|
228
358
|
implicitCount,
|
|
@@ -259,7 +389,7 @@ Output as JSON array with no surrounding text:
|
|
|
259
389
|
let user = `Skill name: ${skillName}
|
|
260
390
|
|
|
261
391
|
Skill content:
|
|
262
|
-
${
|
|
392
|
+
${summarizedSkillContent}
|
|
263
393
|
|
|
264
394
|
Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false).
|
|
265
395
|
|
|
@@ -308,6 +438,7 @@ export function buildSyntheticRefinementPrompt(
|
|
|
308
438
|
maxNegatives: number,
|
|
309
439
|
siblingSkills: string[] = [],
|
|
310
440
|
): { system: string; user: string } {
|
|
441
|
+
const summarizedSkillContent = summarizeSkillContentForSyntheticPrompt(skillContent);
|
|
311
442
|
const targets = buildPromptFamilyTargets(maxPositives, maxNegatives, siblingSkills.length > 0);
|
|
312
443
|
const system = `You are refining a cold-start eval benchmark for a coding agent skill.
|
|
313
444
|
|
|
@@ -325,7 +456,7 @@ Return ONLY a JSON array with the final benchmark.`;
|
|
|
325
456
|
const user = `Skill name: ${skillName}
|
|
326
457
|
|
|
327
458
|
Skill content:
|
|
328
|
-
${
|
|
459
|
+
${summarizedSkillContent}
|
|
329
460
|
|
|
330
461
|
Target final benchmark:
|
|
331
462
|
- ${maxPositives} positives
|
|
@@ -459,25 +590,22 @@ export async function generateSyntheticEvals(
|
|
|
459
590
|
|
|
460
591
|
// Positives: high-confidence triggered records for this skill
|
|
461
592
|
const skillRecords = querySkillUsageRecords(db) as SkillUsageRecord[];
|
|
462
|
-
const
|
|
593
|
+
const positiveCandidates = skillRecords
|
|
463
594
|
.filter((r) => isHighConfidencePositiveSkillRecord(r, skillName))
|
|
464
595
|
.map((r) => r.query)
|
|
465
|
-
.filter((q): q is string => typeof q === "string" && q.length > 0)
|
|
466
|
-
.slice(0, 5);
|
|
596
|
+
.filter((q): q is string => typeof q === "string" && q.length > 0);
|
|
467
597
|
|
|
468
|
-
// Negatives: from all_queries, excluding
|
|
469
|
-
const
|
|
470
|
-
const
|
|
471
|
-
const negative = allQueries
|
|
598
|
+
// Negatives: from all_queries, excluding cleaned positives later.
|
|
599
|
+
const allQueries = queryQueryLog(db) as QueryLogRecord[];
|
|
600
|
+
const negativeCandidates = allQueries
|
|
472
601
|
.map((r) => r.query)
|
|
473
|
-
.filter(
|
|
474
|
-
(q): q is string => typeof q === "string" && q.length > 0 && !posSet.has(q.toLowerCase()),
|
|
475
|
-
)
|
|
476
|
-
.slice(0, 5);
|
|
602
|
+
.filter((q): q is string => typeof q === "string" && q.length > 0);
|
|
477
603
|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
604
|
+
realExamples = buildSyntheticPromptRealExamples(
|
|
605
|
+
positiveCandidates,
|
|
606
|
+
negativeCandidates,
|
|
607
|
+
skillName,
|
|
608
|
+
);
|
|
481
609
|
} catch {
|
|
482
610
|
// fail-open: synthetic gen works without real examples
|
|
483
611
|
}
|