selftune 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +20 -10
- package/.claude/agents/evolution-reviewer.md +14 -1
- package/.claude/agents/integration-guide.md +18 -6
- package/.claude/agents/pattern-analyst.md +18 -5
- package/CHANGELOG.md +12 -4
- package/README.md +43 -35
- package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
- package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
- package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
- package/apps/local-dashboard/dist/favicon.png +0 -0
- package/apps/local-dashboard/dist/index.html +17 -0
- package/apps/local-dashboard/dist/logo.png +0 -0
- package/apps/local-dashboard/dist/logo.svg +9 -0
- package/cli/selftune/badge/badge-data.ts +1 -1
- package/cli/selftune/badge/badge.ts +4 -8
- package/cli/selftune/canonical-export.ts +183 -0
- package/cli/selftune/constants.ts +28 -0
- package/cli/selftune/contribute/contribute.ts +1 -1
- package/cli/selftune/cron/setup.ts +17 -17
- package/cli/selftune/dashboard-contract.ts +202 -0
- package/cli/selftune/dashboard-server.ts +653 -186
- package/cli/selftune/dashboard.ts +41 -176
- package/cli/selftune/eval/baseline.ts +5 -4
- package/cli/selftune/eval/composability-v2.ts +273 -0
- package/cli/selftune/eval/hooks-to-evals.ts +34 -15
- package/cli/selftune/eval/unit-test-cli.ts +1 -1
- package/cli/selftune/evolution/evidence.ts +26 -0
- package/cli/selftune/evolution/evolve-body.ts +105 -11
- package/cli/selftune/evolution/evolve.ts +371 -25
- package/cli/selftune/evolution/extract-patterns.ts +87 -29
- package/cli/selftune/evolution/rollback.ts +2 -2
- package/cli/selftune/grading/auto-grade.ts +200 -0
- package/cli/selftune/grading/grade-session.ts +448 -97
- package/cli/selftune/grading/results.ts +42 -0
- package/cli/selftune/hooks/prompt-log.ts +172 -2
- package/cli/selftune/hooks/session-stop.ts +123 -3
- package/cli/selftune/hooks/skill-eval.ts +119 -3
- package/cli/selftune/index.ts +395 -116
- package/cli/selftune/ingestors/claude-replay.ts +140 -114
- package/cli/selftune/ingestors/codex-rollout.ts +345 -46
- package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
- package/cli/selftune/ingestors/openclaw-ingest.ts +141 -8
- package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
- package/cli/selftune/init.ts +227 -14
- package/cli/selftune/last.ts +14 -5
- package/cli/selftune/localdb/db.ts +63 -0
- package/cli/selftune/localdb/materialize.ts +428 -0
- package/cli/selftune/localdb/queries.ts +376 -0
- package/cli/selftune/localdb/schema.ts +204 -0
- package/cli/selftune/monitoring/watch.ts +66 -15
- package/cli/selftune/normalization.ts +682 -0
- package/cli/selftune/observability.ts +19 -44
- package/cli/selftune/orchestrate.ts +1073 -0
- package/cli/selftune/quickstart.ts +203 -0
- package/cli/selftune/repair/skill-usage.ts +576 -0
- package/cli/selftune/schedule.ts +561 -0
- package/cli/selftune/status.ts +48 -26
- package/cli/selftune/sync.ts +627 -0
- package/cli/selftune/types.ts +148 -0
- package/cli/selftune/utils/canonical-log.ts +45 -0
- package/cli/selftune/utils/hooks.ts +41 -0
- package/cli/selftune/utils/html.ts +27 -0
- package/cli/selftune/utils/llm-call.ts +78 -20
- package/cli/selftune/utils/math.ts +10 -0
- package/cli/selftune/utils/query-filter.ts +139 -0
- package/cli/selftune/utils/skill-discovery.ts +340 -0
- package/cli/selftune/utils/skill-log.ts +68 -0
- package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
- package/cli/selftune/utils/transcript.ts +272 -26
- package/cli/selftune/workflows/discover.ts +254 -0
- package/cli/selftune/workflows/skill-md-writer.ts +288 -0
- package/cli/selftune/workflows/workflows.ts +188 -0
- package/package.json +21 -8
- package/packages/telemetry-contract/README.md +11 -0
- package/packages/telemetry-contract/fixtures/golden.json +87 -0
- package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
- package/packages/telemetry-contract/index.ts +1 -0
- package/packages/telemetry-contract/package.json +19 -0
- package/packages/telemetry-contract/src/index.ts +2 -0
- package/packages/telemetry-contract/src/types.ts +163 -0
- package/packages/telemetry-contract/src/validators.ts +109 -0
- package/skill/SKILL.md +84 -53
- package/skill/Workflows/AutoActivation.md +17 -16
- package/skill/Workflows/Badge.md +6 -0
- package/skill/Workflows/Baseline.md +46 -23
- package/skill/Workflows/Composability.md +12 -5
- package/skill/Workflows/Contribute.md +17 -14
- package/skill/Workflows/Cron.md +56 -79
- package/skill/Workflows/Dashboard.md +45 -34
- package/skill/Workflows/Doctor.md +30 -17
- package/skill/Workflows/Evals.md +64 -40
- package/skill/Workflows/EvolutionMemory.md +2 -0
- package/skill/Workflows/Evolve.md +102 -47
- package/skill/Workflows/EvolveBody.md +6 -6
- package/skill/Workflows/Grade.md +36 -31
- package/skill/Workflows/ImportSkillsBench.md +11 -5
- package/skill/Workflows/Ingest.md +43 -36
- package/skill/Workflows/Initialize.md +44 -30
- package/skill/Workflows/Orchestrate.md +139 -0
- package/skill/Workflows/Replay.md +39 -18
- package/skill/Workflows/Rollback.md +3 -3
- package/skill/Workflows/Schedule.md +61 -0
- package/skill/Workflows/Sync.md +88 -0
- package/skill/Workflows/UnitTest.md +34 -22
- package/skill/Workflows/Watch.md +14 -4
- package/skill/Workflows/Workflows.md +129 -0
- package/skill/assets/activation-rules-default.json +26 -0
- package/skill/assets/multi-skill-settings.json +63 -0
- package/skill/assets/single-skill-settings.json +57 -0
- package/skill/references/invocation-taxonomy.md +2 -2
- package/skill/references/logs.md +164 -2
- package/skill/references/setup-patterns.md +65 -0
- package/skill/references/version-history.md +40 -0
- package/skill/settings_snippet.json +1 -1
- package/templates/multi-skill-settings.json +7 -7
- package/templates/single-skill-settings.json +6 -6
- package/dashboard/index.html +0 -1680
|
@@ -13,11 +13,14 @@ import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
|
|
|
13
13
|
import type { BaselineMeasurement } from "../eval/baseline.js";
|
|
14
14
|
import { measureBaseline } from "../eval/baseline.js";
|
|
15
15
|
import { buildEvalSet } from "../eval/hooks-to-evals.js";
|
|
16
|
+
import { readGradingResultsForSkill } from "../grading/results.js";
|
|
16
17
|
import { updateContextAfterEvolve } from "../memory/writer.js";
|
|
18
|
+
import type { SyncResult } from "../sync.js";
|
|
17
19
|
import type {
|
|
18
20
|
EvalEntry,
|
|
19
21
|
EvalPassRate,
|
|
20
22
|
EvolutionAuditEntry,
|
|
23
|
+
EvolutionEvidenceEntry,
|
|
21
24
|
EvolutionProposal,
|
|
22
25
|
EvolveResultSummary,
|
|
23
26
|
FailurePattern,
|
|
@@ -29,8 +32,10 @@ import type {
|
|
|
29
32
|
} from "../types.js";
|
|
30
33
|
import { parseFrontmatter, replaceFrontmatterDescription } from "../utils/frontmatter.js";
|
|
31
34
|
import { readJsonl } from "../utils/jsonl.js";
|
|
35
|
+
import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
|
|
32
36
|
import { createEvolveTUI } from "../utils/tui.js";
|
|
33
37
|
import { appendAuditEntry } from "./audit.js";
|
|
38
|
+
import { appendEvidenceEntry } from "./evidence.js";
|
|
34
39
|
import { extractFailurePatterns } from "./extract-patterns.js";
|
|
35
40
|
import {
|
|
36
41
|
computeInvocationScores,
|
|
@@ -68,6 +73,8 @@ export interface EvolveOptions {
|
|
|
68
73
|
cheapLoop?: boolean;
|
|
69
74
|
gateModel?: string;
|
|
70
75
|
proposalModel?: string;
|
|
76
|
+
syncFirst?: boolean;
|
|
77
|
+
syncForce?: boolean;
|
|
71
78
|
}
|
|
72
79
|
|
|
73
80
|
export interface EvolveResult {
|
|
@@ -81,6 +88,7 @@ export interface EvolveResult {
|
|
|
81
88
|
elapsedMs: number;
|
|
82
89
|
baselineResult?: BaselineMeasurement;
|
|
83
90
|
gateValidation?: ValidationResult;
|
|
91
|
+
sync_result?: SyncResult;
|
|
84
92
|
}
|
|
85
93
|
|
|
86
94
|
/**
|
|
@@ -98,9 +106,12 @@ export interface EvolveDeps {
|
|
|
98
106
|
validateProposal?: typeof import("./validate-proposal.js").validateProposal;
|
|
99
107
|
gateValidateProposal?: typeof import("./validate-proposal.js").validateProposal;
|
|
100
108
|
appendAuditEntry?: typeof import("./audit.js").appendAuditEntry;
|
|
109
|
+
appendEvidenceEntry?: typeof import("./evidence.js").appendEvidenceEntry;
|
|
101
110
|
buildEvalSet?: typeof import("../eval/hooks-to-evals.js").buildEvalSet;
|
|
102
111
|
updateContextAfterEvolve?: typeof import("../memory/writer.js").updateContextAfterEvolve;
|
|
103
112
|
measureBaseline?: typeof import("../eval/baseline.js").measureBaseline;
|
|
113
|
+
readSkillUsageLog?: () => SkillUsageRecord[];
|
|
114
|
+
syncSources?: typeof import("../sync.js").syncSources;
|
|
104
115
|
}
|
|
105
116
|
|
|
106
117
|
// ---------------------------------------------------------------------------
|
|
@@ -124,6 +135,33 @@ function createAuditEntry(
|
|
|
124
135
|
};
|
|
125
136
|
}
|
|
126
137
|
|
|
138
|
+
// ---------------------------------------------------------------------------
|
|
139
|
+
// Diff helper
|
|
140
|
+
// ---------------------------------------------------------------------------
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Produce a simple colored diff between two text strings.
|
|
144
|
+
* Red (removed) / Green (added) lines, skipping unchanged lines.
|
|
145
|
+
*/
|
|
146
|
+
function formatSimpleDiff(oldText: string, newText: string): string {
|
|
147
|
+
const oldLines = oldText.split("\n");
|
|
148
|
+
const newLines = newText.split("\n");
|
|
149
|
+
const output: string[] = [];
|
|
150
|
+
const maxLen = Math.max(oldLines.length, newLines.length);
|
|
151
|
+
for (let i = 0; i < maxLen; i++) {
|
|
152
|
+
const oldLine = oldLines[i];
|
|
153
|
+
const newLine = newLines[i];
|
|
154
|
+
if (oldLine === newLine) continue;
|
|
155
|
+
if (oldLine !== undefined) {
|
|
156
|
+
output.push(`\x1b[31m- ${oldLine}\x1b[0m`);
|
|
157
|
+
}
|
|
158
|
+
if (newLine !== undefined) {
|
|
159
|
+
output.push(`\x1b[32m+ ${newLine}\x1b[0m`);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
return output.join("\n");
|
|
163
|
+
}
|
|
164
|
+
|
|
127
165
|
// ---------------------------------------------------------------------------
|
|
128
166
|
// Main orchestrator
|
|
129
167
|
// ---------------------------------------------------------------------------
|
|
@@ -148,11 +186,14 @@ export async function evolve(
|
|
|
148
186
|
const _validateProposal = _deps.validateProposal ?? validateProposal;
|
|
149
187
|
const _gateValidateProposal = _deps.gateValidateProposal ?? validateProposal;
|
|
150
188
|
const _appendAuditEntry = _deps.appendAuditEntry ?? appendAuditEntry;
|
|
189
|
+
const _appendEvidenceEntry = _deps.appendEvidenceEntry ?? appendEvidenceEntry;
|
|
151
190
|
const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
|
|
152
191
|
const _updateContextAfterEvolve = _deps.updateContextAfterEvolve ?? updateContextAfterEvolve;
|
|
153
192
|
const _measureBaseline = _deps.measureBaseline ?? measureBaseline;
|
|
193
|
+
const _readSkillUsageLog = _deps.readSkillUsageLog ?? (() => readEffectiveSkillUsageRecords());
|
|
154
194
|
|
|
155
195
|
const auditEntries: EvolutionAuditEntry[] = [];
|
|
196
|
+
let syncResult: SyncResult | undefined;
|
|
156
197
|
|
|
157
198
|
function recordAudit(
|
|
158
199
|
proposalId: string,
|
|
@@ -169,6 +210,14 @@ export async function evolve(
|
|
|
169
210
|
}
|
|
170
211
|
}
|
|
171
212
|
|
|
213
|
+
function recordEvidence(entry: EvolutionEvidenceEntry): void {
|
|
214
|
+
try {
|
|
215
|
+
_appendEvidenceEntry(entry);
|
|
216
|
+
} catch {
|
|
217
|
+
// Fail-open: evidence should not block the pipeline
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
172
221
|
const pipelineStart = Date.now();
|
|
173
222
|
let llmCallCount = 0;
|
|
174
223
|
const tui = createEvolveTUI({ skillName, model: options.proposalModel ?? "(default)" });
|
|
@@ -182,6 +231,7 @@ export async function evolve(
|
|
|
182
231
|
...r,
|
|
183
232
|
llmCallCount,
|
|
184
233
|
elapsedMs: Date.now() - pipelineStart,
|
|
234
|
+
...(syncResult ? { sync_result: syncResult } : {}),
|
|
185
235
|
});
|
|
186
236
|
|
|
187
237
|
// Hoisted so catch block can preserve partial results on error
|
|
@@ -209,21 +259,65 @@ export async function evolve(
|
|
|
209
259
|
const currentDescription = frontmatter.description || rawContent;
|
|
210
260
|
const skillVersion = frontmatter.version || undefined;
|
|
211
261
|
const versionTag = skillVersion ? `, v${skillVersion}` : "";
|
|
262
|
+
const createdAuditDetails = (message: string) =>
|
|
263
|
+
`original_description:${rawContent}\n${message}`;
|
|
212
264
|
tui.done(`Loaded SKILL.md (desc: ${currentDescription.length} chars${versionTag})`);
|
|
213
265
|
|
|
266
|
+
if (options.syncFirst) {
|
|
267
|
+
tui.step(`Syncing source-truth telemetry${options.syncForce ? " (force)" : ""}...`);
|
|
268
|
+
const { createDefaultSyncOptions, syncSources: realSyncSources } = await import("../sync.js");
|
|
269
|
+
const syncRunner = _deps.syncSources ?? realSyncSources;
|
|
270
|
+
syncResult = syncRunner(
|
|
271
|
+
createDefaultSyncOptions({
|
|
272
|
+
force: options.syncForce ?? false,
|
|
273
|
+
}),
|
|
274
|
+
);
|
|
275
|
+
const sourceSynced = Object.values(syncResult.sources).reduce(
|
|
276
|
+
(sum, source) => sum + source.synced,
|
|
277
|
+
0,
|
|
278
|
+
);
|
|
279
|
+
tui.done(
|
|
280
|
+
`Source sync complete (${sourceSynced} source sessions, ${syncResult.repair.repaired_records} repaired records)`,
|
|
281
|
+
);
|
|
282
|
+
}
|
|
283
|
+
|
|
214
284
|
// -----------------------------------------------------------------------
|
|
215
285
|
// Step 2: Load eval set
|
|
216
286
|
// -----------------------------------------------------------------------
|
|
287
|
+
const skillUsage = _readSkillUsageLog();
|
|
217
288
|
let evalSet: EvalEntry[];
|
|
218
289
|
|
|
219
290
|
if (evalSetPath && existsSync(evalSetPath)) {
|
|
220
|
-
|
|
221
|
-
|
|
291
|
+
try {
|
|
292
|
+
const raw = readFileSync(evalSetPath, "utf-8");
|
|
293
|
+
evalSet = JSON.parse(raw) as EvalEntry[];
|
|
294
|
+
} catch (parseErr) {
|
|
295
|
+
const msg = parseErr instanceof Error ? parseErr.message : String(parseErr);
|
|
296
|
+
tui.fail(`Failed to load eval set from ${evalSetPath}: ${msg}`);
|
|
297
|
+
finishTui();
|
|
298
|
+
return withStats({
|
|
299
|
+
proposal: null,
|
|
300
|
+
validation: null,
|
|
301
|
+
deployed: false,
|
|
302
|
+
auditEntries,
|
|
303
|
+
reason: `Failed to load eval set: ${msg}`,
|
|
304
|
+
});
|
|
305
|
+
}
|
|
306
|
+
if (!Array.isArray(evalSet)) {
|
|
307
|
+
tui.fail(`Eval set at ${evalSetPath} is not an array`);
|
|
308
|
+
finishTui();
|
|
309
|
+
return withStats({
|
|
310
|
+
proposal: null,
|
|
311
|
+
validation: null,
|
|
312
|
+
deployed: false,
|
|
313
|
+
auditEntries,
|
|
314
|
+
reason: `Eval set at ${evalSetPath} is not a JSON array`,
|
|
315
|
+
});
|
|
316
|
+
}
|
|
222
317
|
} else {
|
|
223
318
|
// Build from logs
|
|
224
|
-
const skillRecords = readJsonl<SkillUsageRecord>(SKILL_LOG);
|
|
225
319
|
const queryRecords = readJsonl<QueryLogRecord>(QUERY_LOG);
|
|
226
|
-
evalSet = _buildEvalSet(
|
|
320
|
+
evalSet = _buildEvalSet(skillUsage, queryRecords, skillName);
|
|
227
321
|
}
|
|
228
322
|
|
|
229
323
|
const posCount = evalSet.filter((e) => e.should_trigger).length;
|
|
@@ -233,8 +327,6 @@ export async function evolve(
|
|
|
233
327
|
// -----------------------------------------------------------------------
|
|
234
328
|
// Step 3: Load skill usage records
|
|
235
329
|
// -----------------------------------------------------------------------
|
|
236
|
-
const skillUsage = readJsonl<SkillUsageRecord>(SKILL_LOG);
|
|
237
|
-
|
|
238
330
|
// -----------------------------------------------------------------------
|
|
239
331
|
// Step 4: Extract failure patterns
|
|
240
332
|
// -----------------------------------------------------------------------
|
|
@@ -251,17 +343,38 @@ export async function evolve(
|
|
|
251
343
|
);
|
|
252
344
|
|
|
253
345
|
// -----------------------------------------------------------------------
|
|
254
|
-
// Step 5:
|
|
346
|
+
// Step 5: Cold-start bootstrap or early exit if no patterns
|
|
255
347
|
// -----------------------------------------------------------------------
|
|
256
348
|
if (failurePatterns.length === 0) {
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
349
|
+
// Cold-start: if the eval set has positive entries that the skill should
|
|
350
|
+
// match but there are zero skill usage records, treat the positive eval
|
|
351
|
+
// entries themselves as "missed queries" — they ARE the failure signal.
|
|
352
|
+
const positiveEvals = evalSet.filter((e) => e.should_trigger);
|
|
353
|
+
const hasSkillUsageHistory = skillUsage.some((record) => record.skill_name === skillName);
|
|
354
|
+
if (positiveEvals.length > 0 && !hasSkillUsageHistory) {
|
|
355
|
+
const coldStartPattern: FailurePattern = {
|
|
356
|
+
pattern_id: `fp-${skillName}-coldstart`,
|
|
357
|
+
skill_name: skillName,
|
|
358
|
+
invocation_type: "implicit",
|
|
359
|
+
missed_queries: positiveEvals.map((e) => e.query),
|
|
360
|
+
frequency: positiveEvals.length,
|
|
361
|
+
sample_sessions: [],
|
|
362
|
+
extracted_at: new Date().toISOString(),
|
|
363
|
+
};
|
|
364
|
+
failurePatterns.push(coldStartPattern);
|
|
365
|
+
tui.done(
|
|
366
|
+
`Cold-start bootstrap: ${positiveEvals.length} positive eval entries used as missed queries`,
|
|
367
|
+
);
|
|
368
|
+
} else {
|
|
369
|
+
finishTui();
|
|
370
|
+
return withStats({
|
|
371
|
+
proposal: null,
|
|
372
|
+
validation: null,
|
|
373
|
+
deployed: false,
|
|
374
|
+
auditEntries,
|
|
375
|
+
reason: "No failure patterns found",
|
|
376
|
+
});
|
|
377
|
+
}
|
|
265
378
|
}
|
|
266
379
|
|
|
267
380
|
// -----------------------------------------------------------------------
|
|
@@ -279,11 +392,14 @@ export async function evolve(
|
|
|
279
392
|
const paretoEnabled = options.paretoEnabled ?? false;
|
|
280
393
|
const candidateCount = options.candidateCount ?? 3;
|
|
281
394
|
const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
|
|
395
|
+
const telemetryRecords =
|
|
396
|
+
options.telemetryRecords ??
|
|
397
|
+
(tokenEfficiencyEnabled ? readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG) : undefined);
|
|
282
398
|
|
|
283
399
|
// Compute token efficiency score if enabled and telemetry is available
|
|
284
400
|
let tokenEffScore: number | undefined;
|
|
285
|
-
if (tokenEfficiencyEnabled &&
|
|
286
|
-
tokenEffScore = computeTokenEfficiencyScore(skillName,
|
|
401
|
+
if (tokenEfficiencyEnabled && telemetryRecords && telemetryRecords.length > 0) {
|
|
402
|
+
tokenEffScore = computeTokenEfficiencyScore(skillName, telemetryRecords);
|
|
287
403
|
recordAudit(
|
|
288
404
|
"system",
|
|
289
405
|
"created",
|
|
@@ -321,7 +437,25 @@ export async function evolve(
|
|
|
321
437
|
// Validate each candidate
|
|
322
438
|
const paretoCandidates: ParetoCandidate[] = [];
|
|
323
439
|
for (const proposal of viableCandidates) {
|
|
324
|
-
recordAudit(
|
|
440
|
+
recordAudit(
|
|
441
|
+
proposal.proposal_id,
|
|
442
|
+
"created",
|
|
443
|
+
createdAuditDetails(`Pareto candidate for ${skillName}`),
|
|
444
|
+
);
|
|
445
|
+
recordEvidence({
|
|
446
|
+
timestamp: new Date().toISOString(),
|
|
447
|
+
proposal_id: proposal.proposal_id,
|
|
448
|
+
skill_name: skillName,
|
|
449
|
+
skill_path: skillPath,
|
|
450
|
+
target: "description",
|
|
451
|
+
stage: "created",
|
|
452
|
+
rationale: proposal.rationale,
|
|
453
|
+
confidence: proposal.confidence,
|
|
454
|
+
details: `Pareto candidate for ${skillName}`,
|
|
455
|
+
original_text: proposal.original_description,
|
|
456
|
+
proposed_text: proposal.proposed_description,
|
|
457
|
+
eval_set: evalSet,
|
|
458
|
+
});
|
|
325
459
|
|
|
326
460
|
const validation = await _validateProposal(
|
|
327
461
|
proposal,
|
|
@@ -334,6 +468,26 @@ export async function evolve(
|
|
|
334
468
|
"validated",
|
|
335
469
|
`Pareto validation: improved=${validation.improved}`,
|
|
336
470
|
);
|
|
471
|
+
recordEvidence({
|
|
472
|
+
timestamp: new Date().toISOString(),
|
|
473
|
+
proposal_id: proposal.proposal_id,
|
|
474
|
+
skill_name: skillName,
|
|
475
|
+
skill_path: skillPath,
|
|
476
|
+
target: "description",
|
|
477
|
+
stage: "validated",
|
|
478
|
+
rationale: proposal.rationale,
|
|
479
|
+
confidence: proposal.confidence,
|
|
480
|
+
details: `Pareto validation: improved=${validation.improved}`,
|
|
481
|
+
validation: {
|
|
482
|
+
improved: validation.improved,
|
|
483
|
+
before_pass_rate: validation.before_pass_rate,
|
|
484
|
+
after_pass_rate: validation.after_pass_rate,
|
|
485
|
+
net_change: validation.net_change,
|
|
486
|
+
regressions: validation.regressions,
|
|
487
|
+
new_passes: validation.new_passes,
|
|
488
|
+
per_entry_results: validation.per_entry_results,
|
|
489
|
+
},
|
|
490
|
+
});
|
|
337
491
|
|
|
338
492
|
if (validation.improved && validation.per_entry_results) {
|
|
339
493
|
const invocationScores = computeInvocationScores(validation.per_entry_results);
|
|
@@ -398,8 +552,22 @@ export async function evolve(
|
|
|
398
552
|
recordAudit(
|
|
399
553
|
proposal.proposal_id,
|
|
400
554
|
"created",
|
|
401
|
-
`Proposal created for ${skillName} (iteration ${iteration + 1})
|
|
555
|
+
createdAuditDetails(`Proposal created for ${skillName} (iteration ${iteration + 1})`),
|
|
402
556
|
);
|
|
557
|
+
recordEvidence({
|
|
558
|
+
timestamp: new Date().toISOString(),
|
|
559
|
+
proposal_id: proposal.proposal_id,
|
|
560
|
+
skill_name: skillName,
|
|
561
|
+
skill_path: skillPath,
|
|
562
|
+
target: "description",
|
|
563
|
+
stage: "created",
|
|
564
|
+
rationale: proposal.rationale,
|
|
565
|
+
confidence: proposal.confidence,
|
|
566
|
+
details: `Proposal created for ${skillName} (iteration ${iteration + 1})`,
|
|
567
|
+
original_text: proposal.original_description,
|
|
568
|
+
proposed_text: proposal.proposed_description,
|
|
569
|
+
eval_set: evalSet,
|
|
570
|
+
});
|
|
403
571
|
|
|
404
572
|
// Step 9: Check confidence threshold
|
|
405
573
|
if (proposal.confidence < confidenceThreshold) {
|
|
@@ -409,6 +577,17 @@ export async function evolve(
|
|
|
409
577
|
"rejected",
|
|
410
578
|
`Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
411
579
|
);
|
|
580
|
+
recordEvidence({
|
|
581
|
+
timestamp: new Date().toISOString(),
|
|
582
|
+
proposal_id: proposal.proposal_id,
|
|
583
|
+
skill_name: skillName,
|
|
584
|
+
skill_path: skillPath,
|
|
585
|
+
target: "description",
|
|
586
|
+
stage: "rejected",
|
|
587
|
+
rationale: proposal.rationale,
|
|
588
|
+
confidence: proposal.confidence,
|
|
589
|
+
details: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
590
|
+
});
|
|
412
591
|
|
|
413
592
|
// If this is the last iteration, return early with rejection
|
|
414
593
|
if (iteration === maxIterations - 1) {
|
|
@@ -455,6 +634,26 @@ export async function evolve(
|
|
|
455
634
|
`Validation complete: improved=${validation.improved}`,
|
|
456
635
|
evalSnapshot,
|
|
457
636
|
);
|
|
637
|
+
recordEvidence({
|
|
638
|
+
timestamp: new Date().toISOString(),
|
|
639
|
+
proposal_id: proposal.proposal_id,
|
|
640
|
+
skill_name: skillName,
|
|
641
|
+
skill_path: skillPath,
|
|
642
|
+
target: "description",
|
|
643
|
+
stage: "validated",
|
|
644
|
+
rationale: proposal.rationale,
|
|
645
|
+
confidence: proposal.confidence,
|
|
646
|
+
details: `Validation complete: improved=${validation.improved}`,
|
|
647
|
+
validation: {
|
|
648
|
+
improved: validation.improved,
|
|
649
|
+
before_pass_rate: validation.before_pass_rate,
|
|
650
|
+
after_pass_rate: validation.after_pass_rate,
|
|
651
|
+
net_change: validation.net_change,
|
|
652
|
+
regressions: validation.regressions,
|
|
653
|
+
new_passes: validation.new_passes,
|
|
654
|
+
per_entry_results: validation.per_entry_results,
|
|
655
|
+
},
|
|
656
|
+
});
|
|
458
657
|
|
|
459
658
|
// Step 12: Check validation result
|
|
460
659
|
if (!validation.improved) {
|
|
@@ -464,6 +663,26 @@ export async function evolve(
|
|
|
464
663
|
"rejected",
|
|
465
664
|
`Validation failed: net_change=${validation.net_change.toFixed(3)}`,
|
|
466
665
|
);
|
|
666
|
+
recordEvidence({
|
|
667
|
+
timestamp: new Date().toISOString(),
|
|
668
|
+
proposal_id: proposal.proposal_id,
|
|
669
|
+
skill_name: skillName,
|
|
670
|
+
skill_path: skillPath,
|
|
671
|
+
target: "description",
|
|
672
|
+
stage: "rejected",
|
|
673
|
+
rationale: proposal.rationale,
|
|
674
|
+
confidence: proposal.confidence,
|
|
675
|
+
details: `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
|
|
676
|
+
validation: {
|
|
677
|
+
improved: validation.improved,
|
|
678
|
+
before_pass_rate: validation.before_pass_rate,
|
|
679
|
+
after_pass_rate: validation.after_pass_rate,
|
|
680
|
+
net_change: validation.net_change,
|
|
681
|
+
regressions: validation.regressions,
|
|
682
|
+
new_passes: validation.new_passes,
|
|
683
|
+
per_entry_results: validation.per_entry_results,
|
|
684
|
+
},
|
|
685
|
+
});
|
|
467
686
|
|
|
468
687
|
// If this is the last iteration, return with rejection
|
|
469
688
|
if (iteration === maxIterations - 1) {
|
|
@@ -583,12 +802,40 @@ export async function evolve(
|
|
|
583
802
|
writeFileSync(skillPath, updatedContent, "utf-8");
|
|
584
803
|
tui.done(`Deployed updated description to ${skillPath}`);
|
|
585
804
|
|
|
805
|
+
// Show what changed in the skill file
|
|
806
|
+
const diffOutput = formatSimpleDiff(rawContent, updatedContent);
|
|
807
|
+
if (diffOutput) {
|
|
808
|
+
console.error("\n--- Skill description diff ---");
|
|
809
|
+
console.error(diffOutput);
|
|
810
|
+
console.error("------------------------------\n");
|
|
811
|
+
}
|
|
812
|
+
|
|
586
813
|
recordAudit(lastProposal.proposal_id, "deployed", `Deployed proposal for ${skillName}`, {
|
|
587
814
|
total: evalSet.length,
|
|
588
815
|
passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
|
|
589
816
|
failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
|
|
590
817
|
pass_rate: lastValidation.after_pass_rate,
|
|
591
818
|
});
|
|
819
|
+
recordEvidence({
|
|
820
|
+
timestamp: new Date().toISOString(),
|
|
821
|
+
proposal_id: lastProposal.proposal_id,
|
|
822
|
+
skill_name: skillName,
|
|
823
|
+
skill_path: skillPath,
|
|
824
|
+
target: "description",
|
|
825
|
+
stage: "deployed",
|
|
826
|
+
rationale: lastProposal.rationale,
|
|
827
|
+
confidence: lastProposal.confidence,
|
|
828
|
+
details: `Deployed proposal for ${skillName}`,
|
|
829
|
+
validation: {
|
|
830
|
+
improved: lastValidation.improved,
|
|
831
|
+
before_pass_rate: lastValidation.before_pass_rate,
|
|
832
|
+
after_pass_rate: lastValidation.after_pass_rate,
|
|
833
|
+
net_change: lastValidation.net_change,
|
|
834
|
+
regressions: lastValidation.regressions,
|
|
835
|
+
new_passes: lastValidation.new_passes,
|
|
836
|
+
per_entry_results: lastValidation.per_entry_results,
|
|
837
|
+
},
|
|
838
|
+
});
|
|
592
839
|
}
|
|
593
840
|
|
|
594
841
|
// -----------------------------------------------------------------------
|
|
@@ -654,9 +901,12 @@ export async function cliMain(): Promise<void> {
|
|
|
654
901
|
"token-efficiency": { type: "boolean", default: false },
|
|
655
902
|
"with-baseline": { type: "boolean", default: false },
|
|
656
903
|
"validation-model": { type: "string", default: "haiku" },
|
|
657
|
-
"cheap-loop": { type: "boolean", default:
|
|
904
|
+
"cheap-loop": { type: "boolean", default: true },
|
|
905
|
+
"full-model": { type: "boolean", default: false },
|
|
658
906
|
"gate-model": { type: "string" },
|
|
659
907
|
"proposal-model": { type: "string" },
|
|
908
|
+
"sync-first": { type: "boolean", default: false },
|
|
909
|
+
"sync-force": { type: "boolean", default: false },
|
|
660
910
|
verbose: { type: "boolean", default: false },
|
|
661
911
|
help: { type: "boolean", default: false },
|
|
662
912
|
},
|
|
@@ -682,9 +932,12 @@ Options:
|
|
|
682
932
|
--token-efficiency Enable 5D Pareto with token efficiency scoring
|
|
683
933
|
--with-baseline Gate deployment on baseline lift > 0.05
|
|
684
934
|
--validation-model Model for trigger-check validation calls (default: haiku)
|
|
685
|
-
--cheap-loop Use cheap models for loop, expensive
|
|
686
|
-
--
|
|
935
|
+
--cheap-loop Use cheap models for loop, expensive for gate (default: on)
|
|
936
|
+
--full-model Use same model for all stages (disables cheap-loop)
|
|
937
|
+
--gate-model Model for final gate validation (default: sonnet)
|
|
687
938
|
--proposal-model Model for proposal generation LLM calls
|
|
939
|
+
--sync-first Refresh source-truth telemetry before building evals/failure patterns
|
|
940
|
+
--sync-force Force a full rescan during --sync-first
|
|
688
941
|
--verbose Output full EvolveResult JSON (default: compact summary)
|
|
689
942
|
--help Show this help message`);
|
|
690
943
|
process.exit(0);
|
|
@@ -694,6 +947,10 @@ Options:
|
|
|
694
947
|
console.error("[ERROR] --skill and --skill-path are required");
|
|
695
948
|
process.exit(1);
|
|
696
949
|
}
|
|
950
|
+
if ((values["sync-force"] ?? false) && !(values["sync-first"] ?? false)) {
|
|
951
|
+
console.error("[ERROR] --sync-force requires --sync-first");
|
|
952
|
+
process.exit(1);
|
|
953
|
+
}
|
|
697
954
|
|
|
698
955
|
const { detectAgent } = await import("../utils/llm-call.js");
|
|
699
956
|
const requestedAgent = values.agent;
|
|
@@ -721,11 +978,60 @@ Options:
|
|
|
721
978
|
process.exit(1);
|
|
722
979
|
}
|
|
723
980
|
|
|
981
|
+
// -------------------------------------------------------------------------
|
|
982
|
+
// Pre-flight validation: catch common misconfigurations before evolve()
|
|
983
|
+
// -------------------------------------------------------------------------
|
|
984
|
+
const skillPath = values["skill-path"];
|
|
985
|
+
if (!skillPath) {
|
|
986
|
+
console.error("[ERROR] --skill-path is required.");
|
|
987
|
+
process.exit(1);
|
|
988
|
+
}
|
|
989
|
+
if (!existsSync(skillPath)) {
|
|
990
|
+
console.error(`[ERROR] SKILL.md not found at: ${skillPath}`);
|
|
991
|
+
console.error(" Verify the --skill-path argument points to an existing SKILL.md file.");
|
|
992
|
+
process.exit(1);
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
const evalSetPath = values["eval-set"];
|
|
996
|
+
if (evalSetPath && !existsSync(evalSetPath)) {
|
|
997
|
+
console.error(`[ERROR] Eval set file not found at: ${evalSetPath}`);
|
|
998
|
+
console.error(" Verify the --eval-set argument points to an existing JSON file.");
|
|
999
|
+
process.exit(1);
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
// If no eval-set provided, check that log files exist for auto-generation
|
|
1003
|
+
if (!evalSetPath && !(values["sync-first"] ?? false)) {
|
|
1004
|
+
const hasSkillLog = readEffectiveSkillUsageRecords().length > 0;
|
|
1005
|
+
const hasQueryLog = existsSync(QUERY_LOG);
|
|
1006
|
+
if (!hasSkillLog && !hasQueryLog) {
|
|
1007
|
+
console.error("[ERROR] No eval set provided and no telemetry logs found.");
|
|
1008
|
+
console.error(
|
|
1009
|
+
" Either pass --eval-set <path> or generate logs first by using selftune-enabled skills.",
|
|
1010
|
+
);
|
|
1011
|
+
console.error(` Expected logs at: ${SKILL_LOG} and ${QUERY_LOG}`);
|
|
1012
|
+
process.exit(1);
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
|
|
724
1016
|
const tokenEfficiencyEnabled = values["token-efficiency"] ?? false;
|
|
725
1017
|
let telemetryRecords: SessionTelemetryRecord[] | undefined;
|
|
726
|
-
if (tokenEfficiencyEnabled) {
|
|
1018
|
+
if (tokenEfficiencyEnabled && !(values["sync-first"] ?? false)) {
|
|
727
1019
|
telemetryRecords = readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG);
|
|
728
1020
|
}
|
|
1021
|
+
const gradingResults = readGradingResultsForSkill(values.skill);
|
|
1022
|
+
|
|
1023
|
+
if (values.verbose) {
|
|
1024
|
+
console.error("[verbose] Pre-flight checks passed");
|
|
1025
|
+
console.error(`[verbose] Skill: ${values.skill}`);
|
|
1026
|
+
console.error(`[verbose] Skill path: ${skillPath}`);
|
|
1027
|
+
console.error(`[verbose] Agent: ${agent}`);
|
|
1028
|
+
console.error(`[verbose] Eval set: ${evalSetPath ?? "(auto-generated from logs)"}`);
|
|
1029
|
+
console.error(`[verbose] Loaded grading results: ${gradingResults.length}`);
|
|
1030
|
+
console.error(`[verbose] Cheap loop: ${values["cheap-loop"] ?? false}`);
|
|
1031
|
+
console.error(`[verbose] Dry run: ${values["dry-run"] ?? false}`);
|
|
1032
|
+
console.error(`[verbose] Sync first: ${values["sync-first"] ?? false}`);
|
|
1033
|
+
console.error(`[verbose] Sync force: ${values["sync-force"] ?? false}`);
|
|
1034
|
+
}
|
|
729
1035
|
|
|
730
1036
|
const result = await evolve({
|
|
731
1037
|
skillName: values.skill,
|
|
@@ -741,9 +1047,12 @@ Options:
|
|
|
741
1047
|
telemetryRecords,
|
|
742
1048
|
withBaseline: values["with-baseline"] ?? false,
|
|
743
1049
|
validationModel: values["validation-model"],
|
|
744
|
-
cheapLoop: values["cheap-loop"] ?? false,
|
|
1050
|
+
cheapLoop: (values["cheap-loop"] ?? true) && !(values["full-model"] ?? false),
|
|
745
1051
|
gateModel: values["gate-model"],
|
|
746
1052
|
proposalModel: values["proposal-model"],
|
|
1053
|
+
gradingResults,
|
|
1054
|
+
syncFirst: values["sync-first"] ?? false,
|
|
1055
|
+
syncForce: values["sync-force"] ?? false,
|
|
747
1056
|
});
|
|
748
1057
|
|
|
749
1058
|
if (values.verbose) {
|
|
@@ -769,12 +1078,49 @@ Options:
|
|
|
769
1078
|
};
|
|
770
1079
|
console.log(JSON.stringify(summary, null, 2));
|
|
771
1080
|
}
|
|
1081
|
+
|
|
1082
|
+
// Print human-readable status to stderr so users always see outcome
|
|
1083
|
+
if (!result.deployed) {
|
|
1084
|
+
console.error(`\n[NOT DEPLOYED] ${result.reason}`);
|
|
1085
|
+
if (result.validation && !result.validation.improved) {
|
|
1086
|
+
console.error(
|
|
1087
|
+
` Pass rate: ${(result.validation.before_pass_rate * 100).toFixed(1)}% -> ${(result.validation.after_pass_rate * 100).toFixed(1)}% (net: ${result.validation.net_change >= 0 ? "+" : ""}${(result.validation.net_change * 100).toFixed(1)}%)`,
|
|
1088
|
+
);
|
|
1089
|
+
if (result.validation.regressions.length > 0) {
|
|
1090
|
+
console.error(` Regressions: ${result.validation.regressions.length} entries`);
|
|
1091
|
+
}
|
|
1092
|
+
}
|
|
1093
|
+
if (
|
|
1094
|
+
result.proposal &&
|
|
1095
|
+
result.proposal.confidence < Number.parseFloat(values.confidence ?? "0.6")
|
|
1096
|
+
) {
|
|
1097
|
+
console.error(
|
|
1098
|
+
` Confidence ${result.proposal.confidence.toFixed(2)} below threshold ${values.confidence ?? "0.6"}`,
|
|
1099
|
+
);
|
|
1100
|
+
}
|
|
1101
|
+
console.error(" Re-run with --verbose for full diagnostic output.");
|
|
1102
|
+
} else {
|
|
1103
|
+
console.error(`\n[DEPLOYED] ${result.reason}`);
|
|
1104
|
+
}
|
|
1105
|
+
|
|
772
1106
|
process.exit(result.deployed ? 0 : 1);
|
|
773
1107
|
}
|
|
774
1108
|
|
|
775
1109
|
if (import.meta.main) {
|
|
776
1110
|
cliMain().catch((err) => {
|
|
777
|
-
|
|
1111
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1112
|
+
const stack = err instanceof Error ? err.stack : undefined;
|
|
1113
|
+
console.error(`[FATAL] ${message}`);
|
|
1114
|
+
if (stack && process.env.SELFTUNE_VERBOSE === "1") {
|
|
1115
|
+
console.error(stack);
|
|
1116
|
+
}
|
|
1117
|
+
console.error(
|
|
1118
|
+
"\nTroubleshooting:\n" +
|
|
1119
|
+
" - Verify --skill-path points to a valid SKILL.md file\n" +
|
|
1120
|
+
" - Ensure eval data exists (run `selftune evals` first) or pass --eval-set\n" +
|
|
1121
|
+
" - Check that ANTHROPIC_API_KEY is set if using Claude\n" +
|
|
1122
|
+
" - Re-run with --verbose for full diagnostic output",
|
|
1123
|
+
);
|
|
778
1124
|
process.exit(1);
|
|
779
1125
|
});
|
|
780
1126
|
}
|