selftune 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +20 -10
- package/.claude/agents/evolution-reviewer.md +14 -1
- package/.claude/agents/integration-guide.md +18 -6
- package/.claude/agents/pattern-analyst.md +18 -5
- package/CHANGELOG.md +12 -4
- package/README.md +43 -35
- package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
- package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
- package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
- package/apps/local-dashboard/dist/favicon.png +0 -0
- package/apps/local-dashboard/dist/index.html +17 -0
- package/apps/local-dashboard/dist/logo.png +0 -0
- package/apps/local-dashboard/dist/logo.svg +9 -0
- package/cli/selftune/badge/badge-data.ts +1 -1
- package/cli/selftune/badge/badge.ts +4 -8
- package/cli/selftune/canonical-export.ts +183 -0
- package/cli/selftune/constants.ts +28 -0
- package/cli/selftune/contribute/contribute.ts +1 -1
- package/cli/selftune/cron/setup.ts +17 -17
- package/cli/selftune/dashboard-contract.ts +202 -0
- package/cli/selftune/dashboard-server.ts +653 -186
- package/cli/selftune/dashboard.ts +41 -176
- package/cli/selftune/eval/baseline.ts +5 -4
- package/cli/selftune/eval/composability-v2.ts +273 -0
- package/cli/selftune/eval/hooks-to-evals.ts +34 -15
- package/cli/selftune/eval/unit-test-cli.ts +1 -1
- package/cli/selftune/evolution/evidence.ts +26 -0
- package/cli/selftune/evolution/evolve-body.ts +105 -11
- package/cli/selftune/evolution/evolve.ts +371 -25
- package/cli/selftune/evolution/extract-patterns.ts +87 -29
- package/cli/selftune/evolution/rollback.ts +2 -2
- package/cli/selftune/grading/auto-grade.ts +200 -0
- package/cli/selftune/grading/grade-session.ts +448 -97
- package/cli/selftune/grading/results.ts +42 -0
- package/cli/selftune/hooks/prompt-log.ts +172 -2
- package/cli/selftune/hooks/session-stop.ts +123 -3
- package/cli/selftune/hooks/skill-eval.ts +119 -3
- package/cli/selftune/index.ts +395 -116
- package/cli/selftune/ingestors/claude-replay.ts +140 -114
- package/cli/selftune/ingestors/codex-rollout.ts +345 -46
- package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
- package/cli/selftune/ingestors/openclaw-ingest.ts +141 -8
- package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
- package/cli/selftune/init.ts +227 -14
- package/cli/selftune/last.ts +14 -5
- package/cli/selftune/localdb/db.ts +63 -0
- package/cli/selftune/localdb/materialize.ts +428 -0
- package/cli/selftune/localdb/queries.ts +376 -0
- package/cli/selftune/localdb/schema.ts +204 -0
- package/cli/selftune/monitoring/watch.ts +66 -15
- package/cli/selftune/normalization.ts +682 -0
- package/cli/selftune/observability.ts +19 -44
- package/cli/selftune/orchestrate.ts +1073 -0
- package/cli/selftune/quickstart.ts +203 -0
- package/cli/selftune/repair/skill-usage.ts +576 -0
- package/cli/selftune/schedule.ts +561 -0
- package/cli/selftune/status.ts +48 -26
- package/cli/selftune/sync.ts +627 -0
- package/cli/selftune/types.ts +148 -0
- package/cli/selftune/utils/canonical-log.ts +45 -0
- package/cli/selftune/utils/hooks.ts +41 -0
- package/cli/selftune/utils/html.ts +27 -0
- package/cli/selftune/utils/llm-call.ts +78 -20
- package/cli/selftune/utils/math.ts +10 -0
- package/cli/selftune/utils/query-filter.ts +139 -0
- package/cli/selftune/utils/skill-discovery.ts +340 -0
- package/cli/selftune/utils/skill-log.ts +68 -0
- package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
- package/cli/selftune/utils/transcript.ts +272 -26
- package/cli/selftune/workflows/discover.ts +254 -0
- package/cli/selftune/workflows/skill-md-writer.ts +288 -0
- package/cli/selftune/workflows/workflows.ts +188 -0
- package/package.json +21 -8
- package/packages/telemetry-contract/README.md +11 -0
- package/packages/telemetry-contract/fixtures/golden.json +87 -0
- package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
- package/packages/telemetry-contract/index.ts +1 -0
- package/packages/telemetry-contract/package.json +19 -0
- package/packages/telemetry-contract/src/index.ts +2 -0
- package/packages/telemetry-contract/src/types.ts +163 -0
- package/packages/telemetry-contract/src/validators.ts +109 -0
- package/skill/SKILL.md +84 -53
- package/skill/Workflows/AutoActivation.md +17 -16
- package/skill/Workflows/Badge.md +6 -0
- package/skill/Workflows/Baseline.md +46 -23
- package/skill/Workflows/Composability.md +12 -5
- package/skill/Workflows/Contribute.md +17 -14
- package/skill/Workflows/Cron.md +56 -79
- package/skill/Workflows/Dashboard.md +45 -34
- package/skill/Workflows/Doctor.md +30 -17
- package/skill/Workflows/Evals.md +64 -40
- package/skill/Workflows/EvolutionMemory.md +2 -0
- package/skill/Workflows/Evolve.md +102 -47
- package/skill/Workflows/EvolveBody.md +6 -6
- package/skill/Workflows/Grade.md +36 -31
- package/skill/Workflows/ImportSkillsBench.md +11 -5
- package/skill/Workflows/Ingest.md +43 -36
- package/skill/Workflows/Initialize.md +44 -30
- package/skill/Workflows/Orchestrate.md +139 -0
- package/skill/Workflows/Replay.md +39 -18
- package/skill/Workflows/Rollback.md +3 -3
- package/skill/Workflows/Schedule.md +61 -0
- package/skill/Workflows/Sync.md +88 -0
- package/skill/Workflows/UnitTest.md +34 -22
- package/skill/Workflows/Watch.md +14 -4
- package/skill/Workflows/Workflows.md +129 -0
- package/skill/assets/activation-rules-default.json +26 -0
- package/skill/assets/multi-skill-settings.json +63 -0
- package/skill/assets/single-skill-settings.json +57 -0
- package/skill/references/invocation-taxonomy.md +2 -2
- package/skill/references/logs.md +164 -2
- package/skill/references/setup-patterns.md +65 -0
- package/skill/references/version-history.md +40 -0
- package/skill/settings_snippet.json +1 -1
- package/templates/multi-skill-settings.json +7 -7
- package/templates/single-skill-settings.json +6 -6
- package/dashboard/index.html +0 -1680
|
@@ -27,7 +27,13 @@ import type {
|
|
|
27
27
|
} from "../types.js";
|
|
28
28
|
import { readJsonl } from "../utils/jsonl.js";
|
|
29
29
|
import { detectAgent } from "../utils/llm-call.js";
|
|
30
|
+
import {
|
|
31
|
+
filterActionableQueryRecords,
|
|
32
|
+
filterActionableSkillUsageRecords,
|
|
33
|
+
} from "../utils/query-filter.js";
|
|
30
34
|
import { seededShuffle } from "../utils/seeded-random.js";
|
|
35
|
+
import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
|
|
36
|
+
import { isHighConfidencePositiveSkillRecord } from "../utils/skill-usage-confidence.js";
|
|
31
37
|
import { generateSyntheticEvals } from "./synthetic-evals.js";
|
|
32
38
|
|
|
33
39
|
// ---------------------------------------------------------------------------
|
|
@@ -116,14 +122,16 @@ export function buildEvalSet(
|
|
|
116
122
|
seed = 42,
|
|
117
123
|
annotateTaxonomy = true,
|
|
118
124
|
): EvalEntry[] {
|
|
125
|
+
const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
|
|
126
|
+
const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
|
|
119
127
|
const effectiveMaxPerSide = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
|
|
120
128
|
const effectiveSeed = Number.isNaN(seed) ? 42 : seed;
|
|
121
129
|
|
|
122
130
|
// Build set of positive query texts (for exclusion from negatives)
|
|
123
131
|
const positiveQueries = new Set<string>();
|
|
124
|
-
for (const r of
|
|
132
|
+
for (const r of actionableSkillRecords) {
|
|
125
133
|
if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
|
|
126
|
-
if (r
|
|
134
|
+
if (isHighConfidencePositiveSkillRecord(r, skillName)) {
|
|
127
135
|
const q = (r.query ?? "").trim();
|
|
128
136
|
if (q && q !== "(query not found)") {
|
|
129
137
|
positiveQueries.add(q);
|
|
@@ -134,9 +142,9 @@ export function buildEvalSet(
|
|
|
134
142
|
// Build deduplicated positives with taxonomy classification
|
|
135
143
|
const seen = new Set<string>();
|
|
136
144
|
const positives: EvalEntry[] = [];
|
|
137
|
-
for (const r of
|
|
145
|
+
for (const r of actionableSkillRecords) {
|
|
138
146
|
if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
|
|
139
|
-
if (r
|
|
147
|
+
if (!isHighConfidencePositiveSkillRecord(r, skillName)) continue;
|
|
140
148
|
const q = (r.query ?? "").trim();
|
|
141
149
|
if (!q || q === "(query not found)" || seen.has(q)) continue;
|
|
142
150
|
seen.add(q);
|
|
@@ -153,7 +161,7 @@ export function buildEvalSet(
|
|
|
153
161
|
if (includeNegatives) {
|
|
154
162
|
const negCandidates: string[] = [];
|
|
155
163
|
const negSeen = new Set<string>();
|
|
156
|
-
for (const r of
|
|
164
|
+
for (const r of actionableQueryRecords) {
|
|
157
165
|
if (!r || typeof r.query !== "string") continue;
|
|
158
166
|
const q = (r.query ?? "").trim();
|
|
159
167
|
if (!q || positiveQueries.has(q) || negSeen.has(q)) continue;
|
|
@@ -198,13 +206,17 @@ export function listSkills(
|
|
|
198
206
|
queryRecords: QueryLogRecord[],
|
|
199
207
|
telemetryRecords: SessionTelemetryRecord[],
|
|
200
208
|
): void {
|
|
209
|
+
const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
|
|
210
|
+
const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
|
|
201
211
|
const counts = new Map<string, number>();
|
|
202
|
-
for (const r of
|
|
212
|
+
for (const r of actionableSkillRecords) {
|
|
203
213
|
const name = r.skill_name ?? "unknown";
|
|
204
214
|
counts.set(name, (counts.get(name) ?? 0) + 1);
|
|
205
215
|
}
|
|
206
216
|
|
|
207
|
-
console.log(
|
|
217
|
+
console.log(
|
|
218
|
+
`Skill triggers in skill_usage_log (${actionableSkillRecords.length} actionable records):`,
|
|
219
|
+
);
|
|
208
220
|
if (counts.size > 0) {
|
|
209
221
|
const sorted = [...counts.entries()].sort((a, b) => b[1] - a[1]);
|
|
210
222
|
for (const [name, count] of sorted) {
|
|
@@ -214,8 +226,8 @@ export function listSkills(
|
|
|
214
226
|
console.log(" (none yet -- trigger some skills in Claude Code to populate)");
|
|
215
227
|
}
|
|
216
228
|
|
|
217
|
-
console.log(`\
|
|
218
|
-
if (
|
|
229
|
+
console.log(`\nActionable queries in all_queries_log: ${actionableQueryRecords.length}`);
|
|
230
|
+
if (actionableQueryRecords.length === 0) {
|
|
219
231
|
console.log(" (none yet -- make sure prompt_log_hook is installed)");
|
|
220
232
|
}
|
|
221
233
|
|
|
@@ -303,14 +315,16 @@ export function printEvalStats(
|
|
|
303
315
|
): void {
|
|
304
316
|
const pos = evalSet.filter((e) => e.should_trigger);
|
|
305
317
|
const neg = evalSet.filter((e) => !e.should_trigger);
|
|
306
|
-
const
|
|
318
|
+
const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
|
|
319
|
+
const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
|
|
320
|
+
const totalTriggers = actionableSkillRecords.filter((r) => r.skill_name === skillName).length;
|
|
307
321
|
|
|
308
322
|
console.log(`Wrote ${evalSet.length} eval entries to ${outputPath}`);
|
|
309
323
|
console.log(
|
|
310
324
|
` Positives (should_trigger=true) : ${pos.length} (from ${totalTriggers} logged triggers)`,
|
|
311
325
|
);
|
|
312
326
|
console.log(
|
|
313
|
-
` Negatives (should_trigger=false): ${neg.length} (from ${
|
|
327
|
+
` Negatives (should_trigger=false): ${neg.length} (from ${actionableQueryRecords.length} actionable logged queries)`,
|
|
314
328
|
);
|
|
315
329
|
|
|
316
330
|
if (annotateTaxonomy && pos.length > 0) {
|
|
@@ -336,7 +350,7 @@ export function printEvalStats(
|
|
|
336
350
|
console.log();
|
|
337
351
|
if (pos.length === 0) {
|
|
338
352
|
console.log(`[WARN] No positives for skill '${skillName}'.`);
|
|
339
|
-
const names = [...new Set(
|
|
353
|
+
const names = [...new Set(actionableSkillRecords.map((r) => r.skill_name))].sort();
|
|
340
354
|
if (names.length > 0) {
|
|
341
355
|
console.log(` Known skills: ${names.join(", ")}`);
|
|
342
356
|
}
|
|
@@ -366,6 +380,7 @@ export async function cliMain(): Promise<void> {
|
|
|
366
380
|
options: {
|
|
367
381
|
skill: { type: "string" },
|
|
368
382
|
output: { type: "string" },
|
|
383
|
+
out: { type: "string" },
|
|
369
384
|
max: { type: "string", default: "50" },
|
|
370
385
|
seed: { type: "string", default: "42" },
|
|
371
386
|
"list-skills": { type: "boolean", default: false },
|
|
@@ -409,7 +424,7 @@ export async function cliMain(): Promise<void> {
|
|
|
409
424
|
modelFlag: values.model,
|
|
410
425
|
});
|
|
411
426
|
|
|
412
|
-
const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
|
|
427
|
+
const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
|
|
413
428
|
writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
|
|
414
429
|
|
|
415
430
|
const pos = evalSet.filter((e) => e.should_trigger);
|
|
@@ -440,7 +455,11 @@ export async function cliMain(): Promise<void> {
|
|
|
440
455
|
}
|
|
441
456
|
|
|
442
457
|
// --- Log-based mode (original behavior) ---
|
|
443
|
-
const
|
|
458
|
+
const skillLogPath = values["skill-log"] ?? SKILL_LOG;
|
|
459
|
+
const skillRecords =
|
|
460
|
+
skillLogPath === SKILL_LOG
|
|
461
|
+
? readEffectiveSkillUsageRecords()
|
|
462
|
+
: readJsonl<SkillUsageRecord>(skillLogPath);
|
|
444
463
|
const queryRecords = readJsonl<QueryLogRecord>(values["query-log"] ?? QUERY_LOG);
|
|
445
464
|
const telemetryRecords = readJsonl<SessionTelemetryRecord>(
|
|
446
465
|
values["telemetry-log"] ?? TELEMETRY_LOG,
|
|
@@ -475,7 +494,7 @@ export async function cliMain(): Promise<void> {
|
|
|
475
494
|
annotateTaxonomy,
|
|
476
495
|
);
|
|
477
496
|
|
|
478
|
-
const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
|
|
497
|
+
const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
|
|
479
498
|
writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
|
|
480
499
|
printEvalStats(evalSet, values.skill, outputPath, skillRecords, queryRecords, annotateTaxonomy);
|
|
481
500
|
}
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* CLI entrypoint for skill unit tests.
|
|
3
3
|
*
|
|
4
4
|
* Usage:
|
|
5
|
-
* selftune unit-test --skill <name> --tests <path> [--run-agent] [--generate]
|
|
5
|
+
* selftune eval unit-test --skill <name> --tests <path> [--run-agent] [--generate]
|
|
6
6
|
*
|
|
7
7
|
* --skill <name> Skill name (required)
|
|
8
8
|
* --tests <path> Path to unit test JSON file (default: ~/.selftune/unit-tests/<skill>.json)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evolution evidence trail: append and read proposal/eval artifacts that power
|
|
3
|
+
* explainable dashboard drill-downs.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { EVOLUTION_EVIDENCE_LOG } from "../constants.js";
|
|
7
|
+
import type { EvolutionEvidenceEntry } from "../types.js";
|
|
8
|
+
import { appendJsonl, readJsonl } from "../utils/jsonl.js";
|
|
9
|
+
|
|
10
|
+
/** Append a structured evidence artifact to the evolution evidence log. */
|
|
11
|
+
export function appendEvidenceEntry(
|
|
12
|
+
entry: EvolutionEvidenceEntry,
|
|
13
|
+
logPath: string = EVOLUTION_EVIDENCE_LOG,
|
|
14
|
+
): void {
|
|
15
|
+
appendJsonl(logPath, entry);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/** Read all evidence entries, optionally filtered by exact skill name. */
|
|
19
|
+
export function readEvidenceTrail(
|
|
20
|
+
skillName?: string,
|
|
21
|
+
logPath: string = EVOLUTION_EVIDENCE_LOG,
|
|
22
|
+
): EvolutionEvidenceEntry[] {
|
|
23
|
+
const entries = readJsonl<EvolutionEvidenceEntry>(logPath);
|
|
24
|
+
if (!skillName) return entries;
|
|
25
|
+
return entries.filter((entry) => entry.skill_name === skillName);
|
|
26
|
+
}
|
|
@@ -9,13 +9,15 @@
|
|
|
9
9
|
import { existsSync, readFileSync } from "node:fs";
|
|
10
10
|
import { parseArgs } from "node:util";
|
|
11
11
|
|
|
12
|
-
import { QUERY_LOG
|
|
12
|
+
import { QUERY_LOG } from "../constants.js";
|
|
13
13
|
import { buildEvalSet } from "../eval/hooks-to-evals.js";
|
|
14
|
+
import { readGradingResultsForSkill } from "../grading/results.js";
|
|
14
15
|
import type {
|
|
15
16
|
BodyEvolutionProposal,
|
|
16
17
|
BodyValidationResult,
|
|
17
18
|
EvalEntry,
|
|
18
19
|
EvolutionAuditEntry,
|
|
20
|
+
EvolutionEvidenceEntry,
|
|
19
21
|
EvolutionTarget,
|
|
20
22
|
FailurePattern,
|
|
21
23
|
GradingResult,
|
|
@@ -23,8 +25,10 @@ import type {
|
|
|
23
25
|
SkillUsageRecord,
|
|
24
26
|
} from "../types.js";
|
|
25
27
|
import { readJsonl } from "../utils/jsonl.js";
|
|
28
|
+
import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
|
|
26
29
|
import { appendAuditEntry } from "./audit.js";
|
|
27
30
|
import { parseSkillSections, replaceBody, replaceSection } from "./deploy-proposal.js";
|
|
31
|
+
import { appendEvidenceEntry } from "./evidence.js";
|
|
28
32
|
import { extractFailurePatterns } from "./extract-patterns.js";
|
|
29
33
|
import { generateBodyProposal } from "./propose-body.js";
|
|
30
34
|
import { generateRoutingProposal } from "./propose-routing.js";
|
|
@@ -79,7 +83,9 @@ export interface EvolveBodyDeps {
|
|
|
79
83
|
validateRoutingProposal?: typeof import("./validate-routing.js").validateRoutingProposal;
|
|
80
84
|
refineBodyProposal?: typeof import("./refine-body.js").refineBodyProposal;
|
|
81
85
|
appendAuditEntry?: typeof import("./audit.js").appendAuditEntry;
|
|
86
|
+
appendEvidenceEntry?: typeof import("./evidence.js").appendEvidenceEntry;
|
|
82
87
|
buildEvalSet?: typeof import("../eval/hooks-to-evals.js").buildEvalSet;
|
|
88
|
+
readEffectiveSkillUsageRecords?: typeof import("../utils/skill-log.js").readEffectiveSkillUsageRecords;
|
|
83
89
|
readFileSync?: typeof readFileSync;
|
|
84
90
|
writeFileSync?: (path: string, data: string, encoding: string) => void;
|
|
85
91
|
}
|
|
@@ -134,7 +140,10 @@ export async function evolveBody(
|
|
|
134
140
|
const _validateRoutingProposal = _deps.validateRoutingProposal ?? validateRoutingProposal;
|
|
135
141
|
const _refineBodyProposal = _deps.refineBodyProposal ?? refineBodyProposal;
|
|
136
142
|
const _appendAuditEntry = _deps.appendAuditEntry ?? appendAuditEntry;
|
|
143
|
+
const _appendEvidenceEntry = _deps.appendEvidenceEntry ?? appendEvidenceEntry;
|
|
137
144
|
const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
|
|
145
|
+
const _readEffectiveSkillUsageRecords =
|
|
146
|
+
_deps.readEffectiveSkillUsageRecords ?? readEffectiveSkillUsageRecords;
|
|
138
147
|
const _readFileSync = _deps.readFileSync ?? readFileSync;
|
|
139
148
|
const _writeFileSync = _deps.writeFileSync ?? (await import("node:fs")).writeFileSync;
|
|
140
149
|
|
|
@@ -154,6 +163,14 @@ export async function evolveBody(
|
|
|
154
163
|
}
|
|
155
164
|
}
|
|
156
165
|
|
|
166
|
+
function recordEvidence(entry: EvolutionEvidenceEntry): void {
|
|
167
|
+
try {
|
|
168
|
+
_appendEvidenceEntry(entry);
|
|
169
|
+
} catch {
|
|
170
|
+
// Fail-open
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
157
174
|
try {
|
|
158
175
|
// Step 1: Read current SKILL.md
|
|
159
176
|
if (!existsSync(skillPath)) {
|
|
@@ -168,6 +185,8 @@ export async function evolveBody(
|
|
|
168
185
|
|
|
169
186
|
const currentContent = _readFileSync(skillPath, "utf-8");
|
|
170
187
|
const parsed = parseSkillSections(currentContent);
|
|
188
|
+
const createdAuditDetails = (): string => `original_description:${currentContent}`;
|
|
189
|
+
const skillUsage = _readEffectiveSkillUsageRecords();
|
|
171
190
|
|
|
172
191
|
// Step 2: Load eval set
|
|
173
192
|
let evalSet: EvalEntry[];
|
|
@@ -179,13 +198,11 @@ export async function evolveBody(
|
|
|
179
198
|
}
|
|
180
199
|
evalSet = parsed as EvalEntry[];
|
|
181
200
|
} else {
|
|
182
|
-
const skillRecords = readJsonl<SkillUsageRecord>(SKILL_LOG);
|
|
183
201
|
const queryRecords = readJsonl<QueryLogRecord>(QUERY_LOG);
|
|
184
|
-
evalSet = _buildEvalSet(
|
|
202
|
+
evalSet = _buildEvalSet(skillUsage, queryRecords, skillName);
|
|
185
203
|
}
|
|
186
204
|
|
|
187
205
|
// Step 3: Load skill usage and extract failure patterns
|
|
188
|
-
const skillUsage = readJsonl<SkillUsageRecord>(SKILL_LOG);
|
|
189
206
|
const failurePatterns = _extractFailurePatterns(
|
|
190
207
|
evalSet,
|
|
191
208
|
skillUsage,
|
|
@@ -252,11 +269,21 @@ export async function evolveBody(
|
|
|
252
269
|
|
|
253
270
|
lastProposal = proposal;
|
|
254
271
|
|
|
255
|
-
recordAudit(
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
272
|
+
recordAudit(proposal.proposal_id, "created", createdAuditDetails());
|
|
273
|
+
recordEvidence({
|
|
274
|
+
timestamp: new Date().toISOString(),
|
|
275
|
+
proposal_id: proposal.proposal_id,
|
|
276
|
+
skill_name: skillName,
|
|
277
|
+
skill_path: skillPath,
|
|
278
|
+
target,
|
|
279
|
+
stage: "created",
|
|
280
|
+
rationale: proposal.rationale,
|
|
281
|
+
confidence: proposal.confidence,
|
|
282
|
+
details: `${target} proposal created for ${skillName} (iteration ${iteration + 1})`,
|
|
283
|
+
original_text: proposal.original_body,
|
|
284
|
+
proposed_text: proposal.proposed_body,
|
|
285
|
+
eval_set: evalSet,
|
|
286
|
+
});
|
|
260
287
|
|
|
261
288
|
// Check confidence threshold
|
|
262
289
|
if (proposal.confidence < confidenceThreshold) {
|
|
@@ -265,6 +292,17 @@ export async function evolveBody(
|
|
|
265
292
|
"rejected",
|
|
266
293
|
`Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
267
294
|
);
|
|
295
|
+
recordEvidence({
|
|
296
|
+
timestamp: new Date().toISOString(),
|
|
297
|
+
proposal_id: proposal.proposal_id,
|
|
298
|
+
skill_name: skillName,
|
|
299
|
+
skill_path: skillPath,
|
|
300
|
+
target,
|
|
301
|
+
stage: "rejected",
|
|
302
|
+
rationale: proposal.rationale,
|
|
303
|
+
confidence: proposal.confidence,
|
|
304
|
+
details: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
305
|
+
});
|
|
268
306
|
|
|
269
307
|
if (iteration === maxIterations - 1) {
|
|
270
308
|
return {
|
|
@@ -303,6 +341,24 @@ export async function evolveBody(
|
|
|
303
341
|
"validated",
|
|
304
342
|
`Validation: ${validation.gates_passed}/${validation.gates_total} gates passed`,
|
|
305
343
|
);
|
|
344
|
+
recordEvidence({
|
|
345
|
+
timestamp: new Date().toISOString(),
|
|
346
|
+
proposal_id: proposal.proposal_id,
|
|
347
|
+
skill_name: skillName,
|
|
348
|
+
skill_path: skillPath,
|
|
349
|
+
target,
|
|
350
|
+
stage: "validated",
|
|
351
|
+
rationale: proposal.rationale,
|
|
352
|
+
confidence: proposal.confidence,
|
|
353
|
+
details: `Validation: ${validation.gates_passed}/${validation.gates_total} gates passed`,
|
|
354
|
+
validation: {
|
|
355
|
+
improved: validation.improved,
|
|
356
|
+
gates_passed: validation.gates_passed,
|
|
357
|
+
gates_total: validation.gates_total,
|
|
358
|
+
gate_results: validation.gate_results,
|
|
359
|
+
regressions: validation.regressions,
|
|
360
|
+
},
|
|
361
|
+
});
|
|
306
362
|
|
|
307
363
|
if (validation.improved) {
|
|
308
364
|
break;
|
|
@@ -313,6 +369,24 @@ export async function evolveBody(
|
|
|
313
369
|
"rejected",
|
|
314
370
|
`Validation failed: ${validation.gates_passed}/${validation.gates_total} gates`,
|
|
315
371
|
);
|
|
372
|
+
recordEvidence({
|
|
373
|
+
timestamp: new Date().toISOString(),
|
|
374
|
+
proposal_id: proposal.proposal_id,
|
|
375
|
+
skill_name: skillName,
|
|
376
|
+
skill_path: skillPath,
|
|
377
|
+
target,
|
|
378
|
+
stage: "rejected",
|
|
379
|
+
rationale: proposal.rationale,
|
|
380
|
+
confidence: proposal.confidence,
|
|
381
|
+
details: `Validation failed: ${validation.gates_passed}/${validation.gates_total} gates`,
|
|
382
|
+
validation: {
|
|
383
|
+
improved: validation.improved,
|
|
384
|
+
gates_passed: validation.gates_passed,
|
|
385
|
+
gates_total: validation.gates_total,
|
|
386
|
+
gate_results: validation.gate_results,
|
|
387
|
+
regressions: validation.regressions,
|
|
388
|
+
},
|
|
389
|
+
});
|
|
316
390
|
|
|
317
391
|
if (iteration === maxIterations - 1) {
|
|
318
392
|
return {
|
|
@@ -355,6 +429,24 @@ export async function evolveBody(
|
|
|
355
429
|
"deployed",
|
|
356
430
|
`Deployed ${target} proposal for ${skillName}`,
|
|
357
431
|
);
|
|
432
|
+
recordEvidence({
|
|
433
|
+
timestamp: new Date().toISOString(),
|
|
434
|
+
proposal_id: lastProposal.proposal_id,
|
|
435
|
+
skill_name: skillName,
|
|
436
|
+
skill_path: skillPath,
|
|
437
|
+
target,
|
|
438
|
+
stage: "deployed",
|
|
439
|
+
rationale: lastProposal.rationale,
|
|
440
|
+
confidence: lastProposal.confidence,
|
|
441
|
+
details: `Deployed ${target} proposal for ${skillName}`,
|
|
442
|
+
validation: {
|
|
443
|
+
improved: lastValidation.improved,
|
|
444
|
+
gates_passed: lastValidation.gates_passed,
|
|
445
|
+
gates_total: lastValidation.gates_total,
|
|
446
|
+
gate_results: lastValidation.gate_results,
|
|
447
|
+
regressions: lastValidation.regressions,
|
|
448
|
+
},
|
|
449
|
+
});
|
|
358
450
|
|
|
359
451
|
return {
|
|
360
452
|
proposal: lastProposal,
|
|
@@ -411,10 +503,10 @@ export async function cliMain(): Promise<void> {
|
|
|
411
503
|
});
|
|
412
504
|
|
|
413
505
|
if (values.help) {
|
|
414
|
-
console.log(`selftune evolve
|
|
506
|
+
console.log(`selftune evolve body — Evolve a skill body or routing table
|
|
415
507
|
|
|
416
508
|
Usage:
|
|
417
|
-
selftune evolve
|
|
509
|
+
selftune evolve body --skill <name> --skill-path <path> [options]
|
|
418
510
|
|
|
419
511
|
Options:
|
|
420
512
|
--skill Skill name (required)
|
|
@@ -462,6 +554,7 @@ Options:
|
|
|
462
554
|
const paths = values["few-shot"].split(",").map((p) => p.trim());
|
|
463
555
|
fewShotExamples = paths.filter((p) => existsSync(p)).map((p) => readFileSync(p, "utf-8"));
|
|
464
556
|
}
|
|
557
|
+
const gradingResults = readGradingResultsForSkill(values.skill);
|
|
465
558
|
|
|
466
559
|
const result = await evolveBody({
|
|
467
560
|
skillName: values.skill,
|
|
@@ -477,6 +570,7 @@ Options:
|
|
|
477
570
|
confidenceThreshold: Number.parseFloat(values.confidence ?? "0.6"),
|
|
478
571
|
taskDescription: values["task-description"],
|
|
479
572
|
fewShotExamples,
|
|
573
|
+
gradingResults,
|
|
480
574
|
validationModel: values["validation-model"],
|
|
481
575
|
});
|
|
482
576
|
|