selftune 0.2.19 → 0.2.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/apps/local-dashboard/dist/assets/{index-DnhnXQm6.js → index-D8O-RG1I.js} +2 -2
- package/apps/local-dashboard/dist/index.html +1 -1
- package/cli/selftune/dashboard-contract.ts +4 -0
- package/cli/selftune/eval/family-overlap.ts +320 -1
- package/cli/selftune/evolution/evidence.ts +5 -0
- package/cli/selftune/evolution/evolve-body.ts +86 -2
- package/cli/selftune/evolution/evolve.ts +58 -1
- package/cli/selftune/evolution/validate-body.ts +10 -0
- package/cli/selftune/evolution/validate-host-replay.ts +624 -0
- package/cli/selftune/evolution/validate-proposal.ts +10 -0
- package/cli/selftune/evolution/validate-routing.ts +112 -5
- package/cli/selftune/localdb/direct-write.ts +8 -3
- package/cli/selftune/localdb/materialize.ts +7 -2
- package/cli/selftune/localdb/queries.ts +11 -1
- package/cli/selftune/localdb/schema.ts +10 -1
- package/cli/selftune/routes/skill-report.ts +6 -1
- package/cli/selftune/types.ts +54 -0
- package/cli/selftune/utils/text-similarity.ts +73 -0
- package/package.json +1 -1
- package/packages/ui/src/components/EvidenceViewer.tsx +85 -2
- package/packages/ui/src/components/EvolutionTimeline.tsx +23 -1
- package/packages/ui/src/types.ts +4 -0
- package/skill/Workflows/Composability.md +15 -1
- package/skill/Workflows/Evolve.md +39 -0
|
@@ -43,7 +43,7 @@ import { createEvolveTUI } from "../utils/tui.js";
|
|
|
43
43
|
import { appendAuditEntry } from "./audit.js";
|
|
44
44
|
import { checkConstitution } from "./constitutional.js";
|
|
45
45
|
import { scoreDescription } from "./description-quality.js";
|
|
46
|
-
import { appendEvidenceEntry } from "./evidence.js";
|
|
46
|
+
import { appendEvidenceEntry, buildValidationEvidenceRef } from "./evidence.js";
|
|
47
47
|
import { extractFailurePatterns } from "./extract-patterns.js";
|
|
48
48
|
import {
|
|
49
49
|
computeInvocationScores,
|
|
@@ -139,6 +139,10 @@ function createAuditEntry(
|
|
|
139
139
|
evalSnapshot?: EvalPassRate,
|
|
140
140
|
skillName?: string,
|
|
141
141
|
iterationsUsed?: number,
|
|
142
|
+
provenance?: Pick<
|
|
143
|
+
EvolutionAuditEntry,
|
|
144
|
+
"validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
|
|
145
|
+
>,
|
|
142
146
|
): EvolutionAuditEntry {
|
|
143
147
|
return {
|
|
144
148
|
timestamp: new Date().toISOString(),
|
|
@@ -148,6 +152,14 @@ function createAuditEntry(
|
|
|
148
152
|
...(skillName ? { skill_name: skillName } : {}),
|
|
149
153
|
...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}),
|
|
150
154
|
...(iterationsUsed != null ? { iterations_used: iterationsUsed } : {}),
|
|
155
|
+
...(provenance?.validation_mode ? { validation_mode: provenance.validation_mode } : {}),
|
|
156
|
+
...(provenance?.validation_agent ? { validation_agent: provenance.validation_agent } : {}),
|
|
157
|
+
...(provenance?.validation_fixture_id
|
|
158
|
+
? { validation_fixture_id: provenance.validation_fixture_id }
|
|
159
|
+
: {}),
|
|
160
|
+
...(provenance?.validation_evidence_ref
|
|
161
|
+
? { validation_evidence_ref: provenance.validation_evidence_ref }
|
|
162
|
+
: {}),
|
|
151
163
|
};
|
|
152
164
|
}
|
|
153
165
|
|
|
@@ -289,6 +301,10 @@ export async function evolve(
|
|
|
289
301
|
details: string,
|
|
290
302
|
evalSnapshot?: EvalPassRate,
|
|
291
303
|
iterationsUsed?: number,
|
|
304
|
+
provenance?: Pick<
|
|
305
|
+
EvolutionAuditEntry,
|
|
306
|
+
"validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
|
|
307
|
+
>,
|
|
292
308
|
): void {
|
|
293
309
|
const entry = createAuditEntry(
|
|
294
310
|
proposalId,
|
|
@@ -297,6 +313,7 @@ export async function evolve(
|
|
|
297
313
|
evalSnapshot,
|
|
298
314
|
skillName,
|
|
299
315
|
iterationsUsed,
|
|
316
|
+
provenance,
|
|
300
317
|
);
|
|
301
318
|
auditEntries.push(entry);
|
|
302
319
|
try {
|
|
@@ -637,10 +654,18 @@ export async function evolve(
|
|
|
637
654
|
options.validationModel,
|
|
638
655
|
);
|
|
639
656
|
llmCallCount += countValidationLlmCalls(evalSet.length);
|
|
657
|
+
const evidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
|
|
640
658
|
recordAudit(
|
|
641
659
|
proposal.proposal_id,
|
|
642
660
|
"validated",
|
|
643
661
|
`Pareto validation: improved=${validation.improved}`,
|
|
662
|
+
undefined,
|
|
663
|
+
undefined,
|
|
664
|
+
{
|
|
665
|
+
validation_mode: validation.validation_mode,
|
|
666
|
+
validation_agent: validation.validation_agent,
|
|
667
|
+
validation_evidence_ref: evidenceRef,
|
|
668
|
+
},
|
|
644
669
|
);
|
|
645
670
|
recordEvidence({
|
|
646
671
|
timestamp: new Date().toISOString(),
|
|
@@ -660,6 +685,9 @@ export async function evolve(
|
|
|
660
685
|
regressions: validation.regressions,
|
|
661
686
|
new_passes: validation.new_passes,
|
|
662
687
|
per_entry_results: validation.per_entry_results,
|
|
688
|
+
validation_mode: validation.validation_mode,
|
|
689
|
+
validation_agent: validation.validation_agent,
|
|
690
|
+
validation_evidence_ref: evidenceRef,
|
|
663
691
|
},
|
|
664
692
|
});
|
|
665
693
|
|
|
@@ -866,11 +894,18 @@ export async function evolve(
|
|
|
866
894
|
failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
|
|
867
895
|
pass_rate: validation.after_pass_rate,
|
|
868
896
|
};
|
|
897
|
+
const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
|
|
869
898
|
recordAudit(
|
|
870
899
|
proposal.proposal_id,
|
|
871
900
|
"validated",
|
|
872
901
|
`Validation complete: improved=${validation.improved}`,
|
|
873
902
|
evalSnapshot,
|
|
903
|
+
undefined,
|
|
904
|
+
{
|
|
905
|
+
validation_mode: validation.validation_mode,
|
|
906
|
+
validation_agent: validation.validation_agent,
|
|
907
|
+
validation_evidence_ref: validatedEvidenceRef,
|
|
908
|
+
},
|
|
874
909
|
);
|
|
875
910
|
recordEvidence({
|
|
876
911
|
timestamp: new Date().toISOString(),
|
|
@@ -890,6 +925,9 @@ export async function evolve(
|
|
|
890
925
|
regressions: validation.regressions,
|
|
891
926
|
new_passes: validation.new_passes,
|
|
892
927
|
per_entry_results: validation.per_entry_results,
|
|
928
|
+
validation_mode: validation.validation_mode,
|
|
929
|
+
validation_agent: validation.validation_agent,
|
|
930
|
+
validation_evidence_ref: validatedEvidenceRef,
|
|
893
931
|
},
|
|
894
932
|
});
|
|
895
933
|
|
|
@@ -906,10 +944,18 @@ export async function evolve(
|
|
|
906
944
|
|
|
907
945
|
if (!validation.improved) {
|
|
908
946
|
feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
|
|
947
|
+
const rejectedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "rejected");
|
|
909
948
|
recordAudit(
|
|
910
949
|
proposal.proposal_id,
|
|
911
950
|
"rejected",
|
|
912
951
|
`Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
|
|
952
|
+
undefined,
|
|
953
|
+
undefined,
|
|
954
|
+
{
|
|
955
|
+
validation_mode: validation.validation_mode,
|
|
956
|
+
validation_agent: validation.validation_agent,
|
|
957
|
+
validation_evidence_ref: rejectedEvidenceRef,
|
|
958
|
+
},
|
|
913
959
|
);
|
|
914
960
|
recordEvidence({
|
|
915
961
|
timestamp: new Date().toISOString(),
|
|
@@ -929,6 +975,9 @@ export async function evolve(
|
|
|
929
975
|
regressions: validation.regressions,
|
|
930
976
|
new_passes: validation.new_passes,
|
|
931
977
|
per_entry_results: validation.per_entry_results,
|
|
978
|
+
validation_mode: validation.validation_mode,
|
|
979
|
+
validation_agent: validation.validation_agent,
|
|
980
|
+
validation_evidence_ref: rejectedEvidenceRef,
|
|
932
981
|
},
|
|
933
982
|
});
|
|
934
983
|
|
|
@@ -1138,6 +1187,11 @@ export async function evolve(
|
|
|
1138
1187
|
pass_rate: lastValidation.after_pass_rate,
|
|
1139
1188
|
},
|
|
1140
1189
|
iterationsCompleted,
|
|
1190
|
+
{
|
|
1191
|
+
validation_mode: lastValidation.validation_mode,
|
|
1192
|
+
validation_agent: lastValidation.validation_agent,
|
|
1193
|
+
validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
|
|
1194
|
+
},
|
|
1141
1195
|
);
|
|
1142
1196
|
recordEvidence({
|
|
1143
1197
|
timestamp: new Date().toISOString(),
|
|
@@ -1157,6 +1211,9 @@ export async function evolve(
|
|
|
1157
1211
|
regressions: lastValidation.regressions,
|
|
1158
1212
|
new_passes: lastValidation.new_passes,
|
|
1159
1213
|
per_entry_results: lastValidation.per_entry_results,
|
|
1214
|
+
validation_mode: lastValidation.validation_mode,
|
|
1215
|
+
validation_agent: lastValidation.validation_agent,
|
|
1216
|
+
validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
|
|
1160
1217
|
},
|
|
1161
1218
|
});
|
|
1162
1219
|
}
|
|
@@ -209,6 +209,8 @@ export async function validateBodyProposal(
|
|
|
209
209
|
gate_results: gateResults,
|
|
210
210
|
improved: false,
|
|
211
211
|
regressions: [],
|
|
212
|
+
validation_mode: "structural_guard",
|
|
213
|
+
validation_agent: agent,
|
|
212
214
|
};
|
|
213
215
|
}
|
|
214
216
|
|
|
@@ -250,5 +252,13 @@ export async function validateBodyProposal(
|
|
|
250
252
|
gate_results: gateResults,
|
|
251
253
|
improved: gatesPassed === 3,
|
|
252
254
|
regressions: accuracy.regressions,
|
|
255
|
+
validation_mode: "llm_judge",
|
|
256
|
+
validation_agent: agent,
|
|
257
|
+
...(evalSet.length > 0
|
|
258
|
+
? {
|
|
259
|
+
before_pass_rate: accuracy.before_pass_rate,
|
|
260
|
+
after_pass_rate: accuracy.after_pass_rate,
|
|
261
|
+
}
|
|
262
|
+
: {}),
|
|
253
263
|
};
|
|
254
264
|
}
|