selftune 0.2.19 → 0.2.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/apps/local-dashboard/dist/assets/{index-DnhnXQm6.js → index-D8O-RG1I.js} +2 -2
- package/apps/local-dashboard/dist/index.html +1 -1
- package/cli/selftune/dashboard-contract.ts +4 -0
- package/cli/selftune/eval/family-overlap.ts +320 -1
- package/cli/selftune/evolution/evidence.ts +5 -0
- package/cli/selftune/evolution/evolve-body.ts +86 -2
- package/cli/selftune/evolution/evolve.ts +58 -1
- package/cli/selftune/evolution/validate-body.ts +10 -0
- package/cli/selftune/evolution/validate-host-replay.ts +624 -0
- package/cli/selftune/evolution/validate-proposal.ts +10 -0
- package/cli/selftune/evolution/validate-routing.ts +112 -5
- package/cli/selftune/localdb/direct-write.ts +8 -3
- package/cli/selftune/localdb/materialize.ts +7 -2
- package/cli/selftune/localdb/queries.ts +11 -1
- package/cli/selftune/localdb/schema.ts +10 -1
- package/cli/selftune/routes/skill-report.ts +6 -1
- package/cli/selftune/types.ts +54 -0
- package/cli/selftune/utils/text-similarity.ts +73 -0
- package/package.json +1 -1
- package/packages/ui/src/components/EvidenceViewer.tsx +85 -2
- package/packages/ui/src/components/EvolutionTimeline.tsx +23 -1
- package/packages/ui/src/types.ts +4 -0
- package/skill/Workflows/Composability.md +15 -1
- package/skill/Workflows/Evolve.md +39 -0
|
@@ -5,9 +5,43 @@
|
|
|
5
5
|
* and running trigger accuracy checks against an eval set.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
import type {
|
|
8
|
+
import type {
|
|
9
|
+
BodyEvolutionProposal,
|
|
10
|
+
BodyValidationResult,
|
|
11
|
+
EvalEntry,
|
|
12
|
+
RoutingReplayEntryResult,
|
|
13
|
+
RoutingReplayFixture,
|
|
14
|
+
ValidationMode,
|
|
15
|
+
} from "../types.js";
|
|
9
16
|
import { callLlm } from "../utils/llm-call.js";
|
|
10
17
|
import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
|
|
18
|
+
import { runHostReplayFixture } from "./validate-host-replay.js";
|
|
19
|
+
|
|
20
|
+
export interface RoutingReplayRunnerInput {
|
|
21
|
+
routing: string;
|
|
22
|
+
evalSet: EvalEntry[];
|
|
23
|
+
agent: string;
|
|
24
|
+
fixture: RoutingReplayFixture;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export type RoutingReplayRunner = (
|
|
28
|
+
input: RoutingReplayRunnerInput,
|
|
29
|
+
) => Promise<RoutingReplayEntryResult[]>;
|
|
30
|
+
|
|
31
|
+
export interface RoutingValidationOptions {
|
|
32
|
+
replayFixture?: RoutingReplayFixture;
|
|
33
|
+
replayRunner?: RoutingReplayRunner;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface RoutingTriggerAccuracyResult {
|
|
37
|
+
before_pass_rate: number;
|
|
38
|
+
after_pass_rate: number;
|
|
39
|
+
improved: boolean;
|
|
40
|
+
validation_mode: ValidationMode;
|
|
41
|
+
validation_agent: string;
|
|
42
|
+
validation_fixture_id?: string;
|
|
43
|
+
per_entry_results?: RoutingReplayEntryResult[];
|
|
44
|
+
}
|
|
11
45
|
|
|
12
46
|
// ---------------------------------------------------------------------------
|
|
13
47
|
// Structural validation
|
|
@@ -77,9 +111,70 @@ export async function validateRoutingTriggerAccuracy(
|
|
|
77
111
|
evalSet: EvalEntry[],
|
|
78
112
|
agent: string,
|
|
79
113
|
modelFlag?: string,
|
|
80
|
-
|
|
114
|
+
options: RoutingValidationOptions = {},
|
|
115
|
+
): Promise<RoutingTriggerAccuracyResult> {
|
|
81
116
|
if (evalSet.length === 0) {
|
|
82
|
-
return {
|
|
117
|
+
return {
|
|
118
|
+
before_pass_rate: 0,
|
|
119
|
+
after_pass_rate: 0,
|
|
120
|
+
improved: false,
|
|
121
|
+
validation_mode: "structural_guard",
|
|
122
|
+
validation_agent: agent,
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
if (options.replayFixture && options.replayRunner) {
|
|
127
|
+
const beforeResults = await options.replayRunner({
|
|
128
|
+
routing: originalRouting,
|
|
129
|
+
evalSet,
|
|
130
|
+
agent,
|
|
131
|
+
fixture: options.replayFixture,
|
|
132
|
+
});
|
|
133
|
+
const afterResults = await options.replayRunner({
|
|
134
|
+
routing: proposedRouting,
|
|
135
|
+
evalSet,
|
|
136
|
+
agent,
|
|
137
|
+
fixture: options.replayFixture,
|
|
138
|
+
});
|
|
139
|
+
const beforePassed = beforeResults.filter((result) => result.passed).length;
|
|
140
|
+
const afterPassed = afterResults.filter((result) => result.passed).length;
|
|
141
|
+
const total = evalSet.length;
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
before_pass_rate: beforePassed / total,
|
|
145
|
+
after_pass_rate: afterPassed / total,
|
|
146
|
+
improved: afterPassed > beforePassed,
|
|
147
|
+
validation_mode: "host_replay",
|
|
148
|
+
validation_agent: agent,
|
|
149
|
+
validation_fixture_id: options.replayFixture.fixture_id,
|
|
150
|
+
per_entry_results: afterResults,
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if (options.replayFixture) {
|
|
155
|
+
const beforeResults = runHostReplayFixture({
|
|
156
|
+
routing: originalRouting,
|
|
157
|
+
evalSet,
|
|
158
|
+
fixture: options.replayFixture,
|
|
159
|
+
});
|
|
160
|
+
const afterResults = runHostReplayFixture({
|
|
161
|
+
routing: proposedRouting,
|
|
162
|
+
evalSet,
|
|
163
|
+
fixture: options.replayFixture,
|
|
164
|
+
});
|
|
165
|
+
const beforePassed = beforeResults.filter((result) => result.passed).length;
|
|
166
|
+
const afterPassed = afterResults.filter((result) => result.passed).length;
|
|
167
|
+
const total = evalSet.length;
|
|
168
|
+
|
|
169
|
+
return {
|
|
170
|
+
before_pass_rate: beforePassed / total,
|
|
171
|
+
after_pass_rate: afterPassed / total,
|
|
172
|
+
improved: afterPassed > beforePassed,
|
|
173
|
+
validation_mode: "host_replay",
|
|
174
|
+
validation_agent: agent,
|
|
175
|
+
validation_fixture_id: options.replayFixture.fixture_id,
|
|
176
|
+
per_entry_results: afterResults,
|
|
177
|
+
};
|
|
83
178
|
}
|
|
84
179
|
|
|
85
180
|
const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
|
|
@@ -113,6 +208,8 @@ export async function validateRoutingTriggerAccuracy(
|
|
|
113
208
|
before_pass_rate: beforePassRate,
|
|
114
209
|
after_pass_rate: afterPassRate,
|
|
115
210
|
improved: afterPassRate > beforePassRate,
|
|
211
|
+
validation_mode: "llm_judge",
|
|
212
|
+
validation_agent: agent,
|
|
116
213
|
};
|
|
117
214
|
}
|
|
118
215
|
|
|
@@ -126,6 +223,7 @@ export async function validateRoutingProposal(
|
|
|
126
223
|
evalSet: EvalEntry[],
|
|
127
224
|
agent: string,
|
|
128
225
|
modelFlag?: string,
|
|
226
|
+
options: RoutingValidationOptions = {},
|
|
129
227
|
): Promise<BodyValidationResult> {
|
|
130
228
|
const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
|
|
131
229
|
|
|
@@ -145,6 +243,8 @@ export async function validateRoutingProposal(
|
|
|
145
243
|
gate_results: gateResults,
|
|
146
244
|
improved: false,
|
|
147
245
|
regressions: [],
|
|
246
|
+
validation_mode: "structural_guard",
|
|
247
|
+
validation_agent: agent,
|
|
148
248
|
};
|
|
149
249
|
}
|
|
150
250
|
|
|
@@ -155,13 +255,14 @@ export async function validateRoutingProposal(
|
|
|
155
255
|
evalSet,
|
|
156
256
|
agent,
|
|
157
257
|
modelFlag,
|
|
258
|
+
options,
|
|
158
259
|
);
|
|
159
260
|
gateResults.push({
|
|
160
261
|
gate: "trigger_accuracy",
|
|
161
262
|
passed: accuracy.improved,
|
|
162
263
|
reason: accuracy.improved
|
|
163
|
-
? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
|
|
164
|
-
: `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
|
|
264
|
+
? `Improved via ${accuracy.validation_mode}: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
|
|
265
|
+
: `Not improved via ${accuracy.validation_mode}: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
|
|
165
266
|
});
|
|
166
267
|
|
|
167
268
|
const gatesPassed = gateResults.filter((g) => g.passed).length;
|
|
@@ -173,5 +274,11 @@ export async function validateRoutingProposal(
|
|
|
173
274
|
gate_results: gateResults,
|
|
174
275
|
improved: gatesPassed === 2,
|
|
175
276
|
regressions: [],
|
|
277
|
+
validation_mode: accuracy.validation_mode,
|
|
278
|
+
validation_agent: accuracy.validation_agent,
|
|
279
|
+
validation_fixture_id: accuracy.validation_fixture_id,
|
|
280
|
+
before_pass_rate: accuracy.before_pass_rate,
|
|
281
|
+
after_pass_rate: accuracy.after_pass_rate,
|
|
282
|
+
per_entry_results: accuracy.per_entry_results,
|
|
176
283
|
};
|
|
177
284
|
}
|
|
@@ -285,11 +285,12 @@ export function writeEvolutionAuditToDb(record: EvolutionAuditEntry): boolean {
|
|
|
285
285
|
return safeWrite("evolution-audit", (db) => {
|
|
286
286
|
getStmt(
|
|
287
287
|
db,
|
|
288
|
-
"evolution-audit-
|
|
288
|
+
"evolution-audit-v3",
|
|
289
289
|
`
|
|
290
290
|
INSERT OR IGNORE INTO evolution_audit
|
|
291
|
-
(timestamp, proposal_id, skill_name, action, details, eval_snapshot_json, iterations_used
|
|
292
|
-
|
|
291
|
+
(timestamp, proposal_id, skill_name, action, details, eval_snapshot_json, iterations_used,
|
|
292
|
+
validation_mode, validation_agent, validation_fixture_id, validation_evidence_ref)
|
|
293
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
293
294
|
`,
|
|
294
295
|
).run(
|
|
295
296
|
record.timestamp,
|
|
@@ -299,6 +300,10 @@ export function writeEvolutionAuditToDb(record: EvolutionAuditEntry): boolean {
|
|
|
299
300
|
record.details,
|
|
300
301
|
record.eval_snapshot ? JSON.stringify(record.eval_snapshot) : null,
|
|
301
302
|
record.iterations_used ?? null,
|
|
303
|
+
record.validation_mode ?? null,
|
|
304
|
+
record.validation_agent ?? null,
|
|
305
|
+
record.validation_fixture_id ?? null,
|
|
306
|
+
record.validation_evidence_ref ?? null,
|
|
302
307
|
);
|
|
303
308
|
});
|
|
304
309
|
}
|
|
@@ -600,8 +600,9 @@ function insertEvolutionAudit(db: Database, records: EvolutionAuditEntry[]): num
|
|
|
600
600
|
// (idx_evo_audit_dedup defined in schema.ts).
|
|
601
601
|
const stmt = db.prepare(`
|
|
602
602
|
INSERT OR IGNORE INTO evolution_audit
|
|
603
|
-
(timestamp, proposal_id, skill_name, action, details, eval_snapshot_json, iterations_used
|
|
604
|
-
|
|
603
|
+
(timestamp, proposal_id, skill_name, action, details, eval_snapshot_json, iterations_used,
|
|
604
|
+
validation_mode, validation_agent, validation_fixture_id, validation_evidence_ref)
|
|
605
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
605
606
|
`);
|
|
606
607
|
|
|
607
608
|
let count = 0;
|
|
@@ -614,6 +615,10 @@ function insertEvolutionAudit(db: Database, records: EvolutionAuditEntry[]): num
|
|
|
614
615
|
r.details,
|
|
615
616
|
r.eval_snapshot ? JSON.stringify(r.eval_snapshot) : null,
|
|
616
617
|
r.iterations_used ?? null,
|
|
618
|
+
r.validation_mode ?? null,
|
|
619
|
+
r.validation_agent ?? null,
|
|
620
|
+
r.validation_fixture_id ?? null,
|
|
621
|
+
r.validation_evidence_ref ?? null,
|
|
617
622
|
);
|
|
618
623
|
count++;
|
|
619
624
|
}
|
|
@@ -1130,6 +1130,10 @@ export function queryEvolutionAudit(
|
|
|
1130
1130
|
action: string;
|
|
1131
1131
|
details: string;
|
|
1132
1132
|
eval_snapshot?: Record<string, unknown>;
|
|
1133
|
+
validation_mode?: string;
|
|
1134
|
+
validation_agent?: string;
|
|
1135
|
+
validation_fixture_id?: string;
|
|
1136
|
+
validation_evidence_ref?: string;
|
|
1133
1137
|
}> {
|
|
1134
1138
|
const sql = skillName
|
|
1135
1139
|
? `SELECT * FROM evolution_audit
|
|
@@ -1143,12 +1147,18 @@ export function queryEvolutionAudit(
|
|
|
1143
1147
|
return rows.map((r) => ({
|
|
1144
1148
|
timestamp: r.timestamp as string,
|
|
1145
1149
|
proposal_id: r.proposal_id as string,
|
|
1146
|
-
skill_name: r.skill_name
|
|
1150
|
+
skill_name: typeof r.skill_name === "string" ? r.skill_name : undefined,
|
|
1147
1151
|
action: r.action as string,
|
|
1148
1152
|
details: r.details as string,
|
|
1149
1153
|
eval_snapshot: r.eval_snapshot_json
|
|
1150
1154
|
? (safeParseJson(r.eval_snapshot_json as string) as Record<string, unknown>)
|
|
1151
1155
|
: undefined,
|
|
1156
|
+
validation_mode: typeof r.validation_mode === "string" ? r.validation_mode : undefined,
|
|
1157
|
+
validation_agent: typeof r.validation_agent === "string" ? r.validation_agent : undefined,
|
|
1158
|
+
validation_fixture_id:
|
|
1159
|
+
typeof r.validation_fixture_id === "string" ? r.validation_fixture_id : undefined,
|
|
1160
|
+
validation_evidence_ref:
|
|
1161
|
+
typeof r.validation_evidence_ref === "string" ? r.validation_evidence_ref : undefined,
|
|
1152
1162
|
}));
|
|
1153
1163
|
}
|
|
1154
1164
|
|
|
@@ -121,7 +121,12 @@ CREATE TABLE IF NOT EXISTS evolution_audit (
|
|
|
121
121
|
skill_name TEXT,
|
|
122
122
|
action TEXT NOT NULL,
|
|
123
123
|
details TEXT,
|
|
124
|
-
eval_snapshot_json TEXT
|
|
124
|
+
eval_snapshot_json TEXT,
|
|
125
|
+
iterations_used INTEGER,
|
|
126
|
+
validation_mode TEXT,
|
|
127
|
+
validation_agent TEXT,
|
|
128
|
+
validation_fixture_id TEXT,
|
|
129
|
+
validation_evidence_ref TEXT
|
|
125
130
|
)`;
|
|
126
131
|
|
|
127
132
|
// -- Local telemetry tables (from JSONL logs) ---------------------------------
|
|
@@ -369,6 +374,10 @@ export const MIGRATIONS = [
|
|
|
369
374
|
`ALTER TABLE skill_invocations ADD COLUMN source TEXT`,
|
|
370
375
|
// Track how many iteration loops each evolution run used
|
|
371
376
|
`ALTER TABLE evolution_audit ADD COLUMN iterations_used INTEGER`,
|
|
377
|
+
`ALTER TABLE evolution_audit ADD COLUMN validation_mode TEXT`,
|
|
378
|
+
`ALTER TABLE evolution_audit ADD COLUMN validation_agent TEXT`,
|
|
379
|
+
`ALTER TABLE evolution_audit ADD COLUMN validation_fixture_id TEXT`,
|
|
380
|
+
`ALTER TABLE evolution_audit ADD COLUMN validation_evidence_ref TEXT`,
|
|
372
381
|
// Canonical contract fields for upload staging (sessions already has schema_version, platform, normalized_at)
|
|
373
382
|
`ALTER TABLE sessions ADD COLUMN normalizer_version TEXT`,
|
|
374
383
|
`ALTER TABLE sessions ADD COLUMN capture_mode TEXT`,
|
|
@@ -28,7 +28,8 @@ export function handleSkillReport(
|
|
|
28
28
|
// 1. Evolution audit with eval_snapshot
|
|
29
29
|
const evolution = db
|
|
30
30
|
.query(
|
|
31
|
-
`SELECT timestamp, proposal_id, skill_name, action, details, eval_snapshot_json
|
|
31
|
+
`SELECT timestamp, proposal_id, skill_name, action, details, eval_snapshot_json,
|
|
32
|
+
validation_mode, validation_agent, validation_fixture_id, validation_evidence_ref
|
|
32
33
|
FROM evolution_audit
|
|
33
34
|
WHERE skill_name = ? OR (skill_name IS NULL AND proposal_id LIKE 'evo-' || ? || '-%')
|
|
34
35
|
ORDER BY timestamp DESC
|
|
@@ -41,6 +42,10 @@ export function handleSkillReport(
|
|
|
41
42
|
action: string;
|
|
42
43
|
details: string;
|
|
43
44
|
eval_snapshot_json: string | null;
|
|
45
|
+
validation_mode: string | null;
|
|
46
|
+
validation_agent: string | null;
|
|
47
|
+
validation_fixture_id: string | null;
|
|
48
|
+
validation_evidence_ref: string | null;
|
|
44
49
|
}>;
|
|
45
50
|
const evolutionWithSnapshot = evolution.map((e) => ({
|
|
46
51
|
...e,
|
package/cli/selftune/types.ts
CHANGED
|
@@ -400,6 +400,10 @@ export interface EvolutionAuditEntry {
|
|
|
400
400
|
details: string;
|
|
401
401
|
eval_snapshot?: EvalPassRate;
|
|
402
402
|
iterations_used?: number;
|
|
403
|
+
validation_mode?: ValidationMode;
|
|
404
|
+
validation_agent?: string;
|
|
405
|
+
validation_fixture_id?: string;
|
|
406
|
+
validation_evidence_ref?: string;
|
|
403
407
|
}
|
|
404
408
|
|
|
405
409
|
export interface EvolutionEvidenceValidation {
|
|
@@ -413,6 +417,10 @@ export interface EvolutionEvidenceValidation {
|
|
|
413
417
|
gates_passed?: number;
|
|
414
418
|
gates_total?: number;
|
|
415
419
|
gate_results?: Array<{ gate: ValidationGate; passed: boolean; reason: string }>;
|
|
420
|
+
validation_mode?: ValidationMode;
|
|
421
|
+
validation_agent?: string;
|
|
422
|
+
validation_fixture_id?: string;
|
|
423
|
+
validation_evidence_ref?: string;
|
|
416
424
|
}
|
|
417
425
|
|
|
418
426
|
export interface EvolutionEvidenceEntry {
|
|
@@ -697,6 +705,25 @@ export interface BodyEvolutionProposal {
|
|
|
697
705
|
/** Closed union of gate names used in the validation pipeline. */
|
|
698
706
|
export type ValidationGate = "structural" | "trigger_accuracy" | "quality";
|
|
699
707
|
|
|
708
|
+
export type ValidationMode = "structural_guard" | "host_replay" | "llm_judge";
|
|
709
|
+
|
|
710
|
+
export interface RoutingReplayFixture {
|
|
711
|
+
fixture_id: string;
|
|
712
|
+
platform: "claude_code" | "codex";
|
|
713
|
+
target_skill_name: string;
|
|
714
|
+
target_skill_path: string;
|
|
715
|
+
competing_skill_paths: string[];
|
|
716
|
+
workspace_root?: string;
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
export interface RoutingReplayEntryResult {
|
|
720
|
+
query: string;
|
|
721
|
+
should_trigger: boolean;
|
|
722
|
+
triggered: boolean;
|
|
723
|
+
passed: boolean;
|
|
724
|
+
evidence?: string;
|
|
725
|
+
}
|
|
726
|
+
|
|
700
727
|
/** Result of validating a body evolution proposal. */
|
|
701
728
|
export interface BodyValidationResult {
|
|
702
729
|
proposal_id: string;
|
|
@@ -705,6 +732,12 @@ export interface BodyValidationResult {
|
|
|
705
732
|
gate_results: Array<{ gate: ValidationGate; passed: boolean; reason: string }>;
|
|
706
733
|
improved: boolean;
|
|
707
734
|
regressions: string[];
|
|
735
|
+
validation_mode?: ValidationMode;
|
|
736
|
+
validation_agent?: string;
|
|
737
|
+
validation_fixture_id?: string;
|
|
738
|
+
before_pass_rate?: number;
|
|
739
|
+
after_pass_rate?: number;
|
|
740
|
+
per_entry_results?: RoutingReplayEntryResult[];
|
|
708
741
|
}
|
|
709
742
|
|
|
710
743
|
/** Configuration for which LLM model a role should use. */
|
|
@@ -873,6 +906,26 @@ export interface SkillFamilyOverlapPair {
|
|
|
873
906
|
consolidation_pressure: "low" | "medium" | "high";
|
|
874
907
|
}
|
|
875
908
|
|
|
909
|
+
export interface SkillFamilyColdStartPair {
|
|
910
|
+
skill_a: string;
|
|
911
|
+
skill_b: string;
|
|
912
|
+
description_similarity: number;
|
|
913
|
+
when_to_use_similarity: number;
|
|
914
|
+
shared_command_surfaces: string[];
|
|
915
|
+
shared_terms: string[];
|
|
916
|
+
synthetic_confusion_queries: string[];
|
|
917
|
+
suspicion_level: "low" | "medium" | "high";
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
export interface SkillFamilyColdStartSuspicion {
|
|
921
|
+
candidate: boolean;
|
|
922
|
+
analyzed_pairs: number;
|
|
923
|
+
suspicious_pair_count: number;
|
|
924
|
+
average_static_similarity: number;
|
|
925
|
+
pairs: SkillFamilyColdStartPair[];
|
|
926
|
+
rationale: string[];
|
|
927
|
+
}
|
|
928
|
+
|
|
876
929
|
export interface SkillFamilyRefactorWorkflow {
|
|
877
930
|
workflow_name: string;
|
|
878
931
|
source_skill: string;
|
|
@@ -892,6 +945,7 @@ export interface SkillFamilyOverlapReport {
|
|
|
892
945
|
analyzed_skills: string[];
|
|
893
946
|
members: SkillFamilyOverlapMember[];
|
|
894
947
|
pairs: SkillFamilyOverlapPair[];
|
|
948
|
+
cold_start_suspicion?: SkillFamilyColdStartSuspicion;
|
|
895
949
|
total_pairs_analyzed: number;
|
|
896
950
|
overlap_count: number;
|
|
897
951
|
overlap_density: number;
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
const BASE_TEXT_SIMILARITY_STOPWORDS = new Set([
|
|
2
|
+
"a",
|
|
3
|
+
"an",
|
|
4
|
+
"and",
|
|
5
|
+
"are",
|
|
6
|
+
"as",
|
|
7
|
+
"at",
|
|
8
|
+
"be",
|
|
9
|
+
"for",
|
|
10
|
+
"from",
|
|
11
|
+
"how",
|
|
12
|
+
"in",
|
|
13
|
+
"into",
|
|
14
|
+
"is",
|
|
15
|
+
"it",
|
|
16
|
+
"of",
|
|
17
|
+
"on",
|
|
18
|
+
"or",
|
|
19
|
+
"that",
|
|
20
|
+
"the",
|
|
21
|
+
"this",
|
|
22
|
+
"to",
|
|
23
|
+
"use",
|
|
24
|
+
"user",
|
|
25
|
+
"when",
|
|
26
|
+
"with",
|
|
27
|
+
]);
|
|
28
|
+
|
|
29
|
+
export function buildStopwordSet(additionalStopwords: string[] = []): Set<string> {
|
|
30
|
+
return new Set([...BASE_TEXT_SIMILARITY_STOPWORDS, ...additionalStopwords]);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function tokenizeText(
|
|
34
|
+
text: string,
|
|
35
|
+
stopwords = BASE_TEXT_SIMILARITY_STOPWORDS,
|
|
36
|
+
): Set<string> {
|
|
37
|
+
return new Set(
|
|
38
|
+
text
|
|
39
|
+
.toLowerCase()
|
|
40
|
+
.split(/[^a-z0-9]+/i)
|
|
41
|
+
.map((token) => token.trim())
|
|
42
|
+
.filter((token) => token.length >= 3 && !stopwords.has(token)),
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export function jaccardSimilarity(left: Set<string>, right: Set<string>): number {
|
|
47
|
+
if (left.size === 0 || right.size === 0) return 0;
|
|
48
|
+
let shared = 0;
|
|
49
|
+
for (const token of left) {
|
|
50
|
+
if (right.has(token)) shared += 1;
|
|
51
|
+
}
|
|
52
|
+
const union = left.size + right.size - shared;
|
|
53
|
+
return union > 0 ? shared / union : 0;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export function extractWhenToUseLines(body: string): string[] {
|
|
57
|
+
const lines = body.split("\n");
|
|
58
|
+
const start = lines.findIndex((line) => /^##+\s+when to use\s*$/i.test(line.trim()));
|
|
59
|
+
if (start === -1) return [];
|
|
60
|
+
|
|
61
|
+
const extracted: string[] = [];
|
|
62
|
+
for (let i = start + 1; i < lines.length; i++) {
|
|
63
|
+
const line = lines[i].trim();
|
|
64
|
+
if (!line) continue;
|
|
65
|
+
if (/^##+\s+/.test(line)) break;
|
|
66
|
+
if (/^[-*]\s+/.test(line)) {
|
|
67
|
+
extracted.push(line.replace(/^[-*]\s+/, "").trim());
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
extracted.push(line);
|
|
71
|
+
}
|
|
72
|
+
return extracted;
|
|
73
|
+
}
|
package/package.json
CHANGED
|
@@ -12,6 +12,7 @@ import {
|
|
|
12
12
|
ListChecksIcon,
|
|
13
13
|
} from "lucide-react";
|
|
14
14
|
import { useMemo, useState } from "react";
|
|
15
|
+
import type { ReactNode } from "react";
|
|
15
16
|
import Markdown from "react-markdown";
|
|
16
17
|
|
|
17
18
|
import { formatRate, timeAgo } from "../lib/format";
|
|
@@ -34,6 +35,37 @@ interface Props {
|
|
|
34
35
|
showContextBanner?: boolean;
|
|
35
36
|
}
|
|
36
37
|
|
|
38
|
+
function getValidationModeMeta(mode?: string | null): {
|
|
39
|
+
label: string;
|
|
40
|
+
variant: "default" | "secondary" | "destructive" | "outline";
|
|
41
|
+
description: string;
|
|
42
|
+
} | null {
|
|
43
|
+
switch (mode) {
|
|
44
|
+
case "host_replay":
|
|
45
|
+
return {
|
|
46
|
+
label: "Replay-backed validation",
|
|
47
|
+
variant: "default",
|
|
48
|
+
description:
|
|
49
|
+
"Validated against a controlled replay fixture instead of a free-form judge prompt.",
|
|
50
|
+
};
|
|
51
|
+
case "llm_judge":
|
|
52
|
+
return {
|
|
53
|
+
label: "Model judgment",
|
|
54
|
+
variant: "secondary",
|
|
55
|
+
description: "Validated by an LLM trigger check rather than a replay fixture.",
|
|
56
|
+
};
|
|
57
|
+
case "structural_guard":
|
|
58
|
+
return {
|
|
59
|
+
label: "Structural guard",
|
|
60
|
+
variant: "outline",
|
|
61
|
+
description:
|
|
62
|
+
"Only deterministic structural checks ran; no replay or judge validation was needed.",
|
|
63
|
+
};
|
|
64
|
+
default:
|
|
65
|
+
return null;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
37
69
|
function sentenceCase(value: string): string {
|
|
38
70
|
return value.replace(/_/g, " ");
|
|
39
71
|
}
|
|
@@ -42,7 +74,7 @@ function getOutcomePresentation(action?: string | null): {
|
|
|
42
74
|
title: string;
|
|
43
75
|
summary: string;
|
|
44
76
|
tone: string;
|
|
45
|
-
icon:
|
|
77
|
+
icon: ReactNode;
|
|
46
78
|
liveSkillNote: string;
|
|
47
79
|
} {
|
|
48
80
|
switch (action) {
|
|
@@ -180,7 +212,7 @@ function SkillContentBlock({
|
|
|
180
212
|
}
|
|
181
213
|
|
|
182
214
|
/** Smart formatting for a single validation value */
|
|
183
|
-
function formatValidationValue(key: string, val: unknown):
|
|
215
|
+
function formatValidationValue(key: string, val: unknown): ReactNode {
|
|
184
216
|
// Booleans
|
|
185
217
|
if (typeof val === "boolean") {
|
|
186
218
|
return val ? (
|
|
@@ -279,12 +311,18 @@ function ValidationResults({ validation }: { validation: Record<string, unknown>
|
|
|
279
311
|
regressions,
|
|
280
312
|
new_passes,
|
|
281
313
|
per_entry_results,
|
|
314
|
+
validation_mode,
|
|
315
|
+
validation_agent,
|
|
316
|
+
validation_fixture_id,
|
|
317
|
+
validation_evidence_ref,
|
|
282
318
|
...rest
|
|
283
319
|
} = validation;
|
|
284
320
|
|
|
285
321
|
const regressionsArr = Array.isArray(regressions) ? regressions : [];
|
|
286
322
|
const newPassesArr = Array.isArray(new_passes) ? new_passes : [];
|
|
287
323
|
const perEntryArr = Array.isArray(per_entry_results) ? per_entry_results : [];
|
|
324
|
+
const validationMeta =
|
|
325
|
+
typeof validation_mode === "string" ? getValidationModeMeta(validation_mode) : null;
|
|
288
326
|
|
|
289
327
|
return (
|
|
290
328
|
<div className="rounded-md border bg-muted/30 p-3 space-y-3">
|
|
@@ -295,6 +333,34 @@ function ValidationResults({ validation }: { validation: Record<string, unknown>
|
|
|
295
333
|
</span>
|
|
296
334
|
</p>
|
|
297
335
|
|
|
336
|
+
{validationMeta && (
|
|
337
|
+
<div className="rounded-md border bg-card px-3 py-2">
|
|
338
|
+
<div className="flex flex-wrap items-center gap-2">
|
|
339
|
+
<Badge variant={validationMeta.variant} className="text-[10px]">
|
|
340
|
+
{validationMeta.label}
|
|
341
|
+
</Badge>
|
|
342
|
+
{typeof validation_agent === "string" && validation_agent.trim() && (
|
|
343
|
+
<Badge variant="outline" className="text-[10px]">
|
|
344
|
+
agent: {validation_agent}
|
|
345
|
+
</Badge>
|
|
346
|
+
)}
|
|
347
|
+
{typeof validation_fixture_id === "string" && validation_fixture_id.trim() && (
|
|
348
|
+
<Badge variant="outline" className="text-[10px]">
|
|
349
|
+
fixture: {validation_fixture_id}
|
|
350
|
+
</Badge>
|
|
351
|
+
)}
|
|
352
|
+
</div>
|
|
353
|
+
<p className="mt-1 text-[11px] leading-relaxed text-muted-foreground">
|
|
354
|
+
{validationMeta.description}
|
|
355
|
+
</p>
|
|
356
|
+
{typeof validation_evidence_ref === "string" && validation_evidence_ref.trim() && (
|
|
357
|
+
<p className="mt-1 text-[10px] font-mono text-muted-foreground/70">
|
|
358
|
+
{validation_evidence_ref}
|
|
359
|
+
</p>
|
|
360
|
+
)}
|
|
361
|
+
</div>
|
|
362
|
+
)}
|
|
363
|
+
|
|
298
364
|
{/* Summary bar */}
|
|
299
365
|
<div className="flex items-center gap-3 flex-wrap">
|
|
300
366
|
{improved !== undefined && (
|
|
@@ -730,6 +796,7 @@ export function EvidenceViewer({
|
|
|
730
796
|
const latestStep = steps[steps.length - 1] ?? null;
|
|
731
797
|
const lifecycleLabel = steps.map((step) => step.action.replace("_", " ")).join(" -> ");
|
|
732
798
|
const outcome = getOutcomePresentation(latestStep?.action);
|
|
799
|
+
const validationMeta = getValidationModeMeta(latestStep?.validation_mode);
|
|
733
800
|
const latestProposalConfidence = useMemo(() => {
|
|
734
801
|
for (let i = proposalEntries.length - 1; i >= 0; i--) {
|
|
735
802
|
if (proposalEntries[i].confidence !== null) {
|
|
@@ -824,6 +891,16 @@ export function EvidenceViewer({
|
|
|
824
891
|
<Badge variant="outline" className="text-[10px]">
|
|
825
892
|
{entries.length} evidence {entries.length === 1 ? "row" : "rows"}
|
|
826
893
|
</Badge>
|
|
894
|
+
{validationMeta && (
|
|
895
|
+
<Badge variant={validationMeta.variant} className="text-[10px]">
|
|
896
|
+
{validationMeta.label}
|
|
897
|
+
</Badge>
|
|
898
|
+
)}
|
|
899
|
+
{latestStep?.validation_fixture_id && (
|
|
900
|
+
<Badge variant="outline" className="text-[10px]">
|
|
901
|
+
fixture: {latestStep.validation_fixture_id}
|
|
902
|
+
</Badge>
|
|
903
|
+
)}
|
|
827
904
|
{latestProposalConfidence != null && (
|
|
828
905
|
<Badge variant="secondary" className="text-[10px]">
|
|
829
906
|
{Math.round(latestProposalConfidence * 100)}% confidence
|
|
@@ -831,6 +908,12 @@ export function EvidenceViewer({
|
|
|
831
908
|
)}
|
|
832
909
|
</div>
|
|
833
910
|
|
|
911
|
+
{validationMeta && (
|
|
912
|
+
<p className="text-[11px] leading-relaxed text-muted-foreground">
|
|
913
|
+
{validationMeta.description}
|
|
914
|
+
</p>
|
|
915
|
+
)}
|
|
916
|
+
|
|
834
917
|
<div className="flex flex-wrap items-center gap-2 text-[11px] text-muted-foreground">
|
|
835
918
|
<span className="font-headline uppercase tracking-[0.16em] text-muted-foreground/80">
|
|
836
919
|
Lifecycle
|
|
@@ -10,13 +10,14 @@ import {
|
|
|
10
10
|
ChevronRightIcon,
|
|
11
11
|
} from "lucide-react";
|
|
12
12
|
import { useState } from "react";
|
|
13
|
+
import type { ReactNode } from "react";
|
|
13
14
|
|
|
14
15
|
import { timeAgo } from "../lib/format";
|
|
15
16
|
import { cn } from "../lib/utils";
|
|
16
17
|
import { Badge } from "../primitives/badge";
|
|
17
18
|
import type { EvalSnapshot, EvolutionEntry } from "../types";
|
|
18
19
|
|
|
19
|
-
const ACTION_ICON: Record<string,
|
|
20
|
+
const ACTION_ICON: Record<string, ReactNode> = {
|
|
20
21
|
created: <CircleDotIcon className="size-3.5" />,
|
|
21
22
|
validated: <ShieldCheckIcon className="size-3.5" />,
|
|
22
23
|
deployed: <RocketIcon className="size-3.5" />,
|
|
@@ -70,6 +71,21 @@ interface Props {
|
|
|
70
71
|
onSelect: (proposalId: string) => void;
|
|
71
72
|
}
|
|
72
73
|
|
|
74
|
+
function validationModeBadge(
|
|
75
|
+
mode?: string | null,
|
|
76
|
+
): { label: string; variant: "default" | "secondary" | "outline" } | null {
|
|
77
|
+
switch (mode) {
|
|
78
|
+
case "host_replay":
|
|
79
|
+
return { label: "replay", variant: "default" };
|
|
80
|
+
case "llm_judge":
|
|
81
|
+
return { label: "judge", variant: "secondary" };
|
|
82
|
+
case "structural_guard":
|
|
83
|
+
return { label: "structural", variant: "outline" };
|
|
84
|
+
default:
|
|
85
|
+
return null;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
73
89
|
/** Group evolution entries by proposal_id, ordered newest-first. */
|
|
74
90
|
function groupByProposal(entries: EvolutionEntry[]) {
|
|
75
91
|
const map = new Map<string, EvolutionEntry[]>();
|
|
@@ -201,6 +217,7 @@ export function EvolutionTimeline({ entries, selectedProposalId, onSelect }: Pro
|
|
|
201
217
|
const lineColor = ACTION_LINE[terminal] ?? "bg-border";
|
|
202
218
|
const isLast = groupIdx === groups.length - 1;
|
|
203
219
|
const snapshot = findEvalSnapshot(steps);
|
|
220
|
+
const validationBadge = validationModeBadge(lastStep.validation_mode);
|
|
204
221
|
|
|
205
222
|
return (
|
|
206
223
|
<div key={proposalId} className="relative flex gap-3">
|
|
@@ -245,6 +262,11 @@ export function EvolutionTimeline({ entries, selectedProposalId, onSelect }: Pro
|
|
|
245
262
|
<span className="text-[10px] text-muted-foreground">
|
|
246
263
|
{timeAgo(lastStep.timestamp)}
|
|
247
264
|
</span>
|
|
265
|
+
{validationBadge && (
|
|
266
|
+
<Badge variant={validationBadge.variant} className="text-[9px] uppercase">
|
|
267
|
+
{validationBadge.label}
|
|
268
|
+
</Badge>
|
|
269
|
+
)}
|
|
248
270
|
</div>
|
|
249
271
|
{/* Pass rate delta from eval snapshot */}
|
|
250
272
|
{snapshot && (
|