selftune 0.2.19 → 0.2.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/apps/local-dashboard/dist/assets/{index-DnhnXQm6.js → index-D8O-RG1I.js} +2 -2
- package/apps/local-dashboard/dist/index.html +1 -1
- package/cli/selftune/dashboard-contract.ts +4 -0
- package/cli/selftune/eval/family-overlap.ts +320 -1
- package/cli/selftune/evolution/evidence.ts +5 -0
- package/cli/selftune/evolution/evolve-body.ts +86 -2
- package/cli/selftune/evolution/evolve.ts +58 -1
- package/cli/selftune/evolution/validate-body.ts +10 -0
- package/cli/selftune/evolution/validate-host-replay.ts +624 -0
- package/cli/selftune/evolution/validate-proposal.ts +10 -0
- package/cli/selftune/evolution/validate-routing.ts +112 -5
- package/cli/selftune/localdb/direct-write.ts +8 -3
- package/cli/selftune/localdb/materialize.ts +7 -2
- package/cli/selftune/localdb/queries.ts +11 -1
- package/cli/selftune/localdb/schema.ts +10 -1
- package/cli/selftune/routes/skill-report.ts +6 -1
- package/cli/selftune/types.ts +54 -0
- package/cli/selftune/utils/text-similarity.ts +73 -0
- package/package.json +1 -1
- package/packages/ui/src/components/EvidenceViewer.tsx +85 -2
- package/packages/ui/src/components/EvolutionTimeline.tsx +23 -1
- package/packages/ui/src/types.ts +4 -0
- package/skill/Workflows/Composability.md +15 -1
- package/skill/Workflows/Evolve.md +39 -0
|
@@ -0,0 +1,624 @@
|
|
|
1
|
+
import {
|
|
2
|
+
existsSync,
|
|
3
|
+
mkdirSync,
|
|
4
|
+
mkdtempSync,
|
|
5
|
+
readFileSync,
|
|
6
|
+
readdirSync,
|
|
7
|
+
realpathSync,
|
|
8
|
+
rmSync,
|
|
9
|
+
statSync,
|
|
10
|
+
writeFileSync,
|
|
11
|
+
} from "node:fs";
|
|
12
|
+
import { tmpdir } from "node:os";
|
|
13
|
+
import { basename, dirname, isAbsolute, join } from "node:path";
|
|
14
|
+
|
|
15
|
+
import type { EvalEntry, RoutingReplayEntryResult, RoutingReplayFixture } from "../types.js";
|
|
16
|
+
import { parseFrontmatter } from "../utils/frontmatter.js";
|
|
17
|
+
import { containsWholeSkillMention } from "../utils/skill-discovery.js";
|
|
18
|
+
import { findGitRepositoryRoot } from "../utils/skill-discovery.js";
|
|
19
|
+
import {
|
|
20
|
+
extractWhenToUseLines,
|
|
21
|
+
jaccardSimilarity,
|
|
22
|
+
tokenizeText,
|
|
23
|
+
} from "../utils/text-similarity.js";
|
|
24
|
+
import { replaceSection } from "./deploy-proposal.js";
|
|
25
|
+
|
|
26
|
+
interface ReplaySkillSurface {
|
|
27
|
+
skillName: string;
|
|
28
|
+
descriptionTokens: Set<string>;
|
|
29
|
+
whenToUseTokens: Set<string>;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
interface ReplayWorkspace {
|
|
33
|
+
rootDir: string;
|
|
34
|
+
targetSkillPath: string;
|
|
35
|
+
competingSkillPaths: string[];
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export interface ClaudeRuntimeReplayInvokerInput {
|
|
39
|
+
query: string;
|
|
40
|
+
workspaceRoot: string;
|
|
41
|
+
targetSkillName: string;
|
|
42
|
+
targetSkillPath: string;
|
|
43
|
+
competingSkillPaths: string[];
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export interface ClaudeRuntimeReplayObservation {
|
|
47
|
+
invokedSkillNames: string[];
|
|
48
|
+
readSkillPaths: string[];
|
|
49
|
+
rawOutput: string;
|
|
50
|
+
sessionId?: string;
|
|
51
|
+
runtimeError?: string;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export type ClaudeRuntimeReplayInvoker = (
|
|
55
|
+
input: ClaudeRuntimeReplayInvokerInput,
|
|
56
|
+
) => Promise<ClaudeRuntimeReplayObservation>;
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Minimum score needed before replay treats routing text or skill-surface overlap
|
|
60
|
+
* as a real match. Tuned to suppress weak false positives without killing recall
|
|
61
|
+
* for short routing phrases and sparse skill surfaces.
|
|
62
|
+
*/
|
|
63
|
+
const HOST_REPLAY_MATCH_THRESHOLD = 0.18;
|
|
64
|
+
const CLAUDE_RUNTIME_REPLAY_TIMEOUT_MS = 30_000;
|
|
65
|
+
const CLAUDE_RUNTIME_ROUTING_PROMPT =
|
|
66
|
+
"You are being evaluated only on skill routing. Do not solve the user's task. If a local project skill is relevant, invoke exactly one skill immediately. If no local project skill fits, respond with NO_SKILL and do not browse unrelated files.";
|
|
67
|
+
|
|
68
|
+
function resolveReplayPath(path: string): string {
|
|
69
|
+
try {
|
|
70
|
+
return realpathSync(path);
|
|
71
|
+
} catch {
|
|
72
|
+
return path;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function resolveObservedReplayPath(path: string, workspaceRoot: string): string {
|
|
77
|
+
return resolveReplayPath(isAbsolute(path) ? path : join(workspaceRoot, path));
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
function listCompetingSkillPaths(targetSkillPath: string): string[] {
|
|
81
|
+
const normalizedTargetPath = resolveReplayPath(targetSkillPath);
|
|
82
|
+
const targetSkillDir = dirname(normalizedTargetPath);
|
|
83
|
+
const registryDir = dirname(targetSkillDir);
|
|
84
|
+
const targetDirName = basename(targetSkillDir);
|
|
85
|
+
const competingPaths: string[] = [];
|
|
86
|
+
|
|
87
|
+
try {
|
|
88
|
+
for (const entry of readdirSync(registryDir)) {
|
|
89
|
+
if (entry === targetDirName) continue;
|
|
90
|
+
const candidateDir = join(registryDir, entry);
|
|
91
|
+
try {
|
|
92
|
+
if (!statSync(candidateDir).isDirectory()) continue;
|
|
93
|
+
} catch {
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
const candidateSkillPath = join(candidateDir, "SKILL.md");
|
|
98
|
+
if (!existsSync(candidateSkillPath)) continue;
|
|
99
|
+
competingPaths.push(resolveReplayPath(candidateSkillPath));
|
|
100
|
+
}
|
|
101
|
+
} catch {
|
|
102
|
+
// Ignore unreadable registries and treat the fixture as target-only.
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return competingPaths.sort((a, b) => a.localeCompare(b));
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
export function buildRoutingReplayFixture(options: {
|
|
109
|
+
skillName: string;
|
|
110
|
+
skillPath: string;
|
|
111
|
+
platform?: RoutingReplayFixture["platform"];
|
|
112
|
+
fixtureId?: string;
|
|
113
|
+
workspaceRoot?: string;
|
|
114
|
+
}): RoutingReplayFixture {
|
|
115
|
+
const targetSkillPath = resolveReplayPath(options.skillPath);
|
|
116
|
+
const workspaceRoot =
|
|
117
|
+
options.workspaceRoot ?? findGitRepositoryRoot(dirname(dirname(targetSkillPath)));
|
|
118
|
+
const platform = options.platform ?? "claude_code";
|
|
119
|
+
|
|
120
|
+
return {
|
|
121
|
+
fixture_id: options.fixtureId ?? `auto-${platform}-${options.skillName}`,
|
|
122
|
+
platform,
|
|
123
|
+
target_skill_name: options.skillName,
|
|
124
|
+
target_skill_path: targetSkillPath,
|
|
125
|
+
competing_skill_paths: listCompetingSkillPaths(targetSkillPath),
|
|
126
|
+
...(workspaceRoot ? { workspace_root: workspaceRoot } : {}),
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
function buildRuntimeReplayTargetContent(skillPath: string, routing: string): string {
|
|
131
|
+
const currentContent = readFileSync(skillPath, "utf8");
|
|
132
|
+
return replaceSection(currentContent, "Workflow Routing", routing.trim());
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function stageReplaySkill(
|
|
136
|
+
registryDir: string,
|
|
137
|
+
sourceSkillPath: string,
|
|
138
|
+
overrideContent?: string,
|
|
139
|
+
): string {
|
|
140
|
+
const skillDirName = basename(dirname(sourceSkillPath)) || "unknown-skill";
|
|
141
|
+
const destinationDir = join(registryDir, skillDirName);
|
|
142
|
+
mkdirSync(destinationDir, { recursive: true });
|
|
143
|
+
const destinationPath = join(destinationDir, "SKILL.md");
|
|
144
|
+
const content = overrideContent ?? readFileSync(sourceSkillPath, "utf8");
|
|
145
|
+
writeFileSync(destinationPath, content, "utf8");
|
|
146
|
+
return destinationPath;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function buildRuntimeReplayWorkspace(
|
|
150
|
+
fixture: RoutingReplayFixture,
|
|
151
|
+
routing: string,
|
|
152
|
+
): ReplayWorkspace {
|
|
153
|
+
const rootDir = mkdtempSync(join(tmpdir(), "selftune-runtime-replay-"));
|
|
154
|
+
try {
|
|
155
|
+
const registryDir = join(rootDir, ".claude", "skills");
|
|
156
|
+
mkdirSync(join(rootDir, ".git"), { recursive: true });
|
|
157
|
+
mkdirSync(registryDir, { recursive: true });
|
|
158
|
+
|
|
159
|
+
const targetSkillPath = stageReplaySkill(
|
|
160
|
+
registryDir,
|
|
161
|
+
fixture.target_skill_path,
|
|
162
|
+
buildRuntimeReplayTargetContent(fixture.target_skill_path, routing),
|
|
163
|
+
);
|
|
164
|
+
const competingSkillPaths = fixture.competing_skill_paths.map((skillPath) =>
|
|
165
|
+
stageReplaySkill(registryDir, skillPath),
|
|
166
|
+
);
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
rootDir,
|
|
170
|
+
targetSkillPath,
|
|
171
|
+
competingSkillPaths,
|
|
172
|
+
};
|
|
173
|
+
} catch (error) {
|
|
174
|
+
rmSync(rootDir, { recursive: true, force: true });
|
|
175
|
+
throw error;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
function cleanupRuntimeReplayWorkspace(workspace: ReplayWorkspace): void {
|
|
180
|
+
rmSync(workspace.rootDir, { recursive: true, force: true });
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
function parseClaudeRuntimeReplayOutput(rawOutput: string): ClaudeRuntimeReplayObservation {
|
|
184
|
+
const invokedSkillNames = new Set<string>();
|
|
185
|
+
const readSkillPaths = new Set<string>();
|
|
186
|
+
let sessionId: string | undefined;
|
|
187
|
+
let runtimeError: string | undefined;
|
|
188
|
+
|
|
189
|
+
for (const line of rawOutput.split("\n")) {
|
|
190
|
+
const trimmed = line.trim();
|
|
191
|
+
if (!trimmed) continue;
|
|
192
|
+
|
|
193
|
+
let parsed: Record<string, unknown>;
|
|
194
|
+
try {
|
|
195
|
+
parsed = JSON.parse(trimmed);
|
|
196
|
+
} catch {
|
|
197
|
+
continue;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const maybeSessionId = parsed.session_id;
|
|
201
|
+
if (typeof maybeSessionId === "string" && maybeSessionId) {
|
|
202
|
+
sessionId = maybeSessionId;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
if (typeof parsed.error === "string" && parsed.error) {
|
|
206
|
+
runtimeError = parsed.error;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
const assistantMessage =
|
|
210
|
+
parsed.type === "assistant" && typeof parsed.message === "object" && parsed.message !== null
|
|
211
|
+
? (parsed.message as Record<string, unknown>)
|
|
212
|
+
: undefined;
|
|
213
|
+
const content = assistantMessage?.content;
|
|
214
|
+
if (!Array.isArray(content)) continue;
|
|
215
|
+
|
|
216
|
+
for (const block of content) {
|
|
217
|
+
if (typeof block !== "object" || block === null) continue;
|
|
218
|
+
const typedBlock = block as Record<string, unknown>;
|
|
219
|
+
if (typedBlock.type !== "tool_use") continue;
|
|
220
|
+
|
|
221
|
+
const toolName = typedBlock.name;
|
|
222
|
+
const input =
|
|
223
|
+
typeof typedBlock.input === "object" && typedBlock.input !== null
|
|
224
|
+
? (typedBlock.input as Record<string, unknown>)
|
|
225
|
+
: {};
|
|
226
|
+
|
|
227
|
+
if (toolName === "Skill") {
|
|
228
|
+
const skillName = input.skill;
|
|
229
|
+
if (typeof skillName === "string" && skillName.trim()) {
|
|
230
|
+
invokedSkillNames.add(skillName.trim());
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
if (toolName === "Read") {
|
|
235
|
+
const filePath = input.file_path;
|
|
236
|
+
if (typeof filePath === "string" && filePath.trim()) {
|
|
237
|
+
readSkillPaths.add(resolveReplayPath(filePath.trim()));
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
return {
|
|
244
|
+
invokedSkillNames: [...invokedSkillNames],
|
|
245
|
+
readSkillPaths: [...readSkillPaths],
|
|
246
|
+
rawOutput,
|
|
247
|
+
...(sessionId ? { sessionId } : {}),
|
|
248
|
+
...(runtimeError ? { runtimeError } : {}),
|
|
249
|
+
};
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
async function invokeClaudeRuntimeReplay(
|
|
253
|
+
input: ClaudeRuntimeReplayInvokerInput,
|
|
254
|
+
): Promise<ClaudeRuntimeReplayObservation> {
|
|
255
|
+
const command = [
|
|
256
|
+
"claude",
|
|
257
|
+
"-p",
|
|
258
|
+
"--verbose",
|
|
259
|
+
"--output-format",
|
|
260
|
+
"stream-json",
|
|
261
|
+
"--dangerously-skip-permissions",
|
|
262
|
+
"--no-session-persistence",
|
|
263
|
+
"--setting-sources",
|
|
264
|
+
"project,local",
|
|
265
|
+
"--tools",
|
|
266
|
+
"Skill,Read",
|
|
267
|
+
"--max-turns",
|
|
268
|
+
"1",
|
|
269
|
+
"--append-system-prompt",
|
|
270
|
+
CLAUDE_RUNTIME_ROUTING_PROMPT,
|
|
271
|
+
input.query,
|
|
272
|
+
];
|
|
273
|
+
|
|
274
|
+
const proc = Bun.spawn(command, {
|
|
275
|
+
cwd: input.workspaceRoot,
|
|
276
|
+
stdout: "pipe",
|
|
277
|
+
stderr: "pipe",
|
|
278
|
+
env: { ...process.env, CLAUDECODE: "" },
|
|
279
|
+
});
|
|
280
|
+
const timeout = setTimeout(() => proc.kill(), CLAUDE_RUNTIME_REPLAY_TIMEOUT_MS);
|
|
281
|
+
|
|
282
|
+
const [stdoutText, stderrText, exitCode] = await Promise.all([
|
|
283
|
+
new Response(proc.stdout).text(),
|
|
284
|
+
new Response(proc.stderr).text(),
|
|
285
|
+
proc.exited,
|
|
286
|
+
]);
|
|
287
|
+
clearTimeout(timeout);
|
|
288
|
+
|
|
289
|
+
const observation = parseClaudeRuntimeReplayOutput(stdoutText);
|
|
290
|
+
const combinedError = [observation.runtimeError, stderrText.trim()].filter(Boolean).join(" | ");
|
|
291
|
+
const hasRoutingSignal =
|
|
292
|
+
observation.invokedSkillNames.length > 0 || observation.readSkillPaths.length > 0;
|
|
293
|
+
|
|
294
|
+
if (exitCode !== 0 && !hasRoutingSignal) {
|
|
295
|
+
throw new Error(combinedError || `claude runtime replay exited with code ${exitCode}`);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
return {
|
|
299
|
+
...observation,
|
|
300
|
+
...(combinedError ? { runtimeError: combinedError } : {}),
|
|
301
|
+
};
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
function prefixReplayEvidence(
|
|
305
|
+
results: RoutingReplayEntryResult[],
|
|
306
|
+
prefix: string,
|
|
307
|
+
): RoutingReplayEntryResult[] {
|
|
308
|
+
return results.map((result) => ({
|
|
309
|
+
...result,
|
|
310
|
+
evidence: result.evidence ? `${prefix}; ${result.evidence}` : prefix,
|
|
311
|
+
}));
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
function evaluateRuntimeReplayObservation(
|
|
315
|
+
entry: EvalEntry,
|
|
316
|
+
fixture: RoutingReplayFixture,
|
|
317
|
+
observation: ClaudeRuntimeReplayObservation,
|
|
318
|
+
workspace: ReplayWorkspace,
|
|
319
|
+
): RoutingReplayEntryResult {
|
|
320
|
+
const normalizedReadPaths = new Set(
|
|
321
|
+
observation.readSkillPaths.map((path) => resolveObservedReplayPath(path, workspace.rootDir)),
|
|
322
|
+
);
|
|
323
|
+
const allowedReadPaths = new Set([
|
|
324
|
+
resolveReplayPath(workspace.targetSkillPath),
|
|
325
|
+
...workspace.competingSkillPaths.map(resolveReplayPath),
|
|
326
|
+
]);
|
|
327
|
+
const targetSkillName = fixture.target_skill_name.trim();
|
|
328
|
+
const targetInvoked = observation.invokedSkillNames.includes(targetSkillName);
|
|
329
|
+
const competingInvoked = observation.invokedSkillNames.find((skillName) =>
|
|
330
|
+
fixture.competing_skill_paths.some(
|
|
331
|
+
(skillPath) => basename(dirname(skillPath)).trim() === skillName.trim(),
|
|
332
|
+
),
|
|
333
|
+
);
|
|
334
|
+
const unrelatedInvoked = observation.invokedSkillNames.find(
|
|
335
|
+
(skillName) => skillName.trim() !== targetSkillName && skillName.trim() !== competingInvoked,
|
|
336
|
+
);
|
|
337
|
+
const unrelatedReadPaths = [...normalizedReadPaths].filter((path) => !allowedReadPaths.has(path));
|
|
338
|
+
const targetRead = normalizedReadPaths.has(resolveReplayPath(workspace.targetSkillPath));
|
|
339
|
+
const competingRead = workspace.competingSkillPaths.find((skillPath) =>
|
|
340
|
+
normalizedReadPaths.has(resolveReplayPath(skillPath)),
|
|
341
|
+
);
|
|
342
|
+
const sessionPrefix = observation.sessionId
|
|
343
|
+
? `runtime replay session ${observation.sessionId}`
|
|
344
|
+
: "runtime replay";
|
|
345
|
+
if (observation.invokedSkillNames.length > 1) {
|
|
346
|
+
return {
|
|
347
|
+
query: entry.query,
|
|
348
|
+
should_trigger: entry.should_trigger,
|
|
349
|
+
triggered: false,
|
|
350
|
+
passed: false,
|
|
351
|
+
evidence: `${sessionPrefix} invoked multiple skills: ${observation.invokedSkillNames.join(", ")}`,
|
|
352
|
+
};
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
if (targetInvoked) {
|
|
356
|
+
return {
|
|
357
|
+
query: entry.query,
|
|
358
|
+
should_trigger: entry.should_trigger,
|
|
359
|
+
triggered: true,
|
|
360
|
+
passed: entry.should_trigger,
|
|
361
|
+
evidence: `${sessionPrefix} invoked target skill: ${targetSkillName}`,
|
|
362
|
+
};
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
if (competingInvoked) {
|
|
366
|
+
return {
|
|
367
|
+
query: entry.query,
|
|
368
|
+
should_trigger: entry.should_trigger,
|
|
369
|
+
triggered: false,
|
|
370
|
+
passed: !entry.should_trigger,
|
|
371
|
+
evidence: `${sessionPrefix} invoked competing skill: ${competingInvoked}`,
|
|
372
|
+
};
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
if (unrelatedInvoked) {
|
|
376
|
+
return {
|
|
377
|
+
query: entry.query,
|
|
378
|
+
should_trigger: entry.should_trigger,
|
|
379
|
+
triggered: false,
|
|
380
|
+
passed: false,
|
|
381
|
+
evidence: `${sessionPrefix} invoked unrelated skill: ${unrelatedInvoked}`,
|
|
382
|
+
};
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
if (unrelatedReadPaths.length > 0) {
|
|
386
|
+
return {
|
|
387
|
+
query: entry.query,
|
|
388
|
+
should_trigger: entry.should_trigger,
|
|
389
|
+
triggered: false,
|
|
390
|
+
passed: false,
|
|
391
|
+
evidence: `${sessionPrefix} read files outside staged skill set: ${unrelatedReadPaths.join(", ")}`,
|
|
392
|
+
};
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
if (targetRead) {
|
|
396
|
+
return {
|
|
397
|
+
query: entry.query,
|
|
398
|
+
should_trigger: entry.should_trigger,
|
|
399
|
+
triggered: false,
|
|
400
|
+
passed: !entry.should_trigger,
|
|
401
|
+
evidence: `${sessionPrefix} only read the target skill without invoking it`,
|
|
402
|
+
};
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
if (competingRead) {
|
|
406
|
+
return {
|
|
407
|
+
query: entry.query,
|
|
408
|
+
should_trigger: entry.should_trigger,
|
|
409
|
+
triggered: false,
|
|
410
|
+
passed: !entry.should_trigger,
|
|
411
|
+
evidence: `${sessionPrefix} only read a competing skill without invoking it`,
|
|
412
|
+
};
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
if (observation.runtimeError) {
|
|
416
|
+
throw new Error(`${sessionPrefix} did not reach a skill decision: ${observation.runtimeError}`);
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
return {
|
|
420
|
+
query: entry.query,
|
|
421
|
+
should_trigger: entry.should_trigger,
|
|
422
|
+
triggered: false,
|
|
423
|
+
passed: !entry.should_trigger,
|
|
424
|
+
evidence: `${sessionPrefix} did not invoke any local project skill`,
|
|
425
|
+
};
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
function loadReplaySkillSurface(skillPath: string): ReplaySkillSurface {
|
|
429
|
+
const fallbackName = basename(dirname(skillPath)) || "unknown-skill";
|
|
430
|
+
try {
|
|
431
|
+
const raw = readFileSync(skillPath, "utf8");
|
|
432
|
+
const parsed = parseFrontmatter(raw);
|
|
433
|
+
return {
|
|
434
|
+
skillName: parsed.name.trim() || fallbackName,
|
|
435
|
+
descriptionTokens: tokenizeText(parsed.description),
|
|
436
|
+
whenToUseTokens: tokenizeText(extractWhenToUseLines(parsed.body).join(" ")),
|
|
437
|
+
};
|
|
438
|
+
} catch {
|
|
439
|
+
return {
|
|
440
|
+
skillName: fallbackName,
|
|
441
|
+
descriptionTokens: new Set<string>(),
|
|
442
|
+
whenToUseTokens: new Set<string>(),
|
|
443
|
+
};
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
function extractRoutingTriggerPhrases(routing: string): string[] {
|
|
448
|
+
const lines = routing
|
|
449
|
+
.trim()
|
|
450
|
+
.split("\n")
|
|
451
|
+
.map((line) => line.trim())
|
|
452
|
+
.filter(Boolean);
|
|
453
|
+
if (lines.length < 3) return [];
|
|
454
|
+
|
|
455
|
+
const phrases: string[] = [];
|
|
456
|
+
for (const row of lines.slice(2)) {
|
|
457
|
+
if (!row.startsWith("|") || !row.endsWith("|")) continue;
|
|
458
|
+
const cells = row.split("|").map((cell) => cell.trim());
|
|
459
|
+
const triggerCell = cells[1];
|
|
460
|
+
if (!triggerCell) continue;
|
|
461
|
+
for (const part of triggerCell.split(/,|\/| or /i)) {
|
|
462
|
+
const phrase = part.trim().replace(/^["'`]|["'`]$/g, "");
|
|
463
|
+
if (phrase.length >= 3) phrases.push(phrase);
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
return phrases;
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
function scoreQueryAgainstTriggerPhrases(query: string, triggerPhrases: string[]): number {
|
|
470
|
+
const normalizedQuery = query.toLowerCase();
|
|
471
|
+
const queryTokens = tokenizeText(query);
|
|
472
|
+
let best = 0;
|
|
473
|
+
for (const phrase of triggerPhrases) {
|
|
474
|
+
const normalizedPhrase = phrase.toLowerCase();
|
|
475
|
+
if (normalizedQuery.includes(normalizedPhrase)) {
|
|
476
|
+
best = Math.max(best, 1);
|
|
477
|
+
continue;
|
|
478
|
+
}
|
|
479
|
+
best = Math.max(best, jaccardSimilarity(queryTokens, tokenizeText(phrase)));
|
|
480
|
+
}
|
|
481
|
+
return best;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
function scoreQueryAgainstSkillSurface(query: string, surface: ReplaySkillSurface): number {
|
|
485
|
+
const queryTokens = tokenizeText(query);
|
|
486
|
+
return Math.max(
|
|
487
|
+
jaccardSimilarity(queryTokens, surface.descriptionTokens),
|
|
488
|
+
jaccardSimilarity(queryTokens, surface.whenToUseTokens),
|
|
489
|
+
);
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
function evaluateReplayTrigger(
|
|
493
|
+
query: string,
|
|
494
|
+
routing: string,
|
|
495
|
+
targetSurface: ReplaySkillSurface,
|
|
496
|
+
competingSurfaces: ReplaySkillSurface[],
|
|
497
|
+
): { triggered: boolean; evidence: string } {
|
|
498
|
+
const normalizedQuery = query.trim();
|
|
499
|
+
if (containsWholeSkillMention(normalizedQuery, targetSurface.skillName)) {
|
|
500
|
+
return {
|
|
501
|
+
triggered: true,
|
|
502
|
+
evidence: `explicit target mention: ${targetSurface.skillName}`,
|
|
503
|
+
};
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
for (const competingSurface of competingSurfaces) {
|
|
507
|
+
if (containsWholeSkillMention(normalizedQuery, competingSurface.skillName)) {
|
|
508
|
+
return {
|
|
509
|
+
triggered: false,
|
|
510
|
+
evidence: `explicit competing skill mention: ${competingSurface.skillName}`,
|
|
511
|
+
};
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
const triggerPhrases = extractRoutingTriggerPhrases(routing);
|
|
516
|
+
const triggerScore = scoreQueryAgainstTriggerPhrases(normalizedQuery, triggerPhrases);
|
|
517
|
+
const targetSurfaceScore = scoreQueryAgainstSkillSurface(normalizedQuery, targetSurface);
|
|
518
|
+
const targetScore = Math.max(triggerScore, targetSurfaceScore);
|
|
519
|
+
const bestCompetitor = competingSurfaces
|
|
520
|
+
.map((surface) => ({
|
|
521
|
+
skillName: surface.skillName,
|
|
522
|
+
score: scoreQueryAgainstSkillSurface(normalizedQuery, surface),
|
|
523
|
+
}))
|
|
524
|
+
.sort((a, b) => b.score - a.score)[0];
|
|
525
|
+
|
|
526
|
+
if (targetScore < HOST_REPLAY_MATCH_THRESHOLD) {
|
|
527
|
+
return {
|
|
528
|
+
triggered: false,
|
|
529
|
+
evidence: "target routing and skill surface did not clear replay threshold",
|
|
530
|
+
};
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
if (bestCompetitor && bestCompetitor.score >= targetScore) {
|
|
534
|
+
return {
|
|
535
|
+
triggered: false,
|
|
536
|
+
evidence: `competing skill surface scored higher: ${bestCompetitor.skillName}`,
|
|
537
|
+
};
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
if (triggerScore >= targetSurfaceScore) {
|
|
541
|
+
return {
|
|
542
|
+
triggered: true,
|
|
543
|
+
evidence:
|
|
544
|
+
triggerScore === 1
|
|
545
|
+
? "query matched a routing trigger phrase exactly"
|
|
546
|
+
: "query aligned with routing trigger language",
|
|
547
|
+
};
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
return {
|
|
551
|
+
triggered: true,
|
|
552
|
+
evidence: "query aligned with target skill surface in replay fixture",
|
|
553
|
+
};
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
export function runHostReplayFixture(options: {
|
|
557
|
+
routing: string;
|
|
558
|
+
evalSet: EvalEntry[];
|
|
559
|
+
fixture: RoutingReplayFixture;
|
|
560
|
+
}): RoutingReplayEntryResult[] {
|
|
561
|
+
const targetSurface = loadReplaySkillSurface(options.fixture.target_skill_path);
|
|
562
|
+
const competingSurfaces = options.fixture.competing_skill_paths.map(loadReplaySkillSurface);
|
|
563
|
+
|
|
564
|
+
return options.evalSet.map((entry) => {
|
|
565
|
+
const evaluated = evaluateReplayTrigger(
|
|
566
|
+
entry.query,
|
|
567
|
+
options.routing,
|
|
568
|
+
targetSurface,
|
|
569
|
+
competingSurfaces,
|
|
570
|
+
);
|
|
571
|
+
return {
|
|
572
|
+
query: entry.query,
|
|
573
|
+
should_trigger: entry.should_trigger,
|
|
574
|
+
triggered: evaluated.triggered,
|
|
575
|
+
passed: evaluated.triggered === entry.should_trigger,
|
|
576
|
+
evidence: evaluated.evidence,
|
|
577
|
+
};
|
|
578
|
+
});
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
export async function runClaudeRuntimeReplayFixture(options: {
|
|
582
|
+
routing: string;
|
|
583
|
+
evalSet: EvalEntry[];
|
|
584
|
+
fixture: RoutingReplayFixture;
|
|
585
|
+
runtimeInvoker?: ClaudeRuntimeReplayInvoker;
|
|
586
|
+
}): Promise<RoutingReplayEntryResult[]> {
|
|
587
|
+
const fallbackReason = (reason: string) =>
|
|
588
|
+
`runtime replay unavailable; fell back to fixture simulation (${reason})`;
|
|
589
|
+
|
|
590
|
+
if (options.fixture.platform !== "claude_code") {
|
|
591
|
+
return prefixReplayEvidence(
|
|
592
|
+
runHostReplayFixture(options),
|
|
593
|
+
fallbackReason(`unsupported platform ${options.fixture.platform}`),
|
|
594
|
+
);
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
const invokeRuntime = options.runtimeInvoker ?? invokeClaudeRuntimeReplay;
|
|
598
|
+
let workspace: ReplayWorkspace | undefined;
|
|
599
|
+
|
|
600
|
+
try {
|
|
601
|
+
workspace = buildRuntimeReplayWorkspace(options.fixture, options.routing);
|
|
602
|
+
const results: RoutingReplayEntryResult[] = [];
|
|
603
|
+
|
|
604
|
+
for (const entry of options.evalSet) {
|
|
605
|
+
const observation = await invokeRuntime({
|
|
606
|
+
query: entry.query,
|
|
607
|
+
workspaceRoot: workspace.rootDir,
|
|
608
|
+
targetSkillName: options.fixture.target_skill_name,
|
|
609
|
+
targetSkillPath: workspace.targetSkillPath,
|
|
610
|
+
competingSkillPaths: workspace.competingSkillPaths,
|
|
611
|
+
});
|
|
612
|
+
results.push(
|
|
613
|
+
evaluateRuntimeReplayObservation(entry, options.fixture, observation, workspace),
|
|
614
|
+
);
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
return results;
|
|
618
|
+
} catch (error) {
|
|
619
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
620
|
+
return prefixReplayEvidence(runHostReplayFixture(options), fallbackReason(message));
|
|
621
|
+
} finally {
|
|
622
|
+
if (workspace) cleanupRuntimeReplayWorkspace(workspace);
|
|
623
|
+
}
|
|
624
|
+
}
|
|
@@ -40,6 +40,8 @@ export interface ValidationResult {
|
|
|
40
40
|
net_change: number; // after - before pass rate
|
|
41
41
|
by_invocation_type?: InvocationTypeScores;
|
|
42
42
|
per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
|
|
43
|
+
validation_mode?: "llm_judge";
|
|
44
|
+
validation_agent?: string;
|
|
43
45
|
}
|
|
44
46
|
|
|
45
47
|
// ---------------------------------------------------------------------------
|
|
@@ -63,6 +65,8 @@ export async function validateProposalSequential(
|
|
|
63
65
|
regressions: [],
|
|
64
66
|
new_passes: [],
|
|
65
67
|
net_change: 0,
|
|
68
|
+
validation_mode: "llm_judge",
|
|
69
|
+
validation_agent: agent,
|
|
66
70
|
};
|
|
67
71
|
}
|
|
68
72
|
|
|
@@ -174,6 +178,8 @@ export async function validateProposalSequential(
|
|
|
174
178
|
net_change: netChange,
|
|
175
179
|
by_invocation_type: invocationScores,
|
|
176
180
|
per_entry_results: perEntryResults,
|
|
181
|
+
validation_mode: "llm_judge",
|
|
182
|
+
validation_agent: agent,
|
|
177
183
|
};
|
|
178
184
|
}
|
|
179
185
|
|
|
@@ -220,6 +226,8 @@ export async function validateProposalBatched(
|
|
|
220
226
|
regressions: [],
|
|
221
227
|
new_passes: [],
|
|
222
228
|
net_change: 0,
|
|
229
|
+
validation_mode: "llm_judge",
|
|
230
|
+
validation_agent: agent,
|
|
223
231
|
};
|
|
224
232
|
}
|
|
225
233
|
|
|
@@ -342,6 +350,8 @@ export async function validateProposalBatched(
|
|
|
342
350
|
net_change: netChange,
|
|
343
351
|
by_invocation_type: invocationScores,
|
|
344
352
|
per_entry_results: perEntryResults,
|
|
353
|
+
validation_mode: "llm_judge",
|
|
354
|
+
validation_agent: agent,
|
|
345
355
|
};
|
|
346
356
|
}
|
|
347
357
|
|