selftune 0.2.19 → 0.2.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,624 @@
1
+ import {
2
+ existsSync,
3
+ mkdirSync,
4
+ mkdtempSync,
5
+ readFileSync,
6
+ readdirSync,
7
+ realpathSync,
8
+ rmSync,
9
+ statSync,
10
+ writeFileSync,
11
+ } from "node:fs";
12
+ import { tmpdir } from "node:os";
13
+ import { basename, dirname, isAbsolute, join } from "node:path";
14
+
15
+ import type { EvalEntry, RoutingReplayEntryResult, RoutingReplayFixture } from "../types.js";
16
+ import { parseFrontmatter } from "../utils/frontmatter.js";
17
+ import { containsWholeSkillMention } from "../utils/skill-discovery.js";
18
+ import { findGitRepositoryRoot } from "../utils/skill-discovery.js";
19
+ import {
20
+ extractWhenToUseLines,
21
+ jaccardSimilarity,
22
+ tokenizeText,
23
+ } from "../utils/text-similarity.js";
24
+ import { replaceSection } from "./deploy-proposal.js";
25
+
26
+ interface ReplaySkillSurface {
27
+ skillName: string;
28
+ descriptionTokens: Set<string>;
29
+ whenToUseTokens: Set<string>;
30
+ }
31
+
32
+ interface ReplayWorkspace {
33
+ rootDir: string;
34
+ targetSkillPath: string;
35
+ competingSkillPaths: string[];
36
+ }
37
+
38
+ export interface ClaudeRuntimeReplayInvokerInput {
39
+ query: string;
40
+ workspaceRoot: string;
41
+ targetSkillName: string;
42
+ targetSkillPath: string;
43
+ competingSkillPaths: string[];
44
+ }
45
+
46
+ export interface ClaudeRuntimeReplayObservation {
47
+ invokedSkillNames: string[];
48
+ readSkillPaths: string[];
49
+ rawOutput: string;
50
+ sessionId?: string;
51
+ runtimeError?: string;
52
+ }
53
+
54
+ export type ClaudeRuntimeReplayInvoker = (
55
+ input: ClaudeRuntimeReplayInvokerInput,
56
+ ) => Promise<ClaudeRuntimeReplayObservation>;
57
+
58
+ /**
59
+ * Minimum score needed before replay treats routing text or skill-surface overlap
60
+ * as a real match. Tuned to suppress weak false positives without killing recall
61
+ * for short routing phrases and sparse skill surfaces.
62
+ */
63
+ const HOST_REPLAY_MATCH_THRESHOLD = 0.18;
64
+ const CLAUDE_RUNTIME_REPLAY_TIMEOUT_MS = 30_000;
65
+ const CLAUDE_RUNTIME_ROUTING_PROMPT =
66
+ "You are being evaluated only on skill routing. Do not solve the user's task. If a local project skill is relevant, invoke exactly one skill immediately. If no local project skill fits, respond with NO_SKILL and do not browse unrelated files.";
67
+
68
+ function resolveReplayPath(path: string): string {
69
+ try {
70
+ return realpathSync(path);
71
+ } catch {
72
+ return path;
73
+ }
74
+ }
75
+
76
+ function resolveObservedReplayPath(path: string, workspaceRoot: string): string {
77
+ return resolveReplayPath(isAbsolute(path) ? path : join(workspaceRoot, path));
78
+ }
79
+
80
+ function listCompetingSkillPaths(targetSkillPath: string): string[] {
81
+ const normalizedTargetPath = resolveReplayPath(targetSkillPath);
82
+ const targetSkillDir = dirname(normalizedTargetPath);
83
+ const registryDir = dirname(targetSkillDir);
84
+ const targetDirName = basename(targetSkillDir);
85
+ const competingPaths: string[] = [];
86
+
87
+ try {
88
+ for (const entry of readdirSync(registryDir)) {
89
+ if (entry === targetDirName) continue;
90
+ const candidateDir = join(registryDir, entry);
91
+ try {
92
+ if (!statSync(candidateDir).isDirectory()) continue;
93
+ } catch {
94
+ continue;
95
+ }
96
+
97
+ const candidateSkillPath = join(candidateDir, "SKILL.md");
98
+ if (!existsSync(candidateSkillPath)) continue;
99
+ competingPaths.push(resolveReplayPath(candidateSkillPath));
100
+ }
101
+ } catch {
102
+ // Ignore unreadable registries and treat the fixture as target-only.
103
+ }
104
+
105
+ return competingPaths.sort((a, b) => a.localeCompare(b));
106
+ }
107
+
108
+ export function buildRoutingReplayFixture(options: {
109
+ skillName: string;
110
+ skillPath: string;
111
+ platform?: RoutingReplayFixture["platform"];
112
+ fixtureId?: string;
113
+ workspaceRoot?: string;
114
+ }): RoutingReplayFixture {
115
+ const targetSkillPath = resolveReplayPath(options.skillPath);
116
+ const workspaceRoot =
117
+ options.workspaceRoot ?? findGitRepositoryRoot(dirname(dirname(targetSkillPath)));
118
+ const platform = options.platform ?? "claude_code";
119
+
120
+ return {
121
+ fixture_id: options.fixtureId ?? `auto-${platform}-${options.skillName}`,
122
+ platform,
123
+ target_skill_name: options.skillName,
124
+ target_skill_path: targetSkillPath,
125
+ competing_skill_paths: listCompetingSkillPaths(targetSkillPath),
126
+ ...(workspaceRoot ? { workspace_root: workspaceRoot } : {}),
127
+ };
128
+ }
129
+
130
+ function buildRuntimeReplayTargetContent(skillPath: string, routing: string): string {
131
+ const currentContent = readFileSync(skillPath, "utf8");
132
+ return replaceSection(currentContent, "Workflow Routing", routing.trim());
133
+ }
134
+
135
+ function stageReplaySkill(
136
+ registryDir: string,
137
+ sourceSkillPath: string,
138
+ overrideContent?: string,
139
+ ): string {
140
+ const skillDirName = basename(dirname(sourceSkillPath)) || "unknown-skill";
141
+ const destinationDir = join(registryDir, skillDirName);
142
+ mkdirSync(destinationDir, { recursive: true });
143
+ const destinationPath = join(destinationDir, "SKILL.md");
144
+ const content = overrideContent ?? readFileSync(sourceSkillPath, "utf8");
145
+ writeFileSync(destinationPath, content, "utf8");
146
+ return destinationPath;
147
+ }
148
+
149
+ function buildRuntimeReplayWorkspace(
150
+ fixture: RoutingReplayFixture,
151
+ routing: string,
152
+ ): ReplayWorkspace {
153
+ const rootDir = mkdtempSync(join(tmpdir(), "selftune-runtime-replay-"));
154
+ try {
155
+ const registryDir = join(rootDir, ".claude", "skills");
156
+ mkdirSync(join(rootDir, ".git"), { recursive: true });
157
+ mkdirSync(registryDir, { recursive: true });
158
+
159
+ const targetSkillPath = stageReplaySkill(
160
+ registryDir,
161
+ fixture.target_skill_path,
162
+ buildRuntimeReplayTargetContent(fixture.target_skill_path, routing),
163
+ );
164
+ const competingSkillPaths = fixture.competing_skill_paths.map((skillPath) =>
165
+ stageReplaySkill(registryDir, skillPath),
166
+ );
167
+
168
+ return {
169
+ rootDir,
170
+ targetSkillPath,
171
+ competingSkillPaths,
172
+ };
173
+ } catch (error) {
174
+ rmSync(rootDir, { recursive: true, force: true });
175
+ throw error;
176
+ }
177
+ }
178
+
179
+ function cleanupRuntimeReplayWorkspace(workspace: ReplayWorkspace): void {
180
+ rmSync(workspace.rootDir, { recursive: true, force: true });
181
+ }
182
+
183
+ function parseClaudeRuntimeReplayOutput(rawOutput: string): ClaudeRuntimeReplayObservation {
184
+ const invokedSkillNames = new Set<string>();
185
+ const readSkillPaths = new Set<string>();
186
+ let sessionId: string | undefined;
187
+ let runtimeError: string | undefined;
188
+
189
+ for (const line of rawOutput.split("\n")) {
190
+ const trimmed = line.trim();
191
+ if (!trimmed) continue;
192
+
193
+ let parsed: Record<string, unknown>;
194
+ try {
195
+ parsed = JSON.parse(trimmed);
196
+ } catch {
197
+ continue;
198
+ }
199
+
200
+ const maybeSessionId = parsed.session_id;
201
+ if (typeof maybeSessionId === "string" && maybeSessionId) {
202
+ sessionId = maybeSessionId;
203
+ }
204
+
205
+ if (typeof parsed.error === "string" && parsed.error) {
206
+ runtimeError = parsed.error;
207
+ }
208
+
209
+ const assistantMessage =
210
+ parsed.type === "assistant" && typeof parsed.message === "object" && parsed.message !== null
211
+ ? (parsed.message as Record<string, unknown>)
212
+ : undefined;
213
+ const content = assistantMessage?.content;
214
+ if (!Array.isArray(content)) continue;
215
+
216
+ for (const block of content) {
217
+ if (typeof block !== "object" || block === null) continue;
218
+ const typedBlock = block as Record<string, unknown>;
219
+ if (typedBlock.type !== "tool_use") continue;
220
+
221
+ const toolName = typedBlock.name;
222
+ const input =
223
+ typeof typedBlock.input === "object" && typedBlock.input !== null
224
+ ? (typedBlock.input as Record<string, unknown>)
225
+ : {};
226
+
227
+ if (toolName === "Skill") {
228
+ const skillName = input.skill;
229
+ if (typeof skillName === "string" && skillName.trim()) {
230
+ invokedSkillNames.add(skillName.trim());
231
+ }
232
+ }
233
+
234
+ if (toolName === "Read") {
235
+ const filePath = input.file_path;
236
+ if (typeof filePath === "string" && filePath.trim()) {
237
+ readSkillPaths.add(resolveReplayPath(filePath.trim()));
238
+ }
239
+ }
240
+ }
241
+ }
242
+
243
+ return {
244
+ invokedSkillNames: [...invokedSkillNames],
245
+ readSkillPaths: [...readSkillPaths],
246
+ rawOutput,
247
+ ...(sessionId ? { sessionId } : {}),
248
+ ...(runtimeError ? { runtimeError } : {}),
249
+ };
250
+ }
251
+
252
+ async function invokeClaudeRuntimeReplay(
253
+ input: ClaudeRuntimeReplayInvokerInput,
254
+ ): Promise<ClaudeRuntimeReplayObservation> {
255
+ const command = [
256
+ "claude",
257
+ "-p",
258
+ "--verbose",
259
+ "--output-format",
260
+ "stream-json",
261
+ "--dangerously-skip-permissions",
262
+ "--no-session-persistence",
263
+ "--setting-sources",
264
+ "project,local",
265
+ "--tools",
266
+ "Skill,Read",
267
+ "--max-turns",
268
+ "1",
269
+ "--append-system-prompt",
270
+ CLAUDE_RUNTIME_ROUTING_PROMPT,
271
+ input.query,
272
+ ];
273
+
274
+ const proc = Bun.spawn(command, {
275
+ cwd: input.workspaceRoot,
276
+ stdout: "pipe",
277
+ stderr: "pipe",
278
+ env: { ...process.env, CLAUDECODE: "" },
279
+ });
280
+ const timeout = setTimeout(() => proc.kill(), CLAUDE_RUNTIME_REPLAY_TIMEOUT_MS);
281
+
282
+ const [stdoutText, stderrText, exitCode] = await Promise.all([
283
+ new Response(proc.stdout).text(),
284
+ new Response(proc.stderr).text(),
285
+ proc.exited,
286
+ ]);
287
+ clearTimeout(timeout);
288
+
289
+ const observation = parseClaudeRuntimeReplayOutput(stdoutText);
290
+ const combinedError = [observation.runtimeError, stderrText.trim()].filter(Boolean).join(" | ");
291
+ const hasRoutingSignal =
292
+ observation.invokedSkillNames.length > 0 || observation.readSkillPaths.length > 0;
293
+
294
+ if (exitCode !== 0 && !hasRoutingSignal) {
295
+ throw new Error(combinedError || `claude runtime replay exited with code ${exitCode}`);
296
+ }
297
+
298
+ return {
299
+ ...observation,
300
+ ...(combinedError ? { runtimeError: combinedError } : {}),
301
+ };
302
+ }
303
+
304
+ function prefixReplayEvidence(
305
+ results: RoutingReplayEntryResult[],
306
+ prefix: string,
307
+ ): RoutingReplayEntryResult[] {
308
+ return results.map((result) => ({
309
+ ...result,
310
+ evidence: result.evidence ? `${prefix}; ${result.evidence}` : prefix,
311
+ }));
312
+ }
313
+
314
+ function evaluateRuntimeReplayObservation(
315
+ entry: EvalEntry,
316
+ fixture: RoutingReplayFixture,
317
+ observation: ClaudeRuntimeReplayObservation,
318
+ workspace: ReplayWorkspace,
319
+ ): RoutingReplayEntryResult {
320
+ const normalizedReadPaths = new Set(
321
+ observation.readSkillPaths.map((path) => resolveObservedReplayPath(path, workspace.rootDir)),
322
+ );
323
+ const allowedReadPaths = new Set([
324
+ resolveReplayPath(workspace.targetSkillPath),
325
+ ...workspace.competingSkillPaths.map(resolveReplayPath),
326
+ ]);
327
+ const targetSkillName = fixture.target_skill_name.trim();
328
+ const targetInvoked = observation.invokedSkillNames.includes(targetSkillName);
329
+ const competingInvoked = observation.invokedSkillNames.find((skillName) =>
330
+ fixture.competing_skill_paths.some(
331
+ (skillPath) => basename(dirname(skillPath)).trim() === skillName.trim(),
332
+ ),
333
+ );
334
+ const unrelatedInvoked = observation.invokedSkillNames.find(
335
+ (skillName) => skillName.trim() !== targetSkillName && skillName.trim() !== competingInvoked,
336
+ );
337
+ const unrelatedReadPaths = [...normalizedReadPaths].filter((path) => !allowedReadPaths.has(path));
338
+ const targetRead = normalizedReadPaths.has(resolveReplayPath(workspace.targetSkillPath));
339
+ const competingRead = workspace.competingSkillPaths.find((skillPath) =>
340
+ normalizedReadPaths.has(resolveReplayPath(skillPath)),
341
+ );
342
+ const sessionPrefix = observation.sessionId
343
+ ? `runtime replay session ${observation.sessionId}`
344
+ : "runtime replay";
345
+ if (observation.invokedSkillNames.length > 1) {
346
+ return {
347
+ query: entry.query,
348
+ should_trigger: entry.should_trigger,
349
+ triggered: false,
350
+ passed: false,
351
+ evidence: `${sessionPrefix} invoked multiple skills: ${observation.invokedSkillNames.join(", ")}`,
352
+ };
353
+ }
354
+
355
+ if (targetInvoked) {
356
+ return {
357
+ query: entry.query,
358
+ should_trigger: entry.should_trigger,
359
+ triggered: true,
360
+ passed: entry.should_trigger,
361
+ evidence: `${sessionPrefix} invoked target skill: ${targetSkillName}`,
362
+ };
363
+ }
364
+
365
+ if (competingInvoked) {
366
+ return {
367
+ query: entry.query,
368
+ should_trigger: entry.should_trigger,
369
+ triggered: false,
370
+ passed: !entry.should_trigger,
371
+ evidence: `${sessionPrefix} invoked competing skill: ${competingInvoked}`,
372
+ };
373
+ }
374
+
375
+ if (unrelatedInvoked) {
376
+ return {
377
+ query: entry.query,
378
+ should_trigger: entry.should_trigger,
379
+ triggered: false,
380
+ passed: false,
381
+ evidence: `${sessionPrefix} invoked unrelated skill: ${unrelatedInvoked}`,
382
+ };
383
+ }
384
+
385
+ if (unrelatedReadPaths.length > 0) {
386
+ return {
387
+ query: entry.query,
388
+ should_trigger: entry.should_trigger,
389
+ triggered: false,
390
+ passed: false,
391
+ evidence: `${sessionPrefix} read files outside staged skill set: ${unrelatedReadPaths.join(", ")}`,
392
+ };
393
+ }
394
+
395
+ if (targetRead) {
396
+ return {
397
+ query: entry.query,
398
+ should_trigger: entry.should_trigger,
399
+ triggered: false,
400
+ passed: !entry.should_trigger,
401
+ evidence: `${sessionPrefix} only read the target skill without invoking it`,
402
+ };
403
+ }
404
+
405
+ if (competingRead) {
406
+ return {
407
+ query: entry.query,
408
+ should_trigger: entry.should_trigger,
409
+ triggered: false,
410
+ passed: !entry.should_trigger,
411
+ evidence: `${sessionPrefix} only read a competing skill without invoking it`,
412
+ };
413
+ }
414
+
415
+ if (observation.runtimeError) {
416
+ throw new Error(`${sessionPrefix} did not reach a skill decision: ${observation.runtimeError}`);
417
+ }
418
+
419
+ return {
420
+ query: entry.query,
421
+ should_trigger: entry.should_trigger,
422
+ triggered: false,
423
+ passed: !entry.should_trigger,
424
+ evidence: `${sessionPrefix} did not invoke any local project skill`,
425
+ };
426
+ }
427
+
428
+ function loadReplaySkillSurface(skillPath: string): ReplaySkillSurface {
429
+ const fallbackName = basename(dirname(skillPath)) || "unknown-skill";
430
+ try {
431
+ const raw = readFileSync(skillPath, "utf8");
432
+ const parsed = parseFrontmatter(raw);
433
+ return {
434
+ skillName: parsed.name.trim() || fallbackName,
435
+ descriptionTokens: tokenizeText(parsed.description),
436
+ whenToUseTokens: tokenizeText(extractWhenToUseLines(parsed.body).join(" ")),
437
+ };
438
+ } catch {
439
+ return {
440
+ skillName: fallbackName,
441
+ descriptionTokens: new Set<string>(),
442
+ whenToUseTokens: new Set<string>(),
443
+ };
444
+ }
445
+ }
446
+
447
+ function extractRoutingTriggerPhrases(routing: string): string[] {
448
+ const lines = routing
449
+ .trim()
450
+ .split("\n")
451
+ .map((line) => line.trim())
452
+ .filter(Boolean);
453
+ if (lines.length < 3) return [];
454
+
455
+ const phrases: string[] = [];
456
+ for (const row of lines.slice(2)) {
457
+ if (!row.startsWith("|") || !row.endsWith("|")) continue;
458
+ const cells = row.split("|").map((cell) => cell.trim());
459
+ const triggerCell = cells[1];
460
+ if (!triggerCell) continue;
461
+ for (const part of triggerCell.split(/,|\/| or /i)) {
462
+ const phrase = part.trim().replace(/^["'`]|["'`]$/g, "");
463
+ if (phrase.length >= 3) phrases.push(phrase);
464
+ }
465
+ }
466
+ return phrases;
467
+ }
468
+
469
+ function scoreQueryAgainstTriggerPhrases(query: string, triggerPhrases: string[]): number {
470
+ const normalizedQuery = query.toLowerCase();
471
+ const queryTokens = tokenizeText(query);
472
+ let best = 0;
473
+ for (const phrase of triggerPhrases) {
474
+ const normalizedPhrase = phrase.toLowerCase();
475
+ if (normalizedQuery.includes(normalizedPhrase)) {
476
+ best = Math.max(best, 1);
477
+ continue;
478
+ }
479
+ best = Math.max(best, jaccardSimilarity(queryTokens, tokenizeText(phrase)));
480
+ }
481
+ return best;
482
+ }
483
+
484
+ function scoreQueryAgainstSkillSurface(query: string, surface: ReplaySkillSurface): number {
485
+ const queryTokens = tokenizeText(query);
486
+ return Math.max(
487
+ jaccardSimilarity(queryTokens, surface.descriptionTokens),
488
+ jaccardSimilarity(queryTokens, surface.whenToUseTokens),
489
+ );
490
+ }
491
+
492
+ function evaluateReplayTrigger(
493
+ query: string,
494
+ routing: string,
495
+ targetSurface: ReplaySkillSurface,
496
+ competingSurfaces: ReplaySkillSurface[],
497
+ ): { triggered: boolean; evidence: string } {
498
+ const normalizedQuery = query.trim();
499
+ if (containsWholeSkillMention(normalizedQuery, targetSurface.skillName)) {
500
+ return {
501
+ triggered: true,
502
+ evidence: `explicit target mention: ${targetSurface.skillName}`,
503
+ };
504
+ }
505
+
506
+ for (const competingSurface of competingSurfaces) {
507
+ if (containsWholeSkillMention(normalizedQuery, competingSurface.skillName)) {
508
+ return {
509
+ triggered: false,
510
+ evidence: `explicit competing skill mention: ${competingSurface.skillName}`,
511
+ };
512
+ }
513
+ }
514
+
515
+ const triggerPhrases = extractRoutingTriggerPhrases(routing);
516
+ const triggerScore = scoreQueryAgainstTriggerPhrases(normalizedQuery, triggerPhrases);
517
+ const targetSurfaceScore = scoreQueryAgainstSkillSurface(normalizedQuery, targetSurface);
518
+ const targetScore = Math.max(triggerScore, targetSurfaceScore);
519
+ const bestCompetitor = competingSurfaces
520
+ .map((surface) => ({
521
+ skillName: surface.skillName,
522
+ score: scoreQueryAgainstSkillSurface(normalizedQuery, surface),
523
+ }))
524
+ .sort((a, b) => b.score - a.score)[0];
525
+
526
+ if (targetScore < HOST_REPLAY_MATCH_THRESHOLD) {
527
+ return {
528
+ triggered: false,
529
+ evidence: "target routing and skill surface did not clear replay threshold",
530
+ };
531
+ }
532
+
533
+ if (bestCompetitor && bestCompetitor.score >= targetScore) {
534
+ return {
535
+ triggered: false,
536
+ evidence: `competing skill surface scored higher: ${bestCompetitor.skillName}`,
537
+ };
538
+ }
539
+
540
+ if (triggerScore >= targetSurfaceScore) {
541
+ return {
542
+ triggered: true,
543
+ evidence:
544
+ triggerScore === 1
545
+ ? "query matched a routing trigger phrase exactly"
546
+ : "query aligned with routing trigger language",
547
+ };
548
+ }
549
+
550
+ return {
551
+ triggered: true,
552
+ evidence: "query aligned with target skill surface in replay fixture",
553
+ };
554
+ }
555
+
556
+ export function runHostReplayFixture(options: {
557
+ routing: string;
558
+ evalSet: EvalEntry[];
559
+ fixture: RoutingReplayFixture;
560
+ }): RoutingReplayEntryResult[] {
561
+ const targetSurface = loadReplaySkillSurface(options.fixture.target_skill_path);
562
+ const competingSurfaces = options.fixture.competing_skill_paths.map(loadReplaySkillSurface);
563
+
564
+ return options.evalSet.map((entry) => {
565
+ const evaluated = evaluateReplayTrigger(
566
+ entry.query,
567
+ options.routing,
568
+ targetSurface,
569
+ competingSurfaces,
570
+ );
571
+ return {
572
+ query: entry.query,
573
+ should_trigger: entry.should_trigger,
574
+ triggered: evaluated.triggered,
575
+ passed: evaluated.triggered === entry.should_trigger,
576
+ evidence: evaluated.evidence,
577
+ };
578
+ });
579
+ }
580
+
581
+ export async function runClaudeRuntimeReplayFixture(options: {
582
+ routing: string;
583
+ evalSet: EvalEntry[];
584
+ fixture: RoutingReplayFixture;
585
+ runtimeInvoker?: ClaudeRuntimeReplayInvoker;
586
+ }): Promise<RoutingReplayEntryResult[]> {
587
+ const fallbackReason = (reason: string) =>
588
+ `runtime replay unavailable; fell back to fixture simulation (${reason})`;
589
+
590
+ if (options.fixture.platform !== "claude_code") {
591
+ return prefixReplayEvidence(
592
+ runHostReplayFixture(options),
593
+ fallbackReason(`unsupported platform ${options.fixture.platform}`),
594
+ );
595
+ }
596
+
597
+ const invokeRuntime = options.runtimeInvoker ?? invokeClaudeRuntimeReplay;
598
+ let workspace: ReplayWorkspace | undefined;
599
+
600
+ try {
601
+ workspace = buildRuntimeReplayWorkspace(options.fixture, options.routing);
602
+ const results: RoutingReplayEntryResult[] = [];
603
+
604
+ for (const entry of options.evalSet) {
605
+ const observation = await invokeRuntime({
606
+ query: entry.query,
607
+ workspaceRoot: workspace.rootDir,
608
+ targetSkillName: options.fixture.target_skill_name,
609
+ targetSkillPath: workspace.targetSkillPath,
610
+ competingSkillPaths: workspace.competingSkillPaths,
611
+ });
612
+ results.push(
613
+ evaluateRuntimeReplayObservation(entry, options.fixture, observation, workspace),
614
+ );
615
+ }
616
+
617
+ return results;
618
+ } catch (error) {
619
+ const message = error instanceof Error ? error.message : String(error);
620
+ return prefixReplayEvidence(runHostReplayFixture(options), fallbackReason(message));
621
+ } finally {
622
+ if (workspace) cleanupRuntimeReplayWorkspace(workspace);
623
+ }
624
+ }
@@ -40,6 +40,8 @@ export interface ValidationResult {
40
40
  net_change: number; // after - before pass rate
41
41
  by_invocation_type?: InvocationTypeScores;
42
42
  per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
43
+ validation_mode?: "llm_judge";
44
+ validation_agent?: string;
43
45
  }
44
46
 
45
47
  // ---------------------------------------------------------------------------
@@ -63,6 +65,8 @@ export async function validateProposalSequential(
63
65
  regressions: [],
64
66
  new_passes: [],
65
67
  net_change: 0,
68
+ validation_mode: "llm_judge",
69
+ validation_agent: agent,
66
70
  };
67
71
  }
68
72
 
@@ -174,6 +178,8 @@ export async function validateProposalSequential(
174
178
  net_change: netChange,
175
179
  by_invocation_type: invocationScores,
176
180
  per_entry_results: perEntryResults,
181
+ validation_mode: "llm_judge",
182
+ validation_agent: agent,
177
183
  };
178
184
  }
179
185
 
@@ -220,6 +226,8 @@ export async function validateProposalBatched(
220
226
  regressions: [],
221
227
  new_passes: [],
222
228
  net_change: 0,
229
+ validation_mode: "llm_judge",
230
+ validation_agent: agent,
223
231
  };
224
232
  }
225
233
 
@@ -342,6 +350,8 @@ export async function validateProposalBatched(
342
350
  net_change: netChange,
343
351
  by_invocation_type: invocationScores,
344
352
  per_entry_results: perEntryResults,
353
+ validation_mode: "llm_judge",
354
+ validation_agent: agent,
345
355
  };
346
356
  }
347
357