selftune 0.2.20 → 0.2.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,6 +23,7 @@ import type {
23
23
  FailurePattern,
24
24
  GradingResult,
25
25
  QueryLogRecord,
26
+ RoutingReplayFixture,
26
27
  SkillUsageRecord,
27
28
  } from "../types.js";
28
29
  import { CLIError, handleCLIError } from "../utils/cli-error.js";
@@ -37,7 +38,10 @@ import { type ExecutionContext, generateBodyProposal } from "./propose-body.js";
37
38
  import { generateRoutingProposal } from "./propose-routing.js";
38
39
  import { refineBodyProposal } from "./refine-body.js";
39
40
  import { validateBodyProposal } from "./validate-body.js";
40
- import { buildRoutingReplayFixture } from "./validate-host-replay.js";
41
+ import {
42
+ buildRoutingReplayFixture,
43
+ runClaudeRuntimeReplayFixture,
44
+ } from "./validate-host-replay.js";
41
45
  import { validateRoutingProposal } from "./validate-routing.js";
42
46
 
43
47
  // ---------------------------------------------------------------------------
@@ -465,12 +469,32 @@ export async function evolveBody(
465
469
  skillPath,
466
470
  platform: studentAgent === "codex" ? "codex" : "claude_code",
467
471
  });
472
+ const replayRunner =
473
+ replayFixture.platform === "claude_code" && studentAgent === "claude"
474
+ ? async ({
475
+ routing,
476
+ evalSet,
477
+ fixture,
478
+ }: {
479
+ routing: string;
480
+ evalSet: EvalEntry[];
481
+ fixture: RoutingReplayFixture;
482
+ }) =>
483
+ await runClaudeRuntimeReplayFixture({
484
+ routing,
485
+ evalSet,
486
+ fixture,
487
+ })
488
+ : undefined;
468
489
  validation = await _validateRoutingProposal(
469
490
  proposal,
470
491
  evalSet,
471
492
  studentAgent,
472
493
  validationModelFlag,
473
- { replayFixture },
494
+ {
495
+ replayFixture,
496
+ ...(replayRunner ? { replayRunner } : {}),
497
+ },
474
498
  );
475
499
  } else {
476
500
  validation = await _validateBodyProposal(
@@ -1,5 +1,16 @@
1
- import { existsSync, readFileSync, readdirSync, realpathSync, statSync } from "node:fs";
2
- import { basename, dirname, join } from "node:path";
1
+ import {
2
+ existsSync,
3
+ mkdirSync,
4
+ mkdtempSync,
5
+ readFileSync,
6
+ readdirSync,
7
+ realpathSync,
8
+ rmSync,
9
+ statSync,
10
+ writeFileSync,
11
+ } from "node:fs";
12
+ import { tmpdir } from "node:os";
13
+ import { basename, dirname, isAbsolute, join } from "node:path";
3
14
 
4
15
  import type { EvalEntry, RoutingReplayEntryResult, RoutingReplayFixture } from "../types.js";
5
16
  import { parseFrontmatter } from "../utils/frontmatter.js";
@@ -10,6 +21,7 @@ import {
10
21
  jaccardSimilarity,
11
22
  tokenizeText,
12
23
  } from "../utils/text-similarity.js";
24
+ import { replaceSection } from "./deploy-proposal.js";
13
25
 
14
26
  interface ReplaySkillSurface {
15
27
  skillName: string;
@@ -17,12 +29,41 @@ interface ReplaySkillSurface {
17
29
  whenToUseTokens: Set<string>;
18
30
  }
19
31
 
32
+ interface ReplayWorkspace {
33
+ rootDir: string;
34
+ targetSkillPath: string;
35
+ competingSkillPaths: string[];
36
+ }
37
+
38
+ export interface ClaudeRuntimeReplayInvokerInput {
39
+ query: string;
40
+ workspaceRoot: string;
41
+ targetSkillName: string;
42
+ targetSkillPath: string;
43
+ competingSkillPaths: string[];
44
+ }
45
+
46
+ export interface ClaudeRuntimeReplayObservation {
47
+ invokedSkillNames: string[];
48
+ readSkillPaths: string[];
49
+ rawOutput: string;
50
+ sessionId?: string;
51
+ runtimeError?: string;
52
+ }
53
+
54
+ export type ClaudeRuntimeReplayInvoker = (
55
+ input: ClaudeRuntimeReplayInvokerInput,
56
+ ) => Promise<ClaudeRuntimeReplayObservation>;
57
+
20
58
  /**
21
59
  * Minimum score needed before replay treats routing text or skill-surface overlap
22
60
  * as a real match. Tuned to suppress weak false positives without killing recall
23
61
  * for short routing phrases and sparse skill surfaces.
24
62
  */
25
63
  const HOST_REPLAY_MATCH_THRESHOLD = 0.18;
64
+ const CLAUDE_RUNTIME_REPLAY_TIMEOUT_MS = 30_000;
65
+ const CLAUDE_RUNTIME_ROUTING_PROMPT =
66
+ "You are being evaluated only on skill routing. Do not solve the user's task. If a local project skill is relevant, invoke exactly one skill immediately. If no local project skill fits, respond with NO_SKILL and do not browse unrelated files.";
26
67
 
27
68
  function resolveReplayPath(path: string): string {
28
69
  try {
@@ -32,6 +73,10 @@ function resolveReplayPath(path: string): string {
32
73
  }
33
74
  }
34
75
 
76
+ function resolveObservedReplayPath(path: string, workspaceRoot: string): string {
77
+ return resolveReplayPath(isAbsolute(path) ? path : join(workspaceRoot, path));
78
+ }
79
+
35
80
  function listCompetingSkillPaths(targetSkillPath: string): string[] {
36
81
  const normalizedTargetPath = resolveReplayPath(targetSkillPath);
37
82
  const targetSkillDir = dirname(normalizedTargetPath);
@@ -82,6 +127,304 @@ export function buildRoutingReplayFixture(options: {
82
127
  };
83
128
  }
84
129
 
130
+ function buildRuntimeReplayTargetContent(skillPath: string, routing: string): string {
131
+ const currentContent = readFileSync(skillPath, "utf8");
132
+ return replaceSection(currentContent, "Workflow Routing", routing.trim());
133
+ }
134
+
135
+ function stageReplaySkill(
136
+ registryDir: string,
137
+ sourceSkillPath: string,
138
+ overrideContent?: string,
139
+ ): string {
140
+ const skillDirName = basename(dirname(sourceSkillPath)) || "unknown-skill";
141
+ const destinationDir = join(registryDir, skillDirName);
142
+ mkdirSync(destinationDir, { recursive: true });
143
+ const destinationPath = join(destinationDir, "SKILL.md");
144
+ const content = overrideContent ?? readFileSync(sourceSkillPath, "utf8");
145
+ writeFileSync(destinationPath, content, "utf8");
146
+ return destinationPath;
147
+ }
148
+
149
+ function buildRuntimeReplayWorkspace(
150
+ fixture: RoutingReplayFixture,
151
+ routing: string,
152
+ ): ReplayWorkspace {
153
+ const rootDir = mkdtempSync(join(tmpdir(), "selftune-runtime-replay-"));
154
+ try {
155
+ const registryDir = join(rootDir, ".claude", "skills");
156
+ mkdirSync(join(rootDir, ".git"), { recursive: true });
157
+ mkdirSync(registryDir, { recursive: true });
158
+
159
+ const targetSkillPath = stageReplaySkill(
160
+ registryDir,
161
+ fixture.target_skill_path,
162
+ buildRuntimeReplayTargetContent(fixture.target_skill_path, routing),
163
+ );
164
+ const competingSkillPaths = fixture.competing_skill_paths.map((skillPath) =>
165
+ stageReplaySkill(registryDir, skillPath),
166
+ );
167
+
168
+ return {
169
+ rootDir,
170
+ targetSkillPath,
171
+ competingSkillPaths,
172
+ };
173
+ } catch (error) {
174
+ rmSync(rootDir, { recursive: true, force: true });
175
+ throw error;
176
+ }
177
+ }
178
+
179
+ function cleanupRuntimeReplayWorkspace(workspace: ReplayWorkspace): void {
180
+ rmSync(workspace.rootDir, { recursive: true, force: true });
181
+ }
182
+
183
+ function parseClaudeRuntimeReplayOutput(rawOutput: string): ClaudeRuntimeReplayObservation {
184
+ const invokedSkillNames = new Set<string>();
185
+ const readSkillPaths = new Set<string>();
186
+ let sessionId: string | undefined;
187
+ let runtimeError: string | undefined;
188
+
189
+ for (const line of rawOutput.split("\n")) {
190
+ const trimmed = line.trim();
191
+ if (!trimmed) continue;
192
+
193
+ let parsed: Record<string, unknown>;
194
+ try {
195
+ parsed = JSON.parse(trimmed);
196
+ } catch {
197
+ continue;
198
+ }
199
+
200
+ const maybeSessionId = parsed.session_id;
201
+ if (typeof maybeSessionId === "string" && maybeSessionId) {
202
+ sessionId = maybeSessionId;
203
+ }
204
+
205
+ if (typeof parsed.error === "string" && parsed.error) {
206
+ runtimeError = parsed.error;
207
+ }
208
+
209
+ const assistantMessage =
210
+ parsed.type === "assistant" && typeof parsed.message === "object" && parsed.message !== null
211
+ ? (parsed.message as Record<string, unknown>)
212
+ : undefined;
213
+ const content = assistantMessage?.content;
214
+ if (!Array.isArray(content)) continue;
215
+
216
+ for (const block of content) {
217
+ if (typeof block !== "object" || block === null) continue;
218
+ const typedBlock = block as Record<string, unknown>;
219
+ if (typedBlock.type !== "tool_use") continue;
220
+
221
+ const toolName = typedBlock.name;
222
+ const input =
223
+ typeof typedBlock.input === "object" && typedBlock.input !== null
224
+ ? (typedBlock.input as Record<string, unknown>)
225
+ : {};
226
+
227
+ if (toolName === "Skill") {
228
+ const skillName = input.skill;
229
+ if (typeof skillName === "string" && skillName.trim()) {
230
+ invokedSkillNames.add(skillName.trim());
231
+ }
232
+ }
233
+
234
+ if (toolName === "Read") {
235
+ const filePath = input.file_path;
236
+ if (typeof filePath === "string" && filePath.trim()) {
237
+ readSkillPaths.add(resolveReplayPath(filePath.trim()));
238
+ }
239
+ }
240
+ }
241
+ }
242
+
243
+ return {
244
+ invokedSkillNames: [...invokedSkillNames],
245
+ readSkillPaths: [...readSkillPaths],
246
+ rawOutput,
247
+ ...(sessionId ? { sessionId } : {}),
248
+ ...(runtimeError ? { runtimeError } : {}),
249
+ };
250
+ }
251
+
252
+ async function invokeClaudeRuntimeReplay(
253
+ input: ClaudeRuntimeReplayInvokerInput,
254
+ ): Promise<ClaudeRuntimeReplayObservation> {
255
+ const command = [
256
+ "claude",
257
+ "-p",
258
+ "--verbose",
259
+ "--output-format",
260
+ "stream-json",
261
+ "--dangerously-skip-permissions",
262
+ "--no-session-persistence",
263
+ "--setting-sources",
264
+ "project,local",
265
+ "--tools",
266
+ "Skill,Read",
267
+ "--max-turns",
268
+ "1",
269
+ "--append-system-prompt",
270
+ CLAUDE_RUNTIME_ROUTING_PROMPT,
271
+ input.query,
272
+ ];
273
+
274
+ const proc = Bun.spawn(command, {
275
+ cwd: input.workspaceRoot,
276
+ stdout: "pipe",
277
+ stderr: "pipe",
278
+ env: { ...process.env, CLAUDECODE: "" },
279
+ });
280
+ const timeout = setTimeout(() => proc.kill(), CLAUDE_RUNTIME_REPLAY_TIMEOUT_MS);
281
+
282
+ const [stdoutText, stderrText, exitCode] = await Promise.all([
283
+ new Response(proc.stdout).text(),
284
+ new Response(proc.stderr).text(),
285
+ proc.exited,
286
+ ]);
287
+ clearTimeout(timeout);
288
+
289
+ const observation = parseClaudeRuntimeReplayOutput(stdoutText);
290
+ const combinedError = [observation.runtimeError, stderrText.trim()].filter(Boolean).join(" | ");
291
+ const hasRoutingSignal =
292
+ observation.invokedSkillNames.length > 0 || observation.readSkillPaths.length > 0;
293
+
294
+ if (exitCode !== 0 && !hasRoutingSignal) {
295
+ throw new Error(combinedError || `claude runtime replay exited with code ${exitCode}`);
296
+ }
297
+
298
+ return {
299
+ ...observation,
300
+ ...(combinedError ? { runtimeError: combinedError } : {}),
301
+ };
302
+ }
303
+
304
+ function prefixReplayEvidence(
305
+ results: RoutingReplayEntryResult[],
306
+ prefix: string,
307
+ ): RoutingReplayEntryResult[] {
308
+ return results.map((result) => ({
309
+ ...result,
310
+ evidence: result.evidence ? `${prefix}; ${result.evidence}` : prefix,
311
+ }));
312
+ }
313
+
314
+ function evaluateRuntimeReplayObservation(
315
+ entry: EvalEntry,
316
+ fixture: RoutingReplayFixture,
317
+ observation: ClaudeRuntimeReplayObservation,
318
+ workspace: ReplayWorkspace,
319
+ ): RoutingReplayEntryResult {
320
+ const normalizedReadPaths = new Set(
321
+ observation.readSkillPaths.map((path) => resolveObservedReplayPath(path, workspace.rootDir)),
322
+ );
323
+ const allowedReadPaths = new Set([
324
+ resolveReplayPath(workspace.targetSkillPath),
325
+ ...workspace.competingSkillPaths.map(resolveReplayPath),
326
+ ]);
327
+ const targetSkillName = fixture.target_skill_name.trim();
328
+ const targetInvoked = observation.invokedSkillNames.includes(targetSkillName);
329
+ const competingInvoked = observation.invokedSkillNames.find((skillName) =>
330
+ fixture.competing_skill_paths.some(
331
+ (skillPath) => basename(dirname(skillPath)).trim() === skillName.trim(),
332
+ ),
333
+ );
334
+ const unrelatedInvoked = observation.invokedSkillNames.find(
335
+ (skillName) => skillName.trim() !== targetSkillName && skillName.trim() !== competingInvoked,
336
+ );
337
+ const unrelatedReadPaths = [...normalizedReadPaths].filter((path) => !allowedReadPaths.has(path));
338
+ const targetRead = normalizedReadPaths.has(resolveReplayPath(workspace.targetSkillPath));
339
+ const competingRead = workspace.competingSkillPaths.find((skillPath) =>
340
+ normalizedReadPaths.has(resolveReplayPath(skillPath)),
341
+ );
342
+ const sessionPrefix = observation.sessionId
343
+ ? `runtime replay session ${observation.sessionId}`
344
+ : "runtime replay";
345
+ if (observation.invokedSkillNames.length > 1) {
346
+ return {
347
+ query: entry.query,
348
+ should_trigger: entry.should_trigger,
349
+ triggered: false,
350
+ passed: false,
351
+ evidence: `${sessionPrefix} invoked multiple skills: ${observation.invokedSkillNames.join(", ")}`,
352
+ };
353
+ }
354
+
355
+ if (targetInvoked) {
356
+ return {
357
+ query: entry.query,
358
+ should_trigger: entry.should_trigger,
359
+ triggered: true,
360
+ passed: entry.should_trigger,
361
+ evidence: `${sessionPrefix} invoked target skill: ${targetSkillName}`,
362
+ };
363
+ }
364
+
365
+ if (competingInvoked) {
366
+ return {
367
+ query: entry.query,
368
+ should_trigger: entry.should_trigger,
369
+ triggered: false,
370
+ passed: !entry.should_trigger,
371
+ evidence: `${sessionPrefix} invoked competing skill: ${competingInvoked}`,
372
+ };
373
+ }
374
+
375
+ if (unrelatedInvoked) {
376
+ return {
377
+ query: entry.query,
378
+ should_trigger: entry.should_trigger,
379
+ triggered: false,
380
+ passed: false,
381
+ evidence: `${sessionPrefix} invoked unrelated skill: ${unrelatedInvoked}`,
382
+ };
383
+ }
384
+
385
+ if (unrelatedReadPaths.length > 0) {
386
+ return {
387
+ query: entry.query,
388
+ should_trigger: entry.should_trigger,
389
+ triggered: false,
390
+ passed: false,
391
+ evidence: `${sessionPrefix} read files outside staged skill set: ${unrelatedReadPaths.join(", ")}`,
392
+ };
393
+ }
394
+
395
+ if (targetRead) {
396
+ return {
397
+ query: entry.query,
398
+ should_trigger: entry.should_trigger,
399
+ triggered: false,
400
+ passed: !entry.should_trigger,
401
+ evidence: `${sessionPrefix} only read the target skill without invoking it`,
402
+ };
403
+ }
404
+
405
+ if (competingRead) {
406
+ return {
407
+ query: entry.query,
408
+ should_trigger: entry.should_trigger,
409
+ triggered: false,
410
+ passed: !entry.should_trigger,
411
+ evidence: `${sessionPrefix} only read a competing skill without invoking it`,
412
+ };
413
+ }
414
+
415
+ if (observation.runtimeError) {
416
+ throw new Error(`${sessionPrefix} did not reach a skill decision: ${observation.runtimeError}`);
417
+ }
418
+
419
+ return {
420
+ query: entry.query,
421
+ should_trigger: entry.should_trigger,
422
+ triggered: false,
423
+ passed: !entry.should_trigger,
424
+ evidence: `${sessionPrefix} did not invoke any local project skill`,
425
+ };
426
+ }
427
+
85
428
  function loadReplaySkillSurface(skillPath: string): ReplaySkillSurface {
86
429
  const fallbackName = basename(dirname(skillPath)) || "unknown-skill";
87
430
  try {
@@ -234,3 +577,48 @@ export function runHostReplayFixture(options: {
234
577
  };
235
578
  });
236
579
  }
580
+
581
+ export async function runClaudeRuntimeReplayFixture(options: {
582
+ routing: string;
583
+ evalSet: EvalEntry[];
584
+ fixture: RoutingReplayFixture;
585
+ runtimeInvoker?: ClaudeRuntimeReplayInvoker;
586
+ }): Promise<RoutingReplayEntryResult[]> {
587
+ const fallbackReason = (reason: string) =>
588
+ `runtime replay unavailable; fell back to fixture simulation (${reason})`;
589
+
590
+ if (options.fixture.platform !== "claude_code") {
591
+ return prefixReplayEvidence(
592
+ runHostReplayFixture(options),
593
+ fallbackReason(`unsupported platform ${options.fixture.platform}`),
594
+ );
595
+ }
596
+
597
+ const invokeRuntime = options.runtimeInvoker ?? invokeClaudeRuntimeReplay;
598
+ let workspace: ReplayWorkspace | undefined;
599
+
600
+ try {
601
+ workspace = buildRuntimeReplayWorkspace(options.fixture, options.routing);
602
+ const results: RoutingReplayEntryResult[] = [];
603
+
604
+ for (const entry of options.evalSet) {
605
+ const observation = await invokeRuntime({
606
+ query: entry.query,
607
+ workspaceRoot: workspace.rootDir,
608
+ targetSkillName: options.fixture.target_skill_name,
609
+ targetSkillPath: workspace.targetSkillPath,
610
+ competingSkillPaths: workspace.competingSkillPaths,
611
+ });
612
+ results.push(
613
+ evaluateRuntimeReplayObservation(entry, options.fixture, observation, workspace),
614
+ );
615
+ }
616
+
617
+ return results;
618
+ } catch (error) {
619
+ const message = error instanceof Error ? error.message : String(error);
620
+ return prefixReplayEvidence(runHostReplayFixture(options), fallbackReason(message));
621
+ } finally {
622
+ if (workspace) cleanupRuntimeReplayWorkspace(workspace);
623
+ }
624
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "selftune",
3
- "version": "0.2.20",
3
+ "version": "0.2.21",
4
4
  "description": "Self-improving skills CLI for AI agents",
5
5
  "keywords": [
6
6
  "agent",
@@ -89,15 +89,31 @@ skills in the same registry, so replay-backed validation is preferred whenever
89
89
  that local fixture can be constructed because it captures host-style routing
90
90
  behavior instead of model judgment.
91
91
 
92
- The current replay path is fixture-backed: it evaluates the target routing table
93
- against the installed target/competing skill surfaces in a controlled replay
94
- fixture and records per-entry evidence. That is still a stronger signal than a
95
- free-form judge prompt, but you should describe it as replay-backed validation,
96
- not as live operator telemetry.
92
+ For Claude Code, the replay path now stages a temporary project-local
93
+ `.claude/skills` registry, swaps in the candidate routing table, and runs a
94
+ one-turn Claude print-mode session with project/local settings only. Validation
95
+ records whether Claude actually invoked the target skill, invoked a competing
96
+ skill, invoked an unrelated skill, or made no routing decision at all.
97
+ Unrelated skill use is treated as a replay failure even on negative evals,
98
+ because it still indicates the runtime routed somewhere unexpected. If that
99
+ runtime path is unavailable or fails to reach a runtime decision, selftune
100
+ falls back to the existing fixture-backed surface simulation and notes the
101
+ fallback in the replay evidence instead of pretending it was a runtime result.
102
+
103
+ For non-Claude platforms today, replay remains fixture-backed: it evaluates the
104
+ target routing table against the installed target/competing skill surfaces in a
105
+ controlled replay fixture and records per-entry evidence. That is still a
106
+ stronger signal than a free-form judge prompt, but you should describe it as
107
+ replay-backed validation, not as live operator telemetry.
97
108
 
98
109
  Replay parsing is intentionally conservative: unreadable skill files degrade to
99
110
  empty surfaces instead of throwing, and malformed routing rows with empty
100
- trigger cells are ignored rather than treated as valid triggers.
111
+ trigger cells are ignored rather than treated as valid triggers. Claude replay
112
+ also normalizes observed `Read` paths against the staged workspace, so relative
113
+ skill reads still count as read-only evidence for the target or competing
114
+ skill. Reads outside the staged skill set are treated as replay failures rather
115
+ than benign negatives, because they indicate the runtime left the controlled
116
+ evaluation surface.
101
117
 
102
118
  ## Parsing Instructions
103
119