selftune 0.2.19 → 0.2.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1130,6 +1130,10 @@ export function queryEvolutionAudit(
1130
1130
  action: string;
1131
1131
  details: string;
1132
1132
  eval_snapshot?: Record<string, unknown>;
1133
+ validation_mode?: string;
1134
+ validation_agent?: string;
1135
+ validation_fixture_id?: string;
1136
+ validation_evidence_ref?: string;
1133
1137
  }> {
1134
1138
  const sql = skillName
1135
1139
  ? `SELECT * FROM evolution_audit
@@ -1143,12 +1147,18 @@ export function queryEvolutionAudit(
1143
1147
  return rows.map((r) => ({
1144
1148
  timestamp: r.timestamp as string,
1145
1149
  proposal_id: r.proposal_id as string,
1146
- skill_name: r.skill_name as string | undefined,
1150
+ skill_name: typeof r.skill_name === "string" ? r.skill_name : undefined,
1147
1151
  action: r.action as string,
1148
1152
  details: r.details as string,
1149
1153
  eval_snapshot: r.eval_snapshot_json
1150
1154
  ? (safeParseJson(r.eval_snapshot_json as string) as Record<string, unknown>)
1151
1155
  : undefined,
1156
+ validation_mode: typeof r.validation_mode === "string" ? r.validation_mode : undefined,
1157
+ validation_agent: typeof r.validation_agent === "string" ? r.validation_agent : undefined,
1158
+ validation_fixture_id:
1159
+ typeof r.validation_fixture_id === "string" ? r.validation_fixture_id : undefined,
1160
+ validation_evidence_ref:
1161
+ typeof r.validation_evidence_ref === "string" ? r.validation_evidence_ref : undefined,
1152
1162
  }));
1153
1163
  }
1154
1164
 
@@ -121,7 +121,12 @@ CREATE TABLE IF NOT EXISTS evolution_audit (
121
121
  skill_name TEXT,
122
122
  action TEXT NOT NULL,
123
123
  details TEXT,
124
- eval_snapshot_json TEXT
124
+ eval_snapshot_json TEXT,
125
+ iterations_used INTEGER,
126
+ validation_mode TEXT,
127
+ validation_agent TEXT,
128
+ validation_fixture_id TEXT,
129
+ validation_evidence_ref TEXT
125
130
  )`;
126
131
 
127
132
  // -- Local telemetry tables (from JSONL logs) ---------------------------------
@@ -369,6 +374,10 @@ export const MIGRATIONS = [
369
374
  `ALTER TABLE skill_invocations ADD COLUMN source TEXT`,
370
375
  // Track how many iteration loops each evolution run used
371
376
  `ALTER TABLE evolution_audit ADD COLUMN iterations_used INTEGER`,
377
+ `ALTER TABLE evolution_audit ADD COLUMN validation_mode TEXT`,
378
+ `ALTER TABLE evolution_audit ADD COLUMN validation_agent TEXT`,
379
+ `ALTER TABLE evolution_audit ADD COLUMN validation_fixture_id TEXT`,
380
+ `ALTER TABLE evolution_audit ADD COLUMN validation_evidence_ref TEXT`,
372
381
  // Canonical contract fields for upload staging (sessions already has schema_version, platform, normalized_at)
373
382
  `ALTER TABLE sessions ADD COLUMN normalizer_version TEXT`,
374
383
  `ALTER TABLE sessions ADD COLUMN capture_mode TEXT`,
@@ -28,7 +28,8 @@ export function handleSkillReport(
28
28
  // 1. Evolution audit with eval_snapshot
29
29
  const evolution = db
30
30
  .query(
31
- `SELECT timestamp, proposal_id, skill_name, action, details, eval_snapshot_json
31
+ `SELECT timestamp, proposal_id, skill_name, action, details, eval_snapshot_json,
32
+ validation_mode, validation_agent, validation_fixture_id, validation_evidence_ref
32
33
  FROM evolution_audit
33
34
  WHERE skill_name = ? OR (skill_name IS NULL AND proposal_id LIKE 'evo-' || ? || '-%')
34
35
  ORDER BY timestamp DESC
@@ -41,6 +42,10 @@ export function handleSkillReport(
41
42
  action: string;
42
43
  details: string;
43
44
  eval_snapshot_json: string | null;
45
+ validation_mode: string | null;
46
+ validation_agent: string | null;
47
+ validation_fixture_id: string | null;
48
+ validation_evidence_ref: string | null;
44
49
  }>;
45
50
  const evolutionWithSnapshot = evolution.map((e) => ({
46
51
  ...e,
@@ -400,6 +400,10 @@ export interface EvolutionAuditEntry {
400
400
  details: string;
401
401
  eval_snapshot?: EvalPassRate;
402
402
  iterations_used?: number;
403
+ validation_mode?: ValidationMode;
404
+ validation_agent?: string;
405
+ validation_fixture_id?: string;
406
+ validation_evidence_ref?: string;
403
407
  }
404
408
 
405
409
  export interface EvolutionEvidenceValidation {
@@ -413,6 +417,10 @@ export interface EvolutionEvidenceValidation {
413
417
  gates_passed?: number;
414
418
  gates_total?: number;
415
419
  gate_results?: Array<{ gate: ValidationGate; passed: boolean; reason: string }>;
420
+ validation_mode?: ValidationMode;
421
+ validation_agent?: string;
422
+ validation_fixture_id?: string;
423
+ validation_evidence_ref?: string;
416
424
  }
417
425
 
418
426
  export interface EvolutionEvidenceEntry {
@@ -697,6 +705,25 @@ export interface BodyEvolutionProposal {
697
705
  /** Closed union of gate names used in the validation pipeline. */
698
706
  export type ValidationGate = "structural" | "trigger_accuracy" | "quality";
699
707
 
708
+ export type ValidationMode = "structural_guard" | "host_replay" | "llm_judge";
709
+
710
+ export interface RoutingReplayFixture {
711
+ fixture_id: string;
712
+ platform: "claude_code" | "codex";
713
+ target_skill_name: string;
714
+ target_skill_path: string;
715
+ competing_skill_paths: string[];
716
+ workspace_root?: string;
717
+ }
718
+
719
+ export interface RoutingReplayEntryResult {
720
+ query: string;
721
+ should_trigger: boolean;
722
+ triggered: boolean;
723
+ passed: boolean;
724
+ evidence?: string;
725
+ }
726
+
700
727
  /** Result of validating a body evolution proposal. */
701
728
  export interface BodyValidationResult {
702
729
  proposal_id: string;
@@ -705,6 +732,12 @@ export interface BodyValidationResult {
705
732
  gate_results: Array<{ gate: ValidationGate; passed: boolean; reason: string }>;
706
733
  improved: boolean;
707
734
  regressions: string[];
735
+ validation_mode?: ValidationMode;
736
+ validation_agent?: string;
737
+ validation_fixture_id?: string;
738
+ before_pass_rate?: number;
739
+ after_pass_rate?: number;
740
+ per_entry_results?: RoutingReplayEntryResult[];
708
741
  }
709
742
 
710
743
  /** Configuration for which LLM model a role should use. */
@@ -873,6 +906,26 @@ export interface SkillFamilyOverlapPair {
873
906
  consolidation_pressure: "low" | "medium" | "high";
874
907
  }
875
908
 
909
+ export interface SkillFamilyColdStartPair {
910
+ skill_a: string;
911
+ skill_b: string;
912
+ description_similarity: number;
913
+ when_to_use_similarity: number;
914
+ shared_command_surfaces: string[];
915
+ shared_terms: string[];
916
+ synthetic_confusion_queries: string[];
917
+ suspicion_level: "low" | "medium" | "high";
918
+ }
919
+
920
+ export interface SkillFamilyColdStartSuspicion {
921
+ candidate: boolean;
922
+ analyzed_pairs: number;
923
+ suspicious_pair_count: number;
924
+ average_static_similarity: number;
925
+ pairs: SkillFamilyColdStartPair[];
926
+ rationale: string[];
927
+ }
928
+
876
929
  export interface SkillFamilyRefactorWorkflow {
877
930
  workflow_name: string;
878
931
  source_skill: string;
@@ -892,6 +945,7 @@ export interface SkillFamilyOverlapReport {
892
945
  analyzed_skills: string[];
893
946
  members: SkillFamilyOverlapMember[];
894
947
  pairs: SkillFamilyOverlapPair[];
948
+ cold_start_suspicion?: SkillFamilyColdStartSuspicion;
895
949
  total_pairs_analyzed: number;
896
950
  overlap_count: number;
897
951
  overlap_density: number;
@@ -0,0 +1,73 @@
1
+ const BASE_TEXT_SIMILARITY_STOPWORDS = new Set([
2
+ "a",
3
+ "an",
4
+ "and",
5
+ "are",
6
+ "as",
7
+ "at",
8
+ "be",
9
+ "for",
10
+ "from",
11
+ "how",
12
+ "in",
13
+ "into",
14
+ "is",
15
+ "it",
16
+ "of",
17
+ "on",
18
+ "or",
19
+ "that",
20
+ "the",
21
+ "this",
22
+ "to",
23
+ "use",
24
+ "user",
25
+ "when",
26
+ "with",
27
+ ]);
28
+
29
+ export function buildStopwordSet(additionalStopwords: string[] = []): Set<string> {
30
+ return new Set([...BASE_TEXT_SIMILARITY_STOPWORDS, ...additionalStopwords]);
31
+ }
32
+
33
+ export function tokenizeText(
34
+ text: string,
35
+ stopwords = BASE_TEXT_SIMILARITY_STOPWORDS,
36
+ ): Set<string> {
37
+ return new Set(
38
+ text
39
+ .toLowerCase()
40
+ .split(/[^a-z0-9]+/i)
41
+ .map((token) => token.trim())
42
+ .filter((token) => token.length >= 3 && !stopwords.has(token)),
43
+ );
44
+ }
45
+
46
+ export function jaccardSimilarity(left: Set<string>, right: Set<string>): number {
47
+ if (left.size === 0 || right.size === 0) return 0;
48
+ let shared = 0;
49
+ for (const token of left) {
50
+ if (right.has(token)) shared += 1;
51
+ }
52
+ const union = left.size + right.size - shared;
53
+ return union > 0 ? shared / union : 0;
54
+ }
55
+
56
+ export function extractWhenToUseLines(body: string): string[] {
57
+ const lines = body.split("\n");
58
+ const start = lines.findIndex((line) => /^##+\s+when to use\s*$/i.test(line.trim()));
59
+ if (start === -1) return [];
60
+
61
+ const extracted: string[] = [];
62
+ for (let i = start + 1; i < lines.length; i++) {
63
+ const line = lines[i].trim();
64
+ if (!line) continue;
65
+ if (/^##+\s+/.test(line)) break;
66
+ if (/^[-*]\s+/.test(line)) {
67
+ extracted.push(line.replace(/^[-*]\s+/, "").trim());
68
+ continue;
69
+ }
70
+ extracted.push(line);
71
+ }
72
+ return extracted;
73
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "selftune",
3
- "version": "0.2.19",
3
+ "version": "0.2.20",
4
4
  "description": "Self-improving skills CLI for AI agents",
5
5
  "keywords": [
6
6
  "agent",
@@ -12,6 +12,7 @@ import {
12
12
  ListChecksIcon,
13
13
  } from "lucide-react";
14
14
  import { useMemo, useState } from "react";
15
+ import type { ReactNode } from "react";
15
16
  import Markdown from "react-markdown";
16
17
 
17
18
  import { formatRate, timeAgo } from "../lib/format";
@@ -34,6 +35,37 @@ interface Props {
34
35
  showContextBanner?: boolean;
35
36
  }
36
37
 
38
+ function getValidationModeMeta(mode?: string | null): {
39
+ label: string;
40
+ variant: "default" | "secondary" | "destructive" | "outline";
41
+ description: string;
42
+ } | null {
43
+ switch (mode) {
44
+ case "host_replay":
45
+ return {
46
+ label: "Replay-backed validation",
47
+ variant: "default",
48
+ description:
49
+ "Validated against a controlled replay fixture instead of a free-form judge prompt.",
50
+ };
51
+ case "llm_judge":
52
+ return {
53
+ label: "Model judgment",
54
+ variant: "secondary",
55
+ description: "Validated by an LLM trigger check rather than a replay fixture.",
56
+ };
57
+ case "structural_guard":
58
+ return {
59
+ label: "Structural guard",
60
+ variant: "outline",
61
+ description:
62
+ "Only deterministic structural checks ran; no replay or judge validation was needed.",
63
+ };
64
+ default:
65
+ return null;
66
+ }
67
+ }
68
+
37
69
  function sentenceCase(value: string): string {
38
70
  return value.replace(/_/g, " ");
39
71
  }
@@ -42,7 +74,7 @@ function getOutcomePresentation(action?: string | null): {
42
74
  title: string;
43
75
  summary: string;
44
76
  tone: string;
45
- icon: React.ReactNode;
77
+ icon: ReactNode;
46
78
  liveSkillNote: string;
47
79
  } {
48
80
  switch (action) {
@@ -180,7 +212,7 @@ function SkillContentBlock({
180
212
  }
181
213
 
182
214
  /** Smart formatting for a single validation value */
183
- function formatValidationValue(key: string, val: unknown): React.ReactNode {
215
+ function formatValidationValue(key: string, val: unknown): ReactNode {
184
216
  // Booleans
185
217
  if (typeof val === "boolean") {
186
218
  return val ? (
@@ -279,12 +311,18 @@ function ValidationResults({ validation }: { validation: Record<string, unknown>
279
311
  regressions,
280
312
  new_passes,
281
313
  per_entry_results,
314
+ validation_mode,
315
+ validation_agent,
316
+ validation_fixture_id,
317
+ validation_evidence_ref,
282
318
  ...rest
283
319
  } = validation;
284
320
 
285
321
  const regressionsArr = Array.isArray(regressions) ? regressions : [];
286
322
  const newPassesArr = Array.isArray(new_passes) ? new_passes : [];
287
323
  const perEntryArr = Array.isArray(per_entry_results) ? per_entry_results : [];
324
+ const validationMeta =
325
+ typeof validation_mode === "string" ? getValidationModeMeta(validation_mode) : null;
288
326
 
289
327
  return (
290
328
  <div className="rounded-md border bg-muted/30 p-3 space-y-3">
@@ -295,6 +333,34 @@ function ValidationResults({ validation }: { validation: Record<string, unknown>
295
333
  </span>
296
334
  </p>
297
335
 
336
+ {validationMeta && (
337
+ <div className="rounded-md border bg-card px-3 py-2">
338
+ <div className="flex flex-wrap items-center gap-2">
339
+ <Badge variant={validationMeta.variant} className="text-[10px]">
340
+ {validationMeta.label}
341
+ </Badge>
342
+ {typeof validation_agent === "string" && validation_agent.trim() && (
343
+ <Badge variant="outline" className="text-[10px]">
344
+ agent: {validation_agent}
345
+ </Badge>
346
+ )}
347
+ {typeof validation_fixture_id === "string" && validation_fixture_id.trim() && (
348
+ <Badge variant="outline" className="text-[10px]">
349
+ fixture: {validation_fixture_id}
350
+ </Badge>
351
+ )}
352
+ </div>
353
+ <p className="mt-1 text-[11px] leading-relaxed text-muted-foreground">
354
+ {validationMeta.description}
355
+ </p>
356
+ {typeof validation_evidence_ref === "string" && validation_evidence_ref.trim() && (
357
+ <p className="mt-1 text-[10px] font-mono text-muted-foreground/70">
358
+ {validation_evidence_ref}
359
+ </p>
360
+ )}
361
+ </div>
362
+ )}
363
+
298
364
  {/* Summary bar */}
299
365
  <div className="flex items-center gap-3 flex-wrap">
300
366
  {improved !== undefined && (
@@ -730,6 +796,7 @@ export function EvidenceViewer({
730
796
  const latestStep = steps[steps.length - 1] ?? null;
731
797
  const lifecycleLabel = steps.map((step) => step.action.replace("_", " ")).join(" -> ");
732
798
  const outcome = getOutcomePresentation(latestStep?.action);
799
+ const validationMeta = getValidationModeMeta(latestStep?.validation_mode);
733
800
  const latestProposalConfidence = useMemo(() => {
734
801
  for (let i = proposalEntries.length - 1; i >= 0; i--) {
735
802
  if (proposalEntries[i].confidence !== null) {
@@ -824,6 +891,16 @@ export function EvidenceViewer({
824
891
  <Badge variant="outline" className="text-[10px]">
825
892
  {entries.length} evidence {entries.length === 1 ? "row" : "rows"}
826
893
  </Badge>
894
+ {validationMeta && (
895
+ <Badge variant={validationMeta.variant} className="text-[10px]">
896
+ {validationMeta.label}
897
+ </Badge>
898
+ )}
899
+ {latestStep?.validation_fixture_id && (
900
+ <Badge variant="outline" className="text-[10px]">
901
+ fixture: {latestStep.validation_fixture_id}
902
+ </Badge>
903
+ )}
827
904
  {latestProposalConfidence != null && (
828
905
  <Badge variant="secondary" className="text-[10px]">
829
906
  {Math.round(latestProposalConfidence * 100)}% confidence
@@ -831,6 +908,12 @@ export function EvidenceViewer({
831
908
  )}
832
909
  </div>
833
910
 
911
+ {validationMeta && (
912
+ <p className="text-[11px] leading-relaxed text-muted-foreground">
913
+ {validationMeta.description}
914
+ </p>
915
+ )}
916
+
834
917
  <div className="flex flex-wrap items-center gap-2 text-[11px] text-muted-foreground">
835
918
  <span className="font-headline uppercase tracking-[0.16em] text-muted-foreground/80">
836
919
  Lifecycle
@@ -10,13 +10,14 @@ import {
10
10
  ChevronRightIcon,
11
11
  } from "lucide-react";
12
12
  import { useState } from "react";
13
+ import type { ReactNode } from "react";
13
14
 
14
15
  import { timeAgo } from "../lib/format";
15
16
  import { cn } from "../lib/utils";
16
17
  import { Badge } from "../primitives/badge";
17
18
  import type { EvalSnapshot, EvolutionEntry } from "../types";
18
19
 
19
- const ACTION_ICON: Record<string, React.ReactNode> = {
20
+ const ACTION_ICON: Record<string, ReactNode> = {
20
21
  created: <CircleDotIcon className="size-3.5" />,
21
22
  validated: <ShieldCheckIcon className="size-3.5" />,
22
23
  deployed: <RocketIcon className="size-3.5" />,
@@ -70,6 +71,21 @@ interface Props {
70
71
  onSelect: (proposalId: string) => void;
71
72
  }
72
73
 
74
+ function validationModeBadge(
75
+ mode?: string | null,
76
+ ): { label: string; variant: "default" | "secondary" | "outline" } | null {
77
+ switch (mode) {
78
+ case "host_replay":
79
+ return { label: "replay", variant: "default" };
80
+ case "llm_judge":
81
+ return { label: "judge", variant: "secondary" };
82
+ case "structural_guard":
83
+ return { label: "structural", variant: "outline" };
84
+ default:
85
+ return null;
86
+ }
87
+ }
88
+
73
89
  /** Group evolution entries by proposal_id, ordered newest-first. */
74
90
  function groupByProposal(entries: EvolutionEntry[]) {
75
91
  const map = new Map<string, EvolutionEntry[]>();
@@ -201,6 +217,7 @@ export function EvolutionTimeline({ entries, selectedProposalId, onSelect }: Pro
201
217
  const lineColor = ACTION_LINE[terminal] ?? "bg-border";
202
218
  const isLast = groupIdx === groups.length - 1;
203
219
  const snapshot = findEvalSnapshot(steps);
220
+ const validationBadge = validationModeBadge(lastStep.validation_mode);
204
221
 
205
222
  return (
206
223
  <div key={proposalId} className="relative flex gap-3">
@@ -245,6 +262,11 @@ export function EvolutionTimeline({ entries, selectedProposalId, onSelect }: Pro
245
262
  <span className="text-[10px] text-muted-foreground">
246
263
  {timeAgo(lastStep.timestamp)}
247
264
  </span>
265
+ {validationBadge && (
266
+ <Badge variant={validationBadge.variant} className="text-[9px] uppercase">
267
+ {validationBadge.label}
268
+ </Badge>
269
+ )}
248
270
  </div>
249
271
  {/* Pass rate delta from eval snapshot */}
250
272
  {snapshot && (
@@ -31,6 +31,10 @@ export interface EvolutionEntry {
31
31
  action: string;
32
32
  details: string;
33
33
  eval_snapshot?: EvalSnapshot | null;
34
+ validation_mode?: "structural_guard" | "host_replay" | "llm_judge" | null;
35
+ validation_agent?: string | null;
36
+ validation_fixture_id?: string | null;
37
+ validation_evidence_ref?: string | null;
34
38
  }
35
39
 
36
40
  export interface UnmatchedQuery {
@@ -104,6 +104,19 @@ This is for packaging questions like:
104
104
  - "Are my sibling skills competing for the same user intent?"
105
105
  - "Should I stop evolving these independently and redesign the family?"
106
106
 
107
+ When trusted telemetry is sparse, the same command also emits a
108
+ `cold_start_suspicion` block. That is a weaker, earlier signal based on the
109
+ installed skill surfaces:
110
+
111
+ 1. Frontmatter / top-level description similarity
112
+ 2. Overlap in `## When to Use` language
113
+ 3. Shared command surface (for example, siblings that both wrap `mentor search`)
114
+ 4. Synthetic sibling-confusion probes derived from those overlapping surfaces
115
+
116
+ Treat `cold_start_suspicion.candidate` as architecture suspicion, not proof.
117
+ It is meant to tell you "this family may want a parent skill" before enough
118
+ real usage exists to confirm it through trusted positive-query overlap.
119
+
107
120
  ## Steps
108
121
 
109
122
  ### 1. Run Analysis
@@ -140,6 +153,7 @@ Interpretation:
140
153
 
141
154
  - `consolidation_candidate: false` means keep improving the sibling descriptions/workflows separately
142
155
  - `consolidation_candidate: true` means the problem is likely packaging, not just wording
156
+ - `cold_start_suspicion.candidate: true` means installed skill surfaces already look suspicious even though trusted telemetry is still sparse
143
157
  - `refactor_proposal` is a draft for human review only; do not auto-deploy a family rewrite
144
158
 
145
159
  ## Subagent Escalation
@@ -173,4 +187,4 @@ resolution plan with trigger ownership recommendations.
173
187
 
174
188
  **"Should I consolidate this sibling skill family?"**
175
189
 
176
- > Run `selftune eval family-overlap` and look for `consolidation_candidate` plus the `refactor_proposal`.
190
+ > Run `selftune eval family-overlap` and look for `consolidation_candidate` when you have live evidence, or `cold_start_suspicion` when you only have installed skill surfaces plus cold-start evals.
@@ -76,6 +76,29 @@ The evolution process writes multiple audit entries:
76
76
  | `validated` | Proposal tested against eval set | `eval_snapshot` with before/after pass rates |
77
77
  | `deployed` | Updated SKILL.md written to disk | `eval_snapshot` with final rates |
78
78
 
79
+ Routing/body validation may also carry provenance fields such as:
80
+
81
+ - `validation_mode` — `llm_judge`, `host_replay`, or `structural_guard`
82
+ - `validation_agent` — which host/agent performed the validation
83
+ - `validation_fixture_id` — fixture identifier when replay-backed validation is used
84
+ - `before_pass_rate` / `after_pass_rate` — only present when trigger validation actually ran; structural-guard exits do not emit synthetic pass rates
85
+
86
+ Most evolve runs today still validate through `llm_judge`. Routing evolution now
87
+ auto-builds a replay fixture from the target skill plus installed sibling
88
+ skills in the same registry, so replay-backed validation is preferred whenever
89
+ that local fixture can be constructed because it captures host-style routing
90
+ behavior instead of model judgment.
91
+
92
+ The current replay path is fixture-backed: it evaluates the target routing table
93
+ against the installed target/competing skill surfaces in a controlled replay
94
+ fixture and records per-entry evidence. That is still a stronger signal than a
95
+ free-form judge prompt, but you should describe it as replay-backed validation,
96
+ not as live operator telemetry.
97
+
98
+ Replay parsing is intentionally conservative: unreadable skill files degrade to
99
+ empty surfaces instead of throwing, and malformed routing rows with empty
100
+ trigger cells are ignored rather than treated as valid triggers.
101
+
79
102
  ## Parsing Instructions
80
103
 
81
104
  ### Track Evolution Progress