selftune 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/.claude/agents/diagnosis-analyst.md +146 -0
  2. package/.claude/agents/evolution-reviewer.md +167 -0
  3. package/.claude/agents/integration-guide.md +200 -0
  4. package/.claude/agents/pattern-analyst.md +147 -0
  5. package/CHANGELOG.md +37 -0
  6. package/README.md +96 -256
  7. package/assets/BeforeAfter.gif +0 -0
  8. package/assets/FeedbackLoop.gif +0 -0
  9. package/assets/logo.svg +9 -0
  10. package/assets/skill-health-badge.svg +20 -0
  11. package/cli/selftune/activation-rules.ts +171 -0
  12. package/cli/selftune/badge/badge-data.ts +108 -0
  13. package/cli/selftune/badge/badge-svg.ts +212 -0
  14. package/cli/selftune/badge/badge.ts +103 -0
  15. package/cli/selftune/constants.ts +75 -1
  16. package/cli/selftune/contribute/bundle.ts +314 -0
  17. package/cli/selftune/contribute/contribute.ts +214 -0
  18. package/cli/selftune/contribute/sanitize.ts +162 -0
  19. package/cli/selftune/cron/setup.ts +266 -0
  20. package/cli/selftune/dashboard-server.ts +582 -0
  21. package/cli/selftune/dashboard.ts +25 -3
  22. package/cli/selftune/eval/baseline.ts +247 -0
  23. package/cli/selftune/eval/composability.ts +117 -0
  24. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  25. package/cli/selftune/eval/hooks-to-evals.ts +68 -2
  26. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  27. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  28. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  29. package/cli/selftune/eval/unit-test.ts +196 -0
  30. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  31. package/cli/selftune/evolution/evolve-body.ts +492 -0
  32. package/cli/selftune/evolution/evolve.ts +466 -103
  33. package/cli/selftune/evolution/extract-patterns.ts +32 -1
  34. package/cli/selftune/evolution/pareto.ts +314 -0
  35. package/cli/selftune/evolution/propose-body.ts +171 -0
  36. package/cli/selftune/evolution/propose-description.ts +100 -2
  37. package/cli/selftune/evolution/propose-routing.ts +166 -0
  38. package/cli/selftune/evolution/refine-body.ts +141 -0
  39. package/cli/selftune/evolution/rollback.ts +19 -2
  40. package/cli/selftune/evolution/validate-body.ts +254 -0
  41. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  42. package/cli/selftune/evolution/validate-routing.ts +177 -0
  43. package/cli/selftune/grading/grade-session.ts +138 -18
  44. package/cli/selftune/grading/pre-gates.ts +104 -0
  45. package/cli/selftune/hooks/auto-activate.ts +185 -0
  46. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  47. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  48. package/cli/selftune/index.ts +88 -0
  49. package/cli/selftune/ingestors/claude-replay.ts +351 -0
  50. package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
  51. package/cli/selftune/init.ts +150 -3
  52. package/cli/selftune/memory/writer.ts +447 -0
  53. package/cli/selftune/monitoring/watch.ts +25 -2
  54. package/cli/selftune/status.ts +17 -13
  55. package/cli/selftune/types.ts +377 -5
  56. package/cli/selftune/utils/frontmatter.ts +217 -0
  57. package/cli/selftune/utils/llm-call.ts +29 -3
  58. package/cli/selftune/utils/transcript.ts +35 -0
  59. package/cli/selftune/utils/trigger-check.ts +89 -0
  60. package/cli/selftune/utils/tui.ts +156 -0
  61. package/dashboard/index.html +569 -8
  62. package/package.json +8 -4
  63. package/skill/SKILL.md +124 -8
  64. package/skill/Workflows/AutoActivation.md +144 -0
  65. package/skill/Workflows/Badge.md +118 -0
  66. package/skill/Workflows/Baseline.md +121 -0
  67. package/skill/Workflows/Composability.md +100 -0
  68. package/skill/Workflows/Contribute.md +91 -0
  69. package/skill/Workflows/Cron.md +155 -0
  70. package/skill/Workflows/Dashboard.md +203 -0
  71. package/skill/Workflows/Doctor.md +37 -1
  72. package/skill/Workflows/Evals.md +69 -1
  73. package/skill/Workflows/EvolutionMemory.md +152 -0
  74. package/skill/Workflows/Evolve.md +111 -6
  75. package/skill/Workflows/EvolveBody.md +159 -0
  76. package/skill/Workflows/ImportSkillsBench.md +111 -0
  77. package/skill/Workflows/Ingest.md +117 -3
  78. package/skill/Workflows/Initialize.md +57 -3
  79. package/skill/Workflows/Replay.md +70 -0
  80. package/skill/Workflows/Rollback.md +20 -1
  81. package/skill/Workflows/UnitTest.md +138 -0
  82. package/skill/Workflows/Watch.md +22 -0
  83. package/skill/settings_snippet.json +23 -0
  84. package/templates/activation-rules-default.json +27 -0
  85. package/templates/multi-skill-settings.json +64 -0
  86. package/templates/single-skill-settings.json +58 -0
@@ -0,0 +1,166 @@
1
+ /**
2
+ * propose-routing.ts
3
+ *
4
+ * Generates improved routing table proposals using LLM analysis of failure
5
+ * patterns. Targets the `## Workflow Routing` section of a SKILL.md file.
6
+ */
7
+
8
+ import type { BodyEvolutionProposal, EvolutionTarget, FailurePattern } from "../types.js";
9
+ import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
10
+
11
+ // ---------------------------------------------------------------------------
12
+ // System prompt
13
+ // ---------------------------------------------------------------------------
14
+
15
+ /** System prompt for the routing table proposer LLM. */
16
+ export const ROUTING_PROPOSER_SYSTEM = `You are a workflow routing optimizer for an AI agent skill system.
17
+
18
+ Your task is to analyze the current routing table and its failure patterns,
19
+ then propose an improved routing table that would correctly route missed queries
20
+ while preserving correct routing for existing queries.
21
+
22
+ Rules:
23
+ - The routing table must be a valid markdown table with | Trigger | Workflow | columns.
24
+ - Each row maps a trigger pattern to the workflow it should activate.
25
+ - Cover the semantic space of the missed queries without being too broad.
26
+ - Maintain the original intent and scope of the skill routing.
27
+ - Output ONLY valid JSON with exactly these fields:
28
+ - "proposed_routing" (string): the improved routing table in markdown format
29
+ - "rationale" (string): explanation of what changed and why
30
+ - "confidence" (number): 0.0-1.0 how confident you are this improves routing
31
+
32
+ Do NOT include any text outside the JSON object.`;
33
+
34
+ // ---------------------------------------------------------------------------
35
+ // Prompt builder
36
+ // ---------------------------------------------------------------------------
37
+
38
+ /** Build the user prompt for routing table proposal. */
39
+ export function buildRoutingProposalPrompt(
40
+ currentRouting: string,
41
+ fullSkillContent: string,
42
+ failurePatterns: FailurePattern[],
43
+ missedQueries: string[],
44
+ skillName: string,
45
+ ): string {
46
+ const patternLines = failurePatterns.map((p) => {
47
+ const queries = p.missed_queries.map((q) => ` - "${q}"`).join("\n");
48
+ return ` Pattern ${p.pattern_id} (frequency: ${p.frequency}, type: ${p.invocation_type}):\n${queries}`;
49
+ });
50
+
51
+ const missedLines = missedQueries.map((q) => ` - "${q}"`).join("\n");
52
+
53
+ // Build failure feedback section if any patterns have feedback
54
+ const feedbackLines: string[] = [];
55
+ for (const p of failurePatterns) {
56
+ if (p.feedback && p.feedback.length > 0) {
57
+ for (const fb of p.feedback) {
58
+ feedbackLines.push(` Query: "${fb.query}"`);
59
+ feedbackLines.push(` Failure reason: ${fb.failure_reason}`);
60
+ feedbackLines.push(` Improvement hint: ${fb.improvement_hint}`);
61
+ }
62
+ }
63
+ }
64
+ const feedbackSection =
65
+ feedbackLines.length > 0 ? `\n\nStructured Failure Analysis:\n${feedbackLines.join("\n")}` : "";
66
+
67
+ return `Skill Name: ${skillName}
68
+
69
+ Current Routing Table:
70
+ ${currentRouting}
71
+
72
+ Full Skill Content:
73
+ ${fullSkillContent}
74
+
75
+ Failure Patterns:
76
+ ${patternLines.join("\n\n")}
77
+
78
+ All Missed Queries:
79
+ ${missedLines}${feedbackSection}
80
+
81
+ Propose an improved routing table for the "${skillName}" skill that would correctly route the missed queries listed above. Output ONLY a JSON object with "proposed_routing", "rationale", and "confidence" fields.`;
82
+ }
83
+
84
+ // ---------------------------------------------------------------------------
85
+ // Response parser
86
+ // ---------------------------------------------------------------------------
87
+
88
+ /** Parse LLM response text into structured routing proposal data. */
89
+ export function parseRoutingProposalResponse(raw: string): {
90
+ proposed_routing: string;
91
+ rationale: string;
92
+ confidence: number;
93
+ } {
94
+ const cleaned = stripMarkdownFences(raw);
95
+
96
+ let parsed: unknown;
97
+ try {
98
+ parsed = JSON.parse(cleaned);
99
+ } catch {
100
+ throw new Error(`Failed to parse LLM response as JSON: ${cleaned.slice(0, 200)}`);
101
+ }
102
+
103
+ if (typeof parsed !== "object" || parsed === null) {
104
+ throw new Error("LLM response is not a JSON object");
105
+ }
106
+
107
+ const obj = parsed as Record<string, unknown>;
108
+
109
+ if (typeof obj.proposed_routing !== "string") {
110
+ throw new Error("Missing or invalid 'proposed_routing' field in LLM response");
111
+ }
112
+ if (typeof obj.rationale !== "string") {
113
+ throw new Error("Missing or invalid 'rationale' field in LLM response");
114
+ }
115
+ if (typeof obj.confidence !== "number") {
116
+ throw new Error("Missing or invalid 'confidence' field in LLM response");
117
+ }
118
+
119
+ const confidence = Math.max(0.0, Math.min(1.0, obj.confidence));
120
+
121
+ return {
122
+ proposed_routing: obj.proposed_routing,
123
+ rationale: obj.rationale,
124
+ confidence,
125
+ };
126
+ }
127
+
128
+ // ---------------------------------------------------------------------------
129
+ // Proposal generator
130
+ // ---------------------------------------------------------------------------
131
+
132
+ /** Generate a routing table evolution proposal using LLM. */
133
+ export async function generateRoutingProposal(
134
+ currentRouting: string,
135
+ fullSkillContent: string,
136
+ failurePatterns: FailurePattern[],
137
+ missedQueries: string[],
138
+ skillName: string,
139
+ skillPath: string,
140
+ agent: string,
141
+ modelFlag?: string,
142
+ ): Promise<BodyEvolutionProposal> {
143
+ const prompt = buildRoutingProposalPrompt(
144
+ currentRouting,
145
+ fullSkillContent,
146
+ failurePatterns,
147
+ missedQueries,
148
+ skillName,
149
+ );
150
+ const rawResponse = await callLlm(ROUTING_PROPOSER_SYSTEM, prompt, agent, modelFlag);
151
+ const { proposed_routing, rationale, confidence } = parseRoutingProposalResponse(rawResponse);
152
+
153
+ return {
154
+ proposal_id: `evo-routing-${skillName}-${Date.now()}`,
155
+ skill_name: skillName,
156
+ skill_path: skillPath,
157
+ original_body: currentRouting,
158
+ proposed_body: proposed_routing,
159
+ rationale,
160
+ target: "routing" as EvolutionTarget,
161
+ failure_patterns: failurePatterns.map((p) => p.pattern_id),
162
+ confidence,
163
+ created_at: new Date().toISOString(),
164
+ status: "pending",
165
+ };
166
+ }
@@ -0,0 +1,141 @@
1
+ /**
2
+ * refine-body.ts
3
+ *
4
+ * Takes failure feedback from a validation pass and asks the teacher LLM
5
+ * to revise specific sections of a body proposal.
6
+ */
7
+
8
+ import type { BodyEvolutionProposal, BodyValidationResult } from "../types.js";
9
+ import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
10
+
11
+ // ---------------------------------------------------------------------------
12
+ // System prompt
13
+ // ---------------------------------------------------------------------------
14
+
15
+ /** System prompt for the body refiner (teacher) LLM. */
16
+ export const BODY_REFINER_SYSTEM = `You are an expert skill document refiner for an AI agent routing system.
17
+
18
+ You are given a proposed SKILL.md body that failed one or more validation gates.
19
+ Your task is to revise the body to address the specific failures while preserving
20
+ the parts that passed validation.
21
+
22
+ Rules:
23
+ - Address each failure reason specifically.
24
+ - Preserve structural elements: ## Workflow Routing table, ## sections.
25
+ - Keep the routing table as a valid markdown table with | Trigger | Workflow | columns.
26
+ - Do not make unnecessary changes to parts that passed validation.
27
+ - Output ONLY valid JSON with exactly these fields:
28
+ - "refined_body" (string): the revised skill body (markdown, everything below the title)
29
+ - "changes_made" (string): summary of what was changed
30
+ - "confidence" (number): 0.0-1.0 how confident you are this addresses the failures
31
+
32
+ Do NOT include any text outside the JSON object.`;
33
+
34
+ // ---------------------------------------------------------------------------
35
+ // Prompt builder
36
+ // ---------------------------------------------------------------------------
37
+
38
+ /** Build the refinement prompt from validation feedback. */
39
+ export function buildRefinementPrompt(
40
+ proposedBody: string,
41
+ validationResult: BodyValidationResult,
42
+ skillName: string,
43
+ regressionQueries?: string[],
44
+ ): string {
45
+ const failedGates = validationResult.gate_results
46
+ .filter((g) => !g.passed)
47
+ .map((g) => ` - ${g.gate}: ${g.reason}`)
48
+ .join("\n");
49
+
50
+ const regressionSection =
51
+ regressionQueries && regressionQueries.length > 0
52
+ ? `\n\nRegression Queries (these worked before but broke after):\n${regressionQueries.map((q) => ` - "${q}"`).join("\n")}`
53
+ : "";
54
+
55
+ return `Skill Name: ${skillName}
56
+
57
+ Current Proposed Body:
58
+ ${proposedBody}
59
+
60
+ Failed Validation Gates:
61
+ ${failedGates}
62
+ ${regressionSection}
63
+
64
+ Revise the proposed body to address the failed validation gates. Preserve what works, fix what doesn't. Output ONLY a JSON object with "refined_body", "changes_made", and "confidence" fields.`;
65
+ }
66
+
67
+ // ---------------------------------------------------------------------------
68
+ // Response parser
69
+ // ---------------------------------------------------------------------------
70
+
71
+ /** Parse LLM response text into structured refinement data. */
72
+ export function parseRefinementResponse(raw: string): {
73
+ refined_body: string;
74
+ changes_made: string;
75
+ confidence: number;
76
+ } {
77
+ const cleaned = stripMarkdownFences(raw);
78
+
79
+ let parsed: unknown;
80
+ try {
81
+ parsed = JSON.parse(cleaned);
82
+ } catch {
83
+ throw new Error(`Failed to parse LLM response as JSON: ${cleaned.slice(0, 200)}`);
84
+ }
85
+
86
+ if (typeof parsed !== "object" || parsed === null) {
87
+ throw new Error("LLM response is not a JSON object");
88
+ }
89
+
90
+ const obj = parsed as Record<string, unknown>;
91
+
92
+ if (typeof obj.refined_body !== "string") {
93
+ throw new Error("Missing or invalid 'refined_body' field in LLM response");
94
+ }
95
+ if (typeof obj.changes_made !== "string") {
96
+ throw new Error("Missing or invalid 'changes_made' field in LLM response");
97
+ }
98
+ if (typeof obj.confidence !== "number") {
99
+ throw new Error("Missing or invalid 'confidence' field in LLM response");
100
+ }
101
+
102
+ const confidence = Math.max(0.0, Math.min(1.0, obj.confidence));
103
+
104
+ return {
105
+ refined_body: obj.refined_body,
106
+ changes_made: obj.changes_made,
107
+ confidence,
108
+ };
109
+ }
110
+
111
+ // ---------------------------------------------------------------------------
112
+ // Refinement function
113
+ // ---------------------------------------------------------------------------
114
+
115
+ /** Refine a body proposal based on validation feedback. */
116
+ export async function refineBodyProposal(
117
+ proposal: BodyEvolutionProposal,
118
+ validationResult: BodyValidationResult,
119
+ agent: string,
120
+ modelFlag?: string,
121
+ ): Promise<BodyEvolutionProposal> {
122
+ const prompt = buildRefinementPrompt(
123
+ proposal.proposed_body,
124
+ validationResult,
125
+ proposal.skill_name,
126
+ validationResult.regressions,
127
+ );
128
+
129
+ const rawResponse = await callLlm(BODY_REFINER_SYSTEM, prompt, agent, modelFlag);
130
+ const { refined_body, changes_made, confidence } = parseRefinementResponse(rawResponse);
131
+
132
+ return {
133
+ ...proposal,
134
+ proposal_id: `${proposal.proposal_id}-refined-${Date.now()}`,
135
+ proposed_body: refined_body,
136
+ rationale: `${proposal.rationale}\n\nRefinement: ${changes_made}`,
137
+ confidence,
138
+ created_at: new Date().toISOString(),
139
+ status: "pending",
140
+ };
141
+ }
@@ -11,6 +11,7 @@ import { existsSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from
11
11
  import { basename, dirname, join } from "node:path";
12
12
  import { parseArgs } from "node:util";
13
13
 
14
+ import { updateContextAfterRollback } from "../memory/writer.js";
14
15
  import type { EvolutionAuditEntry } from "../types.js";
15
16
  import { appendAuditEntry, getLastDeployedProposal, readAuditTrail } from "./audit.js";
16
17
  import { replaceDescription } from "./deploy-proposal.js";
@@ -153,11 +154,19 @@ export async function rollback(options: RollbackOptions): Promise<RollbackResult
153
154
  };
154
155
  appendAuditEntry(auditEntry, logPath);
155
156
 
156
- return {
157
+ const backupResult: RollbackResult = {
157
158
  rolledBack: true,
158
159
  restoredDescription: originalContent,
159
160
  reason: "Restored from backup file",
160
161
  };
162
+
163
+ try {
164
+ updateContextAfterRollback(skillName, backupResult);
165
+ } catch {
166
+ // Memory writes should never fail the main operation
167
+ }
168
+
169
+ return backupResult;
161
170
  }
162
171
 
163
172
  // Strategy 2: Restore from audit trail's created entry (description only)
@@ -177,11 +186,19 @@ export async function rollback(options: RollbackOptions): Promise<RollbackResult
177
186
  };
178
187
  appendAuditEntry(auditEntry, logPath);
179
188
 
180
- return {
189
+ const auditResult: RollbackResult = {
181
190
  rolledBack: true,
182
191
  restoredDescription: originalFromAudit,
183
192
  reason: "Restored from audit trail",
184
193
  };
194
+
195
+ try {
196
+ updateContextAfterRollback(skillName, auditResult);
197
+ } catch {
198
+ // Memory writes should never fail the main operation
199
+ }
200
+
201
+ return auditResult;
185
202
  }
186
203
 
187
204
  // No restoration source available
@@ -0,0 +1,254 @@
1
+ /**
2
+ * validate-body.ts
3
+ *
4
+ * 3-gate validation for full body evolution proposals:
5
+ * Gate 1 (structural): Pure code — YAML frontmatter, # Title, ## Workflow Routing preserved
6
+ * Gate 2 (trigger accuracy): Student model YES/NO per eval entry
7
+ * Gate 3 (quality): Student model rates body clarity/completeness 0.0-1.0
8
+ */
9
+
10
+ import type { BodyEvolutionProposal, BodyValidationResult, EvalEntry } from "../types.js";
11
+ import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
12
+ import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
13
+
14
+ // ---------------------------------------------------------------------------
15
+ // Gate 1: Structural validation (pure code, no LLM)
16
+ // ---------------------------------------------------------------------------
17
+
18
+ /**
19
+ * Check that a proposed body preserves required structural elements.
20
+ * Verifies:
21
+ * - Contains a ## Workflow Routing section
22
+ * - Routing table has valid markdown table syntax
23
+ * - Body is non-empty
24
+ */
25
+ export function validateBodyStructure(proposedBody: string): { valid: boolean; reason: string } {
26
+ if (!proposedBody || proposedBody.trim().length === 0) {
27
+ return { valid: false, reason: "Proposed body is empty" };
28
+ }
29
+
30
+ // Check for ## Workflow Routing section
31
+ if (!proposedBody.includes("## Workflow Routing")) {
32
+ return { valid: false, reason: "Missing required '## Workflow Routing' section" };
33
+ }
34
+
35
+ // Extract the routing section and check for table syntax
36
+ const routingIdx = proposedBody.indexOf("## Workflow Routing");
37
+ const afterRouting = proposedBody.slice(routingIdx + "## Workflow Routing".length);
38
+ // Find end of section (next ## heading or EOF)
39
+ const nextSectionMatch = afterRouting.match(/\n## /);
40
+ const routingContent = nextSectionMatch
41
+ ? afterRouting.slice(0, nextSectionMatch.index)
42
+ : afterRouting;
43
+
44
+ // Check for pipe-delimited table rows
45
+ const tableLines = routingContent
46
+ .split("\n")
47
+ .filter((l) => l.trim().startsWith("|") && l.trim().endsWith("|"));
48
+ if (tableLines.length < 2) {
49
+ return {
50
+ valid: false,
51
+ reason:
52
+ "Workflow Routing section lacks a valid markdown table (need header + separator + rows)",
53
+ };
54
+ }
55
+
56
+ return { valid: true, reason: "Structural validation passed" };
57
+ }
58
+
59
+ // ---------------------------------------------------------------------------
60
+ // Gate 2: Trigger accuracy (student model YES/NO)
61
+ // ---------------------------------------------------------------------------
62
+
63
+ /**
64
+ * Run trigger checks on the eval set using the proposed body content.
65
+ * Returns before/after pass rates.
66
+ */
67
+ export async function validateBodyTriggerAccuracy(
68
+ originalBody: string,
69
+ proposedBody: string,
70
+ evalSet: EvalEntry[],
71
+ agent: string,
72
+ modelFlag?: string,
73
+ ): Promise<{
74
+ before_pass_rate: number;
75
+ after_pass_rate: number;
76
+ improved: boolean;
77
+ regressions: string[];
78
+ }> {
79
+ if (evalSet.length === 0) {
80
+ return { before_pass_rate: 0, after_pass_rate: 0, improved: false, regressions: [] };
81
+ }
82
+
83
+ const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
84
+ let beforePassed = 0;
85
+ let afterPassed = 0;
86
+ const regressions: string[] = [];
87
+
88
+ for (const entry of evalSet) {
89
+ // Check with original body
90
+ const beforePrompt = buildTriggerCheckPrompt(originalBody, entry.query);
91
+ const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
92
+ const beforeTriggered = parseTriggerResponse(beforeRaw);
93
+ const beforePass =
94
+ (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
95
+
96
+ // Check with proposed body
97
+ const afterPrompt = buildTriggerCheckPrompt(proposedBody, entry.query);
98
+ const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
99
+ const afterTriggered = parseTriggerResponse(afterRaw);
100
+ const afterPass =
101
+ (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
102
+
103
+ if (beforePass) beforePassed++;
104
+ if (afterPass) afterPassed++;
105
+
106
+ // Track regressions
107
+ if (beforePass && !afterPass) {
108
+ regressions.push(entry.query);
109
+ }
110
+ }
111
+
112
+ const total = evalSet.length;
113
+ const beforePassRate = beforePassed / total;
114
+ const afterPassRate = afterPassed / total;
115
+
116
+ return {
117
+ before_pass_rate: beforePassRate,
118
+ after_pass_rate: afterPassRate,
119
+ improved: afterPassRate > beforePassRate,
120
+ regressions,
121
+ };
122
+ }
123
+
124
+ // ---------------------------------------------------------------------------
125
+ // Gate 3: Quality assessment (student model 0.0-1.0)
126
+ // ---------------------------------------------------------------------------
127
+
128
+ /** System prompt for quality assessment. */
129
+ const QUALITY_ASSESSMENT_SYSTEM = `You are a skill document quality assessor for an AI agent system.
130
+
131
+ Rate the quality of the provided skill document body on these dimensions:
132
+ - Clarity: Is the description clear and unambiguous?
133
+ - Completeness: Does it cover the expected use cases?
134
+ - Structure: Is it well-organized with proper sections?
135
+ - Routing accuracy: Does the routing table seem comprehensive?
136
+
137
+ Output ONLY valid JSON with exactly these fields:
138
+ - "score" (number): Overall quality score 0.0-1.0
139
+ - "reason" (string): Brief explanation of the score
140
+
141
+ Do NOT include any text outside the JSON object.`;
142
+
143
+ /** Assess the quality of a proposed body via student model. */
144
+ export async function assessBodyQuality(
145
+ proposedBody: string,
146
+ skillName: string,
147
+ agent: string,
148
+ modelFlag?: string,
149
+ ): Promise<{ score: number; reason: string }> {
150
+ const userPrompt = `Skill Name: ${skillName}
151
+
152
+ Proposed Skill Body:
153
+ ${proposedBody}
154
+
155
+ Rate the quality of this skill document body. Output ONLY a JSON object with "score" (0.0-1.0) and "reason" fields.`;
156
+
157
+ const rawResponse = await callLlm(QUALITY_ASSESSMENT_SYSTEM, userPrompt, agent, modelFlag);
158
+ const cleaned = stripMarkdownFences(rawResponse);
159
+
160
+ let parsed: unknown;
161
+ try {
162
+ parsed = JSON.parse(cleaned);
163
+ } catch {
164
+ // If parsing fails, return a conservative default
165
+ return { score: 0.5, reason: "Failed to parse quality assessment response" };
166
+ }
167
+
168
+ if (typeof parsed !== "object" || parsed === null) {
169
+ return { score: 0.5, reason: "Quality assessment response is not a JSON object" };
170
+ }
171
+
172
+ const obj = parsed as Record<string, unknown>;
173
+ const score = typeof obj.score === "number" ? Math.max(0.0, Math.min(1.0, obj.score)) : 0.5;
174
+ const reason = typeof obj.reason === "string" ? obj.reason : "No reason provided";
175
+
176
+ return { score, reason };
177
+ }
178
+
179
+ // ---------------------------------------------------------------------------
180
+ // Full 3-gate body validation
181
+ // ---------------------------------------------------------------------------
182
+
183
+ /** Minimum quality score to pass Gate 3. */
184
+ const QUALITY_THRESHOLD = 0.6;
185
+
186
+ /** Validate a body proposal through all 3 gates. */
187
+ export async function validateBodyProposal(
188
+ proposal: BodyEvolutionProposal,
189
+ evalSet: EvalEntry[],
190
+ agent: string,
191
+ modelFlag?: string,
192
+ qualityThreshold = QUALITY_THRESHOLD,
193
+ ): Promise<BodyValidationResult> {
194
+ const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
195
+
196
+ // Gate 1: Structural validation (pure code)
197
+ const structural = validateBodyStructure(proposal.proposed_body);
198
+ gateResults.push({
199
+ gate: "structural",
200
+ passed: structural.valid,
201
+ reason: structural.reason,
202
+ });
203
+
204
+ if (!structural.valid) {
205
+ return {
206
+ proposal_id: proposal.proposal_id,
207
+ gates_passed: 0,
208
+ gates_total: 3,
209
+ gate_results: gateResults,
210
+ improved: false,
211
+ regressions: [],
212
+ };
213
+ }
214
+
215
+ // Gate 2: Trigger accuracy (student model)
216
+ const accuracy = await validateBodyTriggerAccuracy(
217
+ proposal.original_body,
218
+ proposal.proposed_body,
219
+ evalSet,
220
+ agent,
221
+ modelFlag,
222
+ );
223
+ gateResults.push({
224
+ gate: "trigger_accuracy",
225
+ passed: accuracy.improved,
226
+ reason: accuracy.improved
227
+ ? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
228
+ : `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
229
+ });
230
+
231
+ // Gate 3: Quality assessment (student model)
232
+ const quality = await assessBodyQuality(
233
+ proposal.proposed_body,
234
+ proposal.skill_name,
235
+ agent,
236
+ modelFlag,
237
+ );
238
+ gateResults.push({
239
+ gate: "quality",
240
+ passed: quality.score >= qualityThreshold,
241
+ reason: `Quality score: ${quality.score.toFixed(2)} (threshold: ${qualityThreshold}) — ${quality.reason}`,
242
+ });
243
+
244
+ const gatesPassed = gateResults.filter((g) => g.passed).length;
245
+
246
+ return {
247
+ proposal_id: proposal.proposal_id,
248
+ gates_passed: gatesPassed,
249
+ gates_total: 3,
250
+ gate_results: gateResults,
251
+ improved: gatesPassed === 3,
252
+ regressions: accuracy.regressions,
253
+ };
254
+ }