selftune 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +146 -0
- package/.claude/agents/evolution-reviewer.md +167 -0
- package/.claude/agents/integration-guide.md +200 -0
- package/.claude/agents/pattern-analyst.md +147 -0
- package/CHANGELOG.md +37 -0
- package/README.md +96 -256
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +103 -0
- package/cli/selftune/constants.ts +75 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-server.ts +582 -0
- package/cli/selftune/dashboard.ts +25 -3
- package/cli/selftune/eval/baseline.ts +247 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +68 -2
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evolve-body.ts +492 -0
- package/cli/selftune/evolution/evolve.ts +466 -103
- package/cli/selftune/evolution/extract-patterns.ts +32 -1
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +19 -2
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/grade-session.ts +138 -18
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/index.ts +88 -0
- package/cli/selftune/ingestors/claude-replay.ts +351 -0
- package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
- package/cli/selftune/init.ts +150 -3
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +25 -2
- package/cli/selftune/status.ts +17 -13
- package/cli/selftune/types.ts +377 -5
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/llm-call.ts +29 -3
- package/cli/selftune/utils/transcript.ts +35 -0
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/dashboard/index.html +569 -8
- package/package.json +8 -4
- package/skill/SKILL.md +124 -8
- package/skill/Workflows/AutoActivation.md +144 -0
- package/skill/Workflows/Badge.md +118 -0
- package/skill/Workflows/Baseline.md +121 -0
- package/skill/Workflows/Composability.md +100 -0
- package/skill/Workflows/Contribute.md +91 -0
- package/skill/Workflows/Cron.md +155 -0
- package/skill/Workflows/Dashboard.md +203 -0
- package/skill/Workflows/Doctor.md +37 -1
- package/skill/Workflows/Evals.md +69 -1
- package/skill/Workflows/EvolutionMemory.md +152 -0
- package/skill/Workflows/Evolve.md +111 -6
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/ImportSkillsBench.md +111 -0
- package/skill/Workflows/Ingest.md +117 -3
- package/skill/Workflows/Initialize.md +57 -3
- package/skill/Workflows/Replay.md +70 -0
- package/skill/Workflows/Rollback.md +20 -1
- package/skill/Workflows/UnitTest.md +138 -0
- package/skill/Workflows/Watch.md +22 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* propose-routing.ts
|
|
3
|
+
*
|
|
4
|
+
* Generates improved routing table proposals using LLM analysis of failure
|
|
5
|
+
* patterns. Targets the `## Workflow Routing` section of a SKILL.md file.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { BodyEvolutionProposal, EvolutionTarget, FailurePattern } from "../types.js";
|
|
9
|
+
import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
|
|
10
|
+
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
// System prompt
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
/** System prompt for the routing table proposer LLM. */
|
|
16
|
+
export const ROUTING_PROPOSER_SYSTEM = `You are a workflow routing optimizer for an AI agent skill system.
|
|
17
|
+
|
|
18
|
+
Your task is to analyze the current routing table and its failure patterns,
|
|
19
|
+
then propose an improved routing table that would correctly route missed queries
|
|
20
|
+
while preserving correct routing for existing queries.
|
|
21
|
+
|
|
22
|
+
Rules:
|
|
23
|
+
- The routing table must be a valid markdown table with | Trigger | Workflow | columns.
|
|
24
|
+
- Each row maps a trigger pattern to the workflow it should activate.
|
|
25
|
+
- Cover the semantic space of the missed queries without being too broad.
|
|
26
|
+
- Maintain the original intent and scope of the skill routing.
|
|
27
|
+
- Output ONLY valid JSON with exactly these fields:
|
|
28
|
+
- "proposed_routing" (string): the improved routing table in markdown format
|
|
29
|
+
- "rationale" (string): explanation of what changed and why
|
|
30
|
+
- "confidence" (number): 0.0-1.0 how confident you are this improves routing
|
|
31
|
+
|
|
32
|
+
Do NOT include any text outside the JSON object.`;
|
|
33
|
+
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
// Prompt builder
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
/** Build the user prompt for routing table proposal. */
|
|
39
|
+
export function buildRoutingProposalPrompt(
|
|
40
|
+
currentRouting: string,
|
|
41
|
+
fullSkillContent: string,
|
|
42
|
+
failurePatterns: FailurePattern[],
|
|
43
|
+
missedQueries: string[],
|
|
44
|
+
skillName: string,
|
|
45
|
+
): string {
|
|
46
|
+
const patternLines = failurePatterns.map((p) => {
|
|
47
|
+
const queries = p.missed_queries.map((q) => ` - "${q}"`).join("\n");
|
|
48
|
+
return ` Pattern ${p.pattern_id} (frequency: ${p.frequency}, type: ${p.invocation_type}):\n${queries}`;
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
const missedLines = missedQueries.map((q) => ` - "${q}"`).join("\n");
|
|
52
|
+
|
|
53
|
+
// Build failure feedback section if any patterns have feedback
|
|
54
|
+
const feedbackLines: string[] = [];
|
|
55
|
+
for (const p of failurePatterns) {
|
|
56
|
+
if (p.feedback && p.feedback.length > 0) {
|
|
57
|
+
for (const fb of p.feedback) {
|
|
58
|
+
feedbackLines.push(` Query: "${fb.query}"`);
|
|
59
|
+
feedbackLines.push(` Failure reason: ${fb.failure_reason}`);
|
|
60
|
+
feedbackLines.push(` Improvement hint: ${fb.improvement_hint}`);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
const feedbackSection =
|
|
65
|
+
feedbackLines.length > 0 ? `\n\nStructured Failure Analysis:\n${feedbackLines.join("\n")}` : "";
|
|
66
|
+
|
|
67
|
+
return `Skill Name: ${skillName}
|
|
68
|
+
|
|
69
|
+
Current Routing Table:
|
|
70
|
+
${currentRouting}
|
|
71
|
+
|
|
72
|
+
Full Skill Content:
|
|
73
|
+
${fullSkillContent}
|
|
74
|
+
|
|
75
|
+
Failure Patterns:
|
|
76
|
+
${patternLines.join("\n\n")}
|
|
77
|
+
|
|
78
|
+
All Missed Queries:
|
|
79
|
+
${missedLines}${feedbackSection}
|
|
80
|
+
|
|
81
|
+
Propose an improved routing table for the "${skillName}" skill that would correctly route the missed queries listed above. Output ONLY a JSON object with "proposed_routing", "rationale", and "confidence" fields.`;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// ---------------------------------------------------------------------------
|
|
85
|
+
// Response parser
|
|
86
|
+
// ---------------------------------------------------------------------------
|
|
87
|
+
|
|
88
|
+
/** Parse LLM response text into structured routing proposal data. */
|
|
89
|
+
export function parseRoutingProposalResponse(raw: string): {
|
|
90
|
+
proposed_routing: string;
|
|
91
|
+
rationale: string;
|
|
92
|
+
confidence: number;
|
|
93
|
+
} {
|
|
94
|
+
const cleaned = stripMarkdownFences(raw);
|
|
95
|
+
|
|
96
|
+
let parsed: unknown;
|
|
97
|
+
try {
|
|
98
|
+
parsed = JSON.parse(cleaned);
|
|
99
|
+
} catch {
|
|
100
|
+
throw new Error(`Failed to parse LLM response as JSON: ${cleaned.slice(0, 200)}`);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
if (typeof parsed !== "object" || parsed === null) {
|
|
104
|
+
throw new Error("LLM response is not a JSON object");
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
const obj = parsed as Record<string, unknown>;
|
|
108
|
+
|
|
109
|
+
if (typeof obj.proposed_routing !== "string") {
|
|
110
|
+
throw new Error("Missing or invalid 'proposed_routing' field in LLM response");
|
|
111
|
+
}
|
|
112
|
+
if (typeof obj.rationale !== "string") {
|
|
113
|
+
throw new Error("Missing or invalid 'rationale' field in LLM response");
|
|
114
|
+
}
|
|
115
|
+
if (typeof obj.confidence !== "number") {
|
|
116
|
+
throw new Error("Missing or invalid 'confidence' field in LLM response");
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
const confidence = Math.max(0.0, Math.min(1.0, obj.confidence));
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
proposed_routing: obj.proposed_routing,
|
|
123
|
+
rationale: obj.rationale,
|
|
124
|
+
confidence,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// ---------------------------------------------------------------------------
|
|
129
|
+
// Proposal generator
|
|
130
|
+
// ---------------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
/** Generate a routing table evolution proposal using LLM. */
|
|
133
|
+
export async function generateRoutingProposal(
|
|
134
|
+
currentRouting: string,
|
|
135
|
+
fullSkillContent: string,
|
|
136
|
+
failurePatterns: FailurePattern[],
|
|
137
|
+
missedQueries: string[],
|
|
138
|
+
skillName: string,
|
|
139
|
+
skillPath: string,
|
|
140
|
+
agent: string,
|
|
141
|
+
modelFlag?: string,
|
|
142
|
+
): Promise<BodyEvolutionProposal> {
|
|
143
|
+
const prompt = buildRoutingProposalPrompt(
|
|
144
|
+
currentRouting,
|
|
145
|
+
fullSkillContent,
|
|
146
|
+
failurePatterns,
|
|
147
|
+
missedQueries,
|
|
148
|
+
skillName,
|
|
149
|
+
);
|
|
150
|
+
const rawResponse = await callLlm(ROUTING_PROPOSER_SYSTEM, prompt, agent, modelFlag);
|
|
151
|
+
const { proposed_routing, rationale, confidence } = parseRoutingProposalResponse(rawResponse);
|
|
152
|
+
|
|
153
|
+
return {
|
|
154
|
+
proposal_id: `evo-routing-${skillName}-${Date.now()}`,
|
|
155
|
+
skill_name: skillName,
|
|
156
|
+
skill_path: skillPath,
|
|
157
|
+
original_body: currentRouting,
|
|
158
|
+
proposed_body: proposed_routing,
|
|
159
|
+
rationale,
|
|
160
|
+
target: "routing" as EvolutionTarget,
|
|
161
|
+
failure_patterns: failurePatterns.map((p) => p.pattern_id),
|
|
162
|
+
confidence,
|
|
163
|
+
created_at: new Date().toISOString(),
|
|
164
|
+
status: "pending",
|
|
165
|
+
};
|
|
166
|
+
}
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* refine-body.ts
|
|
3
|
+
*
|
|
4
|
+
* Takes failure feedback from a validation pass and asks the teacher LLM
|
|
5
|
+
* to revise specific sections of a body proposal.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { BodyEvolutionProposal, BodyValidationResult } from "../types.js";
|
|
9
|
+
import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
|
|
10
|
+
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
// System prompt
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
/** System prompt for the body refiner (teacher) LLM. */
|
|
16
|
+
export const BODY_REFINER_SYSTEM = `You are an expert skill document refiner for an AI agent routing system.
|
|
17
|
+
|
|
18
|
+
You are given a proposed SKILL.md body that failed one or more validation gates.
|
|
19
|
+
Your task is to revise the body to address the specific failures while preserving
|
|
20
|
+
the parts that passed validation.
|
|
21
|
+
|
|
22
|
+
Rules:
|
|
23
|
+
- Address each failure reason specifically.
|
|
24
|
+
- Preserve structural elements: ## Workflow Routing table, ## sections.
|
|
25
|
+
- Keep the routing table as a valid markdown table with | Trigger | Workflow | columns.
|
|
26
|
+
- Do not make unnecessary changes to parts that passed validation.
|
|
27
|
+
- Output ONLY valid JSON with exactly these fields:
|
|
28
|
+
- "refined_body" (string): the revised skill body (markdown, everything below the title)
|
|
29
|
+
- "changes_made" (string): summary of what was changed
|
|
30
|
+
- "confidence" (number): 0.0-1.0 how confident you are this addresses the failures
|
|
31
|
+
|
|
32
|
+
Do NOT include any text outside the JSON object.`;
|
|
33
|
+
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
// Prompt builder
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
/** Build the refinement prompt from validation feedback. */
|
|
39
|
+
export function buildRefinementPrompt(
|
|
40
|
+
proposedBody: string,
|
|
41
|
+
validationResult: BodyValidationResult,
|
|
42
|
+
skillName: string,
|
|
43
|
+
regressionQueries?: string[],
|
|
44
|
+
): string {
|
|
45
|
+
const failedGates = validationResult.gate_results
|
|
46
|
+
.filter((g) => !g.passed)
|
|
47
|
+
.map((g) => ` - ${g.gate}: ${g.reason}`)
|
|
48
|
+
.join("\n");
|
|
49
|
+
|
|
50
|
+
const regressionSection =
|
|
51
|
+
regressionQueries && regressionQueries.length > 0
|
|
52
|
+
? `\n\nRegression Queries (these worked before but broke after):\n${regressionQueries.map((q) => ` - "${q}"`).join("\n")}`
|
|
53
|
+
: "";
|
|
54
|
+
|
|
55
|
+
return `Skill Name: ${skillName}
|
|
56
|
+
|
|
57
|
+
Current Proposed Body:
|
|
58
|
+
${proposedBody}
|
|
59
|
+
|
|
60
|
+
Failed Validation Gates:
|
|
61
|
+
${failedGates}
|
|
62
|
+
${regressionSection}
|
|
63
|
+
|
|
64
|
+
Revise the proposed body to address the failed validation gates. Preserve what works, fix what doesn't. Output ONLY a JSON object with "refined_body", "changes_made", and "confidence" fields.`;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// ---------------------------------------------------------------------------
|
|
68
|
+
// Response parser
|
|
69
|
+
// ---------------------------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
/** Parse LLM response text into structured refinement data. */
|
|
72
|
+
export function parseRefinementResponse(raw: string): {
|
|
73
|
+
refined_body: string;
|
|
74
|
+
changes_made: string;
|
|
75
|
+
confidence: number;
|
|
76
|
+
} {
|
|
77
|
+
const cleaned = stripMarkdownFences(raw);
|
|
78
|
+
|
|
79
|
+
let parsed: unknown;
|
|
80
|
+
try {
|
|
81
|
+
parsed = JSON.parse(cleaned);
|
|
82
|
+
} catch {
|
|
83
|
+
throw new Error(`Failed to parse LLM response as JSON: ${cleaned.slice(0, 200)}`);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (typeof parsed !== "object" || parsed === null) {
|
|
87
|
+
throw new Error("LLM response is not a JSON object");
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const obj = parsed as Record<string, unknown>;
|
|
91
|
+
|
|
92
|
+
if (typeof obj.refined_body !== "string") {
|
|
93
|
+
throw new Error("Missing or invalid 'refined_body' field in LLM response");
|
|
94
|
+
}
|
|
95
|
+
if (typeof obj.changes_made !== "string") {
|
|
96
|
+
throw new Error("Missing or invalid 'changes_made' field in LLM response");
|
|
97
|
+
}
|
|
98
|
+
if (typeof obj.confidence !== "number") {
|
|
99
|
+
throw new Error("Missing or invalid 'confidence' field in LLM response");
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const confidence = Math.max(0.0, Math.min(1.0, obj.confidence));
|
|
103
|
+
|
|
104
|
+
return {
|
|
105
|
+
refined_body: obj.refined_body,
|
|
106
|
+
changes_made: obj.changes_made,
|
|
107
|
+
confidence,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// ---------------------------------------------------------------------------
|
|
112
|
+
// Refinement function
|
|
113
|
+
// ---------------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
/** Refine a body proposal based on validation feedback. */
|
|
116
|
+
export async function refineBodyProposal(
|
|
117
|
+
proposal: BodyEvolutionProposal,
|
|
118
|
+
validationResult: BodyValidationResult,
|
|
119
|
+
agent: string,
|
|
120
|
+
modelFlag?: string,
|
|
121
|
+
): Promise<BodyEvolutionProposal> {
|
|
122
|
+
const prompt = buildRefinementPrompt(
|
|
123
|
+
proposal.proposed_body,
|
|
124
|
+
validationResult,
|
|
125
|
+
proposal.skill_name,
|
|
126
|
+
validationResult.regressions,
|
|
127
|
+
);
|
|
128
|
+
|
|
129
|
+
const rawResponse = await callLlm(BODY_REFINER_SYSTEM, prompt, agent, modelFlag);
|
|
130
|
+
const { refined_body, changes_made, confidence } = parseRefinementResponse(rawResponse);
|
|
131
|
+
|
|
132
|
+
return {
|
|
133
|
+
...proposal,
|
|
134
|
+
proposal_id: `${proposal.proposal_id}-refined-${Date.now()}`,
|
|
135
|
+
proposed_body: refined_body,
|
|
136
|
+
rationale: `${proposal.rationale}\n\nRefinement: ${changes_made}`,
|
|
137
|
+
confidence,
|
|
138
|
+
created_at: new Date().toISOString(),
|
|
139
|
+
status: "pending",
|
|
140
|
+
};
|
|
141
|
+
}
|
|
@@ -11,6 +11,7 @@ import { existsSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from
|
|
|
11
11
|
import { basename, dirname, join } from "node:path";
|
|
12
12
|
import { parseArgs } from "node:util";
|
|
13
13
|
|
|
14
|
+
import { updateContextAfterRollback } from "../memory/writer.js";
|
|
14
15
|
import type { EvolutionAuditEntry } from "../types.js";
|
|
15
16
|
import { appendAuditEntry, getLastDeployedProposal, readAuditTrail } from "./audit.js";
|
|
16
17
|
import { replaceDescription } from "./deploy-proposal.js";
|
|
@@ -153,11 +154,19 @@ export async function rollback(options: RollbackOptions): Promise<RollbackResult
|
|
|
153
154
|
};
|
|
154
155
|
appendAuditEntry(auditEntry, logPath);
|
|
155
156
|
|
|
156
|
-
|
|
157
|
+
const backupResult: RollbackResult = {
|
|
157
158
|
rolledBack: true,
|
|
158
159
|
restoredDescription: originalContent,
|
|
159
160
|
reason: "Restored from backup file",
|
|
160
161
|
};
|
|
162
|
+
|
|
163
|
+
try {
|
|
164
|
+
updateContextAfterRollback(skillName, backupResult);
|
|
165
|
+
} catch {
|
|
166
|
+
// Memory writes should never fail the main operation
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return backupResult;
|
|
161
170
|
}
|
|
162
171
|
|
|
163
172
|
// Strategy 2: Restore from audit trail's created entry (description only)
|
|
@@ -177,11 +186,19 @@ export async function rollback(options: RollbackOptions): Promise<RollbackResult
|
|
|
177
186
|
};
|
|
178
187
|
appendAuditEntry(auditEntry, logPath);
|
|
179
188
|
|
|
180
|
-
|
|
189
|
+
const auditResult: RollbackResult = {
|
|
181
190
|
rolledBack: true,
|
|
182
191
|
restoredDescription: originalFromAudit,
|
|
183
192
|
reason: "Restored from audit trail",
|
|
184
193
|
};
|
|
194
|
+
|
|
195
|
+
try {
|
|
196
|
+
updateContextAfterRollback(skillName, auditResult);
|
|
197
|
+
} catch {
|
|
198
|
+
// Memory writes should never fail the main operation
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
return auditResult;
|
|
185
202
|
}
|
|
186
203
|
|
|
187
204
|
// No restoration source available
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* validate-body.ts
|
|
3
|
+
*
|
|
4
|
+
* 3-gate validation for full body evolution proposals:
|
|
5
|
+
* Gate 1 (structural): Pure code — YAML frontmatter, # Title, ## Workflow Routing preserved
|
|
6
|
+
* Gate 2 (trigger accuracy): Student model YES/NO per eval entry
|
|
7
|
+
* Gate 3 (quality): Student model rates body clarity/completeness 0.0-1.0
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import type { BodyEvolutionProposal, BodyValidationResult, EvalEntry } from "../types.js";
|
|
11
|
+
import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
|
|
12
|
+
import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
|
|
13
|
+
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
// Gate 1: Structural validation (pure code, no LLM)
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Check that a proposed body preserves required structural elements.
|
|
20
|
+
* Verifies:
|
|
21
|
+
* - Contains a ## Workflow Routing section
|
|
22
|
+
* - Routing table has valid markdown table syntax
|
|
23
|
+
* - Body is non-empty
|
|
24
|
+
*/
|
|
25
|
+
export function validateBodyStructure(proposedBody: string): { valid: boolean; reason: string } {
|
|
26
|
+
if (!proposedBody || proposedBody.trim().length === 0) {
|
|
27
|
+
return { valid: false, reason: "Proposed body is empty" };
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Check for ## Workflow Routing section
|
|
31
|
+
if (!proposedBody.includes("## Workflow Routing")) {
|
|
32
|
+
return { valid: false, reason: "Missing required '## Workflow Routing' section" };
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Extract the routing section and check for table syntax
|
|
36
|
+
const routingIdx = proposedBody.indexOf("## Workflow Routing");
|
|
37
|
+
const afterRouting = proposedBody.slice(routingIdx + "## Workflow Routing".length);
|
|
38
|
+
// Find end of section (next ## heading or EOF)
|
|
39
|
+
const nextSectionMatch = afterRouting.match(/\n## /);
|
|
40
|
+
const routingContent = nextSectionMatch
|
|
41
|
+
? afterRouting.slice(0, nextSectionMatch.index)
|
|
42
|
+
: afterRouting;
|
|
43
|
+
|
|
44
|
+
// Check for pipe-delimited table rows
|
|
45
|
+
const tableLines = routingContent
|
|
46
|
+
.split("\n")
|
|
47
|
+
.filter((l) => l.trim().startsWith("|") && l.trim().endsWith("|"));
|
|
48
|
+
if (tableLines.length < 2) {
|
|
49
|
+
return {
|
|
50
|
+
valid: false,
|
|
51
|
+
reason:
|
|
52
|
+
"Workflow Routing section lacks a valid markdown table (need header + separator + rows)",
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return { valid: true, reason: "Structural validation passed" };
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// ---------------------------------------------------------------------------
|
|
60
|
+
// Gate 2: Trigger accuracy (student model YES/NO)
|
|
61
|
+
// ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Run trigger checks on the eval set using the proposed body content.
|
|
65
|
+
* Returns before/after pass rates.
|
|
66
|
+
*/
|
|
67
|
+
export async function validateBodyTriggerAccuracy(
|
|
68
|
+
originalBody: string,
|
|
69
|
+
proposedBody: string,
|
|
70
|
+
evalSet: EvalEntry[],
|
|
71
|
+
agent: string,
|
|
72
|
+
modelFlag?: string,
|
|
73
|
+
): Promise<{
|
|
74
|
+
before_pass_rate: number;
|
|
75
|
+
after_pass_rate: number;
|
|
76
|
+
improved: boolean;
|
|
77
|
+
regressions: string[];
|
|
78
|
+
}> {
|
|
79
|
+
if (evalSet.length === 0) {
|
|
80
|
+
return { before_pass_rate: 0, after_pass_rate: 0, improved: false, regressions: [] };
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
|
|
84
|
+
let beforePassed = 0;
|
|
85
|
+
let afterPassed = 0;
|
|
86
|
+
const regressions: string[] = [];
|
|
87
|
+
|
|
88
|
+
for (const entry of evalSet) {
|
|
89
|
+
// Check with original body
|
|
90
|
+
const beforePrompt = buildTriggerCheckPrompt(originalBody, entry.query);
|
|
91
|
+
const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
|
|
92
|
+
const beforeTriggered = parseTriggerResponse(beforeRaw);
|
|
93
|
+
const beforePass =
|
|
94
|
+
(entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
|
|
95
|
+
|
|
96
|
+
// Check with proposed body
|
|
97
|
+
const afterPrompt = buildTriggerCheckPrompt(proposedBody, entry.query);
|
|
98
|
+
const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
|
|
99
|
+
const afterTriggered = parseTriggerResponse(afterRaw);
|
|
100
|
+
const afterPass =
|
|
101
|
+
(entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
|
|
102
|
+
|
|
103
|
+
if (beforePass) beforePassed++;
|
|
104
|
+
if (afterPass) afterPassed++;
|
|
105
|
+
|
|
106
|
+
// Track regressions
|
|
107
|
+
if (beforePass && !afterPass) {
|
|
108
|
+
regressions.push(entry.query);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const total = evalSet.length;
|
|
113
|
+
const beforePassRate = beforePassed / total;
|
|
114
|
+
const afterPassRate = afterPassed / total;
|
|
115
|
+
|
|
116
|
+
return {
|
|
117
|
+
before_pass_rate: beforePassRate,
|
|
118
|
+
after_pass_rate: afterPassRate,
|
|
119
|
+
improved: afterPassRate > beforePassRate,
|
|
120
|
+
regressions,
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// ---------------------------------------------------------------------------
|
|
125
|
+
// Gate 3: Quality assessment (student model 0.0-1.0)
|
|
126
|
+
// ---------------------------------------------------------------------------
|
|
127
|
+
|
|
128
|
+
/** System prompt for quality assessment. */
|
|
129
|
+
const QUALITY_ASSESSMENT_SYSTEM = `You are a skill document quality assessor for an AI agent system.
|
|
130
|
+
|
|
131
|
+
Rate the quality of the provided skill document body on these dimensions:
|
|
132
|
+
- Clarity: Is the description clear and unambiguous?
|
|
133
|
+
- Completeness: Does it cover the expected use cases?
|
|
134
|
+
- Structure: Is it well-organized with proper sections?
|
|
135
|
+
- Routing accuracy: Does the routing table seem comprehensive?
|
|
136
|
+
|
|
137
|
+
Output ONLY valid JSON with exactly these fields:
|
|
138
|
+
- "score" (number): Overall quality score 0.0-1.0
|
|
139
|
+
- "reason" (string): Brief explanation of the score
|
|
140
|
+
|
|
141
|
+
Do NOT include any text outside the JSON object.`;
|
|
142
|
+
|
|
143
|
+
/** Assess the quality of a proposed body via student model. */
|
|
144
|
+
export async function assessBodyQuality(
|
|
145
|
+
proposedBody: string,
|
|
146
|
+
skillName: string,
|
|
147
|
+
agent: string,
|
|
148
|
+
modelFlag?: string,
|
|
149
|
+
): Promise<{ score: number; reason: string }> {
|
|
150
|
+
const userPrompt = `Skill Name: ${skillName}
|
|
151
|
+
|
|
152
|
+
Proposed Skill Body:
|
|
153
|
+
${proposedBody}
|
|
154
|
+
|
|
155
|
+
Rate the quality of this skill document body. Output ONLY a JSON object with "score" (0.0-1.0) and "reason" fields.`;
|
|
156
|
+
|
|
157
|
+
const rawResponse = await callLlm(QUALITY_ASSESSMENT_SYSTEM, userPrompt, agent, modelFlag);
|
|
158
|
+
const cleaned = stripMarkdownFences(rawResponse);
|
|
159
|
+
|
|
160
|
+
let parsed: unknown;
|
|
161
|
+
try {
|
|
162
|
+
parsed = JSON.parse(cleaned);
|
|
163
|
+
} catch {
|
|
164
|
+
// If parsing fails, return a conservative default
|
|
165
|
+
return { score: 0.5, reason: "Failed to parse quality assessment response" };
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
if (typeof parsed !== "object" || parsed === null) {
|
|
169
|
+
return { score: 0.5, reason: "Quality assessment response is not a JSON object" };
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
const obj = parsed as Record<string, unknown>;
|
|
173
|
+
const score = typeof obj.score === "number" ? Math.max(0.0, Math.min(1.0, obj.score)) : 0.5;
|
|
174
|
+
const reason = typeof obj.reason === "string" ? obj.reason : "No reason provided";
|
|
175
|
+
|
|
176
|
+
return { score, reason };
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
180
|
+
// Full 3-gate body validation
|
|
181
|
+
// ---------------------------------------------------------------------------
|
|
182
|
+
|
|
183
|
+
/** Minimum quality score to pass Gate 3. */
|
|
184
|
+
const QUALITY_THRESHOLD = 0.6;
|
|
185
|
+
|
|
186
|
+
/** Validate a body proposal through all 3 gates. */
|
|
187
|
+
export async function validateBodyProposal(
|
|
188
|
+
proposal: BodyEvolutionProposal,
|
|
189
|
+
evalSet: EvalEntry[],
|
|
190
|
+
agent: string,
|
|
191
|
+
modelFlag?: string,
|
|
192
|
+
qualityThreshold = QUALITY_THRESHOLD,
|
|
193
|
+
): Promise<BodyValidationResult> {
|
|
194
|
+
const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
|
|
195
|
+
|
|
196
|
+
// Gate 1: Structural validation (pure code)
|
|
197
|
+
const structural = validateBodyStructure(proposal.proposed_body);
|
|
198
|
+
gateResults.push({
|
|
199
|
+
gate: "structural",
|
|
200
|
+
passed: structural.valid,
|
|
201
|
+
reason: structural.reason,
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
if (!structural.valid) {
|
|
205
|
+
return {
|
|
206
|
+
proposal_id: proposal.proposal_id,
|
|
207
|
+
gates_passed: 0,
|
|
208
|
+
gates_total: 3,
|
|
209
|
+
gate_results: gateResults,
|
|
210
|
+
improved: false,
|
|
211
|
+
regressions: [],
|
|
212
|
+
};
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Gate 2: Trigger accuracy (student model)
|
|
216
|
+
const accuracy = await validateBodyTriggerAccuracy(
|
|
217
|
+
proposal.original_body,
|
|
218
|
+
proposal.proposed_body,
|
|
219
|
+
evalSet,
|
|
220
|
+
agent,
|
|
221
|
+
modelFlag,
|
|
222
|
+
);
|
|
223
|
+
gateResults.push({
|
|
224
|
+
gate: "trigger_accuracy",
|
|
225
|
+
passed: accuracy.improved,
|
|
226
|
+
reason: accuracy.improved
|
|
227
|
+
? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
|
|
228
|
+
: `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
// Gate 3: Quality assessment (student model)
|
|
232
|
+
const quality = await assessBodyQuality(
|
|
233
|
+
proposal.proposed_body,
|
|
234
|
+
proposal.skill_name,
|
|
235
|
+
agent,
|
|
236
|
+
modelFlag,
|
|
237
|
+
);
|
|
238
|
+
gateResults.push({
|
|
239
|
+
gate: "quality",
|
|
240
|
+
passed: quality.score >= qualityThreshold,
|
|
241
|
+
reason: `Quality score: ${quality.score.toFixed(2)} (threshold: ${qualityThreshold}) — ${quality.reason}`,
|
|
242
|
+
});
|
|
243
|
+
|
|
244
|
+
const gatesPassed = gateResults.filter((g) => g.passed).length;
|
|
245
|
+
|
|
246
|
+
return {
|
|
247
|
+
proposal_id: proposal.proposal_id,
|
|
248
|
+
gates_passed: gatesPassed,
|
|
249
|
+
gates_total: 3,
|
|
250
|
+
gate_results: gateResults,
|
|
251
|
+
improved: gatesPassed === 3,
|
|
252
|
+
regressions: accuracy.regressions,
|
|
253
|
+
};
|
|
254
|
+
}
|