selftune 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +156 -0
- package/.claude/agents/evolution-reviewer.md +180 -0
- package/.claude/agents/integration-guide.md +212 -0
- package/.claude/agents/pattern-analyst.md +160 -0
- package/CHANGELOG.md +46 -1
- package/README.md +105 -257
- package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
- package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
- package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
- package/apps/local-dashboard/dist/favicon.png +0 -0
- package/apps/local-dashboard/dist/index.html +17 -0
- package/apps/local-dashboard/dist/logo.png +0 -0
- package/apps/local-dashboard/dist/logo.svg +9 -0
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +99 -0
- package/cli/selftune/canonical-export.ts +183 -0
- package/cli/selftune/constants.ts +103 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-contract.ts +202 -0
- package/cli/selftune/dashboard-server.ts +1049 -0
- package/cli/selftune/dashboard.ts +43 -156
- package/cli/selftune/eval/baseline.ts +248 -0
- package/cli/selftune/eval/composability-v2.ts +273 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +101 -16
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evidence.ts +26 -0
- package/cli/selftune/evolution/evolve-body.ts +586 -0
- package/cli/selftune/evolution/evolve.ts +825 -116
- package/cli/selftune/evolution/extract-patterns.ts +105 -16
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +21 -4
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/auto-grade.ts +200 -0
- package/cli/selftune/grading/grade-session.ts +513 -42
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/grading/results.ts +42 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/prompt-log.ts +172 -2
- package/cli/selftune/hooks/session-stop.ts +123 -3
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/hooks/skill-eval.ts +119 -3
- package/cli/selftune/index.ts +415 -48
- package/cli/selftune/ingestors/claude-replay.ts +377 -0
- package/cli/selftune/ingestors/codex-rollout.ts +345 -46
- package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
- package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
- package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
- package/cli/selftune/init.ts +376 -16
- package/cli/selftune/last.ts +14 -5
- package/cli/selftune/localdb/db.ts +63 -0
- package/cli/selftune/localdb/materialize.ts +428 -0
- package/cli/selftune/localdb/queries.ts +376 -0
- package/cli/selftune/localdb/schema.ts +204 -0
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +90 -16
- package/cli/selftune/normalization.ts +682 -0
- package/cli/selftune/observability.ts +19 -44
- package/cli/selftune/orchestrate.ts +1073 -0
- package/cli/selftune/quickstart.ts +203 -0
- package/cli/selftune/repair/skill-usage.ts +576 -0
- package/cli/selftune/schedule.ts +561 -0
- package/cli/selftune/status.ts +59 -33
- package/cli/selftune/sync.ts +627 -0
- package/cli/selftune/types.ts +525 -5
- package/cli/selftune/utils/canonical-log.ts +45 -0
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/hooks.ts +41 -0
- package/cli/selftune/utils/html.ts +27 -0
- package/cli/selftune/utils/llm-call.ts +103 -19
- package/cli/selftune/utils/math.ts +10 -0
- package/cli/selftune/utils/query-filter.ts +139 -0
- package/cli/selftune/utils/skill-discovery.ts +340 -0
- package/cli/selftune/utils/skill-log.ts +68 -0
- package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
- package/cli/selftune/utils/transcript.ts +307 -26
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/cli/selftune/workflows/discover.ts +254 -0
- package/cli/selftune/workflows/skill-md-writer.ts +288 -0
- package/cli/selftune/workflows/workflows.ts +188 -0
- package/package.json +28 -11
- package/packages/telemetry-contract/README.md +11 -0
- package/packages/telemetry-contract/fixtures/golden.json +87 -0
- package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
- package/packages/telemetry-contract/index.ts +1 -0
- package/packages/telemetry-contract/package.json +19 -0
- package/packages/telemetry-contract/src/index.ts +2 -0
- package/packages/telemetry-contract/src/types.ts +163 -0
- package/packages/telemetry-contract/src/validators.ts +109 -0
- package/skill/SKILL.md +180 -33
- package/skill/Workflows/AutoActivation.md +145 -0
- package/skill/Workflows/Badge.md +124 -0
- package/skill/Workflows/Baseline.md +144 -0
- package/skill/Workflows/Composability.md +107 -0
- package/skill/Workflows/Contribute.md +94 -0
- package/skill/Workflows/Cron.md +132 -0
- package/skill/Workflows/Dashboard.md +214 -0
- package/skill/Workflows/Doctor.md +63 -14
- package/skill/Workflows/Evals.md +110 -18
- package/skill/Workflows/EvolutionMemory.md +154 -0
- package/skill/Workflows/Evolve.md +181 -21
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/Grade.md +36 -31
- package/skill/Workflows/ImportSkillsBench.md +117 -0
- package/skill/Workflows/Ingest.md +142 -21
- package/skill/Workflows/Initialize.md +91 -23
- package/skill/Workflows/Orchestrate.md +139 -0
- package/skill/Workflows/Replay.md +91 -0
- package/skill/Workflows/Rollback.md +23 -4
- package/skill/Workflows/Schedule.md +61 -0
- package/skill/Workflows/Sync.md +88 -0
- package/skill/Workflows/UnitTest.md +150 -0
- package/skill/Workflows/Watch.md +33 -1
- package/skill/Workflows/Workflows.md +129 -0
- package/skill/assets/activation-rules-default.json +26 -0
- package/skill/assets/multi-skill-settings.json +63 -0
- package/skill/assets/single-skill-settings.json +57 -0
- package/skill/references/invocation-taxonomy.md +2 -2
- package/skill/references/logs.md +164 -2
- package/skill/references/setup-patterns.md +65 -0
- package/skill/references/version-history.md +40 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
- package/dashboard/index.html +0 -1119
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CLI entrypoint for skill unit tests.
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* selftune eval unit-test --skill <name> --tests <path> [--run-agent] [--generate]
|
|
6
|
+
*
|
|
7
|
+
* --skill <name> Skill name (required)
|
|
8
|
+
* --tests <path> Path to unit test JSON file (default: ~/.selftune/unit-tests/<skill>.json)
|
|
9
|
+
* --run-agent Actually run tests through an agent (otherwise dry-run with static checks)
|
|
10
|
+
* --generate Generate tests from skill content using LLM (requires agent)
|
|
11
|
+
* --skill-path <p> Path to skill file (used with --generate for content)
|
|
12
|
+
* --eval-set <p> Path to eval set JSON (used with --generate for failure context)
|
|
13
|
+
* --model <m> Model flag for LLM calls
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
17
|
+
import { join } from "node:path";
|
|
18
|
+
import { parseArgs } from "node:util";
|
|
19
|
+
|
|
20
|
+
import { SELFTUNE_CONFIG_DIR } from "../constants.js";
|
|
21
|
+
import type { EvalEntry } from "../types.js";
|
|
22
|
+
import { callLlm, detectAgent } from "../utils/llm-call.js";
|
|
23
|
+
import { generateUnitTests } from "./generate-unit-tests.js";
|
|
24
|
+
import type { AgentRunner } from "./unit-test.js";
|
|
25
|
+
import { loadUnitTests, runUnitTestSuite } from "./unit-test.js";
|
|
26
|
+
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
// CLI
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
export async function cliMain(): Promise<void> {
|
|
32
|
+
const { values } = parseArgs({
|
|
33
|
+
options: {
|
|
34
|
+
skill: { type: "string" },
|
|
35
|
+
tests: { type: "string" },
|
|
36
|
+
"run-agent": { type: "boolean", default: false },
|
|
37
|
+
generate: { type: "boolean", default: false },
|
|
38
|
+
"skill-path": { type: "string" },
|
|
39
|
+
"eval-set": { type: "string" },
|
|
40
|
+
model: { type: "string" },
|
|
41
|
+
},
|
|
42
|
+
strict: true,
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
if (!values.skill) {
|
|
46
|
+
console.error("[ERROR] --skill <name> is required.");
|
|
47
|
+
process.exit(1);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const skillName = values.skill;
|
|
51
|
+
const unitTestDir = join(SELFTUNE_CONFIG_DIR, "unit-tests");
|
|
52
|
+
const defaultTestsPath = join(unitTestDir, `${skillName}.json`);
|
|
53
|
+
const testsPath = values.tests ?? defaultTestsPath;
|
|
54
|
+
|
|
55
|
+
// --generate: create tests from skill content
|
|
56
|
+
if (values.generate) {
|
|
57
|
+
const agent = detectAgent();
|
|
58
|
+
if (!agent) {
|
|
59
|
+
console.error("[ERROR] No agent CLI found (claude/codex/opencode). Cannot generate tests.");
|
|
60
|
+
process.exit(1);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
let skillContent = `Skill: ${skillName}`;
|
|
64
|
+
if (values["skill-path"] && existsSync(values["skill-path"])) {
|
|
65
|
+
skillContent = readFileSync(values["skill-path"], "utf-8");
|
|
66
|
+
} else if (values["skill-path"]) {
|
|
67
|
+
console.warn(`[WARN] Skill path not found: ${values["skill-path"]}. Using skill name only.`);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
let evalFailures: EvalEntry[] = [];
|
|
71
|
+
if (values["eval-set"] && existsSync(values["eval-set"])) {
|
|
72
|
+
try {
|
|
73
|
+
const raw = readFileSync(values["eval-set"], "utf-8");
|
|
74
|
+
const entries: EvalEntry[] = JSON.parse(raw);
|
|
75
|
+
evalFailures = entries.filter((e) => e.should_trigger);
|
|
76
|
+
} catch {
|
|
77
|
+
console.warn("[WARN] Failed to parse eval set. Proceeding without failure context.");
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const modelFlag = values.model;
|
|
82
|
+
const llmCaller = (systemPrompt: string, userPrompt: string) =>
|
|
83
|
+
callLlm(systemPrompt, userPrompt, agent, modelFlag);
|
|
84
|
+
|
|
85
|
+
console.log(`Generating unit tests for skill '${skillName}'...`);
|
|
86
|
+
const tests = await generateUnitTests(skillName, skillContent, evalFailures, llmCaller);
|
|
87
|
+
|
|
88
|
+
if (tests.length === 0) {
|
|
89
|
+
console.error("[ERROR] No tests generated. Check agent/LLM availability.");
|
|
90
|
+
process.exit(1);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Ensure output directory exists
|
|
94
|
+
mkdirSync(unitTestDir, { recursive: true });
|
|
95
|
+
writeFileSync(testsPath, JSON.stringify(tests, null, 2), "utf-8");
|
|
96
|
+
console.log(`Generated ${tests.length} unit tests -> ${testsPath}`);
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Load and run tests
|
|
101
|
+
const tests = loadUnitTests(testsPath);
|
|
102
|
+
if (tests.length === 0) {
|
|
103
|
+
console.error(`[ERROR] No tests found at ${testsPath}`);
|
|
104
|
+
console.error(" Use --generate to create tests, or provide --tests <path>.");
|
|
105
|
+
process.exit(1);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
console.log(`Loaded ${tests.length} unit tests for skill '${skillName}'`);
|
|
109
|
+
|
|
110
|
+
let agentRunner: AgentRunner;
|
|
111
|
+
|
|
112
|
+
if (values["run-agent"]) {
|
|
113
|
+
const agent = detectAgent();
|
|
114
|
+
if (!agent) {
|
|
115
|
+
console.error("[ERROR] No agent CLI found. Cannot run agent-based tests.");
|
|
116
|
+
process.exit(1);
|
|
117
|
+
}
|
|
118
|
+
const modelFlag = values.model;
|
|
119
|
+
agentRunner = async (query: string): Promise<string> => {
|
|
120
|
+
return callLlm("You are a helpful assistant.", query, agent, modelFlag);
|
|
121
|
+
};
|
|
122
|
+
} else {
|
|
123
|
+
// Dry-run: use query as transcript (only static assertions like contains work meaningfully)
|
|
124
|
+
console.log("(dry-run mode — use --run-agent for full agent execution)\n");
|
|
125
|
+
agentRunner = async (query: string): Promise<string> => query;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
const suite = await runUnitTestSuite(tests, skillName, agentRunner);
|
|
129
|
+
|
|
130
|
+
// Print results
|
|
131
|
+
console.log(`\nResults for '${suite.skill_name}':`);
|
|
132
|
+
console.log(` Total: ${suite.total} Passed: ${suite.passed} Failed: ${suite.failed}`);
|
|
133
|
+
console.log(` Pass rate: ${(suite.pass_rate * 100).toFixed(1)}%`);
|
|
134
|
+
|
|
135
|
+
if (suite.failed > 0) {
|
|
136
|
+
console.log("\nFailed tests:");
|
|
137
|
+
for (const r of suite.results.filter((r) => !r.passed)) {
|
|
138
|
+
console.log(` [FAIL] ${r.test_id} (${r.duration_ms}ms)`);
|
|
139
|
+
if (r.error) {
|
|
140
|
+
console.log(` Error: ${r.error}`);
|
|
141
|
+
}
|
|
142
|
+
for (const a of r.assertion_results.filter((a) => !a.passed)) {
|
|
143
|
+
console.log(
|
|
144
|
+
` - ${a.assertion.type}: expected "${a.assertion.value}", got "${a.actual}"`,
|
|
145
|
+
);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
console.log(`\n${JSON.stringify(suite, null, 2)}`);
|
|
151
|
+
process.exit(suite.failed > 0 ? 1 : 0);
|
|
152
|
+
}
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Skill unit test runner.
|
|
3
|
+
*
|
|
4
|
+
* Loads, runs, and reports on skill-level unit tests.
|
|
5
|
+
* Tests are stored as JSON arrays of SkillUnitTest objects.
|
|
6
|
+
*
|
|
7
|
+
* Assertion types:
|
|
8
|
+
* - contains / not_contains: check transcript for substring
|
|
9
|
+
* - regex: check transcript against a regex pattern
|
|
10
|
+
* - tool_called / tool_not_called: check transcript for tool usage
|
|
11
|
+
* - json_path: check key=value in parsed JSON from transcript
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
15
|
+
import type {
|
|
16
|
+
SkillAssertion,
|
|
17
|
+
SkillUnitTest,
|
|
18
|
+
UnitTestResult,
|
|
19
|
+
UnitTestSuiteResult,
|
|
20
|
+
} from "../types.js";
|
|
21
|
+
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
// Assertion checker (deterministic, no agent needed)
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
/** Check a single assertion against a transcript string. */
|
|
27
|
+
export function checkAssertion(
|
|
28
|
+
assertion: SkillAssertion,
|
|
29
|
+
transcript: string,
|
|
30
|
+
): { passed: boolean; actual?: string } {
|
|
31
|
+
switch (assertion.type) {
|
|
32
|
+
case "contains":
|
|
33
|
+
return {
|
|
34
|
+
passed: transcript.includes(assertion.value),
|
|
35
|
+
actual: transcript.includes(assertion.value) ? assertion.value : "(not found)",
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
case "not_contains":
|
|
39
|
+
return {
|
|
40
|
+
passed: !transcript.includes(assertion.value),
|
|
41
|
+
actual: transcript.includes(assertion.value) ? `found: ${assertion.value}` : "(absent)",
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
case "regex": {
|
|
45
|
+
const re = new RegExp(assertion.value);
|
|
46
|
+
const match = re.exec(transcript);
|
|
47
|
+
return {
|
|
48
|
+
passed: match !== null,
|
|
49
|
+
actual: match ? match[0] : "(no match)",
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
case "tool_called":
|
|
54
|
+
return {
|
|
55
|
+
passed: transcript.includes(assertion.value),
|
|
56
|
+
actual: transcript.includes(assertion.value) ? assertion.value : "(tool not found)",
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
case "tool_not_called":
|
|
60
|
+
return {
|
|
61
|
+
passed: !transcript.includes(assertion.value),
|
|
62
|
+
actual: transcript.includes(assertion.value) ? `found: ${assertion.value}` : "(absent)",
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
case "json_path": {
|
|
66
|
+
// Simple key=value check: "status=ok" looks for {"status":"ok"} in transcript
|
|
67
|
+
const eqIdx = assertion.value.indexOf("=");
|
|
68
|
+
if (eqIdx < 0) {
|
|
69
|
+
return { passed: false, actual: "invalid json_path format (expected key=value)" };
|
|
70
|
+
}
|
|
71
|
+
const key = assertion.value.slice(0, eqIdx);
|
|
72
|
+
const expected = assertion.value.slice(eqIdx + 1);
|
|
73
|
+
try {
|
|
74
|
+
const parsed = JSON.parse(transcript);
|
|
75
|
+
const actual = String(parsed[key] ?? "");
|
|
76
|
+
return { passed: actual === expected, actual };
|
|
77
|
+
} catch {
|
|
78
|
+
// Try to find JSON in the transcript
|
|
79
|
+
const jsonMatch = transcript.match(/\{[^}]+\}/);
|
|
80
|
+
if (jsonMatch) {
|
|
81
|
+
try {
|
|
82
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
83
|
+
const actual = String(parsed[key] ?? "");
|
|
84
|
+
return { passed: actual === expected, actual };
|
|
85
|
+
} catch {
|
|
86
|
+
return { passed: false, actual: "(json parse error)" };
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
return { passed: false, actual: "(no json found)" };
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
default:
|
|
94
|
+
return { passed: false, actual: `unknown assertion type: ${assertion.type}` };
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// ---------------------------------------------------------------------------
|
|
99
|
+
// Load unit tests from JSON file
|
|
100
|
+
// ---------------------------------------------------------------------------
|
|
101
|
+
|
|
102
|
+
/** Load unit tests from a JSON file. Returns empty array on error. */
|
|
103
|
+
export function loadUnitTests(testsPath: string): SkillUnitTest[] {
|
|
104
|
+
try {
|
|
105
|
+
if (!existsSync(testsPath)) {
|
|
106
|
+
console.warn(`[WARN] Unit test file not found: ${testsPath}`);
|
|
107
|
+
return [];
|
|
108
|
+
}
|
|
109
|
+
const raw = readFileSync(testsPath, "utf-8");
|
|
110
|
+
const parsed = JSON.parse(raw);
|
|
111
|
+
if (!Array.isArray(parsed)) {
|
|
112
|
+
console.warn(`[WARN] Unit test file is not an array: ${testsPath}`);
|
|
113
|
+
return [];
|
|
114
|
+
}
|
|
115
|
+
return parsed as SkillUnitTest[];
|
|
116
|
+
} catch (err) {
|
|
117
|
+
console.warn(`[WARN] Failed to load unit tests from ${testsPath}:`, err);
|
|
118
|
+
return [];
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// ---------------------------------------------------------------------------
|
|
123
|
+
// Run a single unit test
|
|
124
|
+
// ---------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
/** Agent function type: takes a query, returns transcript text. */
|
|
127
|
+
export type AgentRunner = (query: string) => Promise<string>;
|
|
128
|
+
|
|
129
|
+
/** Run a single unit test against an agent runner. */
|
|
130
|
+
export async function runUnitTest(
|
|
131
|
+
test: SkillUnitTest,
|
|
132
|
+
agent: AgentRunner,
|
|
133
|
+
): Promise<UnitTestResult> {
|
|
134
|
+
const start = Date.now();
|
|
135
|
+
|
|
136
|
+
try {
|
|
137
|
+
const transcript = await agent(test.query);
|
|
138
|
+
const assertionResults = test.assertions.map((assertion) => {
|
|
139
|
+
const result = checkAssertion(assertion, transcript);
|
|
140
|
+
return { assertion, passed: result.passed, actual: result.actual };
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
const allPassed = assertionResults.every((r) => r.passed);
|
|
144
|
+
|
|
145
|
+
return {
|
|
146
|
+
test_id: test.id,
|
|
147
|
+
passed: allPassed,
|
|
148
|
+
assertion_results: assertionResults,
|
|
149
|
+
duration_ms: Date.now() - start,
|
|
150
|
+
};
|
|
151
|
+
} catch (err) {
|
|
152
|
+
return {
|
|
153
|
+
test_id: test.id,
|
|
154
|
+
passed: false,
|
|
155
|
+
assertion_results: test.assertions.map((assertion) => ({
|
|
156
|
+
assertion,
|
|
157
|
+
passed: false,
|
|
158
|
+
actual: "error",
|
|
159
|
+
})),
|
|
160
|
+
duration_ms: Date.now() - start,
|
|
161
|
+
error: err instanceof Error ? err.message : String(err),
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// ---------------------------------------------------------------------------
|
|
167
|
+
// Run a full unit test suite
|
|
168
|
+
// ---------------------------------------------------------------------------
|
|
169
|
+
|
|
170
|
+
/** Run all unit tests and return aggregated results. */
|
|
171
|
+
export async function runUnitTestSuite(
|
|
172
|
+
tests: SkillUnitTest[],
|
|
173
|
+
skillName: string,
|
|
174
|
+
agent: AgentRunner,
|
|
175
|
+
): Promise<UnitTestSuiteResult> {
|
|
176
|
+
const results: UnitTestResult[] = [];
|
|
177
|
+
|
|
178
|
+
for (const t of tests) {
|
|
179
|
+
const result = await runUnitTest(t, agent);
|
|
180
|
+
results.push(result);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
const passed = results.filter((r) => r.passed).length;
|
|
184
|
+
const failed = results.filter((r) => !r.passed).length;
|
|
185
|
+
const total = results.length;
|
|
186
|
+
|
|
187
|
+
return {
|
|
188
|
+
skill_name: skillName,
|
|
189
|
+
total,
|
|
190
|
+
passed,
|
|
191
|
+
failed,
|
|
192
|
+
pass_rate: total > 0 ? passed / total : 0,
|
|
193
|
+
results,
|
|
194
|
+
run_at: new Date().toISOString(),
|
|
195
|
+
};
|
|
196
|
+
}
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
9
|
import { copyFileSync, existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
10
|
-
import type { EvolutionProposal } from "../types.js";
|
|
10
|
+
import type { EvolutionProposal, SkillSections } from "../types.js";
|
|
11
11
|
import type { ValidationResult } from "./validate-proposal.js";
|
|
12
12
|
|
|
13
13
|
// ---------------------------------------------------------------------------
|
|
@@ -93,6 +93,147 @@ export function replaceDescription(currentContent: string, newDescription: strin
|
|
|
93
93
|
return `${preamble}${headingLine}\n${descriptionBlock}\n${afterSubHeading}`;
|
|
94
94
|
}
|
|
95
95
|
|
|
96
|
+
// ---------------------------------------------------------------------------
|
|
97
|
+
// Structured SKILL.md parsing
|
|
98
|
+
// ---------------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Parse a SKILL.md file into named sections.
|
|
102
|
+
*
|
|
103
|
+
* Splits the content into:
|
|
104
|
+
* - frontmatter: YAML frontmatter block (if present, including delimiters)
|
|
105
|
+
* - title: the first `# Heading` line
|
|
106
|
+
* - description: content between the title and the first `## ` heading
|
|
107
|
+
* - sections: map of `## Name` -> content (up to next `##` or EOF)
|
|
108
|
+
*/
|
|
109
|
+
export function parseSkillSections(content: string): SkillSections {
|
|
110
|
+
const lines = content.split("\n");
|
|
111
|
+
let idx = 0;
|
|
112
|
+
|
|
113
|
+
// --- frontmatter ---
|
|
114
|
+
let frontmatter = "";
|
|
115
|
+
if (lines[0]?.trim() === "---") {
|
|
116
|
+
const endIdx = lines.indexOf("---", 1);
|
|
117
|
+
if (endIdx > 0) {
|
|
118
|
+
frontmatter = lines.slice(0, endIdx + 1).join("\n");
|
|
119
|
+
idx = endIdx + 1;
|
|
120
|
+
// skip blank line after frontmatter
|
|
121
|
+
if (idx < lines.length && lines[idx].trim() === "") idx++;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// --- title ---
|
|
126
|
+
let title = "";
|
|
127
|
+
while (idx < lines.length) {
|
|
128
|
+
if (lines[idx].startsWith("# ") && !lines[idx].startsWith("## ")) {
|
|
129
|
+
title = lines[idx];
|
|
130
|
+
idx++;
|
|
131
|
+
break;
|
|
132
|
+
}
|
|
133
|
+
idx++;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// --- description (between title and first ## heading) ---
|
|
137
|
+
const descLines: string[] = [];
|
|
138
|
+
while (idx < lines.length && !lines[idx].startsWith("## ")) {
|
|
139
|
+
descLines.push(lines[idx]);
|
|
140
|
+
idx++;
|
|
141
|
+
}
|
|
142
|
+
// Trim leading/trailing blank lines from description
|
|
143
|
+
const description = descLines.join("\n").trim();
|
|
144
|
+
|
|
145
|
+
// --- remaining ## sections ---
|
|
146
|
+
const sections: Record<string, string> = {};
|
|
147
|
+
let currentSection = "";
|
|
148
|
+
const sectionLines: string[] = [];
|
|
149
|
+
|
|
150
|
+
while (idx < lines.length) {
|
|
151
|
+
if (lines[idx].startsWith("## ")) {
|
|
152
|
+
// Flush previous section
|
|
153
|
+
if (currentSection) {
|
|
154
|
+
sections[currentSection] = sectionLines.join("\n").trim();
|
|
155
|
+
sectionLines.length = 0;
|
|
156
|
+
}
|
|
157
|
+
currentSection = lines[idx].replace(/^## /, "").trim();
|
|
158
|
+
idx++;
|
|
159
|
+
} else {
|
|
160
|
+
sectionLines.push(lines[idx]);
|
|
161
|
+
idx++;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
// Flush last section
|
|
165
|
+
if (currentSection) {
|
|
166
|
+
sections[currentSection] = sectionLines.join("\n").trim();
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return { frontmatter, title, description, sections };
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// ---------------------------------------------------------------------------
|
|
173
|
+
// Section replacement
|
|
174
|
+
// ---------------------------------------------------------------------------
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Replace a named `## Section` block in a SKILL.md file.
|
|
178
|
+
*
|
|
179
|
+
* If the section does not exist, appends it at the end.
|
|
180
|
+
*/
|
|
181
|
+
export function replaceSection(content: string, sectionName: string, newContent: string): string {
|
|
182
|
+
const lines = content.split("\n");
|
|
183
|
+
const heading = `## ${sectionName}`;
|
|
184
|
+
let startIdx = -1;
|
|
185
|
+
let endIdx = lines.length;
|
|
186
|
+
|
|
187
|
+
for (let i = 0; i < lines.length; i++) {
|
|
188
|
+
if (
|
|
189
|
+
lines[i].startsWith(heading) &&
|
|
190
|
+
(lines[i].length === heading.length || lines[i][heading.length] === " ")
|
|
191
|
+
) {
|
|
192
|
+
startIdx = i;
|
|
193
|
+
// Find end: next ## heading or EOF
|
|
194
|
+
for (let j = i + 1; j < lines.length; j++) {
|
|
195
|
+
if (lines[j].startsWith("## ")) {
|
|
196
|
+
endIdx = j;
|
|
197
|
+
break;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
break;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
if (startIdx === -1) {
|
|
205
|
+
// Section not found — append
|
|
206
|
+
const trimmed = content.trimEnd();
|
|
207
|
+
return `${trimmed}\n\n${heading}\n\n${newContent}\n`;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
const before = lines.slice(0, startIdx);
|
|
211
|
+
const after = lines.slice(endIdx);
|
|
212
|
+
return [...before, heading, "", newContent, "", ...after].join("\n");
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Replace the entire body below frontmatter with a proposed body.
|
|
217
|
+
*
|
|
218
|
+
* Preserves frontmatter (if present) and the `# Title` line intact.
|
|
219
|
+
*/
|
|
220
|
+
export function replaceBody(currentContent: string, proposedBody: string): string {
|
|
221
|
+
const parsed = parseSkillSections(currentContent);
|
|
222
|
+
const parts: string[] = [];
|
|
223
|
+
|
|
224
|
+
if (parsed.frontmatter) {
|
|
225
|
+
parts.push(parsed.frontmatter);
|
|
226
|
+
parts.push("");
|
|
227
|
+
}
|
|
228
|
+
if (parsed.title) {
|
|
229
|
+
parts.push(parsed.title);
|
|
230
|
+
parts.push("");
|
|
231
|
+
}
|
|
232
|
+
parts.push(proposedBody);
|
|
233
|
+
|
|
234
|
+
return `${parts.join("\n").trimEnd()}\n`;
|
|
235
|
+
}
|
|
236
|
+
|
|
96
237
|
// ---------------------------------------------------------------------------
|
|
97
238
|
// Commit message builder
|
|
98
239
|
// ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evolution evidence trail: append and read proposal/eval artifacts that power
|
|
3
|
+
* explainable dashboard drill-downs.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { EVOLUTION_EVIDENCE_LOG } from "../constants.js";
|
|
7
|
+
import type { EvolutionEvidenceEntry } from "../types.js";
|
|
8
|
+
import { appendJsonl, readJsonl } from "../utils/jsonl.js";
|
|
9
|
+
|
|
10
|
+
/** Append a structured evidence artifact to the evolution evidence log. */
|
|
11
|
+
export function appendEvidenceEntry(
|
|
12
|
+
entry: EvolutionEvidenceEntry,
|
|
13
|
+
logPath: string = EVOLUTION_EVIDENCE_LOG,
|
|
14
|
+
): void {
|
|
15
|
+
appendJsonl(logPath, entry);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/** Read all evidence entries, optionally filtered by exact skill name. */
|
|
19
|
+
export function readEvidenceTrail(
|
|
20
|
+
skillName?: string,
|
|
21
|
+
logPath: string = EVOLUTION_EVIDENCE_LOG,
|
|
22
|
+
): EvolutionEvidenceEntry[] {
|
|
23
|
+
const entries = readJsonl<EvolutionEvidenceEntry>(logPath);
|
|
24
|
+
if (!skillName) return entries;
|
|
25
|
+
return entries.filter((entry) => entry.skill_name === skillName);
|
|
26
|
+
}
|