selftune 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +146 -0
- package/.claude/agents/evolution-reviewer.md +167 -0
- package/.claude/agents/integration-guide.md +200 -0
- package/.claude/agents/pattern-analyst.md +147 -0
- package/CHANGELOG.md +37 -0
- package/README.md +96 -256
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +103 -0
- package/cli/selftune/constants.ts +75 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-server.ts +582 -0
- package/cli/selftune/dashboard.ts +25 -3
- package/cli/selftune/eval/baseline.ts +247 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +68 -2
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evolve-body.ts +492 -0
- package/cli/selftune/evolution/evolve.ts +466 -103
- package/cli/selftune/evolution/extract-patterns.ts +32 -1
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +19 -2
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/grade-session.ts +138 -18
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/index.ts +88 -0
- package/cli/selftune/ingestors/claude-replay.ts +351 -0
- package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
- package/cli/selftune/init.ts +150 -3
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +25 -2
- package/cli/selftune/status.ts +17 -13
- package/cli/selftune/types.ts +377 -5
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/llm-call.ts +29 -3
- package/cli/selftune/utils/transcript.ts +35 -0
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/dashboard/index.html +569 -8
- package/package.json +8 -4
- package/skill/SKILL.md +124 -8
- package/skill/Workflows/AutoActivation.md +144 -0
- package/skill/Workflows/Badge.md +118 -0
- package/skill/Workflows/Baseline.md +121 -0
- package/skill/Workflows/Composability.md +100 -0
- package/skill/Workflows/Contribute.md +91 -0
- package/skill/Workflows/Cron.md +155 -0
- package/skill/Workflows/Dashboard.md +203 -0
- package/skill/Workflows/Doctor.md +37 -1
- package/skill/Workflows/Evals.md +69 -1
- package/skill/Workflows/EvolutionMemory.md +152 -0
- package/skill/Workflows/Evolve.md +111 -6
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/ImportSkillsBench.md +111 -0
- package/skill/Workflows/Ingest.md +117 -3
- package/skill/Workflows/Initialize.md +57 -3
- package/skill/Workflows/Replay.md +70 -0
- package/skill/Workflows/Rollback.md +20 -1
- package/skill/Workflows/UnitTest.md +138 -0
- package/skill/Workflows/Watch.md +22 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
|
@@ -16,6 +16,7 @@ import { TELEMETRY_LOG } from "../constants.js";
|
|
|
16
16
|
import type {
|
|
17
17
|
ExecutionMetrics,
|
|
18
18
|
GraderOutput,
|
|
19
|
+
GradingExpectation,
|
|
19
20
|
GradingResult,
|
|
20
21
|
SessionTelemetryRecord,
|
|
21
22
|
} from "../types.js";
|
|
@@ -26,6 +27,7 @@ import {
|
|
|
26
27
|
callViaAgent,
|
|
27
28
|
} from "../utils/llm-call.js";
|
|
28
29
|
import { readExcerpt } from "../utils/transcript.js";
|
|
30
|
+
import { type PreGateContext, runPreGates } from "./pre-gates.js";
|
|
29
31
|
|
|
30
32
|
// Re-export for backward compatibility
|
|
31
33
|
export { detectAgent, stripMarkdownFences } from "../utils/llm-call.js";
|
|
@@ -48,24 +50,36 @@ export const GRADER_SYSTEM = `You are a rigorous skill session evaluator. You re
|
|
|
48
50
|
Grade each expectation and output ONLY valid JSON matching this schema:
|
|
49
51
|
{
|
|
50
52
|
"expectations": [
|
|
51
|
-
{"text": "...", "passed": true/false, "evidence": "specific quote or metric"}
|
|
53
|
+
{"text": "...", "passed": true/false, "evidence": "specific quote or metric", "score": 0.0-1.0}
|
|
52
54
|
],
|
|
53
|
-
"summary": {"passed": N, "failed": N, "total": N, "pass_rate": 0.0},
|
|
55
|
+
"summary": {"passed": N, "failed": N, "total": N, "pass_rate": 0.0, "mean_score": 0.0},
|
|
54
56
|
"claims": [
|
|
55
57
|
{"claim": "...", "type": "factual|process|quality", "verified": true/false, "evidence": "..."}
|
|
56
58
|
],
|
|
57
59
|
"eval_feedback": {
|
|
58
60
|
"suggestions": [{"assertion": "...", "reason": "..."}],
|
|
59
61
|
"overall": "one sentence"
|
|
60
|
-
}
|
|
62
|
+
},
|
|
63
|
+
"failure_feedback": [
|
|
64
|
+
{"query": "the user query that failed", "failure_reason": "why it failed", "improvement_hint": "how to fix", "invocation_type": "explicit|implicit|contextual|negative"}
|
|
65
|
+
]
|
|
61
66
|
}
|
|
62
67
|
|
|
68
|
+
Score guide:
|
|
69
|
+
- 1.0: Clear, specific evidence of full completion
|
|
70
|
+
- 0.7-0.9: Strong evidence with minor gaps
|
|
71
|
+
- 0.4-0.6: Partial evidence or partial completion
|
|
72
|
+
- 0.1-0.3: Weak evidence, mostly not met
|
|
73
|
+
- 0.0: No evidence or clearly not met
|
|
74
|
+
|
|
63
75
|
Rules:
|
|
64
76
|
- PASS only when there is clear, specific evidence — not assumptions
|
|
65
77
|
- FAIL when evidence is absent or contradictory
|
|
66
78
|
- Cite exact quotes or specific metric values
|
|
67
79
|
- Extract 2-4 implicit claims from the transcript and verify them
|
|
68
|
-
- Suggest eval improvements only for clear gaps
|
|
80
|
+
- Suggest eval improvements only for clear gaps
|
|
81
|
+
- Set score to reflect confidence level (0.0-1.0)
|
|
82
|
+
- For each FAILED expectation, provide a failure_feedback entry with the relevant query, specific reason for failure, and actionable improvement hint`;
|
|
69
83
|
|
|
70
84
|
// ---------------------------------------------------------------------------
|
|
71
85
|
// Data lookup helpers
|
|
@@ -159,6 +173,39 @@ export function buildExecutionMetrics(telemetry: SessionTelemetryRecord): Execut
|
|
|
159
173
|
};
|
|
160
174
|
}
|
|
161
175
|
|
|
176
|
+
// ---------------------------------------------------------------------------
|
|
177
|
+
// Graduated scoring
|
|
178
|
+
// ---------------------------------------------------------------------------
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Compute graduated scoring summary from expectations.
|
|
182
|
+
* Uses score field if present, defaults to 1.0 for pass, 0.0 for fail.
|
|
183
|
+
*/
|
|
184
|
+
export function buildGraduatedSummary(expectations: GradingExpectation[]): {
|
|
185
|
+
mean_score: number;
|
|
186
|
+
score_std_dev: number;
|
|
187
|
+
} {
|
|
188
|
+
if (expectations.length === 0) {
|
|
189
|
+
return { mean_score: 0, score_std_dev: 0 };
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
const scores = expectations.map((e) => {
|
|
193
|
+
const fallback = e.passed ? 1.0 : 0.0;
|
|
194
|
+
const raw = e.score ?? fallback;
|
|
195
|
+
if (!Number.isFinite(raw)) return fallback;
|
|
196
|
+
return Math.min(1, Math.max(0, raw));
|
|
197
|
+
});
|
|
198
|
+
const mean = scores.reduce((sum, s) => sum + s, 0) / scores.length;
|
|
199
|
+
|
|
200
|
+
const variance = scores.reduce((sum, s) => sum + (s - mean) ** 2, 0) / scores.length;
|
|
201
|
+
const stdDev = Math.sqrt(variance);
|
|
202
|
+
|
|
203
|
+
return {
|
|
204
|
+
mean_score: Math.round(mean * 1000) / 1000,
|
|
205
|
+
score_std_dev: Math.round(stdDev * 1000) / 1000,
|
|
206
|
+
};
|
|
207
|
+
}
|
|
208
|
+
|
|
162
209
|
// ---------------------------------------------------------------------------
|
|
163
210
|
// Prompt building
|
|
164
211
|
// ---------------------------------------------------------------------------
|
|
@@ -234,16 +281,31 @@ export function assembleResult(
|
|
|
234
281
|
skillName: string,
|
|
235
282
|
transcriptPath: string,
|
|
236
283
|
): GradingResult {
|
|
284
|
+
// Default missing scores on expectations
|
|
285
|
+
const expectations = (graderOutput?.expectations ?? []).map((e) => ({
|
|
286
|
+
...e,
|
|
287
|
+
score: e.score ?? (e.passed ? 1.0 : 0.0),
|
|
288
|
+
source: e.source ?? ("llm" as const),
|
|
289
|
+
}));
|
|
290
|
+
|
|
291
|
+
const baseSummary = graderOutput?.summary ?? { passed: 0, failed: 0, total: 0, pass_rate: 0 };
|
|
292
|
+
const graduated = buildGraduatedSummary(expectations);
|
|
293
|
+
|
|
237
294
|
return {
|
|
238
295
|
session_id: sessionId ?? "unknown",
|
|
239
296
|
skill_name: skillName ?? "unknown",
|
|
240
297
|
transcript_path: transcriptPath ?? "",
|
|
241
298
|
graded_at: new Date().toISOString(),
|
|
242
|
-
expectations
|
|
243
|
-
summary:
|
|
299
|
+
expectations,
|
|
300
|
+
summary: {
|
|
301
|
+
...baseSummary,
|
|
302
|
+
mean_score: graduated.mean_score,
|
|
303
|
+
score_std_dev: graduated.score_std_dev,
|
|
304
|
+
},
|
|
244
305
|
execution_metrics: buildExecutionMetrics(telemetry ?? ({} as SessionTelemetryRecord)),
|
|
245
306
|
claims: graderOutput?.claims ?? [],
|
|
246
307
|
eval_feedback: graderOutput?.eval_feedback ?? { suggestions: [], overall: "" },
|
|
308
|
+
failure_feedback: graderOutput?.failure_feedback,
|
|
247
309
|
};
|
|
248
310
|
}
|
|
249
311
|
|
|
@@ -254,10 +316,16 @@ export function assembleResult(
|
|
|
254
316
|
function printSummary(result: GradingResult): void {
|
|
255
317
|
const { summary } = result;
|
|
256
318
|
const rate = summary.pass_rate ?? 0;
|
|
257
|
-
|
|
319
|
+
const meanStr =
|
|
320
|
+
summary.mean_score != null ? ` | mean score: ${summary.mean_score.toFixed(2)}` : "";
|
|
321
|
+
console.log(
|
|
322
|
+
`\nResults: ${summary.passed}/${summary.total} passed (${Math.round(rate * 100)}%)${meanStr}`,
|
|
323
|
+
);
|
|
258
324
|
for (const exp of result.expectations ?? []) {
|
|
259
325
|
const icon = exp.passed ? "\u2713" : "\u2717";
|
|
260
|
-
|
|
326
|
+
const scoreStr = exp.score != null ? ` [${exp.score.toFixed(1)}]` : "";
|
|
327
|
+
const sourceStr = exp.source ? ` (${exp.source})` : "";
|
|
328
|
+
console.log(` ${icon}${scoreStr}${sourceStr} ${String(exp.text ?? "").slice(0, 70)}`);
|
|
261
329
|
if (!exp.passed) {
|
|
262
330
|
console.log(` -> ${String(exp.evidence ?? "").slice(0, 100)}`);
|
|
263
331
|
}
|
|
@@ -380,20 +448,72 @@ export async function cliMain(): Promise<void> {
|
|
|
380
448
|
console.log("==========================\n");
|
|
381
449
|
}
|
|
382
450
|
|
|
383
|
-
// ---
|
|
384
|
-
const
|
|
451
|
+
// --- Run pre-gates first ---
|
|
452
|
+
const preGateCtx: PreGateContext = {
|
|
453
|
+
telemetry,
|
|
454
|
+
skillName: skill,
|
|
455
|
+
transcriptExcerpt,
|
|
456
|
+
};
|
|
457
|
+
const preGateResult = runPreGates(expectations, preGateCtx);
|
|
385
458
|
|
|
386
|
-
|
|
459
|
+
let allExpectations: GradingExpectation[];
|
|
387
460
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
461
|
+
if (preGateResult.remaining.length === 0) {
|
|
462
|
+
// All expectations resolved by pre-gates — skip LLM entirely
|
|
463
|
+
console.error(
|
|
464
|
+
`[INFO] All ${expectations.length} expectations resolved by pre-gates, skipping LLM`,
|
|
465
|
+
);
|
|
466
|
+
allExpectations = preGateResult.resolved;
|
|
467
|
+
} else {
|
|
468
|
+
// Build prompt and grade remaining via LLM
|
|
469
|
+
console.error(
|
|
470
|
+
`[INFO] Pre-gates resolved ${preGateResult.resolved.length}/${expectations.length} expectations`,
|
|
471
|
+
);
|
|
472
|
+
const prompt = buildGradingPrompt(preGateResult.remaining, telemetry, transcriptExcerpt, skill);
|
|
473
|
+
console.error(`Grading ${preGateResult.remaining.length} expectations for skill '${skill}'...`);
|
|
474
|
+
|
|
475
|
+
let graderOutput: GraderOutput;
|
|
476
|
+
try {
|
|
477
|
+
graderOutput = await gradeViaAgent(prompt, agent);
|
|
478
|
+
} catch (e) {
|
|
479
|
+
console.error(`[ERROR] Grading failed: ${e}`);
|
|
480
|
+
process.exit(1);
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
// Default scores on LLM results
|
|
484
|
+
const llmExpectations = (graderOutput.expectations ?? []).map((e) => ({
|
|
485
|
+
...e,
|
|
486
|
+
score: e.score ?? (e.passed ? 1.0 : 0.0),
|
|
487
|
+
source: e.source ?? ("llm" as const),
|
|
488
|
+
}));
|
|
489
|
+
|
|
490
|
+
// Merge pre-gate + LLM results
|
|
491
|
+
allExpectations = [...preGateResult.resolved, ...llmExpectations];
|
|
394
492
|
}
|
|
395
493
|
|
|
396
|
-
|
|
494
|
+
// Compute graduated summary
|
|
495
|
+
const graduated = buildGraduatedSummary(allExpectations);
|
|
496
|
+
const passedCount = allExpectations.filter((e) => e.passed).length;
|
|
497
|
+
const totalCount = allExpectations.length;
|
|
498
|
+
|
|
499
|
+
const result: GradingResult = {
|
|
500
|
+
session_id: sessionId,
|
|
501
|
+
skill_name: skill,
|
|
502
|
+
transcript_path: transcriptPath,
|
|
503
|
+
graded_at: new Date().toISOString(),
|
|
504
|
+
expectations: allExpectations,
|
|
505
|
+
summary: {
|
|
506
|
+
passed: passedCount,
|
|
507
|
+
failed: totalCount - passedCount,
|
|
508
|
+
total: totalCount,
|
|
509
|
+
pass_rate: totalCount > 0 ? passedCount / totalCount : 0,
|
|
510
|
+
mean_score: graduated.mean_score,
|
|
511
|
+
score_std_dev: graduated.score_std_dev,
|
|
512
|
+
},
|
|
513
|
+
execution_metrics: buildExecutionMetrics(telemetry),
|
|
514
|
+
claims: [],
|
|
515
|
+
eval_feedback: { suggestions: [], overall: "" },
|
|
516
|
+
};
|
|
397
517
|
|
|
398
518
|
const outputPath = values.output ?? "grading.json";
|
|
399
519
|
const outputDir = dirname(outputPath);
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pre-gates.ts
|
|
3
|
+
*
|
|
4
|
+
* Deterministic pre-gate checks that resolve grading expectations without LLM.
|
|
5
|
+
* Each gate matches an expectation text pattern and resolves it using telemetry data.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { GradingExpectation, SessionTelemetryRecord } from "../types.js";
|
|
9
|
+
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
// Gate definitions
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
|
|
14
|
+
export interface PreGate {
|
|
15
|
+
name: string;
|
|
16
|
+
pattern: RegExp;
|
|
17
|
+
check: (ctx: PreGateContext) => boolean;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface PreGateContext {
|
|
21
|
+
telemetry: SessionTelemetryRecord;
|
|
22
|
+
skillName: string;
|
|
23
|
+
transcriptExcerpt?: string;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export interface PreGateResult {
|
|
27
|
+
resolved: GradingExpectation[];
|
|
28
|
+
remaining: string[];
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/** Default set of pre-gates. */
|
|
32
|
+
export const DEFAULT_GATES: PreGate[] = [
|
|
33
|
+
{
|
|
34
|
+
name: "skill_md_read",
|
|
35
|
+
pattern: /(read.*skill\.md|skill\.md.*read)/i,
|
|
36
|
+
check: (ctx) => {
|
|
37
|
+
// Check if skills_triggered contains the skill name
|
|
38
|
+
const triggered = ctx.telemetry.skills_triggered ?? [];
|
|
39
|
+
if (triggered.includes(ctx.skillName)) return true;
|
|
40
|
+
// Also check if transcript mentions reading SKILL.md
|
|
41
|
+
if (ctx.transcriptExcerpt && /Read.*SKILL\.md/i.test(ctx.transcriptExcerpt)) return true;
|
|
42
|
+
return false;
|
|
43
|
+
},
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
name: "expected_tools_called",
|
|
47
|
+
pattern: /tool[s]?\s+(were\s+)?called/i,
|
|
48
|
+
check: (ctx) => (ctx.telemetry.total_tool_calls ?? 0) > 0,
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
name: "error_count",
|
|
52
|
+
pattern: /error[s]?\s*(count|encountered)/i,
|
|
53
|
+
check: (ctx) => (ctx.telemetry.errors_encountered ?? 0) <= 2,
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
name: "session_completed",
|
|
57
|
+
pattern: /session\s*(completed|finished)/i,
|
|
58
|
+
check: (ctx) => (ctx.telemetry.assistant_turns ?? 0) > 0,
|
|
59
|
+
},
|
|
60
|
+
];
|
|
61
|
+
|
|
62
|
+
// ---------------------------------------------------------------------------
|
|
63
|
+
// Pre-gate runner
|
|
64
|
+
// ---------------------------------------------------------------------------
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Run pre-gate checks against expectations. Returns resolved expectations
|
|
68
|
+
* (with source: "pre-gate" and score: 1.0 or 0.0) and remaining expectation
|
|
69
|
+
* texts that need LLM grading.
|
|
70
|
+
*/
|
|
71
|
+
export function runPreGates(
|
|
72
|
+
expectations: string[],
|
|
73
|
+
ctx: PreGateContext,
|
|
74
|
+
gates: PreGate[] = DEFAULT_GATES,
|
|
75
|
+
): PreGateResult {
|
|
76
|
+
const resolved: GradingExpectation[] = [];
|
|
77
|
+
const remaining: string[] = [];
|
|
78
|
+
|
|
79
|
+
for (const text of expectations) {
|
|
80
|
+
let matched = false;
|
|
81
|
+
for (const gate of gates) {
|
|
82
|
+
if (gate.pattern.global || gate.pattern.sticky) {
|
|
83
|
+
gate.pattern.lastIndex = 0;
|
|
84
|
+
}
|
|
85
|
+
if (gate.pattern.test(text)) {
|
|
86
|
+
const passed = gate.check(ctx);
|
|
87
|
+
resolved.push({
|
|
88
|
+
text,
|
|
89
|
+
passed,
|
|
90
|
+
evidence: `Pre-gate "${gate.name}": ${passed ? "PASS" : "FAIL"}`,
|
|
91
|
+
score: passed ? 1.0 : 0.0,
|
|
92
|
+
source: "pre-gate",
|
|
93
|
+
});
|
|
94
|
+
matched = true;
|
|
95
|
+
break; // first matching gate wins
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
if (!matched) {
|
|
99
|
+
remaining.push(text);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return { resolved, remaining };
|
|
104
|
+
}
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Claude Code UserPromptSubmit hook: auto-activate.ts
|
|
4
|
+
*
|
|
5
|
+
* Evaluates activation rules against the current session context and
|
|
6
|
+
* outputs suggestions to stderr (shown to Claude as system messages).
|
|
7
|
+
* Suggestions are advisory — exit code is always 0.
|
|
8
|
+
*
|
|
9
|
+
* Session state is tracked to avoid repeated nags within a session.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
13
|
+
import { dirname } from "node:path";
|
|
14
|
+
import {
|
|
15
|
+
CLAUDE_SETTINGS_PATH,
|
|
16
|
+
EVOLUTION_AUDIT_LOG,
|
|
17
|
+
QUERY_LOG,
|
|
18
|
+
SELFTUNE_CONFIG_DIR,
|
|
19
|
+
sessionStatePath,
|
|
20
|
+
TELEMETRY_LOG,
|
|
21
|
+
} from "../constants.js";
|
|
22
|
+
import type {
|
|
23
|
+
ActivationContext,
|
|
24
|
+
ActivationRule,
|
|
25
|
+
PromptSubmitPayload,
|
|
26
|
+
SessionState,
|
|
27
|
+
} from "../types.js";
|
|
28
|
+
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
// Session state persistence
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
export function loadSessionState(path: string, sessionId: string): SessionState {
|
|
34
|
+
if (!existsSync(path)) {
|
|
35
|
+
return { session_id: sessionId, suggestions_shown: [], updated_at: new Date().toISOString() };
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
try {
|
|
39
|
+
const data = JSON.parse(readFileSync(path, "utf-8")) as SessionState;
|
|
40
|
+
if (data.session_id === sessionId && Array.isArray(data.suggestions_shown)) {
|
|
41
|
+
return data;
|
|
42
|
+
}
|
|
43
|
+
} catch {
|
|
44
|
+
// corrupt file — start fresh
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
return { session_id: sessionId, suggestions_shown: [], updated_at: new Date().toISOString() };
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export function saveSessionState(path: string, state: SessionState): void {
|
|
51
|
+
const dir = dirname(path);
|
|
52
|
+
if (!existsSync(dir)) {
|
|
53
|
+
mkdirSync(dir, { recursive: true });
|
|
54
|
+
}
|
|
55
|
+
writeFileSync(path, JSON.stringify(state, null, 2), "utf-8");
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// ---------------------------------------------------------------------------
|
|
59
|
+
// PAI coexistence check
|
|
60
|
+
// ---------------------------------------------------------------------------
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Check if PAI's skill-activation-prompt hook is registered in settings.
|
|
64
|
+
* If so, selftune defers skill-level suggestions.
|
|
65
|
+
*/
|
|
66
|
+
export function checkPaiCoexistence(settingsPath: string): boolean {
|
|
67
|
+
if (!existsSync(settingsPath)) return false;
|
|
68
|
+
|
|
69
|
+
try {
|
|
70
|
+
const settings = JSON.parse(readFileSync(settingsPath, "utf-8")) as {
|
|
71
|
+
hooks?: Record<string, Array<{ command?: string; hooks?: Array<{ command?: string }> }>>;
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
if (!settings.hooks) return false;
|
|
75
|
+
|
|
76
|
+
// Search all hook entries for skill-activation-prompt
|
|
77
|
+
for (const hookEntries of Object.values(settings.hooks)) {
|
|
78
|
+
if (!Array.isArray(hookEntries)) continue;
|
|
79
|
+
for (const entry of hookEntries) {
|
|
80
|
+
// Check flat entry.command
|
|
81
|
+
if (
|
|
82
|
+
typeof entry.command === "string" &&
|
|
83
|
+
entry.command.includes("skill-activation-prompt")
|
|
84
|
+
) {
|
|
85
|
+
return true;
|
|
86
|
+
}
|
|
87
|
+
// Check nested entry.hooks[].command
|
|
88
|
+
if (entry.hooks && Array.isArray(entry.hooks)) {
|
|
89
|
+
for (const hook of entry.hooks) {
|
|
90
|
+
if (
|
|
91
|
+
typeof hook.command === "string" &&
|
|
92
|
+
hook.command.includes("skill-activation-prompt")
|
|
93
|
+
) {
|
|
94
|
+
return true;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
} catch {
|
|
101
|
+
// fail-open
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return false;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// ---------------------------------------------------------------------------
|
|
108
|
+
// Rule evaluation engine
|
|
109
|
+
// ---------------------------------------------------------------------------
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Evaluate all rules against the current context, respecting session state.
|
|
113
|
+
* Returns array of suggestion strings for rules that fired.
|
|
114
|
+
*/
|
|
115
|
+
export function evaluateRules(
|
|
116
|
+
rules: ActivationRule[],
|
|
117
|
+
ctx: ActivationContext,
|
|
118
|
+
statePath: string,
|
|
119
|
+
): string[] {
|
|
120
|
+
const state = loadSessionState(statePath, ctx.session_id);
|
|
121
|
+
const suggestions: string[] = [];
|
|
122
|
+
const newlyShown: string[] = [];
|
|
123
|
+
|
|
124
|
+
for (const rule of rules) {
|
|
125
|
+
// Skip rules already shown this session
|
|
126
|
+
if (state.suggestions_shown.includes(rule.id)) continue;
|
|
127
|
+
|
|
128
|
+
try {
|
|
129
|
+
const suggestion = rule.evaluate(ctx);
|
|
130
|
+
if (suggestion !== null) {
|
|
131
|
+
suggestions.push(suggestion);
|
|
132
|
+
newlyShown.push(rule.id);
|
|
133
|
+
}
|
|
134
|
+
} catch {
|
|
135
|
+
// fail-open: skip rules that throw
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Persist updated session state
|
|
140
|
+
if (newlyShown.length > 0) {
|
|
141
|
+
state.suggestions_shown.push(...newlyShown);
|
|
142
|
+
state.updated_at = new Date().toISOString();
|
|
143
|
+
saveSessionState(statePath, state);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
return suggestions;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// ---------------------------------------------------------------------------
|
|
150
|
+
// stdin main (only when executed directly, not when imported)
|
|
151
|
+
// ---------------------------------------------------------------------------
|
|
152
|
+
|
|
153
|
+
if (import.meta.main) {
|
|
154
|
+
try {
|
|
155
|
+
const payload: PromptSubmitPayload = JSON.parse(await Bun.stdin.text());
|
|
156
|
+
const sessionId = payload.session_id ?? "unknown";
|
|
157
|
+
|
|
158
|
+
// Dynamically import default rules (keeps hook file lightweight)
|
|
159
|
+
const { DEFAULT_RULES } = await import("../activation-rules.js");
|
|
160
|
+
|
|
161
|
+
const ctx: ActivationContext = {
|
|
162
|
+
session_id: sessionId,
|
|
163
|
+
query_log_path: QUERY_LOG,
|
|
164
|
+
telemetry_log_path: TELEMETRY_LOG,
|
|
165
|
+
evolution_audit_log_path: EVOLUTION_AUDIT_LOG,
|
|
166
|
+
selftune_dir: SELFTUNE_CONFIG_DIR,
|
|
167
|
+
settings_path: CLAUDE_SETTINGS_PATH,
|
|
168
|
+
};
|
|
169
|
+
|
|
170
|
+
// Check PAI coexistence — if PAI is active, skip selftune suggestions
|
|
171
|
+
// (PAI handles skill-level activation; selftune handles observability)
|
|
172
|
+
if (!checkPaiCoexistence(CLAUDE_SETTINGS_PATH)) {
|
|
173
|
+
const statePath = sessionStatePath(sessionId);
|
|
174
|
+
const suggestions = evaluateRules(DEFAULT_RULES, ctx, statePath);
|
|
175
|
+
|
|
176
|
+
for (const s of suggestions) {
|
|
177
|
+
// Output to stderr — Claude Code shows stderr as system messages
|
|
178
|
+
process.stderr.write(`[selftune] 💡 Suggestion: ${s}\n`);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
} catch {
|
|
182
|
+
// silent — hooks must never block Claude
|
|
183
|
+
}
|
|
184
|
+
process.exit(0);
|
|
185
|
+
}
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Claude Code PreToolUse hook: evolution-guard.ts
|
|
4
|
+
*
|
|
5
|
+
* Fires before Write/Edit tool calls. If the target is a SKILL.md file
|
|
6
|
+
* that has a deployed evolution (i.e., is under active monitoring), and
|
|
7
|
+
* no recent `selftune watch` snapshot exists, this hook BLOCKS the write
|
|
8
|
+
* with exit code 2 and a message suggesting to run watch first.
|
|
9
|
+
*
|
|
10
|
+
* Exit codes:
|
|
11
|
+
* 0 = allow (not a SKILL.md, not monitored, or watch is recent)
|
|
12
|
+
* 2 = block with message (Claude Code convention for PreToolUse hooks)
|
|
13
|
+
*
|
|
14
|
+
* Fail-open: any error → exit 0 (never block accidentally).
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
18
|
+
import { basename, dirname, join } from "node:path";
|
|
19
|
+
import { EVOLUTION_AUDIT_LOG, SELFTUNE_CONFIG_DIR } from "../constants.js";
|
|
20
|
+
import type { PreToolUsePayload } from "../types.js";
|
|
21
|
+
import { readJsonl } from "../utils/jsonl.js";
|
|
22
|
+
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
// Detection helpers (same pattern as skill-change-guard)
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
function isSkillMdWrite(toolName: string, filePath: string): boolean {
|
|
28
|
+
if (toolName !== "Write" && toolName !== "Edit") return false;
|
|
29
|
+
return basename(filePath).toUpperCase() === "SKILL.MD";
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function extractSkillName(filePath: string): string {
|
|
33
|
+
return basename(dirname(filePath)) || "unknown";
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
// Active monitoring check (reads audit log directly — no evolution imports)
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Check if a skill has an active deployed evolution (meaning it's under monitoring).
|
|
42
|
+
* Reads the evolution audit JSONL directly to respect architecture lint rules.
|
|
43
|
+
*
|
|
44
|
+
* A skill is "actively monitored" if its last audit action is "deployed".
|
|
45
|
+
* If the last action is "rolled_back", it's no longer monitored.
|
|
46
|
+
*/
|
|
47
|
+
export function checkActiveMonitoring(skillName: string, auditLogPath: string): boolean {
|
|
48
|
+
const entries = readJsonl<{
|
|
49
|
+
skill_name?: string;
|
|
50
|
+
action: string;
|
|
51
|
+
}>(auditLogPath);
|
|
52
|
+
|
|
53
|
+
// Filter entries for this skill by skill_name field
|
|
54
|
+
const skillEntries = entries.filter((e) => e.skill_name === skillName);
|
|
55
|
+
if (skillEntries.length === 0) return false;
|
|
56
|
+
|
|
57
|
+
const lastEntry = skillEntries[skillEntries.length - 1];
|
|
58
|
+
return lastEntry.action === "deployed";
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// ---------------------------------------------------------------------------
|
|
62
|
+
// Recent watch snapshot check (reads monitoring dir directly)
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Check if there's a recent monitoring snapshot for the given skill.
|
|
67
|
+
* "Recent" means within `maxAgeHours` hours.
|
|
68
|
+
*/
|
|
69
|
+
export function hasRecentWatchSnapshot(
|
|
70
|
+
skillName: string,
|
|
71
|
+
selftuneDir: string,
|
|
72
|
+
maxAgeHours: number,
|
|
73
|
+
): boolean {
|
|
74
|
+
const snapshotPath = join(selftuneDir, "monitoring", "latest-snapshot.json");
|
|
75
|
+
if (!existsSync(snapshotPath)) return false;
|
|
76
|
+
|
|
77
|
+
try {
|
|
78
|
+
const snapshot = JSON.parse(readFileSync(snapshotPath, "utf-8")) as {
|
|
79
|
+
timestamp: string;
|
|
80
|
+
skill_name?: string;
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
// Must be for the same skill
|
|
84
|
+
if (snapshot.skill_name !== skillName) return false;
|
|
85
|
+
|
|
86
|
+
// Must be recent
|
|
87
|
+
const snapshotAge = Date.now() - new Date(snapshot.timestamp).getTime();
|
|
88
|
+
const maxAgeMs = maxAgeHours * 60 * 60 * 1000;
|
|
89
|
+
return snapshotAge <= maxAgeMs;
|
|
90
|
+
} catch {
|
|
91
|
+
return false;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// ---------------------------------------------------------------------------
|
|
96
|
+
// Guard result type
|
|
97
|
+
// ---------------------------------------------------------------------------
|
|
98
|
+
|
|
99
|
+
export interface GuardResult {
|
|
100
|
+
exitCode: number;
|
|
101
|
+
message: string;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// ---------------------------------------------------------------------------
|
|
105
|
+
// Core processing logic
|
|
106
|
+
// ---------------------------------------------------------------------------
|
|
107
|
+
|
|
108
|
+
export interface GuardOptions {
|
|
109
|
+
auditLogPath: string;
|
|
110
|
+
selftuneDir: string;
|
|
111
|
+
maxSnapshotAgeHours?: number;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Process a PreToolUse payload. Returns null if the write should be allowed,
|
|
116
|
+
* or a GuardResult with exitCode 2 if the write should be blocked.
|
|
117
|
+
*/
|
|
118
|
+
export function processEvolutionGuard(
|
|
119
|
+
payload: PreToolUsePayload,
|
|
120
|
+
options: GuardOptions,
|
|
121
|
+
): GuardResult | null {
|
|
122
|
+
const filePath =
|
|
123
|
+
typeof payload.tool_input?.file_path === "string" ? payload.tool_input.file_path : "";
|
|
124
|
+
|
|
125
|
+
if (!isSkillMdWrite(payload.tool_name, filePath)) return null;
|
|
126
|
+
|
|
127
|
+
const skillName = extractSkillName(filePath);
|
|
128
|
+
const { auditLogPath, selftuneDir, maxSnapshotAgeHours = 24 } = options;
|
|
129
|
+
|
|
130
|
+
// Check if this skill is under active monitoring
|
|
131
|
+
if (!checkActiveMonitoring(skillName, auditLogPath)) return null;
|
|
132
|
+
|
|
133
|
+
// Check if there's a recent watch snapshot
|
|
134
|
+
if (hasRecentWatchSnapshot(skillName, selftuneDir, maxSnapshotAgeHours)) return null;
|
|
135
|
+
|
|
136
|
+
// Block: skill is monitored but no recent watch
|
|
137
|
+
return {
|
|
138
|
+
exitCode: 2,
|
|
139
|
+
message: `[selftune] Skill "${skillName}" has a deployed evolution and is under active monitoring. Run \`selftune watch --skill ${skillName}\` before modifying SKILL.md to check current health.`,
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// ---------------------------------------------------------------------------
|
|
144
|
+
// stdin main (only when executed directly, not when imported)
|
|
145
|
+
// ---------------------------------------------------------------------------
|
|
146
|
+
|
|
147
|
+
if (import.meta.main) {
|
|
148
|
+
try {
|
|
149
|
+
const payload: PreToolUsePayload = JSON.parse(await Bun.stdin.text());
|
|
150
|
+
|
|
151
|
+
const result = processEvolutionGuard(payload, {
|
|
152
|
+
auditLogPath: EVOLUTION_AUDIT_LOG,
|
|
153
|
+
selftuneDir: SELFTUNE_CONFIG_DIR,
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
if (result) {
|
|
157
|
+
// Exit code 2 = block with message
|
|
158
|
+
process.stderr.write(`${result.message}\n`);
|
|
159
|
+
process.exit(2);
|
|
160
|
+
}
|
|
161
|
+
} catch {
|
|
162
|
+
// Fail-open: any error → allow the write
|
|
163
|
+
}
|
|
164
|
+
process.exit(0);
|
|
165
|
+
}
|