selftune 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +23 -0
  2. package/README.md +259 -0
  3. package/bin/selftune.cjs +29 -0
  4. package/cli/selftune/constants.ts +71 -0
  5. package/cli/selftune/eval/hooks-to-evals.ts +422 -0
  6. package/cli/selftune/evolution/audit.ts +44 -0
  7. package/cli/selftune/evolution/deploy-proposal.ts +244 -0
  8. package/cli/selftune/evolution/evolve.ts +406 -0
  9. package/cli/selftune/evolution/extract-patterns.ts +145 -0
  10. package/cli/selftune/evolution/propose-description.ts +146 -0
  11. package/cli/selftune/evolution/rollback.ts +242 -0
  12. package/cli/selftune/evolution/stopping-criteria.ts +69 -0
  13. package/cli/selftune/evolution/validate-proposal.ts +137 -0
  14. package/cli/selftune/grading/grade-session.ts +459 -0
  15. package/cli/selftune/hooks/prompt-log.ts +52 -0
  16. package/cli/selftune/hooks/session-stop.ts +54 -0
  17. package/cli/selftune/hooks/skill-eval.ts +73 -0
  18. package/cli/selftune/index.ts +104 -0
  19. package/cli/selftune/ingestors/codex-rollout.ts +416 -0
  20. package/cli/selftune/ingestors/codex-wrapper.ts +332 -0
  21. package/cli/selftune/ingestors/opencode-ingest.ts +565 -0
  22. package/cli/selftune/init.ts +297 -0
  23. package/cli/selftune/monitoring/watch.ts +328 -0
  24. package/cli/selftune/observability.ts +255 -0
  25. package/cli/selftune/types.ts +255 -0
  26. package/cli/selftune/utils/jsonl.ts +75 -0
  27. package/cli/selftune/utils/llm-call.ts +192 -0
  28. package/cli/selftune/utils/logging.ts +40 -0
  29. package/cli/selftune/utils/schema-validator.ts +47 -0
  30. package/cli/selftune/utils/seeded-random.ts +31 -0
  31. package/cli/selftune/utils/transcript.ts +260 -0
  32. package/package.json +29 -0
  33. package/skill/SKILL.md +120 -0
  34. package/skill/Workflows/Doctor.md +145 -0
  35. package/skill/Workflows/Evals.md +193 -0
  36. package/skill/Workflows/Evolve.md +159 -0
  37. package/skill/Workflows/Grade.md +157 -0
  38. package/skill/Workflows/Ingest.md +159 -0
  39. package/skill/Workflows/Initialize.md +125 -0
  40. package/skill/Workflows/Rollback.md +131 -0
  41. package/skill/Workflows/Watch.md +128 -0
  42. package/skill/references/grading-methodology.md +176 -0
  43. package/skill/references/invocation-taxonomy.md +144 -0
  44. package/skill/references/logs.md +168 -0
  45. package/skill/settings_snippet.json +41 -0
@@ -0,0 +1,422 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * hooks-to-evals.ts
4
+ *
5
+ * Converts hook logs into trigger eval sets compatible with run_eval / run_loop.
6
+ *
7
+ * Three input logs (all written automatically by hooks):
8
+ * ~/.claude/skill_usage_log.jsonl - queries that DID trigger a skill
9
+ * ~/.claude/all_queries_log.jsonl - ALL queries, triggered or not
10
+ * ~/.claude/session_telemetry_log.jsonl - per-session process metrics (Stop hook)
11
+ *
12
+ * For a given skill:
13
+ * Positives (should_trigger=true) -> queries in skill_usage_log for that skill
14
+ * Negatives (should_trigger=false) -> queries in all_queries_log that never triggered
15
+ * that skill (cross-skill AND untriggered queries)
16
+ */
17
+
18
+ import { writeFileSync } from "node:fs";
19
+ import { parseArgs } from "node:util";
20
+ import { GENERIC_NEGATIVES, QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
21
+ import type {
22
+ EvalEntry,
23
+ InvocationType,
24
+ QueryLogRecord,
25
+ SessionTelemetryRecord,
26
+ SkillUsageRecord,
27
+ } from "../types.js";
28
+ import { readJsonl } from "../utils/jsonl.js";
29
+ import { seededShuffle } from "../utils/seeded-random.js";
30
+
31
+ // ---------------------------------------------------------------------------
32
+ // Query truncation
33
+ // ---------------------------------------------------------------------------
34
+
35
+ export const MAX_QUERY_LENGTH = 500;
36
+
37
+ function truncateQuery(query: string): string {
38
+ return query.length > MAX_QUERY_LENGTH ? query.slice(0, MAX_QUERY_LENGTH) : query;
39
+ }
40
+
41
+ // ---------------------------------------------------------------------------
42
+ // Invocation taxonomy classifier
43
+ // ---------------------------------------------------------------------------
44
+
45
+ export function classifyInvocation(query: string, skillName: string): InvocationType {
46
+ const qLower = query.toLowerCase();
47
+ const skillLower = skillName.toLowerCase();
48
+
49
+ // --- Explicit checks ---
50
+
51
+ // Explicit: mentions skill name or $skill syntax
52
+ if (
53
+ qLower.includes(`$${skillLower}`) ||
54
+ query.includes(`$${skillName}`) ||
55
+ qLower.includes(skillLower)
56
+ ) {
57
+ return "explicit";
58
+ }
59
+
60
+ // Handle hyphenated skill names: check if all parts appear
61
+ if (skillLower.includes("-")) {
62
+ const parts = skillLower.split("-");
63
+ if (parts.every((part) => qLower.includes(part))) {
64
+ return "explicit";
65
+ }
66
+ }
67
+
68
+ // Convert skill-name to camelCase and check
69
+ const camelCase = skillLower.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
70
+ if (camelCase !== skillLower && qLower.includes(camelCase)) {
71
+ return "explicit";
72
+ }
73
+
74
+ // --- Contextual checks ---
75
+
76
+ const wordCount = query.split(/\s+/).length;
77
+ const hasProperNoun = /\b[A-Z][a-z]{2,}\b/.test(query);
78
+
79
+ // Temporal references suggest domain context
80
+ const hasTemporalRef =
81
+ /\b(next week|last week|tomorrow|yesterday|Q[1-4]|monday|tuesday|wednesday|thursday|friday|january|february|march|april|may|june|july|august|september|october|november|december)\b/i.test(
82
+ query,
83
+ );
84
+
85
+ // Filenames suggest contextual usage
86
+ const hasFilename = /\b\w+\.\w{2,4}\b/.test(query);
87
+
88
+ // Email addresses suggest contextual usage
89
+ const hasEmail = /\b\S+@\S+\.\S+\b/.test(query);
90
+
91
+ if (wordCount > 15 || hasProperNoun || hasTemporalRef || hasFilename || hasEmail) {
92
+ return "contextual";
93
+ }
94
+
95
+ // Borderline: 10-15 words with domain signals (multi-digit numbers, uppercase acronyms)
96
+ const hasDomainSignal = /\b\d{2,}\b/.test(query) || /[A-Z]{2,}/.test(query);
97
+ if (wordCount >= 10 && hasDomainSignal) {
98
+ return "contextual";
99
+ }
100
+
101
+ return "implicit";
102
+ }
103
+
104
+ // ---------------------------------------------------------------------------
105
+ // Build eval set
106
+ // ---------------------------------------------------------------------------
107
+
108
+ export function buildEvalSet(
109
+ skillRecords: SkillUsageRecord[],
110
+ queryRecords: QueryLogRecord[],
111
+ skillName: string,
112
+ maxPerSide = 50,
113
+ includeNegatives = true,
114
+ seed = 42,
115
+ annotateTaxonomy = true,
116
+ ): EvalEntry[] {
117
+ const effectiveMaxPerSide = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
118
+ const effectiveSeed = Number.isNaN(seed) ? 42 : seed;
119
+
120
+ // Build set of positive query texts (for exclusion from negatives)
121
+ const positiveQueries = new Set<string>();
122
+ for (const r of skillRecords) {
123
+ if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
124
+ if (r.skill_name === skillName) {
125
+ const q = (r.query ?? "").trim();
126
+ if (q && q !== "(query not found)") {
127
+ positiveQueries.add(q);
128
+ }
129
+ }
130
+ }
131
+
132
+ // Build deduplicated positives with taxonomy classification
133
+ const seen = new Set<string>();
134
+ const positives: EvalEntry[] = [];
135
+ for (const r of skillRecords) {
136
+ if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
137
+ if (r.skill_name !== skillName) continue;
138
+ const q = (r.query ?? "").trim();
139
+ if (!q || q === "(query not found)" || seen.has(q)) continue;
140
+ seen.add(q);
141
+ const entry: EvalEntry = { query: truncateQuery(q), should_trigger: true };
142
+ if (annotateTaxonomy) {
143
+ entry.invocation_type = classifyInvocation(q, skillName);
144
+ }
145
+ positives.push(entry);
146
+ }
147
+
148
+ const shuffledPositives = seededShuffle(positives, effectiveSeed).slice(0, effectiveMaxPerSide);
149
+
150
+ let negatives: EvalEntry[] = [];
151
+ if (includeNegatives) {
152
+ const negCandidates: string[] = [];
153
+ const negSeen = new Set<string>();
154
+ for (const r of queryRecords) {
155
+ if (!r || typeof r.query !== "string") continue;
156
+ const q = (r.query ?? "").trim();
157
+ if (!q || positiveQueries.has(q) || negSeen.has(q)) continue;
158
+ negSeen.add(q);
159
+ negCandidates.push(q);
160
+ }
161
+
162
+ const shuffledNeg = seededShuffle(negCandidates, effectiveSeed).slice(0, effectiveMaxPerSide);
163
+ negatives = shuffledNeg.map((q) => {
164
+ const entry: EvalEntry = { query: truncateQuery(q), should_trigger: false };
165
+ if (annotateTaxonomy) {
166
+ entry.invocation_type = "negative";
167
+ }
168
+ return entry;
169
+ });
170
+
171
+ // Pad with generic fallbacks if needed
172
+ if (negatives.length < shuffledPositives.length) {
173
+ const needed = shuffledPositives.length - negatives.length;
174
+ const fallbacks: EvalEntry[] = [];
175
+ for (const q of GENERIC_NEGATIVES) {
176
+ if (negSeen.has(q) || positiveQueries.has(q)) continue;
177
+ const entry: EvalEntry = { query: q, should_trigger: false };
178
+ if (annotateTaxonomy) {
179
+ entry.invocation_type = "negative";
180
+ }
181
+ fallbacks.push(entry);
182
+ }
183
+ negatives.push(...fallbacks.slice(0, needed));
184
+ }
185
+ }
186
+
187
+ return [...shuffledPositives, ...negatives];
188
+ }
189
+
190
+ // ---------------------------------------------------------------------------
191
+ // List skills
192
+ // ---------------------------------------------------------------------------
193
+
194
+ export function listSkills(
195
+ skillRecords: SkillUsageRecord[],
196
+ queryRecords: QueryLogRecord[],
197
+ telemetryRecords: SessionTelemetryRecord[],
198
+ ): void {
199
+ const counts = new Map<string, number>();
200
+ for (const r of skillRecords) {
201
+ const name = r.skill_name ?? "unknown";
202
+ counts.set(name, (counts.get(name) ?? 0) + 1);
203
+ }
204
+
205
+ console.log(`Skill triggers in skill_usage_log (${skillRecords.length} total records):`);
206
+ if (counts.size > 0) {
207
+ const sorted = [...counts.entries()].sort((a, b) => b[1] - a[1]);
208
+ for (const [name, count] of sorted) {
209
+ console.log(` ${name.padEnd(30)} ${String(count).padStart(4)} triggers`);
210
+ }
211
+ } else {
212
+ console.log(" (none yet -- trigger some skills in Claude Code to populate)");
213
+ }
214
+
215
+ console.log(`\nAll queries in all_queries_log: ${queryRecords.length}`);
216
+ if (queryRecords.length === 0) {
217
+ console.log(" (none yet -- make sure prompt_log_hook is installed)");
218
+ }
219
+
220
+ console.log(`\nSessions in session_telemetry_log: ${telemetryRecords.length}`);
221
+ if (telemetryRecords.length === 0) {
222
+ console.log(" (none yet -- make sure session_stop_hook is installed)");
223
+ }
224
+ }
225
+
226
+ // ---------------------------------------------------------------------------
227
+ // Telemetry stats
228
+ // ---------------------------------------------------------------------------
229
+
230
+ export function showTelemetryStats(
231
+ telemetryRecords: SessionTelemetryRecord[],
232
+ skillName: string,
233
+ ): void {
234
+ const sessions = telemetryRecords.filter((r) => (r.skills_triggered ?? []).includes(skillName));
235
+
236
+ if (sessions.length === 0) {
237
+ console.log(`No telemetry sessions found for skill '${skillName}'.`);
238
+ console.log("Make sure session_stop_hook is installed.");
239
+ return;
240
+ }
241
+
242
+ console.log(`Process telemetry for skill '${skillName}' (${sessions.length} sessions):\n`);
243
+
244
+ const allTools = new Map<string, number[]>();
245
+ const allTurns: number[] = [];
246
+ const allErrors: number[] = [];
247
+ const allBashCounts: number[] = [];
248
+
249
+ for (const s of sessions) {
250
+ for (const [tool, count] of Object.entries(s.tool_calls ?? {})) {
251
+ if (!allTools.has(tool)) allTools.set(tool, []);
252
+ allTools.get(tool)?.push(count);
253
+ }
254
+ allTurns.push(s.assistant_turns ?? 0);
255
+ allErrors.push(s.errors_encountered ?? 0);
256
+ allBashCounts.push((s.bash_commands ?? []).length);
257
+ }
258
+
259
+ const avg = (lst: number[]) => (lst.length > 0 ? lst.reduce((a, b) => a + b, 0) / lst.length : 0);
260
+
261
+ console.log(
262
+ ` Assistant turns: avg ${avg(allTurns).toFixed(1)} (min ${Math.min(...allTurns)}, max ${Math.max(...allTurns)})`,
263
+ );
264
+ console.log(
265
+ ` Errors: avg ${avg(allErrors).toFixed(1)} (min ${Math.min(...allErrors)}, max ${Math.max(...allErrors)})`,
266
+ );
267
+ console.log(` Bash commands: avg ${avg(allBashCounts).toFixed(1)}`);
268
+ console.log();
269
+ console.log(" Tool call averages:");
270
+
271
+ const sortedTools = [...allTools.entries()].sort((a, b) => avg(b[1]) - avg(a[1]));
272
+ for (const [tool, counts] of sortedTools) {
273
+ console.log(` ${tool.padEnd(20)} avg ${avg(counts).toFixed(1)}`);
274
+ }
275
+
276
+ // Flag high-error sessions
277
+ const highError = sessions.filter((s) => (s.errors_encountered ?? 0) > 2);
278
+ if (highError.length > 0) {
279
+ console.log(
280
+ `\n WARNING: ${highError.length} session(s) had >2 errors -- inspect transcripts:`,
281
+ );
282
+ for (const s of highError) {
283
+ console.log(
284
+ ` session ${s.session_id.slice(0, 12)}... -- ${s.errors_encountered} errors, transcript: ${s.transcript_path ?? "?"}`,
285
+ );
286
+ }
287
+ }
288
+ }
289
+
290
+ // ---------------------------------------------------------------------------
291
+ // Print eval stats
292
+ // ---------------------------------------------------------------------------
293
+
294
+ export function printEvalStats(
295
+ evalSet: EvalEntry[],
296
+ skillName: string,
297
+ outputPath: string,
298
+ skillRecords: SkillUsageRecord[],
299
+ queryRecords: QueryLogRecord[],
300
+ annotateTaxonomy: boolean,
301
+ ): void {
302
+ const pos = evalSet.filter((e) => e.should_trigger);
303
+ const neg = evalSet.filter((e) => !e.should_trigger);
304
+ const totalTriggers = skillRecords.filter((r) => r.skill_name === skillName).length;
305
+
306
+ console.log(`Wrote ${evalSet.length} eval entries to ${outputPath}`);
307
+ console.log(
308
+ ` Positives (should_trigger=true) : ${pos.length} (from ${totalTriggers} logged triggers)`,
309
+ );
310
+ console.log(
311
+ ` Negatives (should_trigger=false): ${neg.length} (from ${queryRecords.length} total logged queries)`,
312
+ );
313
+
314
+ if (annotateTaxonomy && pos.length > 0) {
315
+ const types = new Map<string, number>();
316
+ for (const e of pos) {
317
+ const t = e.invocation_type ?? "?";
318
+ types.set(t, (types.get(t) ?? 0) + 1);
319
+ }
320
+ console.log("\n Positive invocation types:");
321
+ for (const [t, c] of [...types.entries()].sort()) {
322
+ console.log(` ${t.padEnd(15)} ${c}`);
323
+ }
324
+ if (!types.has("explicit")) {
325
+ console.log("\n [TIP] No explicit positives (queries naming the skill directly).");
326
+ console.log(" Consider adding some for a complete taxonomy.");
327
+ }
328
+ if (!types.has("contextual")) {
329
+ console.log("\n [TIP] No contextual positives (implicit + domain noise).");
330
+ console.log(" These are important for realistic triggering tests.");
331
+ }
332
+ }
333
+
334
+ console.log();
335
+ if (pos.length === 0) {
336
+ console.log(`[WARN] No positives for skill '${skillName}'.`);
337
+ const names = [...new Set(skillRecords.map((r) => r.skill_name))].sort();
338
+ if (names.length > 0) {
339
+ console.log(` Known skills: ${names.join(", ")}`);
340
+ }
341
+ }
342
+ if (neg.length === 0) {
343
+ console.log("[WARN] No negatives -- install prompt_log_hook for real negatives.");
344
+ }
345
+
346
+ console.log("Next steps:");
347
+ console.log(" bun run cli/selftune/eval/run-eval.ts \\");
348
+ console.log(` --eval-set ${outputPath} \\`);
349
+ console.log(` --skill-path /path/to/skills/${skillName} \\`);
350
+ console.log(" --runs-per-query 3 --verbose");
351
+ console.log();
352
+ console.log(" bun run cli/selftune/eval/run-loop.ts \\");
353
+ console.log(` --eval-set ${outputPath} \\`);
354
+ console.log(` --skill-path /path/to/skills/${skillName} \\`);
355
+ console.log(" --max-iterations 5 --verbose");
356
+ }
357
+
358
+ // ---------------------------------------------------------------------------
359
+ // CLI entry point
360
+ // ---------------------------------------------------------------------------
361
+
362
+ export function cliMain(): void {
363
+ const { values } = parseArgs({
364
+ options: {
365
+ skill: { type: "string" },
366
+ output: { type: "string" },
367
+ max: { type: "string", default: "50" },
368
+ seed: { type: "string", default: "42" },
369
+ "list-skills": { type: "boolean", default: false },
370
+ stats: { type: "boolean", default: false },
371
+ "no-negatives": { type: "boolean", default: false },
372
+ "no-taxonomy": { type: "boolean", default: false },
373
+ "skill-log": { type: "string", default: SKILL_LOG },
374
+ "query-log": { type: "string", default: QUERY_LOG },
375
+ "telemetry-log": { type: "string", default: TELEMETRY_LOG },
376
+ },
377
+ strict: true,
378
+ });
379
+
380
+ const skillRecords = readJsonl<SkillUsageRecord>(values["skill-log"] ?? SKILL_LOG);
381
+ const queryRecords = readJsonl<QueryLogRecord>(values["query-log"] ?? QUERY_LOG);
382
+ const telemetryRecords = readJsonl<SessionTelemetryRecord>(
383
+ values["telemetry-log"] ?? TELEMETRY_LOG,
384
+ );
385
+
386
+ if (values["list-skills"]) {
387
+ listSkills(skillRecords, queryRecords, telemetryRecords);
388
+ process.exit(0);
389
+ }
390
+
391
+ if (!values.skill) {
392
+ console.error("[ERROR] --skill required (or use --list-skills)");
393
+ process.exit(1);
394
+ }
395
+
396
+ if (values.stats) {
397
+ showTelemetryStats(telemetryRecords, values.skill);
398
+ process.exit(0);
399
+ }
400
+
401
+ const maxPerSide = Number.parseInt(values.max ?? "50", 10);
402
+ const seed = Number.parseInt(values.seed ?? "42", 10);
403
+ const annotateTaxonomy = !values["no-taxonomy"];
404
+
405
+ const evalSet = buildEvalSet(
406
+ skillRecords,
407
+ queryRecords,
408
+ values.skill,
409
+ maxPerSide,
410
+ !values["no-negatives"],
411
+ seed,
412
+ annotateTaxonomy,
413
+ );
414
+
415
+ const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
416
+ writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
417
+ printEvalStats(evalSet, values.skill, outputPath, skillRecords, queryRecords, annotateTaxonomy);
418
+ }
419
+
420
+ if (import.meta.main) {
421
+ cliMain();
422
+ }
@@ -0,0 +1,44 @@
1
+ /**
2
+ * Evolution audit trail: append, read, and query audit entries.
3
+ */
4
+
5
+ import { EVOLUTION_AUDIT_LOG } from "../constants.js";
6
+ import type { EvolutionAuditEntry } from "../types.js";
7
+ import { appendJsonl, readJsonl } from "../utils/jsonl.js";
8
+
9
+ /** Append an audit entry to the evolution audit log. */
10
+ export function appendAuditEntry(
11
+ entry: EvolutionAuditEntry,
12
+ logPath: string = EVOLUTION_AUDIT_LOG,
13
+ ): void {
14
+ appendJsonl(logPath, entry);
15
+ }
16
+
17
+ /**
18
+ * Read all audit entries, optionally filtered by skill name.
19
+ *
20
+ * When skillName is provided, returns only entries whose `details` field
21
+ * contains the skill name (case-insensitive match).
22
+ */
23
+ export function readAuditTrail(
24
+ skillName?: string,
25
+ logPath: string = EVOLUTION_AUDIT_LOG,
26
+ ): EvolutionAuditEntry[] {
27
+ const entries = readJsonl<EvolutionAuditEntry>(logPath);
28
+ if (!skillName) return entries;
29
+ const needle = skillName.toLowerCase();
30
+ return entries.filter((e) => (e.details ?? "").toLowerCase().includes(needle));
31
+ }
32
+
33
+ /**
34
+ * Get the most recent "deployed" audit entry for a skill.
35
+ * Returns null if no deployed entries exist for the given skill.
36
+ */
37
+ export function getLastDeployedProposal(
38
+ skillName: string,
39
+ logPath: string = EVOLUTION_AUDIT_LOG,
40
+ ): EvolutionAuditEntry | null {
41
+ const entries = readAuditTrail(skillName, logPath);
42
+ const deployed = entries.filter((e) => e.action === "deployed");
43
+ return deployed.length > 0 ? deployed[deployed.length - 1] : null;
44
+ }