selftune 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/README.md +259 -0
- package/bin/selftune.cjs +29 -0
- package/cli/selftune/constants.ts +71 -0
- package/cli/selftune/eval/hooks-to-evals.ts +422 -0
- package/cli/selftune/evolution/audit.ts +44 -0
- package/cli/selftune/evolution/deploy-proposal.ts +244 -0
- package/cli/selftune/evolution/evolve.ts +406 -0
- package/cli/selftune/evolution/extract-patterns.ts +145 -0
- package/cli/selftune/evolution/propose-description.ts +146 -0
- package/cli/selftune/evolution/rollback.ts +242 -0
- package/cli/selftune/evolution/stopping-criteria.ts +69 -0
- package/cli/selftune/evolution/validate-proposal.ts +137 -0
- package/cli/selftune/grading/grade-session.ts +459 -0
- package/cli/selftune/hooks/prompt-log.ts +52 -0
- package/cli/selftune/hooks/session-stop.ts +54 -0
- package/cli/selftune/hooks/skill-eval.ts +73 -0
- package/cli/selftune/index.ts +104 -0
- package/cli/selftune/ingestors/codex-rollout.ts +416 -0
- package/cli/selftune/ingestors/codex-wrapper.ts +332 -0
- package/cli/selftune/ingestors/opencode-ingest.ts +565 -0
- package/cli/selftune/init.ts +297 -0
- package/cli/selftune/monitoring/watch.ts +328 -0
- package/cli/selftune/observability.ts +255 -0
- package/cli/selftune/types.ts +255 -0
- package/cli/selftune/utils/jsonl.ts +75 -0
- package/cli/selftune/utils/llm-call.ts +192 -0
- package/cli/selftune/utils/logging.ts +40 -0
- package/cli/selftune/utils/schema-validator.ts +47 -0
- package/cli/selftune/utils/seeded-random.ts +31 -0
- package/cli/selftune/utils/transcript.ts +260 -0
- package/package.json +29 -0
- package/skill/SKILL.md +120 -0
- package/skill/Workflows/Doctor.md +145 -0
- package/skill/Workflows/Evals.md +193 -0
- package/skill/Workflows/Evolve.md +159 -0
- package/skill/Workflows/Grade.md +157 -0
- package/skill/Workflows/Ingest.md +159 -0
- package/skill/Workflows/Initialize.md +125 -0
- package/skill/Workflows/Rollback.md +131 -0
- package/skill/Workflows/Watch.md +128 -0
- package/skill/references/grading-methodology.md +176 -0
- package/skill/references/invocation-taxonomy.md +144 -0
- package/skill/references/logs.md +168 -0
- package/skill/settings_snippet.json +41 -0
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* selftune init — Bootstrap agent identity and write config.
|
|
4
|
+
*
|
|
5
|
+
* Detects the coding agent environment, resolves the CLI path,
|
|
6
|
+
* determines LLM mode, checks hook installation, and writes
|
|
7
|
+
* the result to ~/.selftune/config.json.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* selftune init [--agent <type>] [--cli-path <path>] [--llm-mode <mode>] [--force]
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
14
|
+
import { homedir } from "node:os";
|
|
15
|
+
import { dirname, join, resolve } from "node:path";
|
|
16
|
+
import { fileURLToPath } from "node:url";
|
|
17
|
+
import { parseArgs } from "node:util";
|
|
18
|
+
|
|
19
|
+
import { SELFTUNE_CONFIG_DIR, SELFTUNE_CONFIG_PATH } from "./constants.js";
|
|
20
|
+
import type { SelftuneConfig } from "./types.js";
|
|
21
|
+
import { detectAgent } from "./utils/llm-call.js";
|
|
22
|
+
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
// Agent type detection
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Detect which coding agent environment we are running inside.
|
|
29
|
+
*
|
|
30
|
+
* Detection order:
|
|
31
|
+
* 1. Claude Code — ~/.claude/ directory exists AND (`which claude` OR env signals)
|
|
32
|
+
* 2. Codex — $CODEX_HOME set OR `which codex`
|
|
33
|
+
* 3. OpenCode — ~/.local/share/opencode/opencode.db exists OR `which opencode`
|
|
34
|
+
* 4. "unknown" fallback
|
|
35
|
+
*/
|
|
36
|
+
const VALID_AGENT_TYPES: SelftuneConfig["agent_type"][] = [
|
|
37
|
+
"claude_code",
|
|
38
|
+
"codex",
|
|
39
|
+
"opencode",
|
|
40
|
+
"unknown",
|
|
41
|
+
];
|
|
42
|
+
|
|
43
|
+
export function detectAgentType(
|
|
44
|
+
override?: string,
|
|
45
|
+
homeOverride?: string,
|
|
46
|
+
): SelftuneConfig["agent_type"] {
|
|
47
|
+
if (override) {
|
|
48
|
+
if (VALID_AGENT_TYPES.includes(override as SelftuneConfig["agent_type"])) {
|
|
49
|
+
return override as SelftuneConfig["agent_type"];
|
|
50
|
+
}
|
|
51
|
+
console.error(`[WARN] Unknown agent type "${override}", falling back to detection`);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const home = homeOverride ?? homedir();
|
|
55
|
+
|
|
56
|
+
// Claude Code: .claude directory + claude binary
|
|
57
|
+
const claudeDir = join(home, ".claude");
|
|
58
|
+
if (existsSync(claudeDir)) {
|
|
59
|
+
if (Bun.which("claude") || process.env.CLAUDE_CODE_ENTRYPOINT) {
|
|
60
|
+
return "claude_code";
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Codex: env var or binary
|
|
65
|
+
if (process.env.CODEX_HOME || Bun.which("codex")) {
|
|
66
|
+
return "codex";
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// OpenCode: db file or binary
|
|
70
|
+
const opencodeDb = join(home, ".local", "share", "opencode", "opencode.db");
|
|
71
|
+
if (existsSync(opencodeDb) || Bun.which("opencode")) {
|
|
72
|
+
return "opencode";
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return "unknown";
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// ---------------------------------------------------------------------------
|
|
79
|
+
// CLI path resolution
|
|
80
|
+
// ---------------------------------------------------------------------------
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Resolve the absolute path to cli/selftune/index.ts.
|
|
84
|
+
* Uses the directory of this file (init.ts lives alongside index.ts).
|
|
85
|
+
*/
|
|
86
|
+
export function determineCliPath(override?: string): string {
|
|
87
|
+
if (override) return override;
|
|
88
|
+
return resolve(dirname(import.meta.path), "index.ts");
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// ---------------------------------------------------------------------------
|
|
92
|
+
// LLM mode determination
|
|
93
|
+
// ---------------------------------------------------------------------------
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Determine LLM mode and agent CLI based on available signals.
|
|
97
|
+
*/
|
|
98
|
+
export function determineLlmMode(
|
|
99
|
+
agentCli: string | null,
|
|
100
|
+
hasApiKey?: boolean,
|
|
101
|
+
modeOverride?: string,
|
|
102
|
+
): { llm_mode: "agent" | "api"; agent_cli: string | null } {
|
|
103
|
+
const detectedAgent = agentCli;
|
|
104
|
+
const validModes = ["agent", "api"] as const;
|
|
105
|
+
if (modeOverride && !validModes.includes(modeOverride as (typeof validModes)[number])) {
|
|
106
|
+
throw new Error(
|
|
107
|
+
`Invalid --llm-mode "${modeOverride}". Allowed values: ${validModes.join(", ")}`,
|
|
108
|
+
);
|
|
109
|
+
}
|
|
110
|
+
const resolvedMode = modeOverride as "agent" | "api" | undefined;
|
|
111
|
+
|
|
112
|
+
if (resolvedMode) {
|
|
113
|
+
return { llm_mode: resolvedMode, agent_cli: detectedAgent };
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
if (detectedAgent) {
|
|
117
|
+
return { llm_mode: "agent", agent_cli: detectedAgent };
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
if (hasApiKey) {
|
|
121
|
+
return { llm_mode: "api", agent_cli: null };
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Fallback: agent mode with null cli (will need setup)
|
|
125
|
+
return { llm_mode: "agent", agent_cli: null };
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// ---------------------------------------------------------------------------
|
|
129
|
+
// Hook detection (Claude Code only)
|
|
130
|
+
// ---------------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
const REQUIRED_HOOK_KEYS = ["prompt-submit", "post-tool-use", "session-stop"] as const;
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Check if the selftune hooks are configured in Claude Code settings.
|
|
136
|
+
*/
|
|
137
|
+
export function checkClaudeCodeHooks(settingsPath: string): boolean {
|
|
138
|
+
if (!existsSync(settingsPath)) return false;
|
|
139
|
+
|
|
140
|
+
try {
|
|
141
|
+
const raw = readFileSync(settingsPath, "utf-8");
|
|
142
|
+
const settings = JSON.parse(raw);
|
|
143
|
+
const hooks = settings?.hooks;
|
|
144
|
+
if (!hooks || typeof hooks !== "object") return false;
|
|
145
|
+
|
|
146
|
+
for (const key of REQUIRED_HOOK_KEYS) {
|
|
147
|
+
const entries = hooks[key];
|
|
148
|
+
if (!Array.isArray(entries) || entries.length === 0) return false;
|
|
149
|
+
// Check that at least one entry references selftune
|
|
150
|
+
const hasSelftune = entries.some(
|
|
151
|
+
(e: { command?: string }) =>
|
|
152
|
+
typeof e.command === "string" && e.command.includes("selftune"),
|
|
153
|
+
);
|
|
154
|
+
if (!hasSelftune) return false;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return true;
|
|
158
|
+
} catch {
|
|
159
|
+
return false;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// ---------------------------------------------------------------------------
|
|
164
|
+
// Init options (for testability)
|
|
165
|
+
// ---------------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
export interface InitOptions {
|
|
168
|
+
configDir: string;
|
|
169
|
+
configPath: string;
|
|
170
|
+
force: boolean;
|
|
171
|
+
agentOverride?: string;
|
|
172
|
+
cliPathOverride?: string;
|
|
173
|
+
llmModeOverride?: string;
|
|
174
|
+
homeDir?: string;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// ---------------------------------------------------------------------------
|
|
178
|
+
// Core init logic
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Run the init flow. Returns the written (or existing) config.
|
|
183
|
+
* Extracted as a pure function for testability.
|
|
184
|
+
*/
|
|
185
|
+
export function runInit(opts: InitOptions): SelftuneConfig {
|
|
186
|
+
const { configDir, configPath, force } = opts;
|
|
187
|
+
|
|
188
|
+
// If config exists and no --force, return existing
|
|
189
|
+
if (!force && existsSync(configPath)) {
|
|
190
|
+
const raw = readFileSync(configPath, "utf-8");
|
|
191
|
+
try {
|
|
192
|
+
return JSON.parse(raw) as SelftuneConfig;
|
|
193
|
+
} catch (err) {
|
|
194
|
+
throw new Error(
|
|
195
|
+
`Config file at ${configPath} contains invalid JSON. Delete it or use --force to reinitialize. Cause: ${err instanceof Error ? err.message : String(err)}`,
|
|
196
|
+
);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Detect agent type
|
|
201
|
+
const agentType = detectAgentType(opts.agentOverride, opts.homeDir);
|
|
202
|
+
|
|
203
|
+
// Resolve CLI path
|
|
204
|
+
const cliPath = determineCliPath(opts.cliPathOverride);
|
|
205
|
+
|
|
206
|
+
// Detect agent CLI
|
|
207
|
+
const agentCli = detectAgent();
|
|
208
|
+
|
|
209
|
+
// Determine LLM mode
|
|
210
|
+
const hasApiKey = Boolean(process.env.ANTHROPIC_API_KEY);
|
|
211
|
+
const { llm_mode, agent_cli } = determineLlmMode(agentCli, hasApiKey, opts.llmModeOverride);
|
|
212
|
+
|
|
213
|
+
// Check hooks (Claude Code only)
|
|
214
|
+
const home = opts.homeDir ?? homedir();
|
|
215
|
+
const settingsPath = join(home, ".claude", "settings.json");
|
|
216
|
+
const hooksInstalled = agentType === "claude_code" ? checkClaudeCodeHooks(settingsPath) : false;
|
|
217
|
+
|
|
218
|
+
const config: SelftuneConfig = {
|
|
219
|
+
agent_type: agentType,
|
|
220
|
+
cli_path: cliPath,
|
|
221
|
+
llm_mode,
|
|
222
|
+
agent_cli,
|
|
223
|
+
hooks_installed: hooksInstalled,
|
|
224
|
+
initialized_at: new Date().toISOString(),
|
|
225
|
+
};
|
|
226
|
+
|
|
227
|
+
// Write config
|
|
228
|
+
mkdirSync(configDir, { recursive: true });
|
|
229
|
+
writeFileSync(configPath, JSON.stringify(config, null, 2), "utf-8");
|
|
230
|
+
|
|
231
|
+
return config;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// ---------------------------------------------------------------------------
|
|
235
|
+
// CLI entry point
|
|
236
|
+
// ---------------------------------------------------------------------------
|
|
237
|
+
|
|
238
|
+
export async function cliMain(): Promise<void> {
|
|
239
|
+
const { values } = parseArgs({
|
|
240
|
+
options: {
|
|
241
|
+
agent: { type: "string" },
|
|
242
|
+
"cli-path": { type: "string" },
|
|
243
|
+
"llm-mode": { type: "string" },
|
|
244
|
+
force: { type: "boolean", default: false },
|
|
245
|
+
},
|
|
246
|
+
strict: true,
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
const configDir = SELFTUNE_CONFIG_DIR;
|
|
250
|
+
const configPath = SELFTUNE_CONFIG_PATH;
|
|
251
|
+
const force = values.force ?? false;
|
|
252
|
+
|
|
253
|
+
// Check for existing config without force
|
|
254
|
+
if (!force && existsSync(configPath)) {
|
|
255
|
+
try {
|
|
256
|
+
const raw = readFileSync(configPath, "utf-8");
|
|
257
|
+
const existing = JSON.parse(raw) as SelftuneConfig;
|
|
258
|
+
console.log(JSON.stringify(existing, null, 2));
|
|
259
|
+
console.error("Already initialized. Use --force to reinitialize.");
|
|
260
|
+
process.exit(0);
|
|
261
|
+
} catch (err) {
|
|
262
|
+
console.error(
|
|
263
|
+
`[WARN] Config at ${configPath} is corrupted: ${err instanceof Error ? err.message : String(err)}. Reinitializing...`,
|
|
264
|
+
);
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
const config = runInit({
|
|
269
|
+
configDir,
|
|
270
|
+
configPath,
|
|
271
|
+
force,
|
|
272
|
+
agentOverride: values.agent,
|
|
273
|
+
cliPathOverride: values["cli-path"],
|
|
274
|
+
llmModeOverride: values["llm-mode"],
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
console.log(JSON.stringify(config, null, 2));
|
|
278
|
+
|
|
279
|
+
// Run doctor as post-check
|
|
280
|
+
const { doctor } = await import("./observability.js");
|
|
281
|
+
const doctorResult = doctor();
|
|
282
|
+
console.error(
|
|
283
|
+
`\n[doctor] ${doctorResult.summary.pass}/${doctorResult.summary.total} checks pass`,
|
|
284
|
+
);
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// Guard: only run when invoked directly
|
|
288
|
+
const isMain =
|
|
289
|
+
(import.meta as Record<string, unknown>).main === true ||
|
|
290
|
+
process.argv[1] === fileURLToPath(import.meta.url);
|
|
291
|
+
|
|
292
|
+
if (isMain) {
|
|
293
|
+
cliMain().catch((err) => {
|
|
294
|
+
console.error(`[FATAL] ${err}`);
|
|
295
|
+
process.exit(1);
|
|
296
|
+
});
|
|
297
|
+
}
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Post-deploy monitoring: compute snapshots and detect regressions (TASK-16).
|
|
3
|
+
*
|
|
4
|
+
* Exports:
|
|
5
|
+
* - computeMonitoringSnapshot (pure function, deterministic)
|
|
6
|
+
* - watch (reads log files, computes snapshot, optionally rolls back)
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { parseArgs } from "node:util";
|
|
10
|
+
|
|
11
|
+
import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
|
|
12
|
+
import { getLastDeployedProposal } from "../evolution/audit.js";
|
|
13
|
+
import type {
|
|
14
|
+
InvocationType,
|
|
15
|
+
MonitoringSnapshot,
|
|
16
|
+
QueryLogRecord,
|
|
17
|
+
SessionTelemetryRecord,
|
|
18
|
+
SkillUsageRecord,
|
|
19
|
+
} from "../types.js";
|
|
20
|
+
import { readJsonl } from "../utils/jsonl.js";
|
|
21
|
+
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
// Public interfaces
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
export interface WatchOptions {
|
|
27
|
+
skillName: string;
|
|
28
|
+
skillPath: string;
|
|
29
|
+
windowSessions: number;
|
|
30
|
+
regressionThreshold: number;
|
|
31
|
+
autoRollback: boolean;
|
|
32
|
+
/** Injected log paths for testing (override defaults). */
|
|
33
|
+
_telemetryLogPath?: string;
|
|
34
|
+
_skillLogPath?: string;
|
|
35
|
+
_queryLogPath?: string;
|
|
36
|
+
_auditLogPath?: string;
|
|
37
|
+
/** Injected rollback function for testing. */
|
|
38
|
+
_rollbackFn?: (opts: {
|
|
39
|
+
skillName: string;
|
|
40
|
+
skillPath: string;
|
|
41
|
+
proposalId?: string;
|
|
42
|
+
}) => Promise<{ rolledBack: boolean; restoredDescription: string; reason: string }>;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export interface WatchResult {
|
|
46
|
+
snapshot: MonitoringSnapshot;
|
|
47
|
+
alert: string | null;
|
|
48
|
+
rolledBack: boolean;
|
|
49
|
+
recommendation: string;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// ---------------------------------------------------------------------------
|
|
53
|
+
// Constants
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
const DEFAULT_BASELINE_PASS_RATE = 0.5;
|
|
57
|
+
const DEFAULT_REGRESSION_THRESHOLD = 0.1;
|
|
58
|
+
|
|
59
|
+
// ---------------------------------------------------------------------------
|
|
60
|
+
// computeMonitoringSnapshot - pure function
|
|
61
|
+
// ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Compute a monitoring snapshot from raw log records.
|
|
65
|
+
*
|
|
66
|
+
* The function windows telemetry to the last `windowSessions` entries, then
|
|
67
|
+
* scopes skill and query records to those sessions. If telemetry is empty or
|
|
68
|
+
* no records match the windowed session IDs, all provided skill/query records
|
|
69
|
+
* are used directly (unfiltered by session).
|
|
70
|
+
*
|
|
71
|
+
* @param skillName - The skill to monitor
|
|
72
|
+
* @param telemetry - All session telemetry records
|
|
73
|
+
* @param skillRecords - All skill usage records
|
|
74
|
+
* @param queryRecords - All query log records
|
|
75
|
+
* @param windowSessions - Max number of recent sessions to consider
|
|
76
|
+
* @param baselinePassRate - The baseline pass rate for regression detection
|
|
77
|
+
* @param regressionThreshold - Drop below baseline minus this triggers regression (default 0.10)
|
|
78
|
+
*/
|
|
79
|
+
export function computeMonitoringSnapshot(
|
|
80
|
+
skillName: string,
|
|
81
|
+
telemetry: SessionTelemetryRecord[],
|
|
82
|
+
skillRecords: SkillUsageRecord[],
|
|
83
|
+
queryRecords: QueryLogRecord[],
|
|
84
|
+
windowSessions: number,
|
|
85
|
+
baselinePassRate: number,
|
|
86
|
+
regressionThreshold: number = DEFAULT_REGRESSION_THRESHOLD,
|
|
87
|
+
): MonitoringSnapshot {
|
|
88
|
+
// 1. Window the telemetry to the last N sessions (by array order, assumed chronological)
|
|
89
|
+
const windowedTelemetry = telemetry.slice(-windowSessions);
|
|
90
|
+
const windowedSessionIds = new Set(windowedTelemetry.map((t) => t.session_id));
|
|
91
|
+
|
|
92
|
+
// 2. Filter skill records by skill name first
|
|
93
|
+
const skillNameFiltered = skillRecords.filter((r) => r.skill_name === skillName);
|
|
94
|
+
|
|
95
|
+
// 3. Apply session ID windowing only if telemetry is present and overlaps
|
|
96
|
+
const hasSessionOverlap =
|
|
97
|
+
windowedSessionIds.size > 0 &&
|
|
98
|
+
(skillNameFiltered.some((r) => windowedSessionIds.has(r.session_id)) ||
|
|
99
|
+
queryRecords.some((r) => windowedSessionIds.has(r.session_id)));
|
|
100
|
+
|
|
101
|
+
const filteredSkillRecords = hasSessionOverlap
|
|
102
|
+
? skillNameFiltered.filter((r) => windowedSessionIds.has(r.session_id))
|
|
103
|
+
: skillNameFiltered;
|
|
104
|
+
|
|
105
|
+
const filteredQueryRecords = hasSessionOverlap
|
|
106
|
+
? queryRecords.filter((r) => windowedSessionIds.has(r.session_id))
|
|
107
|
+
: queryRecords;
|
|
108
|
+
|
|
109
|
+
// 4. Compute pass rate: triggered_count / total_query_count
|
|
110
|
+
const triggeredCount = filteredSkillRecords.filter((r) => r.triggered).length;
|
|
111
|
+
const totalQueries = filteredQueryRecords.length;
|
|
112
|
+
const passRate = totalQueries === 0 ? 1.0 : triggeredCount / totalQueries;
|
|
113
|
+
|
|
114
|
+
// 5. Compute false negative rate from skill usage records
|
|
115
|
+
const totalSkillChecks = filteredSkillRecords.length;
|
|
116
|
+
const falseNegatives = filteredSkillRecords.filter((r) => !r.triggered).length;
|
|
117
|
+
const falseNegativeRate = totalSkillChecks === 0 ? 0 : falseNegatives / totalSkillChecks;
|
|
118
|
+
|
|
119
|
+
// 6. by_invocation_type: MVP classifies everything as "implicit"
|
|
120
|
+
const byInvocationType: Record<InvocationType, { passed: number; total: number }> = {
|
|
121
|
+
explicit: { passed: 0, total: 0 },
|
|
122
|
+
implicit: { passed: triggeredCount, total: totalSkillChecks },
|
|
123
|
+
contextual: { passed: 0, total: 0 },
|
|
124
|
+
negative: { passed: 0, total: 0 },
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
// 7. Regression detection: pass_rate < baseline - threshold
|
|
128
|
+
// Use rounding to avoid floating-point boundary issues (e.g. 0.8 - 0.1 = 0.7000000000000001)
|
|
129
|
+
const precision = 1e10;
|
|
130
|
+
const adjustedThreshold =
|
|
131
|
+
Math.round((baselinePassRate - regressionThreshold) * precision) / precision;
|
|
132
|
+
const roundedPassRate = Math.round(passRate * precision) / precision;
|
|
133
|
+
const regressionDetected = roundedPassRate < adjustedThreshold;
|
|
134
|
+
|
|
135
|
+
return {
|
|
136
|
+
timestamp: new Date().toISOString(),
|
|
137
|
+
skill_name: skillName,
|
|
138
|
+
window_sessions: windowSessions,
|
|
139
|
+
pass_rate: passRate,
|
|
140
|
+
false_negative_rate: falseNegativeRate,
|
|
141
|
+
by_invocation_type: byInvocationType,
|
|
142
|
+
regression_detected: regressionDetected,
|
|
143
|
+
baseline_pass_rate: baselinePassRate,
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// ---------------------------------------------------------------------------
|
|
148
|
+
// watch - reads logs, computes snapshot, optionally rolls back
|
|
149
|
+
// ---------------------------------------------------------------------------
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Run the post-deploy monitoring check for a skill.
|
|
153
|
+
*/
|
|
154
|
+
export async function watch(options: WatchOptions): Promise<WatchResult> {
|
|
155
|
+
const {
|
|
156
|
+
skillName,
|
|
157
|
+
skillPath,
|
|
158
|
+
windowSessions = 20,
|
|
159
|
+
regressionThreshold = DEFAULT_REGRESSION_THRESHOLD,
|
|
160
|
+
autoRollback = false,
|
|
161
|
+
_telemetryLogPath = TELEMETRY_LOG,
|
|
162
|
+
_skillLogPath = SKILL_LOG,
|
|
163
|
+
_queryLogPath = QUERY_LOG,
|
|
164
|
+
_auditLogPath,
|
|
165
|
+
_rollbackFn,
|
|
166
|
+
} = options;
|
|
167
|
+
|
|
168
|
+
// 1. Read log files
|
|
169
|
+
const telemetry = readJsonl<SessionTelemetryRecord>(_telemetryLogPath);
|
|
170
|
+
const skillRecords = readJsonl<SkillUsageRecord>(_skillLogPath);
|
|
171
|
+
const queryRecords = readJsonl<QueryLogRecord>(_queryLogPath);
|
|
172
|
+
|
|
173
|
+
// 2. Determine baseline pass rate from last deployed audit entry
|
|
174
|
+
const lastDeployed = getLastDeployedProposal(skillName, _auditLogPath);
|
|
175
|
+
const baselinePassRate = lastDeployed?.eval_snapshot?.pass_rate ?? DEFAULT_BASELINE_PASS_RATE;
|
|
176
|
+
|
|
177
|
+
// 3. Compute the monitoring snapshot (includes regression detection)
|
|
178
|
+
const snapshot = computeMonitoringSnapshot(
|
|
179
|
+
skillName,
|
|
180
|
+
telemetry,
|
|
181
|
+
skillRecords,
|
|
182
|
+
queryRecords,
|
|
183
|
+
windowSessions,
|
|
184
|
+
baselinePassRate,
|
|
185
|
+
regressionThreshold,
|
|
186
|
+
);
|
|
187
|
+
|
|
188
|
+
// 4. Build alert and recommendation
|
|
189
|
+
let alert: string | null = null;
|
|
190
|
+
let rolledBack = false;
|
|
191
|
+
let recommendation: string;
|
|
192
|
+
|
|
193
|
+
if (snapshot.regression_detected) {
|
|
194
|
+
alert = `regression detected for "${skillName}": pass_rate=${snapshot.pass_rate.toFixed(2)} below baseline=${baselinePassRate.toFixed(2)} minus threshold=${regressionThreshold.toFixed(2)}`;
|
|
195
|
+
|
|
196
|
+
// 5. Auto-rollback if enabled
|
|
197
|
+
if (autoRollback) {
|
|
198
|
+
const rollbackFn = _rollbackFn ?? (await loadRollbackFn());
|
|
199
|
+
const proposalId = lastDeployed?.proposal_id;
|
|
200
|
+
const rollbackResult = await rollbackFn({
|
|
201
|
+
skillName,
|
|
202
|
+
skillPath,
|
|
203
|
+
proposalId,
|
|
204
|
+
});
|
|
205
|
+
rolledBack = rollbackResult.rolledBack;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
recommendation = rolledBack
|
|
209
|
+
? `Rolled back "${skillName}" to previous version. Monitor to confirm recovery.`
|
|
210
|
+
: `Consider running: selftune rollback --skill "${skillName}" --skill-path "${skillPath}"`;
|
|
211
|
+
} else {
|
|
212
|
+
recommendation = `Skill "${skillName}" is stable. Pass rate ${snapshot.pass_rate.toFixed(2)} is within acceptable range of baseline ${baselinePassRate.toFixed(2)}.`;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
return {
|
|
216
|
+
snapshot,
|
|
217
|
+
alert,
|
|
218
|
+
rolledBack,
|
|
219
|
+
recommendation,
|
|
220
|
+
};
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// ---------------------------------------------------------------------------
|
|
224
|
+
// Lazy rollback loader (avoids import if rollback.ts doesn't exist yet)
|
|
225
|
+
// ---------------------------------------------------------------------------
|
|
226
|
+
|
|
227
|
+
async function loadRollbackFn(): Promise<
|
|
228
|
+
(opts: {
|
|
229
|
+
skillName: string;
|
|
230
|
+
skillPath: string;
|
|
231
|
+
proposalId?: string;
|
|
232
|
+
}) => Promise<{ rolledBack: boolean; restoredDescription: string; reason: string }>
|
|
233
|
+
> {
|
|
234
|
+
try {
|
|
235
|
+
const mod = await import("../evolution/rollback.js");
|
|
236
|
+
return mod.rollback;
|
|
237
|
+
} catch (error: unknown) {
|
|
238
|
+
// Only suppress module-resolution failures; rethrow syntax/runtime errors
|
|
239
|
+
const code = (error as NodeJS.ErrnoException)?.code;
|
|
240
|
+
if (code === "ERR_MODULE_NOT_FOUND" || code === "MODULE_NOT_FOUND") {
|
|
241
|
+
return async () => ({
|
|
242
|
+
rolledBack: false,
|
|
243
|
+
restoredDescription: "",
|
|
244
|
+
reason: "Rollback module not available",
|
|
245
|
+
});
|
|
246
|
+
}
|
|
247
|
+
throw error;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// ---------------------------------------------------------------------------
|
|
252
|
+
// CLI entry point
|
|
253
|
+
// ---------------------------------------------------------------------------
|
|
254
|
+
|
|
255
|
+
export async function cliMain(): Promise<void> {
|
|
256
|
+
const { values } = parseArgs({
|
|
257
|
+
options: {
|
|
258
|
+
skill: { type: "string" },
|
|
259
|
+
"skill-path": { type: "string" },
|
|
260
|
+
window: { type: "string", default: "20" },
|
|
261
|
+
threshold: { type: "string", default: "0.1" },
|
|
262
|
+
"auto-rollback": { type: "boolean", default: false },
|
|
263
|
+
help: { type: "boolean", default: false },
|
|
264
|
+
},
|
|
265
|
+
strict: true,
|
|
266
|
+
});
|
|
267
|
+
|
|
268
|
+
if (values.help) {
|
|
269
|
+
console.log(`selftune watch — Monitor post-deploy skill health
|
|
270
|
+
|
|
271
|
+
Usage:
|
|
272
|
+
selftune watch --skill <name> --skill-path <path> [options]
|
|
273
|
+
|
|
274
|
+
Options:
|
|
275
|
+
--skill Skill name (required)
|
|
276
|
+
--skill-path Path to SKILL.md (required)
|
|
277
|
+
--window Number of recent sessions to consider (default: 20)
|
|
278
|
+
--threshold Regression threshold below baseline (default: 0.1)
|
|
279
|
+
--auto-rollback Automatically rollback on regression detection
|
|
280
|
+
--help Show this help message`);
|
|
281
|
+
process.exit(0);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
if (!values.skill || !values["skill-path"]) {
|
|
285
|
+
console.error("[ERROR] --skill and --skill-path are required");
|
|
286
|
+
process.exit(1);
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
const rawWindow = values.window ?? "20";
|
|
290
|
+
if (!/^\d+$/.test(rawWindow)) {
|
|
291
|
+
console.error("[ERROR] --window must be a positive integer >= 1");
|
|
292
|
+
process.exit(1);
|
|
293
|
+
}
|
|
294
|
+
const windowSessions = Number.parseInt(rawWindow, 10);
|
|
295
|
+
if (windowSessions < 1) {
|
|
296
|
+
console.error("[ERROR] --window must be a positive integer >= 1");
|
|
297
|
+
process.exit(1);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
const rawThreshold = values.threshold ?? "0.1";
|
|
301
|
+
if (!/^\d+(\.\d+)?$/.test(rawThreshold)) {
|
|
302
|
+
console.error("[ERROR] --threshold must be a finite number between 0 and 1");
|
|
303
|
+
process.exit(1);
|
|
304
|
+
}
|
|
305
|
+
const regressionThreshold = Number.parseFloat(rawThreshold);
|
|
306
|
+
if (regressionThreshold < 0 || regressionThreshold > 1) {
|
|
307
|
+
console.error("[ERROR] --threshold must be a finite number between 0 and 1");
|
|
308
|
+
process.exit(1);
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
const result = await watch({
|
|
312
|
+
skillName: values.skill,
|
|
313
|
+
skillPath: values["skill-path"],
|
|
314
|
+
windowSessions,
|
|
315
|
+
regressionThreshold,
|
|
316
|
+
autoRollback: values["auto-rollback"] ?? false,
|
|
317
|
+
});
|
|
318
|
+
|
|
319
|
+
console.log(JSON.stringify(result, null, 2));
|
|
320
|
+
process.exit(result.alert ? 1 : 0);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
if (import.meta.main) {
|
|
324
|
+
cliMain().catch((err) => {
|
|
325
|
+
console.error(`[FATAL] ${err}`);
|
|
326
|
+
process.exit(1);
|
|
327
|
+
});
|
|
328
|
+
}
|