selftune 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +20 -10
- package/.claude/agents/evolution-reviewer.md +14 -1
- package/.claude/agents/integration-guide.md +18 -6
- package/.claude/agents/pattern-analyst.md +18 -5
- package/CHANGELOG.md +12 -4
- package/README.md +43 -35
- package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
- package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
- package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
- package/apps/local-dashboard/dist/favicon.png +0 -0
- package/apps/local-dashboard/dist/index.html +17 -0
- package/apps/local-dashboard/dist/logo.png +0 -0
- package/apps/local-dashboard/dist/logo.svg +9 -0
- package/cli/selftune/badge/badge-data.ts +1 -1
- package/cli/selftune/badge/badge.ts +4 -8
- package/cli/selftune/canonical-export.ts +183 -0
- package/cli/selftune/constants.ts +28 -0
- package/cli/selftune/contribute/contribute.ts +1 -1
- package/cli/selftune/cron/setup.ts +17 -17
- package/cli/selftune/dashboard-contract.ts +202 -0
- package/cli/selftune/dashboard-server.ts +653 -186
- package/cli/selftune/dashboard.ts +41 -176
- package/cli/selftune/eval/baseline.ts +5 -4
- package/cli/selftune/eval/composability-v2.ts +273 -0
- package/cli/selftune/eval/hooks-to-evals.ts +34 -15
- package/cli/selftune/eval/unit-test-cli.ts +1 -1
- package/cli/selftune/evolution/evidence.ts +26 -0
- package/cli/selftune/evolution/evolve-body.ts +105 -11
- package/cli/selftune/evolution/evolve.ts +371 -25
- package/cli/selftune/evolution/extract-patterns.ts +87 -29
- package/cli/selftune/evolution/rollback.ts +2 -2
- package/cli/selftune/grading/auto-grade.ts +200 -0
- package/cli/selftune/grading/grade-session.ts +448 -97
- package/cli/selftune/grading/results.ts +42 -0
- package/cli/selftune/hooks/prompt-log.ts +172 -2
- package/cli/selftune/hooks/session-stop.ts +123 -3
- package/cli/selftune/hooks/skill-eval.ts +119 -3
- package/cli/selftune/index.ts +395 -116
- package/cli/selftune/ingestors/claude-replay.ts +140 -114
- package/cli/selftune/ingestors/codex-rollout.ts +345 -46
- package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
- package/cli/selftune/ingestors/openclaw-ingest.ts +141 -8
- package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
- package/cli/selftune/init.ts +227 -14
- package/cli/selftune/last.ts +14 -5
- package/cli/selftune/localdb/db.ts +63 -0
- package/cli/selftune/localdb/materialize.ts +428 -0
- package/cli/selftune/localdb/queries.ts +376 -0
- package/cli/selftune/localdb/schema.ts +204 -0
- package/cli/selftune/monitoring/watch.ts +66 -15
- package/cli/selftune/normalization.ts +682 -0
- package/cli/selftune/observability.ts +19 -44
- package/cli/selftune/orchestrate.ts +1073 -0
- package/cli/selftune/quickstart.ts +203 -0
- package/cli/selftune/repair/skill-usage.ts +576 -0
- package/cli/selftune/schedule.ts +561 -0
- package/cli/selftune/status.ts +48 -26
- package/cli/selftune/sync.ts +627 -0
- package/cli/selftune/types.ts +148 -0
- package/cli/selftune/utils/canonical-log.ts +45 -0
- package/cli/selftune/utils/hooks.ts +41 -0
- package/cli/selftune/utils/html.ts +27 -0
- package/cli/selftune/utils/llm-call.ts +78 -20
- package/cli/selftune/utils/math.ts +10 -0
- package/cli/selftune/utils/query-filter.ts +139 -0
- package/cli/selftune/utils/skill-discovery.ts +340 -0
- package/cli/selftune/utils/skill-log.ts +68 -0
- package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
- package/cli/selftune/utils/transcript.ts +272 -26
- package/cli/selftune/workflows/discover.ts +254 -0
- package/cli/selftune/workflows/skill-md-writer.ts +288 -0
- package/cli/selftune/workflows/workflows.ts +188 -0
- package/package.json +21 -8
- package/packages/telemetry-contract/README.md +11 -0
- package/packages/telemetry-contract/fixtures/golden.json +87 -0
- package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
- package/packages/telemetry-contract/index.ts +1 -0
- package/packages/telemetry-contract/package.json +19 -0
- package/packages/telemetry-contract/src/index.ts +2 -0
- package/packages/telemetry-contract/src/types.ts +163 -0
- package/packages/telemetry-contract/src/validators.ts +109 -0
- package/skill/SKILL.md +84 -53
- package/skill/Workflows/AutoActivation.md +17 -16
- package/skill/Workflows/Badge.md +6 -0
- package/skill/Workflows/Baseline.md +46 -23
- package/skill/Workflows/Composability.md +12 -5
- package/skill/Workflows/Contribute.md +17 -14
- package/skill/Workflows/Cron.md +56 -79
- package/skill/Workflows/Dashboard.md +45 -34
- package/skill/Workflows/Doctor.md +30 -17
- package/skill/Workflows/Evals.md +64 -40
- package/skill/Workflows/EvolutionMemory.md +2 -0
- package/skill/Workflows/Evolve.md +102 -47
- package/skill/Workflows/EvolveBody.md +6 -6
- package/skill/Workflows/Grade.md +36 -31
- package/skill/Workflows/ImportSkillsBench.md +11 -5
- package/skill/Workflows/Ingest.md +43 -36
- package/skill/Workflows/Initialize.md +44 -30
- package/skill/Workflows/Orchestrate.md +139 -0
- package/skill/Workflows/Replay.md +39 -18
- package/skill/Workflows/Rollback.md +3 -3
- package/skill/Workflows/Schedule.md +61 -0
- package/skill/Workflows/Sync.md +88 -0
- package/skill/Workflows/UnitTest.md +34 -22
- package/skill/Workflows/Watch.md +14 -4
- package/skill/Workflows/Workflows.md +129 -0
- package/skill/assets/activation-rules-default.json +26 -0
- package/skill/assets/multi-skill-settings.json +63 -0
- package/skill/assets/single-skill-settings.json +57 -0
- package/skill/references/invocation-taxonomy.md +2 -2
- package/skill/references/logs.md +164 -2
- package/skill/references/setup-patterns.md +65 -0
- package/skill/references/version-history.md +40 -0
- package/skill/settings_snippet.json +1 -1
- package/templates/multi-skill-settings.json +7 -7
- package/templates/single-skill-settings.json +6 -6
- package/dashboard/index.html +0 -1680
|
@@ -12,6 +12,7 @@ import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
|
|
|
12
12
|
import { classifyInvocation } from "../eval/hooks-to-evals.js";
|
|
13
13
|
import { getLastDeployedProposal } from "../evolution/audit.js";
|
|
14
14
|
import { updateContextAfterWatch } from "../memory/writer.js";
|
|
15
|
+
import type { SyncResult } from "../sync.js";
|
|
15
16
|
import type {
|
|
16
17
|
InvocationType,
|
|
17
18
|
MonitoringSnapshot,
|
|
@@ -20,6 +21,11 @@ import type {
|
|
|
20
21
|
SkillUsageRecord,
|
|
21
22
|
} from "../types.js";
|
|
22
23
|
import { readJsonl } from "../utils/jsonl.js";
|
|
24
|
+
import {
|
|
25
|
+
filterActionableQueryRecords,
|
|
26
|
+
filterActionableSkillUsageRecords,
|
|
27
|
+
} from "../utils/query-filter.js";
|
|
28
|
+
import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
|
|
23
29
|
|
|
24
30
|
// ---------------------------------------------------------------------------
|
|
25
31
|
// Public interfaces
|
|
@@ -42,6 +48,10 @@ export interface WatchOptions {
|
|
|
42
48
|
skillPath: string;
|
|
43
49
|
proposalId?: string;
|
|
44
50
|
}) => Promise<{ rolledBack: boolean; restoredDescription: string; reason: string }>;
|
|
51
|
+
/** Source-truth refresh before reading logs. */
|
|
52
|
+
syncFirst?: boolean;
|
|
53
|
+
syncForce?: boolean;
|
|
54
|
+
_syncFn?: typeof import("../sync.js").syncSources;
|
|
45
55
|
}
|
|
46
56
|
|
|
47
57
|
export interface WatchResult {
|
|
@@ -49,6 +59,7 @@ export interface WatchResult {
|
|
|
49
59
|
alert: string | null;
|
|
50
60
|
rolledBack: boolean;
|
|
51
61
|
recommendation: string;
|
|
62
|
+
sync_result?: SyncResult;
|
|
52
63
|
}
|
|
53
64
|
|
|
54
65
|
// ---------------------------------------------------------------------------
|
|
@@ -57,6 +68,7 @@ export interface WatchResult {
|
|
|
57
68
|
|
|
58
69
|
const DEFAULT_BASELINE_PASS_RATE = 0.5;
|
|
59
70
|
const DEFAULT_REGRESSION_THRESHOLD = 0.1;
|
|
71
|
+
export const MIN_MONITORING_SKILL_CHECKS = 3;
|
|
60
72
|
|
|
61
73
|
// ---------------------------------------------------------------------------
|
|
62
74
|
// computeMonitoringSnapshot - pure function
|
|
@@ -66,9 +78,9 @@ const DEFAULT_REGRESSION_THRESHOLD = 0.1;
|
|
|
66
78
|
* Compute a monitoring snapshot from raw log records.
|
|
67
79
|
*
|
|
68
80
|
* The function windows telemetry to the last `windowSessions` entries, then
|
|
69
|
-
* scopes skill and query records to those sessions. If telemetry is
|
|
70
|
-
* no records match the windowed session IDs, all provided skill/query
|
|
71
|
-
* are used directly (unfiltered by session).
|
|
81
|
+
* scopes skill and actionable query records to those sessions. If telemetry is
|
|
82
|
+
* empty or no records match the windowed session IDs, all provided skill/query
|
|
83
|
+
* records are used directly (unfiltered by session).
|
|
72
84
|
*
|
|
73
85
|
* @param skillName - The skill to monitor
|
|
74
86
|
* @param telemetry - All session telemetry records
|
|
@@ -88,33 +100,33 @@ export function computeMonitoringSnapshot(
|
|
|
88
100
|
regressionThreshold: number = DEFAULT_REGRESSION_THRESHOLD,
|
|
89
101
|
): MonitoringSnapshot {
|
|
90
102
|
// 1. Window the telemetry to the last N sessions (by array order, assumed chronological)
|
|
103
|
+
const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
|
|
104
|
+
const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
|
|
91
105
|
const windowedTelemetry = telemetry.slice(-windowSessions);
|
|
92
106
|
const windowedSessionIds = new Set(windowedTelemetry.map((t) => t.session_id));
|
|
93
107
|
|
|
94
108
|
// 2. Filter skill records by skill name first
|
|
95
|
-
const skillNameFiltered =
|
|
109
|
+
const skillNameFiltered = actionableSkillRecords.filter((r) => r.skill_name === skillName);
|
|
96
110
|
|
|
97
111
|
// 3. Apply session ID windowing only if telemetry is present and overlaps
|
|
98
112
|
const hasSessionOverlap =
|
|
99
113
|
windowedSessionIds.size > 0 &&
|
|
100
114
|
(skillNameFiltered.some((r) => windowedSessionIds.has(r.session_id)) ||
|
|
101
|
-
|
|
115
|
+
actionableQueryRecords.some((r) => windowedSessionIds.has(r.session_id)));
|
|
102
116
|
|
|
103
117
|
const filteredSkillRecords = hasSessionOverlap
|
|
104
118
|
? skillNameFiltered.filter((r) => windowedSessionIds.has(r.session_id))
|
|
105
119
|
: skillNameFiltered;
|
|
106
|
-
|
|
107
120
|
const filteredQueryRecords = hasSessionOverlap
|
|
108
|
-
?
|
|
109
|
-
:
|
|
121
|
+
? actionableQueryRecords.filter((r) => windowedSessionIds.has(r.session_id))
|
|
122
|
+
: actionableQueryRecords;
|
|
110
123
|
|
|
111
|
-
// 4. Compute pass rate
|
|
124
|
+
// 4. Compute pass rate from explicit skill checks, not from all queries.
|
|
112
125
|
const triggeredCount = filteredSkillRecords.filter((r) => r.triggered).length;
|
|
113
|
-
const
|
|
114
|
-
const passRate =
|
|
126
|
+
const totalSkillChecks = filteredSkillRecords.length;
|
|
127
|
+
const passRate = totalSkillChecks === 0 ? 0 : triggeredCount / totalSkillChecks;
|
|
115
128
|
|
|
116
129
|
// 5. Compute false negative rate from skill usage records
|
|
117
|
-
const totalSkillChecks = filteredSkillRecords.length;
|
|
118
130
|
const falseNegatives = filteredSkillRecords.filter((r) => !r.triggered).length;
|
|
119
131
|
const falseNegativeRate = totalSkillChecks === 0 ? 0 : falseNegatives / totalSkillChecks;
|
|
120
132
|
|
|
@@ -126,7 +138,10 @@ export function computeMonitoringSnapshot(
|
|
|
126
138
|
negative: { passed: 0, total: 0 },
|
|
127
139
|
};
|
|
128
140
|
for (const record of filteredSkillRecords) {
|
|
129
|
-
const invType = classifyInvocation(
|
|
141
|
+
const invType = classifyInvocation(
|
|
142
|
+
typeof record.query === "string" ? record.query : "",
|
|
143
|
+
skillName,
|
|
144
|
+
);
|
|
130
145
|
byInvocationType[invType].total++;
|
|
131
146
|
if (record.triggered) {
|
|
132
147
|
byInvocationType[invType].passed++;
|
|
@@ -139,12 +154,16 @@ export function computeMonitoringSnapshot(
|
|
|
139
154
|
const adjustedThreshold =
|
|
140
155
|
Math.round((baselinePassRate - regressionThreshold) * precision) / precision;
|
|
141
156
|
const roundedPassRate = Math.round(passRate * precision) / precision;
|
|
142
|
-
const
|
|
157
|
+
const hasEnoughSignalForRegression =
|
|
158
|
+
totalSkillChecks >= MIN_MONITORING_SKILL_CHECKS ||
|
|
159
|
+
(totalSkillChecks === 0 && filteredQueryRecords.length >= MIN_MONITORING_SKILL_CHECKS);
|
|
160
|
+
const regressionDetected = hasEnoughSignalForRegression && roundedPassRate < adjustedThreshold;
|
|
143
161
|
|
|
144
162
|
return {
|
|
145
163
|
timestamp: new Date().toISOString(),
|
|
146
164
|
skill_name: skillName,
|
|
147
165
|
window_sessions: windowSessions,
|
|
166
|
+
skill_checks: totalSkillChecks,
|
|
148
167
|
pass_rate: passRate,
|
|
149
168
|
false_negative_rate: falseNegativeRate,
|
|
150
169
|
by_invocation_type: byInvocationType,
|
|
@@ -172,11 +191,28 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
|
|
|
172
191
|
_queryLogPath = QUERY_LOG,
|
|
173
192
|
_auditLogPath,
|
|
174
193
|
_rollbackFn,
|
|
194
|
+
syncFirst = false,
|
|
195
|
+
syncForce = false,
|
|
196
|
+
_syncFn,
|
|
175
197
|
} = options;
|
|
176
198
|
|
|
199
|
+
let syncResult: SyncResult | undefined;
|
|
200
|
+
if (syncFirst) {
|
|
201
|
+
const { createDefaultSyncOptions, syncSources: realSyncSources } = await import("../sync.js");
|
|
202
|
+
const syncRunner = _syncFn ?? realSyncSources;
|
|
203
|
+
syncResult = syncRunner(
|
|
204
|
+
createDefaultSyncOptions({
|
|
205
|
+
force: syncForce,
|
|
206
|
+
}),
|
|
207
|
+
);
|
|
208
|
+
}
|
|
209
|
+
|
|
177
210
|
// 1. Read log files
|
|
178
211
|
const telemetry = readJsonl<SessionTelemetryRecord>(_telemetryLogPath);
|
|
179
|
-
const skillRecords =
|
|
212
|
+
const skillRecords =
|
|
213
|
+
_skillLogPath === SKILL_LOG
|
|
214
|
+
? readEffectiveSkillUsageRecords()
|
|
215
|
+
: readJsonl<SkillUsageRecord>(_skillLogPath);
|
|
180
216
|
const queryRecords = readJsonl<QueryLogRecord>(_queryLogPath);
|
|
181
217
|
|
|
182
218
|
// 2. Determine baseline pass rate from last deployed audit entry
|
|
@@ -217,6 +253,10 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
|
|
|
217
253
|
recommendation = rolledBack
|
|
218
254
|
? `Rolled back "${skillName}" to previous version. Monitor to confirm recovery.`
|
|
219
255
|
: `Consider running: selftune rollback --skill "${skillName}" --skill-path "${skillPath}"`;
|
|
256
|
+
} else if (snapshot.skill_checks < MIN_MONITORING_SKILL_CHECKS) {
|
|
257
|
+
recommendation =
|
|
258
|
+
`Skill "${skillName}" has only ${snapshot.skill_checks} actionable check(s) in the current window. ` +
|
|
259
|
+
`Need at least ${MIN_MONITORING_SKILL_CHECKS} before calling it stable.`;
|
|
220
260
|
} else {
|
|
221
261
|
recommendation = `Skill "${skillName}" is stable. Pass rate ${snapshot.pass_rate.toFixed(2)} is within acceptable range of baseline ${baselinePassRate.toFixed(2)}.`;
|
|
222
262
|
}
|
|
@@ -240,6 +280,7 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
|
|
|
240
280
|
alert,
|
|
241
281
|
rolledBack,
|
|
242
282
|
recommendation,
|
|
283
|
+
...(syncResult ? { sync_result: syncResult } : {}),
|
|
243
284
|
};
|
|
244
285
|
}
|
|
245
286
|
|
|
@@ -283,6 +324,8 @@ export async function cliMain(): Promise<void> {
|
|
|
283
324
|
window: { type: "string", default: "20" },
|
|
284
325
|
threshold: { type: "string", default: "0.1" },
|
|
285
326
|
"auto-rollback": { type: "boolean", default: false },
|
|
327
|
+
"sync-first": { type: "boolean", default: false },
|
|
328
|
+
"sync-force": { type: "boolean", default: false },
|
|
286
329
|
help: { type: "boolean", default: false },
|
|
287
330
|
},
|
|
288
331
|
strict: true,
|
|
@@ -300,6 +343,8 @@ Options:
|
|
|
300
343
|
--window Number of recent sessions to consider (default: 20)
|
|
301
344
|
--threshold Regression threshold below baseline (default: 0.1)
|
|
302
345
|
--auto-rollback Automatically rollback on regression detection
|
|
346
|
+
--sync-first Refresh source-truth telemetry before reading watch inputs
|
|
347
|
+
--sync-force Force a full rescan during --sync-first
|
|
303
348
|
--help Show this help message`);
|
|
304
349
|
process.exit(0);
|
|
305
350
|
}
|
|
@@ -308,6 +353,10 @@ Options:
|
|
|
308
353
|
console.error("[ERROR] --skill and --skill-path are required");
|
|
309
354
|
process.exit(1);
|
|
310
355
|
}
|
|
356
|
+
if ((values["sync-force"] ?? false) && !(values["sync-first"] ?? false)) {
|
|
357
|
+
console.error("[ERROR] --sync-force requires --sync-first");
|
|
358
|
+
process.exit(1);
|
|
359
|
+
}
|
|
311
360
|
|
|
312
361
|
const rawWindow = values.window ?? "20";
|
|
313
362
|
if (!/^\d+$/.test(rawWindow)) {
|
|
@@ -337,6 +386,8 @@ Options:
|
|
|
337
386
|
windowSessions,
|
|
338
387
|
regressionThreshold,
|
|
339
388
|
autoRollback: values["auto-rollback"] ?? false,
|
|
389
|
+
syncFirst: values["sync-first"] ?? false,
|
|
390
|
+
syncForce: values["sync-force"] ?? false,
|
|
340
391
|
});
|
|
341
392
|
|
|
342
393
|
console.log(JSON.stringify(result, null, 2));
|