selftune 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. package/.claude/agents/diagnosis-analyst.md +20 -10
  2. package/.claude/agents/evolution-reviewer.md +14 -1
  3. package/.claude/agents/integration-guide.md +18 -6
  4. package/.claude/agents/pattern-analyst.md +18 -5
  5. package/CHANGELOG.md +12 -4
  6. package/README.md +43 -35
  7. package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
  8. package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
  9. package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
  10. package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
  11. package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
  12. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
  13. package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
  14. package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
  15. package/apps/local-dashboard/dist/favicon.png +0 -0
  16. package/apps/local-dashboard/dist/index.html +17 -0
  17. package/apps/local-dashboard/dist/logo.png +0 -0
  18. package/apps/local-dashboard/dist/logo.svg +9 -0
  19. package/cli/selftune/badge/badge-data.ts +1 -1
  20. package/cli/selftune/badge/badge.ts +4 -8
  21. package/cli/selftune/canonical-export.ts +183 -0
  22. package/cli/selftune/constants.ts +28 -0
  23. package/cli/selftune/contribute/contribute.ts +1 -1
  24. package/cli/selftune/cron/setup.ts +17 -17
  25. package/cli/selftune/dashboard-contract.ts +202 -0
  26. package/cli/selftune/dashboard-server.ts +653 -186
  27. package/cli/selftune/dashboard.ts +41 -176
  28. package/cli/selftune/eval/baseline.ts +5 -4
  29. package/cli/selftune/eval/composability-v2.ts +273 -0
  30. package/cli/selftune/eval/hooks-to-evals.ts +34 -15
  31. package/cli/selftune/eval/unit-test-cli.ts +1 -1
  32. package/cli/selftune/evolution/evidence.ts +26 -0
  33. package/cli/selftune/evolution/evolve-body.ts +105 -11
  34. package/cli/selftune/evolution/evolve.ts +371 -25
  35. package/cli/selftune/evolution/extract-patterns.ts +87 -29
  36. package/cli/selftune/evolution/rollback.ts +2 -2
  37. package/cli/selftune/grading/auto-grade.ts +200 -0
  38. package/cli/selftune/grading/grade-session.ts +448 -97
  39. package/cli/selftune/grading/results.ts +42 -0
  40. package/cli/selftune/hooks/prompt-log.ts +172 -2
  41. package/cli/selftune/hooks/session-stop.ts +123 -3
  42. package/cli/selftune/hooks/skill-eval.ts +119 -3
  43. package/cli/selftune/index.ts +395 -116
  44. package/cli/selftune/ingestors/claude-replay.ts +140 -114
  45. package/cli/selftune/ingestors/codex-rollout.ts +345 -46
  46. package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
  47. package/cli/selftune/ingestors/openclaw-ingest.ts +141 -8
  48. package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
  49. package/cli/selftune/init.ts +227 -14
  50. package/cli/selftune/last.ts +14 -5
  51. package/cli/selftune/localdb/db.ts +63 -0
  52. package/cli/selftune/localdb/materialize.ts +428 -0
  53. package/cli/selftune/localdb/queries.ts +376 -0
  54. package/cli/selftune/localdb/schema.ts +204 -0
  55. package/cli/selftune/monitoring/watch.ts +66 -15
  56. package/cli/selftune/normalization.ts +682 -0
  57. package/cli/selftune/observability.ts +19 -44
  58. package/cli/selftune/orchestrate.ts +1073 -0
  59. package/cli/selftune/quickstart.ts +203 -0
  60. package/cli/selftune/repair/skill-usage.ts +576 -0
  61. package/cli/selftune/schedule.ts +561 -0
  62. package/cli/selftune/status.ts +48 -26
  63. package/cli/selftune/sync.ts +627 -0
  64. package/cli/selftune/types.ts +148 -0
  65. package/cli/selftune/utils/canonical-log.ts +45 -0
  66. package/cli/selftune/utils/hooks.ts +41 -0
  67. package/cli/selftune/utils/html.ts +27 -0
  68. package/cli/selftune/utils/llm-call.ts +78 -20
  69. package/cli/selftune/utils/math.ts +10 -0
  70. package/cli/selftune/utils/query-filter.ts +139 -0
  71. package/cli/selftune/utils/skill-discovery.ts +340 -0
  72. package/cli/selftune/utils/skill-log.ts +68 -0
  73. package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
  74. package/cli/selftune/utils/transcript.ts +272 -26
  75. package/cli/selftune/workflows/discover.ts +254 -0
  76. package/cli/selftune/workflows/skill-md-writer.ts +288 -0
  77. package/cli/selftune/workflows/workflows.ts +188 -0
  78. package/package.json +21 -8
  79. package/packages/telemetry-contract/README.md +11 -0
  80. package/packages/telemetry-contract/fixtures/golden.json +87 -0
  81. package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
  82. package/packages/telemetry-contract/index.ts +1 -0
  83. package/packages/telemetry-contract/package.json +19 -0
  84. package/packages/telemetry-contract/src/index.ts +2 -0
  85. package/packages/telemetry-contract/src/types.ts +163 -0
  86. package/packages/telemetry-contract/src/validators.ts +109 -0
  87. package/skill/SKILL.md +84 -53
  88. package/skill/Workflows/AutoActivation.md +17 -16
  89. package/skill/Workflows/Badge.md +6 -0
  90. package/skill/Workflows/Baseline.md +46 -23
  91. package/skill/Workflows/Composability.md +12 -5
  92. package/skill/Workflows/Contribute.md +17 -14
  93. package/skill/Workflows/Cron.md +56 -79
  94. package/skill/Workflows/Dashboard.md +45 -34
  95. package/skill/Workflows/Doctor.md +30 -17
  96. package/skill/Workflows/Evals.md +64 -40
  97. package/skill/Workflows/EvolutionMemory.md +2 -0
  98. package/skill/Workflows/Evolve.md +102 -47
  99. package/skill/Workflows/EvolveBody.md +6 -6
  100. package/skill/Workflows/Grade.md +36 -31
  101. package/skill/Workflows/ImportSkillsBench.md +11 -5
  102. package/skill/Workflows/Ingest.md +43 -36
  103. package/skill/Workflows/Initialize.md +44 -30
  104. package/skill/Workflows/Orchestrate.md +139 -0
  105. package/skill/Workflows/Replay.md +39 -18
  106. package/skill/Workflows/Rollback.md +3 -3
  107. package/skill/Workflows/Schedule.md +61 -0
  108. package/skill/Workflows/Sync.md +88 -0
  109. package/skill/Workflows/UnitTest.md +34 -22
  110. package/skill/Workflows/Watch.md +14 -4
  111. package/skill/Workflows/Workflows.md +129 -0
  112. package/skill/assets/activation-rules-default.json +26 -0
  113. package/skill/assets/multi-skill-settings.json +63 -0
  114. package/skill/assets/single-skill-settings.json +57 -0
  115. package/skill/references/invocation-taxonomy.md +2 -2
  116. package/skill/references/logs.md +164 -2
  117. package/skill/references/setup-patterns.md +65 -0
  118. package/skill/references/version-history.md +40 -0
  119. package/skill/settings_snippet.json +1 -1
  120. package/templates/multi-skill-settings.json +7 -7
  121. package/templates/single-skill-settings.json +6 -6
  122. package/dashboard/index.html +0 -1680
@@ -12,6 +12,7 @@ import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
12
12
  import { classifyInvocation } from "../eval/hooks-to-evals.js";
13
13
  import { getLastDeployedProposal } from "../evolution/audit.js";
14
14
  import { updateContextAfterWatch } from "../memory/writer.js";
15
+ import type { SyncResult } from "../sync.js";
15
16
  import type {
16
17
  InvocationType,
17
18
  MonitoringSnapshot,
@@ -20,6 +21,11 @@ import type {
20
21
  SkillUsageRecord,
21
22
  } from "../types.js";
22
23
  import { readJsonl } from "../utils/jsonl.js";
24
+ import {
25
+ filterActionableQueryRecords,
26
+ filterActionableSkillUsageRecords,
27
+ } from "../utils/query-filter.js";
28
+ import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
23
29
 
24
30
  // ---------------------------------------------------------------------------
25
31
  // Public interfaces
@@ -42,6 +48,10 @@ export interface WatchOptions {
42
48
  skillPath: string;
43
49
  proposalId?: string;
44
50
  }) => Promise<{ rolledBack: boolean; restoredDescription: string; reason: string }>;
51
+ /** Source-truth refresh before reading logs. */
52
+ syncFirst?: boolean;
53
+ syncForce?: boolean;
54
+ _syncFn?: typeof import("../sync.js").syncSources;
45
55
  }
46
56
 
47
57
  export interface WatchResult {
@@ -49,6 +59,7 @@ export interface WatchResult {
49
59
  alert: string | null;
50
60
  rolledBack: boolean;
51
61
  recommendation: string;
62
+ sync_result?: SyncResult;
52
63
  }
53
64
 
54
65
  // ---------------------------------------------------------------------------
@@ -57,6 +68,7 @@ export interface WatchResult {
57
68
 
58
69
  const DEFAULT_BASELINE_PASS_RATE = 0.5;
59
70
  const DEFAULT_REGRESSION_THRESHOLD = 0.1;
71
+ export const MIN_MONITORING_SKILL_CHECKS = 3;
60
72
 
61
73
  // ---------------------------------------------------------------------------
62
74
  // computeMonitoringSnapshot - pure function
@@ -66,9 +78,9 @@ const DEFAULT_REGRESSION_THRESHOLD = 0.1;
66
78
  * Compute a monitoring snapshot from raw log records.
67
79
  *
68
80
  * The function windows telemetry to the last `windowSessions` entries, then
69
- * scopes skill and query records to those sessions. If telemetry is empty or
70
- * no records match the windowed session IDs, all provided skill/query records
71
- * are used directly (unfiltered by session).
81
+ * scopes skill and actionable query records to those sessions. If telemetry is
82
+ * empty or no records match the windowed session IDs, all provided skill/query
83
+ * records are used directly (unfiltered by session).
72
84
  *
73
85
  * @param skillName - The skill to monitor
74
86
  * @param telemetry - All session telemetry records
@@ -88,33 +100,33 @@ export function computeMonitoringSnapshot(
88
100
  regressionThreshold: number = DEFAULT_REGRESSION_THRESHOLD,
89
101
  ): MonitoringSnapshot {
90
102
  // 1. Window the telemetry to the last N sessions (by array order, assumed chronological)
103
+ const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
104
+ const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
91
105
  const windowedTelemetry = telemetry.slice(-windowSessions);
92
106
  const windowedSessionIds = new Set(windowedTelemetry.map((t) => t.session_id));
93
107
 
94
108
  // 2. Filter skill records by skill name first
95
- const skillNameFiltered = skillRecords.filter((r) => r.skill_name === skillName);
109
+ const skillNameFiltered = actionableSkillRecords.filter((r) => r.skill_name === skillName);
96
110
 
97
111
  // 3. Apply session ID windowing only if telemetry is present and overlaps
98
112
  const hasSessionOverlap =
99
113
  windowedSessionIds.size > 0 &&
100
114
  (skillNameFiltered.some((r) => windowedSessionIds.has(r.session_id)) ||
101
- queryRecords.some((r) => windowedSessionIds.has(r.session_id)));
115
+ actionableQueryRecords.some((r) => windowedSessionIds.has(r.session_id)));
102
116
 
103
117
  const filteredSkillRecords = hasSessionOverlap
104
118
  ? skillNameFiltered.filter((r) => windowedSessionIds.has(r.session_id))
105
119
  : skillNameFiltered;
106
-
107
120
  const filteredQueryRecords = hasSessionOverlap
108
- ? queryRecords.filter((r) => windowedSessionIds.has(r.session_id))
109
- : queryRecords;
121
+ ? actionableQueryRecords.filter((r) => windowedSessionIds.has(r.session_id))
122
+ : actionableQueryRecords;
110
123
 
111
- // 4. Compute pass rate: triggered_count / total_query_count
124
+ // 4. Compute pass rate from explicit skill checks, not from all queries.
112
125
  const triggeredCount = filteredSkillRecords.filter((r) => r.triggered).length;
113
- const totalQueries = filteredQueryRecords.length;
114
- const passRate = totalQueries === 0 ? 1.0 : triggeredCount / totalQueries;
126
+ const totalSkillChecks = filteredSkillRecords.length;
127
+ const passRate = totalSkillChecks === 0 ? 0 : triggeredCount / totalSkillChecks;
115
128
 
116
129
  // 5. Compute false negative rate from skill usage records
117
- const totalSkillChecks = filteredSkillRecords.length;
118
130
  const falseNegatives = filteredSkillRecords.filter((r) => !r.triggered).length;
119
131
  const falseNegativeRate = totalSkillChecks === 0 ? 0 : falseNegatives / totalSkillChecks;
120
132
 
@@ -126,7 +138,10 @@ export function computeMonitoringSnapshot(
126
138
  negative: { passed: 0, total: 0 },
127
139
  };
128
140
  for (const record of filteredSkillRecords) {
129
- const invType = classifyInvocation(record.query, skillName);
141
+ const invType = classifyInvocation(
142
+ typeof record.query === "string" ? record.query : "",
143
+ skillName,
144
+ );
130
145
  byInvocationType[invType].total++;
131
146
  if (record.triggered) {
132
147
  byInvocationType[invType].passed++;
@@ -139,12 +154,16 @@ export function computeMonitoringSnapshot(
139
154
  const adjustedThreshold =
140
155
  Math.round((baselinePassRate - regressionThreshold) * precision) / precision;
141
156
  const roundedPassRate = Math.round(passRate * precision) / precision;
142
- const regressionDetected = roundedPassRate < adjustedThreshold;
157
+ const hasEnoughSignalForRegression =
158
+ totalSkillChecks >= MIN_MONITORING_SKILL_CHECKS ||
159
+ (totalSkillChecks === 0 && filteredQueryRecords.length >= MIN_MONITORING_SKILL_CHECKS);
160
+ const regressionDetected = hasEnoughSignalForRegression && roundedPassRate < adjustedThreshold;
143
161
 
144
162
  return {
145
163
  timestamp: new Date().toISOString(),
146
164
  skill_name: skillName,
147
165
  window_sessions: windowSessions,
166
+ skill_checks: totalSkillChecks,
148
167
  pass_rate: passRate,
149
168
  false_negative_rate: falseNegativeRate,
150
169
  by_invocation_type: byInvocationType,
@@ -172,11 +191,28 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
172
191
  _queryLogPath = QUERY_LOG,
173
192
  _auditLogPath,
174
193
  _rollbackFn,
194
+ syncFirst = false,
195
+ syncForce = false,
196
+ _syncFn,
175
197
  } = options;
176
198
 
199
+ let syncResult: SyncResult | undefined;
200
+ if (syncFirst) {
201
+ const { createDefaultSyncOptions, syncSources: realSyncSources } = await import("../sync.js");
202
+ const syncRunner = _syncFn ?? realSyncSources;
203
+ syncResult = syncRunner(
204
+ createDefaultSyncOptions({
205
+ force: syncForce,
206
+ }),
207
+ );
208
+ }
209
+
177
210
  // 1. Read log files
178
211
  const telemetry = readJsonl<SessionTelemetryRecord>(_telemetryLogPath);
179
- const skillRecords = readJsonl<SkillUsageRecord>(_skillLogPath);
212
+ const skillRecords =
213
+ _skillLogPath === SKILL_LOG
214
+ ? readEffectiveSkillUsageRecords()
215
+ : readJsonl<SkillUsageRecord>(_skillLogPath);
180
216
  const queryRecords = readJsonl<QueryLogRecord>(_queryLogPath);
181
217
 
182
218
  // 2. Determine baseline pass rate from last deployed audit entry
@@ -217,6 +253,10 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
217
253
  recommendation = rolledBack
218
254
  ? `Rolled back "${skillName}" to previous version. Monitor to confirm recovery.`
219
255
  : `Consider running: selftune rollback --skill "${skillName}" --skill-path "${skillPath}"`;
256
+ } else if (snapshot.skill_checks < MIN_MONITORING_SKILL_CHECKS) {
257
+ recommendation =
258
+ `Skill "${skillName}" has only ${snapshot.skill_checks} actionable check(s) in the current window. ` +
259
+ `Need at least ${MIN_MONITORING_SKILL_CHECKS} before calling it stable.`;
220
260
  } else {
221
261
  recommendation = `Skill "${skillName}" is stable. Pass rate ${snapshot.pass_rate.toFixed(2)} is within acceptable range of baseline ${baselinePassRate.toFixed(2)}.`;
222
262
  }
@@ -240,6 +280,7 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
240
280
  alert,
241
281
  rolledBack,
242
282
  recommendation,
283
+ ...(syncResult ? { sync_result: syncResult } : {}),
243
284
  };
244
285
  }
245
286
 
@@ -283,6 +324,8 @@ export async function cliMain(): Promise<void> {
283
324
  window: { type: "string", default: "20" },
284
325
  threshold: { type: "string", default: "0.1" },
285
326
  "auto-rollback": { type: "boolean", default: false },
327
+ "sync-first": { type: "boolean", default: false },
328
+ "sync-force": { type: "boolean", default: false },
286
329
  help: { type: "boolean", default: false },
287
330
  },
288
331
  strict: true,
@@ -300,6 +343,8 @@ Options:
300
343
  --window Number of recent sessions to consider (default: 20)
301
344
  --threshold Regression threshold below baseline (default: 0.1)
302
345
  --auto-rollback Automatically rollback on regression detection
346
+ --sync-first Refresh source-truth telemetry before reading watch inputs
347
+ --sync-force Force a full rescan during --sync-first
303
348
  --help Show this help message`);
304
349
  process.exit(0);
305
350
  }
@@ -308,6 +353,10 @@ Options:
308
353
  console.error("[ERROR] --skill and --skill-path are required");
309
354
  process.exit(1);
310
355
  }
356
+ if ((values["sync-force"] ?? false) && !(values["sync-first"] ?? false)) {
357
+ console.error("[ERROR] --sync-force requires --sync-first");
358
+ process.exit(1);
359
+ }
311
360
 
312
361
  const rawWindow = values.window ?? "20";
313
362
  if (!/^\d+$/.test(rawWindow)) {
@@ -337,6 +386,8 @@ Options:
337
386
  windowSessions,
338
387
  regressionThreshold,
339
388
  autoRollback: values["auto-rollback"] ?? false,
389
+ syncFirst: values["sync-first"] ?? false,
390
+ syncForce: values["sync-force"] ?? false,
340
391
  });
341
392
 
342
393
  console.log(JSON.stringify(result, null, 2));