selftune 0.2.29 → 0.2.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/apps/local-dashboard/dist/assets/index-BcXquWFB.css +1 -0
  2. package/apps/local-dashboard/dist/assets/index-Coq42hE4.js +15 -0
  3. package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +1 -0
  4. package/apps/local-dashboard/dist/index.html +3 -3
  5. package/cli/selftune/auto-update.ts +40 -8
  6. package/cli/selftune/command-surface.ts +1 -1
  7. package/cli/selftune/constants.ts +5 -0
  8. package/cli/selftune/dashboard-action-events.ts +117 -0
  9. package/cli/selftune/dashboard-action-instrumentation.ts +103 -0
  10. package/cli/selftune/dashboard-action-result.ts +90 -0
  11. package/cli/selftune/dashboard-action-stream.ts +252 -0
  12. package/cli/selftune/dashboard-contract.ts +81 -1
  13. package/cli/selftune/dashboard-server.ts +133 -16
  14. package/cli/selftune/eval/hooks-to-evals.ts +157 -0
  15. package/cli/selftune/eval/synthetic-evals.ts +33 -2
  16. package/cli/selftune/eval/unit-test-cli.ts +53 -5
  17. package/cli/selftune/evolution/validate-host-replay.ts +191 -14
  18. package/cli/selftune/index.ts +4 -0
  19. package/cli/selftune/ingestors/opencode-ingest.ts +117 -8
  20. package/cli/selftune/localdb/schema.ts +34 -0
  21. package/cli/selftune/routes/actions.ts +273 -42
  22. package/cli/selftune/testing-readiness.ts +203 -10
  23. package/cli/selftune/utils/llm-call.ts +90 -1
  24. package/package.json +1 -1
  25. package/packages/ui/src/components/EvolutionTimeline.tsx +1 -1
  26. package/skill/SKILL.md +1 -1
  27. package/skill/workflows/Dashboard.md +50 -23
  28. package/apps/local-dashboard/dist/assets/index-BcvtYmmL.js +0 -15
  29. package/apps/local-dashboard/dist/assets/index-BpRIxnpS.css +0 -1
  30. package/apps/local-dashboard/dist/assets/vendor-ui-DqH_uxum.js +0 -1
@@ -17,6 +17,23 @@ const logger = createLogger("llm-call");
17
17
  export const LLM_BACKED_AGENT_CANDIDATES = ["claude", "codex", "opencode", "pi"] as const;
18
18
  export type LlmBackedAgent = (typeof LLM_BACKED_AGENT_CANDIDATES)[number];
19
19
 
20
+ export interface LlmInvocationIdentity {
21
+ platform: string;
22
+ model: string | null;
23
+ }
24
+
25
+ export interface LlmCallLifecycleEvent extends LlmInvocationIdentity {
26
+ agent: string;
27
+ durationMs: number | null;
28
+ success: boolean | null;
29
+ error: string | null;
30
+ }
31
+
32
+ export interface LlmCallObserver {
33
+ onStart?: (event: LlmCallLifecycleEvent) => void;
34
+ onFinish?: (event: LlmCallLifecycleEvent) => void;
35
+ }
36
+
20
37
  // ---------------------------------------------------------------------------
21
38
  // Model alias resolution
22
39
  // ---------------------------------------------------------------------------
@@ -61,6 +78,41 @@ function resolvePiThinking(effort: EffortLevel): string {
61
78
  return PI_THINKING_MAP[effort];
62
79
  }
63
80
 
81
+ export function describeLlmInvocation(agent: string, modelFlag?: string): LlmInvocationIdentity {
82
+ if (agent === "claude") {
83
+ return {
84
+ platform: "claude_code",
85
+ model: modelFlag ? resolveModelFlag(modelFlag) : null,
86
+ };
87
+ }
88
+
89
+ if (agent === "opencode") {
90
+ return {
91
+ platform: "opencode",
92
+ model: modelFlag ? resolveOpenCodeModel(modelFlag) : null,
93
+ };
94
+ }
95
+
96
+ if (agent === "codex") {
97
+ return {
98
+ platform: "codex",
99
+ model: modelFlag ?? null,
100
+ };
101
+ }
102
+
103
+ if (agent === "pi") {
104
+ return {
105
+ platform: "pi",
106
+ model: modelFlag ?? null,
107
+ };
108
+ }
109
+
110
+ return {
111
+ platform: agent,
112
+ model: modelFlag ?? null,
113
+ };
114
+ }
115
+
64
116
  // ---------------------------------------------------------------------------
65
117
  // Bundled agent file loading (for codex inline prompt injection)
66
118
  // ---------------------------------------------------------------------------
@@ -208,6 +260,7 @@ export async function callViaAgent(
208
260
  modelFlag?: string,
209
261
  retryOpts?: RetryOptions,
210
262
  effort?: EffortLevel,
263
+ observer?: LlmCallObserver,
211
264
  ): Promise<string> {
212
265
  // Write prompt to temp file to avoid shell quoting issues
213
266
  const promptFile = join(tmpdir(), `selftune-llm-${Date.now()}.txt`);
@@ -216,6 +269,7 @@ export async function callViaAgent(
216
269
  try {
217
270
  const promptContent = readFileSync(promptFile, "utf-8");
218
271
  let cmd: string[];
272
+ const identity = describeLlmInvocation(agent, modelFlag);
219
273
 
220
274
  if (agent === "claude") {
221
275
  cmd = ["claude", "-p", promptContent];
@@ -264,6 +318,18 @@ export async function callViaAgent(
264
318
  const maxRetries = retryOpts?.maxRetries ?? DEFAULT_MAX_RETRIES;
265
319
  const initialBackoffMs = retryOpts?.initialBackoffMs ?? DEFAULT_INITIAL_BACKOFF_MS;
266
320
  let lastError: Error | undefined;
321
+ const startedAt = Date.now();
322
+ try {
323
+ observer?.onStart?.({
324
+ agent,
325
+ ...identity,
326
+ durationMs: null,
327
+ success: null,
328
+ error: null,
329
+ });
330
+ } catch {
331
+ // fail-open: instrumentation must never block the real LLM call
332
+ }
267
333
  for (let attempt = 0; attempt <= maxRetries; attempt++) {
268
334
  if (attempt > 0) {
269
335
  const backoffMs = initialBackoffMs * 2 ** (attempt - 1);
@@ -296,10 +362,32 @@ export async function callViaAgent(
296
362
  }
297
363
 
298
364
  const raw = await new Response(proc.stdout).text();
365
+ try {
366
+ observer?.onFinish?.({
367
+ agent,
368
+ ...identity,
369
+ durationMs: Date.now() - startedAt,
370
+ success: true,
371
+ error: null,
372
+ });
373
+ } catch {
374
+ // fail-open: instrumentation must never block the real LLM call
375
+ }
299
376
  return raw;
300
377
  } catch (err) {
301
378
  lastError = err instanceof Error ? err : new Error(String(err));
302
379
  if (!isTransientError(lastError) || attempt === maxRetries) {
380
+ try {
381
+ observer?.onFinish?.({
382
+ agent,
383
+ ...identity,
384
+ durationMs: Date.now() - startedAt,
385
+ success: false,
386
+ error: lastError.message,
387
+ });
388
+ } catch {
389
+ // fail-open: instrumentation must never block the real LLM call
390
+ }
303
391
  throw lastError;
304
392
  }
305
393
  logger.warn(`Transient failure on attempt ${attempt + 1}: ${lastError.message}`);
@@ -533,9 +621,10 @@ export async function callLlm(
533
621
  agent: string,
534
622
  modelFlag?: string,
535
623
  effort?: EffortLevel,
624
+ observer?: LlmCallObserver,
536
625
  ): Promise<string> {
537
626
  if (!agent) {
538
627
  throw new Error("Agent must be specified for callLlm");
539
628
  }
540
- return callViaAgent(systemPrompt, userPrompt, agent, modelFlag, undefined, effort);
629
+ return callViaAgent(systemPrompt, userPrompt, agent, modelFlag, undefined, effort, observer);
541
630
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "selftune",
3
- "version": "0.2.29",
3
+ "version": "0.2.30",
4
4
  "description": "Skill-level observability and self-improvement for AI agents — monitors skill routing, detects missed triggers, and evolves descriptions automatically",
5
5
  "keywords": [
6
6
  "agent",
@@ -139,7 +139,7 @@ export function EvolutionTimeline({ entries, selectedProposalId, onSelect }: Pro
139
139
 
140
140
  return (
141
141
  <div className="flex flex-col gap-0">
142
- <h2 className="sticky top-0 z-10 bg-background px-2 pb-2 text-xs font-semibold uppercase tracking-wider text-muted-foreground">
142
+ <h2 className="sticky top-0 z-10 px-2 pb-2 text-xs font-semibold uppercase tracking-wider text-muted-foreground">
143
143
  Evolution
144
144
  </h2>
145
145
  <LifecycleLegend />
package/skill/SKILL.md CHANGED
@@ -13,7 +13,7 @@ description: >
13
13
  even if they don't say "selftune" explicitly.
14
14
  metadata:
15
15
  author: selftune-dev
16
- version: 0.2.29
16
+ version: 0.2.30
17
17
  category: developer-tools
18
18
  ---
19
19
 
@@ -19,12 +19,12 @@ generate JSONL from SQLite for debugging or offline analysis.
19
19
 
20
20
  ## Options
21
21
 
22
- | Flag | Description | Default |
23
- | --------------- | ----------------------------------------- | ------- |
24
- | `--port <port>` | Custom port for the server | 3141 |
25
- | `--restart` | Force-restart an existing dashboard on the target port | Off |
26
- | `--no-open` | Start server without opening browser | Off |
27
- | `--serve` | _(Deprecated)_ Alias for default behavior | — |
22
+ | Flag | Description | Default |
23
+ | --------------- | ------------------------------------------------------ | ------- |
24
+ | `--port <port>` | Custom port for the server | 3141 |
25
+ | `--restart` | Force-restart an existing dashboard on the target port | Off |
26
+ | `--no-open` | Start server without opening browser | Off |
27
+ | `--serve` | _(Deprecated)_ Alias for default behavior | — |
28
28
 
29
29
  Note: `--export` and `--out` were removed. The CLI will error if used,
30
30
  suggesting `selftune dashboard` instead.
@@ -48,26 +48,52 @@ staying stale.
48
48
 
49
49
  ### Endpoints
50
50
 
51
- | Method | Path | Description |
52
- | ------ | -------------------------- | ---------------------------------------------------------- |
53
- | `GET` | `/` | Serve dashboard SPA shell |
54
- | `GET` | `/api/v2/overview` | SQLite-backed overview payload |
55
- | `GET` | `/api/v2/skills/:name` | SQLite-backed per-skill report |
56
- | `GET` | `/api/v2/orchestrate-runs` | Recent orchestrate run reports |
57
- | `GET` | `/api/v2/doctor` | System health diagnostics (config, logs, hooks, evolution) |
58
- | `GET` | `/api/v2/events` | SSE stream for live dashboard updates |
59
- | `GET` | `/api/health` | Dashboard server health probe |
60
- | `POST` | `/api/actions/watch` | Trigger `selftune watch` for a skill |
61
- | `POST` | `/api/actions/evolve` | Trigger `selftune evolve` for a skill |
62
- | `POST` | `/api/actions/rollback` | Trigger `selftune evolve rollback` for a skill |
63
- | `POST` | `/api/actions/watchlist` | Persist creator watchlist preferences |
51
+ | Method | Path | Description |
52
+ | ------ | ---------------------------------- | ------------------------------------------------------------ |
53
+ | `GET` | `/` | Serve dashboard SPA shell |
54
+ | `GET` | `/api/v2/overview` | SQLite-backed overview payload |
55
+ | `GET` | `/api/v2/skills/:name` | SQLite-backed per-skill report |
56
+ | `GET` | `/api/v2/orchestrate-runs` | Recent orchestrate run reports |
57
+ | `GET` | `/api/v2/doctor` | System health diagnostics (config, logs, hooks, evolution) |
58
+ | `GET` | `/api/v2/events` | SSE stream for live dashboard updates |
59
+ | `GET` | `/api/health` | Dashboard server health probe |
60
+ | `POST` | `/api/actions/generate-evals` | Trigger `selftune eval generate` for a skill |
61
+ | `POST` | `/api/actions/generate-unit-tests` | Trigger `selftune eval unit-test --generate` |
62
+ | `POST` | `/api/actions/replay-dry-run` | Trigger `selftune evolve --dry-run --validation-mode replay` |
63
+ | `POST` | `/api/actions/measure-baseline` | Trigger `selftune grade baseline` for a skill |
64
+ | `POST` | `/api/actions/deploy-candidate` | Trigger `selftune evolve` for a skill |
65
+ | `POST` | `/api/actions/watch` | Trigger `selftune watch` for a skill |
66
+ | `POST` | `/api/actions/evolve` | Trigger `selftune evolve` for a skill |
67
+ | `POST` | `/api/actions/rollback` | Trigger `selftune evolve rollback` for a skill |
68
+ | `POST` | `/api/actions/watchlist` | Persist creator watchlist preferences |
64
69
 
65
70
  ### Live Updates (SSE)
66
71
 
67
72
  The dashboard connects to `/api/v2/events` via Server-Sent Events.
68
73
  The server watches the SQLite WAL file for changes and broadcasts an
69
- `update` event when new data is written. The SPA invalidates all cached
70
- queries, triggering immediate refetches (~1s latency).
74
+ `update` event when new data is written. The dashboard also broadcasts
75
+ `action` events while creator-loop commands are running so the UI can
76
+ show live stdout/stderr and terminal success/failure. This works for
77
+ both dashboard-triggered actions and supported `selftune` commands run
78
+ directly in another terminal, because the CLI writes a shared action
79
+ stream under `~/.selftune/dashboard-action-events.jsonl`. The SPA
80
+ invalidates cached queries on updates and terminal action events (~1s
81
+ latency for DB-backed updates).
82
+
83
+ For demo or operator workflows, the skill report can open a dedicated
84
+ live-run screen. That screen follows one active creator-loop run at a
85
+ time, keeps a larger terminal log visible, and shows parsed dry-run
86
+ summary fields plus historical model/platform/token aggregates from the
87
+ skill report. Replay dry-runs also attach live `metrics` events when the
88
+ underlying runtime exposes structured output (for example Claude Code's
89
+ `--output-format stream-json`), so the screen can show per-run platform,
90
+ model, token, cost, and duration updates before the action finishes.
91
+ Replay validation now also emits structured per-eval `progress` events,
92
+ so the live-run screen can show `eval n/N`, the current query snippet,
93
+ and pass/fail evidence as each replayed eval completes. New browser tabs
94
+ receive recent action-event backfill on connect, which means opening the
95
+ live-run screen mid-run can still reconstruct the current action instead
96
+ of only showing the final JSON after completion.
71
97
 
72
98
  TanStack Query polling (60s) acts as a fallback safety net in case the
73
99
  SSE connection drops. Data also refreshes on window focus.
@@ -79,7 +105,7 @@ See [docs/design-docs/live-dashboard-sse.md](../../docs/design-docs/live-dashboa
79
105
  Action buttons in the dashboard trigger selftune commands via POST
80
106
  requests. Each endpoint spawns a `bun run` subprocess.
81
107
 
82
- **Watch and Evolve** request body:
108
+ **Creator-loop and watch/deploy actions** request body:
83
109
 
84
110
  ```json
85
111
  {
@@ -104,7 +130,8 @@ All action endpoints return:
104
130
  {
105
131
  "success": true,
106
132
  "output": "command stdout",
107
- "error": null
133
+ "error": null,
134
+ "exitCode": 0
108
135
  }
109
136
  ```
110
137