selftune 0.2.29 → 0.2.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/apps/local-dashboard/dist/assets/index-BcXquWFB.css +1 -0
- package/apps/local-dashboard/dist/assets/index-Coq42hE4.js +15 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +1 -0
- package/apps/local-dashboard/dist/index.html +3 -3
- package/cli/selftune/auto-update.ts +40 -8
- package/cli/selftune/command-surface.ts +1 -1
- package/cli/selftune/constants.ts +5 -0
- package/cli/selftune/dashboard-action-events.ts +117 -0
- package/cli/selftune/dashboard-action-instrumentation.ts +103 -0
- package/cli/selftune/dashboard-action-result.ts +90 -0
- package/cli/selftune/dashboard-action-stream.ts +252 -0
- package/cli/selftune/dashboard-contract.ts +81 -1
- package/cli/selftune/dashboard-server.ts +133 -16
- package/cli/selftune/eval/hooks-to-evals.ts +157 -0
- package/cli/selftune/eval/synthetic-evals.ts +33 -2
- package/cli/selftune/eval/unit-test-cli.ts +53 -5
- package/cli/selftune/evolution/validate-host-replay.ts +191 -14
- package/cli/selftune/index.ts +4 -0
- package/cli/selftune/ingestors/opencode-ingest.ts +117 -8
- package/cli/selftune/localdb/schema.ts +34 -0
- package/cli/selftune/routes/actions.ts +273 -42
- package/cli/selftune/testing-readiness.ts +203 -10
- package/cli/selftune/utils/llm-call.ts +90 -1
- package/package.json +1 -1
- package/packages/ui/src/components/EvolutionTimeline.tsx +1 -1
- package/skill/SKILL.md +1 -1
- package/skill/workflows/Dashboard.md +50 -23
- package/apps/local-dashboard/dist/assets/index-BcvtYmmL.js +0 -15
- package/apps/local-dashboard/dist/assets/index-BpRIxnpS.css +0 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-DqH_uxum.js +0 -1
|
@@ -17,6 +17,23 @@ const logger = createLogger("llm-call");
|
|
|
17
17
|
export const LLM_BACKED_AGENT_CANDIDATES = ["claude", "codex", "opencode", "pi"] as const;
|
|
18
18
|
export type LlmBackedAgent = (typeof LLM_BACKED_AGENT_CANDIDATES)[number];
|
|
19
19
|
|
|
20
|
+
export interface LlmInvocationIdentity {
|
|
21
|
+
platform: string;
|
|
22
|
+
model: string | null;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface LlmCallLifecycleEvent extends LlmInvocationIdentity {
|
|
26
|
+
agent: string;
|
|
27
|
+
durationMs: number | null;
|
|
28
|
+
success: boolean | null;
|
|
29
|
+
error: string | null;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface LlmCallObserver {
|
|
33
|
+
onStart?: (event: LlmCallLifecycleEvent) => void;
|
|
34
|
+
onFinish?: (event: LlmCallLifecycleEvent) => void;
|
|
35
|
+
}
|
|
36
|
+
|
|
20
37
|
// ---------------------------------------------------------------------------
|
|
21
38
|
// Model alias resolution
|
|
22
39
|
// ---------------------------------------------------------------------------
|
|
@@ -61,6 +78,41 @@ function resolvePiThinking(effort: EffortLevel): string {
|
|
|
61
78
|
return PI_THINKING_MAP[effort];
|
|
62
79
|
}
|
|
63
80
|
|
|
81
|
+
export function describeLlmInvocation(agent: string, modelFlag?: string): LlmInvocationIdentity {
|
|
82
|
+
if (agent === "claude") {
|
|
83
|
+
return {
|
|
84
|
+
platform: "claude_code",
|
|
85
|
+
model: modelFlag ? resolveModelFlag(modelFlag) : null,
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (agent === "opencode") {
|
|
90
|
+
return {
|
|
91
|
+
platform: "opencode",
|
|
92
|
+
model: modelFlag ? resolveOpenCodeModel(modelFlag) : null,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (agent === "codex") {
|
|
97
|
+
return {
|
|
98
|
+
platform: "codex",
|
|
99
|
+
model: modelFlag ?? null,
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
if (agent === "pi") {
|
|
104
|
+
return {
|
|
105
|
+
platform: "pi",
|
|
106
|
+
model: modelFlag ?? null,
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
platform: agent,
|
|
112
|
+
model: modelFlag ?? null,
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
|
|
64
116
|
// ---------------------------------------------------------------------------
|
|
65
117
|
// Bundled agent file loading (for codex inline prompt injection)
|
|
66
118
|
// ---------------------------------------------------------------------------
|
|
@@ -208,6 +260,7 @@ export async function callViaAgent(
|
|
|
208
260
|
modelFlag?: string,
|
|
209
261
|
retryOpts?: RetryOptions,
|
|
210
262
|
effort?: EffortLevel,
|
|
263
|
+
observer?: LlmCallObserver,
|
|
211
264
|
): Promise<string> {
|
|
212
265
|
// Write prompt to temp file to avoid shell quoting issues
|
|
213
266
|
const promptFile = join(tmpdir(), `selftune-llm-${Date.now()}.txt`);
|
|
@@ -216,6 +269,7 @@ export async function callViaAgent(
|
|
|
216
269
|
try {
|
|
217
270
|
const promptContent = readFileSync(promptFile, "utf-8");
|
|
218
271
|
let cmd: string[];
|
|
272
|
+
const identity = describeLlmInvocation(agent, modelFlag);
|
|
219
273
|
|
|
220
274
|
if (agent === "claude") {
|
|
221
275
|
cmd = ["claude", "-p", promptContent];
|
|
@@ -264,6 +318,18 @@ export async function callViaAgent(
|
|
|
264
318
|
const maxRetries = retryOpts?.maxRetries ?? DEFAULT_MAX_RETRIES;
|
|
265
319
|
const initialBackoffMs = retryOpts?.initialBackoffMs ?? DEFAULT_INITIAL_BACKOFF_MS;
|
|
266
320
|
let lastError: Error | undefined;
|
|
321
|
+
const startedAt = Date.now();
|
|
322
|
+
try {
|
|
323
|
+
observer?.onStart?.({
|
|
324
|
+
agent,
|
|
325
|
+
...identity,
|
|
326
|
+
durationMs: null,
|
|
327
|
+
success: null,
|
|
328
|
+
error: null,
|
|
329
|
+
});
|
|
330
|
+
} catch {
|
|
331
|
+
// fail-open: instrumentation must never block the real LLM call
|
|
332
|
+
}
|
|
267
333
|
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
268
334
|
if (attempt > 0) {
|
|
269
335
|
const backoffMs = initialBackoffMs * 2 ** (attempt - 1);
|
|
@@ -296,10 +362,32 @@ export async function callViaAgent(
|
|
|
296
362
|
}
|
|
297
363
|
|
|
298
364
|
const raw = await new Response(proc.stdout).text();
|
|
365
|
+
try {
|
|
366
|
+
observer?.onFinish?.({
|
|
367
|
+
agent,
|
|
368
|
+
...identity,
|
|
369
|
+
durationMs: Date.now() - startedAt,
|
|
370
|
+
success: true,
|
|
371
|
+
error: null,
|
|
372
|
+
});
|
|
373
|
+
} catch {
|
|
374
|
+
// fail-open: instrumentation must never block the real LLM call
|
|
375
|
+
}
|
|
299
376
|
return raw;
|
|
300
377
|
} catch (err) {
|
|
301
378
|
lastError = err instanceof Error ? err : new Error(String(err));
|
|
302
379
|
if (!isTransientError(lastError) || attempt === maxRetries) {
|
|
380
|
+
try {
|
|
381
|
+
observer?.onFinish?.({
|
|
382
|
+
agent,
|
|
383
|
+
...identity,
|
|
384
|
+
durationMs: Date.now() - startedAt,
|
|
385
|
+
success: false,
|
|
386
|
+
error: lastError.message,
|
|
387
|
+
});
|
|
388
|
+
} catch {
|
|
389
|
+
// fail-open: instrumentation must never block the real LLM call
|
|
390
|
+
}
|
|
303
391
|
throw lastError;
|
|
304
392
|
}
|
|
305
393
|
logger.warn(`Transient failure on attempt ${attempt + 1}: ${lastError.message}`);
|
|
@@ -533,9 +621,10 @@ export async function callLlm(
|
|
|
533
621
|
agent: string,
|
|
534
622
|
modelFlag?: string,
|
|
535
623
|
effort?: EffortLevel,
|
|
624
|
+
observer?: LlmCallObserver,
|
|
536
625
|
): Promise<string> {
|
|
537
626
|
if (!agent) {
|
|
538
627
|
throw new Error("Agent must be specified for callLlm");
|
|
539
628
|
}
|
|
540
|
-
return callViaAgent(systemPrompt, userPrompt, agent, modelFlag, undefined, effort);
|
|
629
|
+
return callViaAgent(systemPrompt, userPrompt, agent, modelFlag, undefined, effort, observer);
|
|
541
630
|
}
|
package/package.json
CHANGED
|
@@ -139,7 +139,7 @@ export function EvolutionTimeline({ entries, selectedProposalId, onSelect }: Pro
|
|
|
139
139
|
|
|
140
140
|
return (
|
|
141
141
|
<div className="flex flex-col gap-0">
|
|
142
|
-
<h2 className="sticky top-0 z-10
|
|
142
|
+
<h2 className="sticky top-0 z-10 px-2 pb-2 text-xs font-semibold uppercase tracking-wider text-muted-foreground">
|
|
143
143
|
Evolution
|
|
144
144
|
</h2>
|
|
145
145
|
<LifecycleLegend />
|
package/skill/SKILL.md
CHANGED
|
@@ -19,12 +19,12 @@ generate JSONL from SQLite for debugging or offline analysis.
|
|
|
19
19
|
|
|
20
20
|
## Options
|
|
21
21
|
|
|
22
|
-
| Flag | Description
|
|
23
|
-
| --------------- |
|
|
24
|
-
| `--port <port>` | Custom port for the server
|
|
25
|
-
| `--restart` | Force-restart an existing dashboard on the target port | Off
|
|
26
|
-
| `--no-open` | Start server without opening browser
|
|
27
|
-
| `--serve` | _(Deprecated)_ Alias for default behavior
|
|
22
|
+
| Flag | Description | Default |
|
|
23
|
+
| --------------- | ------------------------------------------------------ | ------- |
|
|
24
|
+
| `--port <port>` | Custom port for the server | 3141 |
|
|
25
|
+
| `--restart` | Force-restart an existing dashboard on the target port | Off |
|
|
26
|
+
| `--no-open` | Start server without opening browser | Off |
|
|
27
|
+
| `--serve` | _(Deprecated)_ Alias for default behavior | — |
|
|
28
28
|
|
|
29
29
|
Note: `--export` and `--out` were removed. The CLI will error if used,
|
|
30
30
|
suggesting `selftune dashboard` instead.
|
|
@@ -48,26 +48,52 @@ staying stale.
|
|
|
48
48
|
|
|
49
49
|
### Endpoints
|
|
50
50
|
|
|
51
|
-
| Method | Path
|
|
52
|
-
| ------ |
|
|
53
|
-
| `GET` | `/`
|
|
54
|
-
| `GET` | `/api/v2/overview`
|
|
55
|
-
| `GET` | `/api/v2/skills/:name`
|
|
56
|
-
| `GET` | `/api/v2/orchestrate-runs`
|
|
57
|
-
| `GET` | `/api/v2/doctor`
|
|
58
|
-
| `GET` | `/api/v2/events`
|
|
59
|
-
| `GET` | `/api/health`
|
|
60
|
-
| `POST` | `/api/actions/
|
|
61
|
-
| `POST` | `/api/actions/
|
|
62
|
-
| `POST` | `/api/actions/
|
|
63
|
-
| `POST` | `/api/actions/
|
|
51
|
+
| Method | Path | Description |
|
|
52
|
+
| ------ | ---------------------------------- | ------------------------------------------------------------ |
|
|
53
|
+
| `GET` | `/` | Serve dashboard SPA shell |
|
|
54
|
+
| `GET` | `/api/v2/overview` | SQLite-backed overview payload |
|
|
55
|
+
| `GET` | `/api/v2/skills/:name` | SQLite-backed per-skill report |
|
|
56
|
+
| `GET` | `/api/v2/orchestrate-runs` | Recent orchestrate run reports |
|
|
57
|
+
| `GET` | `/api/v2/doctor` | System health diagnostics (config, logs, hooks, evolution) |
|
|
58
|
+
| `GET` | `/api/v2/events` | SSE stream for live dashboard updates |
|
|
59
|
+
| `GET` | `/api/health` | Dashboard server health probe |
|
|
60
|
+
| `POST` | `/api/actions/generate-evals` | Trigger `selftune eval generate` for a skill |
|
|
61
|
+
| `POST` | `/api/actions/generate-unit-tests` | Trigger `selftune eval unit-test --generate` |
|
|
62
|
+
| `POST` | `/api/actions/replay-dry-run` | Trigger `selftune evolve --dry-run --validation-mode replay` |
|
|
63
|
+
| `POST` | `/api/actions/measure-baseline` | Trigger `selftune grade baseline` for a skill |
|
|
64
|
+
| `POST` | `/api/actions/deploy-candidate` | Trigger `selftune evolve` for a skill |
|
|
65
|
+
| `POST` | `/api/actions/watch` | Trigger `selftune watch` for a skill |
|
|
66
|
+
| `POST` | `/api/actions/evolve` | Trigger `selftune evolve` for a skill |
|
|
67
|
+
| `POST` | `/api/actions/rollback` | Trigger `selftune evolve rollback` for a skill |
|
|
68
|
+
| `POST` | `/api/actions/watchlist` | Persist creator watchlist preferences |
|
|
64
69
|
|
|
65
70
|
### Live Updates (SSE)
|
|
66
71
|
|
|
67
72
|
The dashboard connects to `/api/v2/events` via Server-Sent Events.
|
|
68
73
|
The server watches the SQLite WAL file for changes and broadcasts an
|
|
69
|
-
`update` event when new data is written. The
|
|
70
|
-
|
|
74
|
+
`update` event when new data is written. The dashboard also broadcasts
|
|
75
|
+
`action` events while creator-loop commands are running so the UI can
|
|
76
|
+
show live stdout/stderr and terminal success/failure. This works for
|
|
77
|
+
both dashboard-triggered actions and supported `selftune` commands run
|
|
78
|
+
directly in another terminal, because the CLI writes a shared action
|
|
79
|
+
stream under `~/.selftune/dashboard-action-events.jsonl`. The SPA
|
|
80
|
+
invalidates cached queries on updates and terminal action events (~1s
|
|
81
|
+
latency for DB-backed updates).
|
|
82
|
+
|
|
83
|
+
For demo or operator workflows, the skill report can open a dedicated
|
|
84
|
+
live-run screen. That screen follows one active creator-loop run at a
|
|
85
|
+
time, keeps a larger terminal log visible, and shows parsed dry-run
|
|
86
|
+
summary fields plus historical model/platform/token aggregates from the
|
|
87
|
+
skill report. Replay dry-runs also attach live `metrics` events when the
|
|
88
|
+
underlying runtime exposes structured output (for example Claude Code's
|
|
89
|
+
`--output-format stream-json`), so the screen can show per-run platform,
|
|
90
|
+
model, token, cost, and duration updates before the action finishes.
|
|
91
|
+
Replay validation now also emits structured per-eval `progress` events,
|
|
92
|
+
so the live-run screen can show `eval n/N`, the current query snippet,
|
|
93
|
+
and pass/fail evidence as each replayed eval completes. New browser tabs
|
|
94
|
+
receive recent action-event backfill on connect, which means opening the
|
|
95
|
+
live-run screen mid-run can still reconstruct the current action instead
|
|
96
|
+
of only showing the final JSON after completion.
|
|
71
97
|
|
|
72
98
|
TanStack Query polling (60s) acts as a fallback safety net in case the
|
|
73
99
|
SSE connection drops. Data also refreshes on window focus.
|
|
@@ -79,7 +105,7 @@ See [docs/design-docs/live-dashboard-sse.md](../../docs/design-docs/live-dashboa
|
|
|
79
105
|
Action buttons in the dashboard trigger selftune commands via POST
|
|
80
106
|
requests. Each endpoint spawns a `bun run` subprocess.
|
|
81
107
|
|
|
82
|
-
**
|
|
108
|
+
**Creator-loop and watch/deploy actions** request body:
|
|
83
109
|
|
|
84
110
|
```json
|
|
85
111
|
{
|
|
@@ -104,7 +130,8 @@ All action endpoints return:
|
|
|
104
130
|
{
|
|
105
131
|
"success": true,
|
|
106
132
|
"output": "command stdout",
|
|
107
|
-
"error": null
|
|
133
|
+
"error": null,
|
|
134
|
+
"exitCode": 0
|
|
108
135
|
}
|
|
109
136
|
```
|
|
110
137
|
|