npm - selftune - Versions diffs - 0.2.29 → 0.2.30 - Mend

selftune 0.2.29 → 0.2.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/apps/local-dashboard/dist/assets/index-BcXquWFB.css +1 -0
package/apps/local-dashboard/dist/assets/index-Coq42hE4.js +15 -0
package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +1 -0
package/apps/local-dashboard/dist/index.html +3 -3
package/cli/selftune/auto-update.ts +40 -8
package/cli/selftune/command-surface.ts +1 -1
package/cli/selftune/constants.ts +5 -0
package/cli/selftune/dashboard-action-events.ts +117 -0
package/cli/selftune/dashboard-action-instrumentation.ts +103 -0
package/cli/selftune/dashboard-action-result.ts +90 -0
package/cli/selftune/dashboard-action-stream.ts +252 -0
package/cli/selftune/dashboard-contract.ts +81 -1
package/cli/selftune/dashboard-server.ts +133 -16
package/cli/selftune/eval/hooks-to-evals.ts +157 -0
package/cli/selftune/eval/synthetic-evals.ts +33 -2
package/cli/selftune/eval/unit-test-cli.ts +53 -5
package/cli/selftune/evolution/validate-host-replay.ts +191 -14
package/cli/selftune/index.ts +4 -0
package/cli/selftune/ingestors/opencode-ingest.ts +117 -8
package/cli/selftune/localdb/schema.ts +34 -0
package/cli/selftune/routes/actions.ts +273 -42
package/cli/selftune/testing-readiness.ts +203 -10
package/cli/selftune/utils/llm-call.ts +90 -1
package/package.json +1 -1
package/packages/ui/src/components/EvolutionTimeline.tsx +1 -1
package/skill/SKILL.md +1 -1
package/skill/workflows/Dashboard.md +50 -23
package/apps/local-dashboard/dist/assets/index-BcvtYmmL.js +0 -15
package/apps/local-dashboard/dist/assets/index-BpRIxnpS.css +0 -1
package/apps/local-dashboard/dist/assets/vendor-ui-DqH_uxum.js +0 -1

package/cli/selftune/utils/llm-call.ts CHANGED Viewed

@@ -17,6 +17,23 @@ const logger = createLogger("llm-call");
 export const LLM_BACKED_AGENT_CANDIDATES = ["claude", "codex", "opencode", "pi"] as const;
 export type LlmBackedAgent = (typeof LLM_BACKED_AGENT_CANDIDATES)[number];
+export interface LlmInvocationIdentity {
+  platform: string;
+  model: string | null;
+}
+export interface LlmCallLifecycleEvent extends LlmInvocationIdentity {
+  agent: string;
+  durationMs: number | null;
+  success: boolean | null;
+  error: string | null;
+}
+export interface LlmCallObserver {
+  onStart?: (event: LlmCallLifecycleEvent) => void;
+  onFinish?: (event: LlmCallLifecycleEvent) => void;
+}
 // ---------------------------------------------------------------------------
 // Model alias resolution
 // ---------------------------------------------------------------------------
@@ -61,6 +78,41 @@ function resolvePiThinking(effort: EffortLevel): string {
   return PI_THINKING_MAP[effort];
 }
+export function describeLlmInvocation(agent: string, modelFlag?: string): LlmInvocationIdentity {
+  if (agent === "claude") {
+    return {
+      platform: "claude_code",
+      model: modelFlag ? resolveModelFlag(modelFlag) : null,
+    };
+  }
+  if (agent === "opencode") {
+    return {
+      platform: "opencode",
+      model: modelFlag ? resolveOpenCodeModel(modelFlag) : null,
+    };
+  }
+  if (agent === "codex") {
+    return {
+      platform: "codex",
+      model: modelFlag ?? null,
+    };
+  }
+  if (agent === "pi") {
+    return {
+      platform: "pi",
+      model: modelFlag ?? null,
+    };
+  }
+  return {
+    platform: agent,
+    model: modelFlag ?? null,
+  };
+}
 // ---------------------------------------------------------------------------
 // Bundled agent file loading (for codex inline prompt injection)
 // ---------------------------------------------------------------------------
@@ -208,6 +260,7 @@ export async function callViaAgent(
   modelFlag?: string,
   retryOpts?: RetryOptions,
   effort?: EffortLevel,
+  observer?: LlmCallObserver,
 ): Promise<string> {
   // Write prompt to temp file to avoid shell quoting issues
   const promptFile = join(tmpdir(), `selftune-llm-${Date.now()}.txt`);
@@ -216,6 +269,7 @@ export async function callViaAgent(
   try {
     const promptContent = readFileSync(promptFile, "utf-8");
     let cmd: string[];
+    const identity = describeLlmInvocation(agent, modelFlag);
     if (agent === "claude") {
       cmd = ["claude", "-p", promptContent];
@@ -264,6 +318,18 @@ export async function callViaAgent(
     const maxRetries = retryOpts?.maxRetries ?? DEFAULT_MAX_RETRIES;
     const initialBackoffMs = retryOpts?.initialBackoffMs ?? DEFAULT_INITIAL_BACKOFF_MS;
     let lastError: Error | undefined;
+    const startedAt = Date.now();
+    try {
+      observer?.onStart?.({
+        agent,
+        ...identity,
+        durationMs: null,
+        success: null,
+        error: null,
+      });
+    } catch {
+      // fail-open: instrumentation must never block the real LLM call
+    }
     for (let attempt = 0; attempt <= maxRetries; attempt++) {
       if (attempt > 0) {
         const backoffMs = initialBackoffMs * 2 ** (attempt - 1);
@@ -296,10 +362,32 @@ export async function callViaAgent(
         }
         const raw = await new Response(proc.stdout).text();
+        try {
+          observer?.onFinish?.({
+            agent,
+            ...identity,
+            durationMs: Date.now() - startedAt,
+            success: true,
+            error: null,
+          });
+        } catch {
+          // fail-open: instrumentation must never block the real LLM call
+        }
         return raw;
       } catch (err) {
         lastError = err instanceof Error ? err : new Error(String(err));
         if (!isTransientError(lastError) || attempt === maxRetries) {
+          try {
+            observer?.onFinish?.({
+              agent,
+              ...identity,
+              durationMs: Date.now() - startedAt,
+              success: false,
+              error: lastError.message,
+            });
+          } catch {
+            // fail-open: instrumentation must never block the real LLM call
+          }
           throw lastError;
         }
         logger.warn(`Transient failure on attempt ${attempt + 1}: ${lastError.message}`);
@@ -533,9 +621,10 @@ export async function callLlm(
   agent: string,
   modelFlag?: string,
   effort?: EffortLevel,
+  observer?: LlmCallObserver,
 ): Promise<string> {
   if (!agent) {
     throw new Error("Agent must be specified for callLlm");
   }
-  return callViaAgent(systemPrompt, userPrompt, agent, modelFlag, undefined, effort);
+  return callViaAgent(systemPrompt, userPrompt, agent, modelFlag, undefined, effort, observer);
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "selftune",
-  "version": "0.2.29",
+  "version": "0.2.30",
   "description": "Skill-level observability and self-improvement for AI agents — monitors skill routing, detects missed triggers, and evolves descriptions automatically",
   "keywords": [
     "agent",

package/packages/ui/src/components/EvolutionTimeline.tsx CHANGED Viewed

@@ -139,7 +139,7 @@ export function EvolutionTimeline({ entries, selectedProposalId, onSelect }: Pro
   return (
     <div className="flex flex-col gap-0">
-      <h2 className="sticky top-0 z-10 bg-background px-2 pb-2 text-xs font-semibold uppercase tracking-wider text-muted-foreground">
+      <h2 className="sticky top-0 z-10 px-2 pb-2 text-xs font-semibold uppercase tracking-wider text-muted-foreground">
         Evolution
       </h2>
       <LifecycleLegend />

package/skill/SKILL.md CHANGED Viewed

@@ -13,7 +13,7 @@ description: >
   even if they don't say "selftune" explicitly.
 metadata:
   author: selftune-dev
-  version: 0.2.29
+  version: 0.2.30
   category: developer-tools
 ---

package/skill/workflows/Dashboard.md CHANGED Viewed

@@ -19,12 +19,12 @@ generate JSONL from SQLite for debugging or offline analysis.
 ## Options
-| Flag            | Description                               | Default |
-| --------------- | ----------------------------------------- | ------- |
-| `--port <port>` | Custom port for the server                | 3141    |
-| `--restart`     | Force-restart an existing dashboard on the target port | Off |
-| `--no-open`     | Start server without opening browser      | Off     |
-| `--serve`       | _(Deprecated)_ Alias for default behavior | —       |
+| Flag            | Description                                            | Default |
+| --------------- | ------------------------------------------------------ | ------- |
+| `--port <port>` | Custom port for the server                             | 3141    |
+| `--restart`     | Force-restart an existing dashboard on the target port | Off     |
+| `--no-open`     | Start server without opening browser                   | Off     |
+| `--serve`       | _(Deprecated)_ Alias for default behavior              | —       |
 Note: `--export` and `--out` were removed. The CLI will error if used,
 suggesting `selftune dashboard` instead.
@@ -48,26 +48,52 @@ staying stale.
 ### Endpoints
-| Method | Path                       | Description                                                |
-| ------ | -------------------------- | ---------------------------------------------------------- |
-| `GET`  | `/`                        | Serve dashboard SPA shell                                  |
-| `GET`  | `/api/v2/overview`         | SQLite-backed overview payload                             |
-| `GET`  | `/api/v2/skills/:name`     | SQLite-backed per-skill report                             |
-| `GET`  | `/api/v2/orchestrate-runs` | Recent orchestrate run reports                             |
-| `GET`  | `/api/v2/doctor`           | System health diagnostics (config, logs, hooks, evolution) |
-| `GET`  | `/api/v2/events`           | SSE stream for live dashboard updates                      |
-| `GET`  | `/api/health`              | Dashboard server health probe                              |
-| `POST` | `/api/actions/watch`       | Trigger `selftune watch` for a skill                       |
-| `POST` | `/api/actions/evolve`      | Trigger `selftune evolve` for a skill                      |
-| `POST` | `/api/actions/rollback`    | Trigger `selftune evolve rollback` for a skill             |
-| `POST` | `/api/actions/watchlist`   | Persist creator watchlist preferences                      |
+| Method | Path                               | Description                                                  |
+| ------ | ---------------------------------- | ------------------------------------------------------------ |
+| `GET`  | `/`                                | Serve dashboard SPA shell                                    |
+| `GET`  | `/api/v2/overview`                 | SQLite-backed overview payload                               |
+| `GET`  | `/api/v2/skills/:name`             | SQLite-backed per-skill report                               |
+| `GET`  | `/api/v2/orchestrate-runs`         | Recent orchestrate run reports                               |
+| `GET`  | `/api/v2/doctor`                   | System health diagnostics (config, logs, hooks, evolution)   |
+| `GET`  | `/api/v2/events`                   | SSE stream for live dashboard updates                        |
+| `GET`  | `/api/health`                      | Dashboard server health probe                                |
+| `POST` | `/api/actions/generate-evals`      | Trigger `selftune eval generate` for a skill                 |
+| `POST` | `/api/actions/generate-unit-tests` | Trigger `selftune eval unit-test --generate`                 |
+| `POST` | `/api/actions/replay-dry-run`      | Trigger `selftune evolve --dry-run --validation-mode replay` |
+| `POST` | `/api/actions/measure-baseline`    | Trigger `selftune grade baseline` for a skill                |
+| `POST` | `/api/actions/deploy-candidate`    | Trigger `selftune evolve` for a skill                        |
+| `POST` | `/api/actions/watch`               | Trigger `selftune watch` for a skill                         |
+| `POST` | `/api/actions/evolve`              | Trigger `selftune evolve` for a skill                        |
+| `POST` | `/api/actions/rollback`            | Trigger `selftune evolve rollback` for a skill               |
+| `POST` | `/api/actions/watchlist`           | Persist creator watchlist preferences                        |
 ### Live Updates (SSE)
 The dashboard connects to `/api/v2/events` via Server-Sent Events.
 The server watches the SQLite WAL file for changes and broadcasts an
-`update` event when new data is written. The SPA invalidates all cached
-queries, triggering immediate refetches (~1s latency).
+`update` event when new data is written. The dashboard also broadcasts
+`action` events while creator-loop commands are running so the UI can
+show live stdout/stderr and terminal success/failure. This works for
+both dashboard-triggered actions and supported `selftune` commands run
+directly in another terminal, because the CLI writes a shared action
+stream under `~/.selftune/dashboard-action-events.jsonl`. The SPA
+invalidates cached queries on updates and terminal action events (~1s
+latency for DB-backed updates).
+For demo or operator workflows, the skill report can open a dedicated
+live-run screen. That screen follows one active creator-loop run at a
+time, keeps a larger terminal log visible, and shows parsed dry-run
+summary fields plus historical model/platform/token aggregates from the
+skill report. Replay dry-runs also attach live `metrics` events when the
+underlying runtime exposes structured output (for example Claude Code's
+`--output-format stream-json`), so the screen can show per-run platform,
+model, token, cost, and duration updates before the action finishes.
+Replay validation now also emits structured per-eval `progress` events,
+so the live-run screen can show `eval n/N`, the current query snippet,
+and pass/fail evidence as each replayed eval completes. New browser tabs
+receive recent action-event backfill on connect, which means opening the
+live-run screen mid-run can still reconstruct the current action instead
+of only showing the final JSON after completion.
 TanStack Query polling (60s) acts as a fallback safety net in case the
 SSE connection drops. Data also refreshes on window focus.
@@ -79,7 +105,7 @@ See [docs/design-docs/live-dashboard-sse.md](../../docs/design-docs/live-dashboa
 Action buttons in the dashboard trigger selftune commands via POST
 requests. Each endpoint spawns a `bun run` subprocess.
-**Watch and Evolve** request body:
+**Creator-loop and watch/deploy actions** request body:
 ```json
 {
@@ -104,7 +130,8 @@ All action endpoints return:
 {
   "success": true,
   "output": "command stdout",
-  "error": null
+  "error": null,
+  "exitCode": 0
 }
 ```