npm - selftune - Versions diffs - 0.2.29 → 0.2.31 - Mend

selftune 0.2.29 → 0.2.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +15 -0
package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +1 -0
package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +1 -0
package/apps/local-dashboard/dist/index.html +3 -3
package/cli/selftune/auto-update.ts +40 -8
package/cli/selftune/command-surface.ts +1 -1
package/cli/selftune/constants.ts +5 -0
package/cli/selftune/dashboard-action-events.ts +117 -0
package/cli/selftune/dashboard-action-instrumentation.ts +103 -0
package/cli/selftune/dashboard-action-result.ts +90 -0
package/cli/selftune/dashboard-action-stream.ts +252 -0
package/cli/selftune/dashboard-contract.ts +81 -1
package/cli/selftune/dashboard-server.ts +133 -16
package/cli/selftune/eval/hooks-to-evals.ts +157 -0
package/cli/selftune/eval/synthetic-evals.ts +33 -2
package/cli/selftune/eval/unit-test-cli.ts +53 -5
package/cli/selftune/evolution/validate-host-replay.ts +191 -14
package/cli/selftune/index.ts +4 -0
package/cli/selftune/ingestors/opencode-ingest.ts +117 -8
package/cli/selftune/localdb/schema.ts +34 -0
package/cli/selftune/registry/github-install.ts +256 -0
package/cli/selftune/registry/index.ts +1 -1
package/cli/selftune/registry/install.ts +58 -7
package/cli/selftune/routes/actions.ts +273 -42
package/cli/selftune/testing-readiness.ts +203 -10
package/cli/selftune/utils/llm-call.ts +90 -1
package/package.json +1 -1
package/packages/dashboard-core/src/routes/manifest.ts +2 -2
package/packages/ui/src/components/EvolutionTimeline.tsx +1 -1
package/packages/ui/src/components/SkillReportPanels.tsx +7 -7
package/packages/ui/src/primitives/button.tsx +5 -0
package/skill/SKILL.md +1 -1
package/skill/workflows/Dashboard.md +50 -23
package/skill/workflows/Registry.md +19 -13
package/apps/local-dashboard/dist/assets/index-BcvtYmmL.js +0 -15
package/apps/local-dashboard/dist/assets/index-BpRIxnpS.css +0 -1
package/apps/local-dashboard/dist/assets/vendor-ui-DqH_uxum.js +0 -1

package/cli/selftune/utils/llm-call.ts CHANGED Viewed

@@ -17,6 +17,23 @@ const logger = createLogger("llm-call");
 export const LLM_BACKED_AGENT_CANDIDATES = ["claude", "codex", "opencode", "pi"] as const;
 export type LlmBackedAgent = (typeof LLM_BACKED_AGENT_CANDIDATES)[number];
+export interface LlmInvocationIdentity {
+  platform: string;
+  model: string | null;
+}
+export interface LlmCallLifecycleEvent extends LlmInvocationIdentity {
+  agent: string;
+  durationMs: number | null;
+  success: boolean | null;
+  error: string | null;
+}
+export interface LlmCallObserver {
+  onStart?: (event: LlmCallLifecycleEvent) => void;
+  onFinish?: (event: LlmCallLifecycleEvent) => void;
+}
 // ---------------------------------------------------------------------------
 // Model alias resolution
 // ---------------------------------------------------------------------------
@@ -61,6 +78,41 @@ function resolvePiThinking(effort: EffortLevel): string {
   return PI_THINKING_MAP[effort];
 }
+export function describeLlmInvocation(agent: string, modelFlag?: string): LlmInvocationIdentity {
+  if (agent === "claude") {
+    return {
+      platform: "claude_code",
+      model: modelFlag ? resolveModelFlag(modelFlag) : null,
+    };
+  }
+  if (agent === "opencode") {
+    return {
+      platform: "opencode",
+      model: modelFlag ? resolveOpenCodeModel(modelFlag) : null,
+    };
+  }
+  if (agent === "codex") {
+    return {
+      platform: "codex",
+      model: modelFlag ?? null,
+    };
+  }
+  if (agent === "pi") {
+    return {
+      platform: "pi",
+      model: modelFlag ?? null,
+    };
+  }
+  return {
+    platform: agent,
+    model: modelFlag ?? null,
+  };
+}
 // ---------------------------------------------------------------------------
 // Bundled agent file loading (for codex inline prompt injection)
 // ---------------------------------------------------------------------------
@@ -208,6 +260,7 @@ export async function callViaAgent(
   modelFlag?: string,
   retryOpts?: RetryOptions,
   effort?: EffortLevel,
+  observer?: LlmCallObserver,
 ): Promise<string> {
   // Write prompt to temp file to avoid shell quoting issues
   const promptFile = join(tmpdir(), `selftune-llm-${Date.now()}.txt`);
@@ -216,6 +269,7 @@ export async function callViaAgent(
   try {
     const promptContent = readFileSync(promptFile, "utf-8");
     let cmd: string[];
+    const identity = describeLlmInvocation(agent, modelFlag);
     if (agent === "claude") {
       cmd = ["claude", "-p", promptContent];
@@ -264,6 +318,18 @@ export async function callViaAgent(
     const maxRetries = retryOpts?.maxRetries ?? DEFAULT_MAX_RETRIES;
     const initialBackoffMs = retryOpts?.initialBackoffMs ?? DEFAULT_INITIAL_BACKOFF_MS;
     let lastError: Error | undefined;
+    const startedAt = Date.now();
+    try {
+      observer?.onStart?.({
+        agent,
+        ...identity,
+        durationMs: null,
+        success: null,
+        error: null,
+      });
+    } catch {
+      // fail-open: instrumentation must never block the real LLM call
+    }
     for (let attempt = 0; attempt <= maxRetries; attempt++) {
       if (attempt > 0) {
         const backoffMs = initialBackoffMs * 2 ** (attempt - 1);
@@ -296,10 +362,32 @@ export async function callViaAgent(
         }
         const raw = await new Response(proc.stdout).text();
+        try {
+          observer?.onFinish?.({
+            agent,
+            ...identity,
+            durationMs: Date.now() - startedAt,
+            success: true,
+            error: null,
+          });
+        } catch {
+          // fail-open: instrumentation must never block the real LLM call
+        }
         return raw;
       } catch (err) {
         lastError = err instanceof Error ? err : new Error(String(err));
         if (!isTransientError(lastError) || attempt === maxRetries) {
+          try {
+            observer?.onFinish?.({
+              agent,
+              ...identity,
+              durationMs: Date.now() - startedAt,
+              success: false,
+              error: lastError.message,
+            });
+          } catch {
+            // fail-open: instrumentation must never block the real LLM call
+          }
           throw lastError;
         }
         logger.warn(`Transient failure on attempt ${attempt + 1}: ${lastError.message}`);
@@ -533,9 +621,10 @@ export async function callLlm(
   agent: string,
   modelFlag?: string,
   effort?: EffortLevel,
+  observer?: LlmCallObserver,
 ): Promise<string> {
   if (!agent) {
     throw new Error("Agent must be specified for callLlm");
   }
-  return callViaAgent(systemPrompt, userPrompt, agent, modelFlag, undefined, effort);
+  return callViaAgent(systemPrompt, userPrompt, agent, modelFlag, undefined, effort, observer);
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "selftune",
-  "version": "0.2.29",
+  "version": "0.2.31",
   "description": "Skill-level observability and self-improvement for AI agents — monitors skill routing, detects missed triggers, and evolves descriptions automatically",
   "keywords": [
     "agent",

package/packages/dashboard-core/src/routes/manifest.ts CHANGED Viewed

@@ -217,7 +217,7 @@ export const DASHBOARD_ROUTE_MANIFEST: readonly DashboardRouteManifestEntry[] =
     icon: PackageIcon,
     feature: "registry",
     discoverableFeature: "registry",
-    lockedTitle: "Cloud Registry lives in Selftune Cloud",
+    lockedTitle: "Cloud Registry lives in selftune Cloud",
     lockedBody:
       "Publish versioned skills, watch installations across projects, and roll back bad versions from a single cloud workspace.",
     lockedHighlights: [
@@ -255,7 +255,7 @@ export const DASHBOARD_ROUTE_MANIFEST: readonly DashboardRouteManifestEntry[] =
     icon: UsersIcon,
     feature: "signals",
     discoverableFeature: "signals",
-    lockedTitle: "Contributor signals run through Selftune Cloud",
+    lockedTitle: "Contributor signals run through selftune Cloud",
     lockedBody:
       "See anonymized contributor signals, compare bundle submissions, and turn real-world usage into proposals without leaving the shared dashboard.",
     lockedHighlights: [

package/packages/ui/src/components/EvolutionTimeline.tsx CHANGED Viewed

@@ -139,7 +139,7 @@ export function EvolutionTimeline({ entries, selectedProposalId, onSelect }: Pro
   return (
     <div className="flex flex-col gap-0">
-      <h2 className="sticky top-0 z-10 bg-background px-2 pb-2 text-xs font-semibold uppercase tracking-wider text-muted-foreground">
+      <h2 className="sticky top-0 z-10 px-2 pb-2 text-xs font-semibold uppercase tracking-wider text-muted-foreground">
         Evolution
       </h2>
       <LifecycleLegend />

package/packages/ui/src/components/SkillReportPanels.tsx CHANGED Viewed

@@ -379,7 +379,7 @@ function narrativeObservedText({
     promptLinkRate != null
       ? ` It could link ${formatRate(promptLinkRate)} of those checks back to prompts.`
       : "";
-  return `Selftune watched ${checks} skill checks across ${sessions} sessions.${promptClause}`;
+  return `selftune watched ${checks} skill checks across ${sessions} sessions.${promptClause}`;
 }
 function narrativeDiagnosisText({
@@ -411,17 +411,17 @@ function narrativeDecisionText({
 }) {
   switch (trustState) {
     case "validated":
-      return `Selftune found a candidate that looks promising, but it has not been deployed yet. ${nextActionText}`;
+      return `selftune found a candidate that looks promising, but it has not been deployed yet. ${nextActionText}`;
     case "deployed":
-      return `A change has already been deployed for this skill. Selftune is now watching for regressions in real use.`;
+      return `A change has already been deployed for this skill. selftune is now watching for regressions in real use.`;
     case "rolled_back":
       return `A previous change was rolled back, so the live skill is back on the safer version while selftune keeps observing.`;
     case "watch":
-      return `Selftune sees enough signal to keep a close eye on this skill, but not enough to blindly change it. ${nextActionText}`;
+      return `selftune sees enough signal to keep a close eye on this skill, but not enough to blindly change it. ${nextActionText}`;
     case "observed":
-      return `Selftune is still learning how people use this skill before making stronger recommendations.`;
+      return `selftune is still learning how people use this skill before making stronger recommendations.`;
     case "low_sample":
-      return `There is not enough evidence yet to trust a big change here. Selftune is still collecting examples.`;
+      return `There is not enough evidence yet to trust a big change here. selftune is still collecting examples.`;
     default:
       return latestAction
         ? `The latest automated decision for this skill was ${latestAction}. ${nextActionText}`
@@ -552,7 +552,7 @@ export function SkillTrustNarrativePanel({
           />
         </div>
         <div className="rounded-xl border border-border/10 bg-muted/15 px-4 py-3 text-sm text-muted-foreground">
-          If a proposal is rejected or still pending, your live skill has not changed yet. Selftune
+          If a proposal is rejected or still pending, your live skill has not changed yet. selftune
           only earns trust by testing changes before deployment.
         </div>
       </CardContent>

package/packages/ui/src/primitives/button.tsx CHANGED Viewed

@@ -18,6 +18,10 @@ const buttonVariants = cva(
         destructive:
           "bg-destructive/10 text-destructive hover:bg-destructive/20 focus-visible:border-destructive/40 focus-visible:ring-destructive/20 dark:bg-destructive/20 dark:hover:bg-destructive/30 dark:focus-visible:ring-destructive/40",
         link: "text-primary underline-offset-4 hover:underline",
+        "glass-primary":
+          "border-cyan-300 bg-background/75 text-foreground shadow-[0_10px_28px_rgba(34,211,238,0.14),inset_0_1px_0_rgba(255,255,255,0.09)] hover:border-cyan-200 hover:bg-background/85 hover:shadow-[0_14px_34px_rgba(34,211,238,0.18),inset_0_1px_0_rgba(255,255,255,0.12)]",
+        "glass-secondary":
+          "border-border/70 bg-background/60 text-foreground shadow-[inset_0_1px_0_rgba(255,255,255,0.06)] hover:border-cyan-400/20 hover:bg-background/75 hover:shadow-[0_10px_24px_rgba(34,211,238,0.08),inset_0_1px_0_rgba(255,255,255,0.1)]",
       },
       size: {
         default:
@@ -31,6 +35,7 @@ const buttonVariants = cva(
         "icon-sm":
           "size-7 rounded-[min(var(--radius-md),12px)] in-data-[slot=button-group]:rounded-lg",
         "icon-lg": "size-9",
+        glass: "gap-2 px-4 py-2 backdrop-blur-sm",
       },
     },
     defaultVariants: {

package/skill/SKILL.md CHANGED Viewed

@@ -13,7 +13,7 @@ description: >
   even if they don't say "selftune" explicitly.
 metadata:
   author: selftune-dev
-  version: 0.2.29
+  version: 0.2.31
   category: developer-tools
 ---

package/skill/workflows/Dashboard.md CHANGED Viewed

@@ -19,12 +19,12 @@ generate JSONL from SQLite for debugging or offline analysis.
 ## Options
-| Flag            | Description                               | Default |
-| --------------- | ----------------------------------------- | ------- |
-| `--port <port>` | Custom port for the server                | 3141    |
-| `--restart`     | Force-restart an existing dashboard on the target port | Off |
-| `--no-open`     | Start server without opening browser      | Off     |
-| `--serve`       | _(Deprecated)_ Alias for default behavior | —       |
+| Flag            | Description                                            | Default |
+| --------------- | ------------------------------------------------------ | ------- |
+| `--port <port>` | Custom port for the server                             | 3141    |
+| `--restart`     | Force-restart an existing dashboard on the target port | Off     |
+| `--no-open`     | Start server without opening browser                   | Off     |
+| `--serve`       | _(Deprecated)_ Alias for default behavior              | —       |
 Note: `--export` and `--out` were removed. The CLI will error if used,
 suggesting `selftune dashboard` instead.
@@ -48,26 +48,52 @@ staying stale.
 ### Endpoints
-| Method | Path                       | Description                                                |
-| ------ | -------------------------- | ---------------------------------------------------------- |
-| `GET`  | `/`                        | Serve dashboard SPA shell                                  |
-| `GET`  | `/api/v2/overview`         | SQLite-backed overview payload                             |
-| `GET`  | `/api/v2/skills/:name`     | SQLite-backed per-skill report                             |
-| `GET`  | `/api/v2/orchestrate-runs` | Recent orchestrate run reports                             |
-| `GET`  | `/api/v2/doctor`           | System health diagnostics (config, logs, hooks, evolution) |
-| `GET`  | `/api/v2/events`           | SSE stream for live dashboard updates                      |
-| `GET`  | `/api/health`              | Dashboard server health probe                              |
-| `POST` | `/api/actions/watch`       | Trigger `selftune watch` for a skill                       |
-| `POST` | `/api/actions/evolve`      | Trigger `selftune evolve` for a skill                      |
-| `POST` | `/api/actions/rollback`    | Trigger `selftune evolve rollback` for a skill             |
-| `POST` | `/api/actions/watchlist`   | Persist creator watchlist preferences                      |
+| Method | Path                               | Description                                                  |
+| ------ | ---------------------------------- | ------------------------------------------------------------ |
+| `GET`  | `/`                                | Serve dashboard SPA shell                                    |
+| `GET`  | `/api/v2/overview`                 | SQLite-backed overview payload                               |
+| `GET`  | `/api/v2/skills/:name`             | SQLite-backed per-skill report                               |
+| `GET`  | `/api/v2/orchestrate-runs`         | Recent orchestrate run reports                               |
+| `GET`  | `/api/v2/doctor`                   | System health diagnostics (config, logs, hooks, evolution)   |
+| `GET`  | `/api/v2/events`                   | SSE stream for live dashboard updates                        |
+| `GET`  | `/api/health`                      | Dashboard server health probe                                |
+| `POST` | `/api/actions/generate-evals`      | Trigger `selftune eval generate` for a skill                 |
+| `POST` | `/api/actions/generate-unit-tests` | Trigger `selftune eval unit-test --generate`                 |
+| `POST` | `/api/actions/replay-dry-run`      | Trigger `selftune evolve --dry-run --validation-mode replay` |
+| `POST` | `/api/actions/measure-baseline`    | Trigger `selftune grade baseline` for a skill                |
+| `POST` | `/api/actions/deploy-candidate`    | Trigger `selftune evolve` for a skill                        |
+| `POST` | `/api/actions/watch`               | Trigger `selftune watch` for a skill                         |
+| `POST` | `/api/actions/evolve`              | Trigger `selftune evolve` for a skill                        |
+| `POST` | `/api/actions/rollback`            | Trigger `selftune evolve rollback` for a skill               |
+| `POST` | `/api/actions/watchlist`           | Persist creator watchlist preferences                        |
 ### Live Updates (SSE)
 The dashboard connects to `/api/v2/events` via Server-Sent Events.
 The server watches the SQLite WAL file for changes and broadcasts an
-`update` event when new data is written. The SPA invalidates all cached
-queries, triggering immediate refetches (~1s latency).
+`update` event when new data is written. The dashboard also broadcasts
+`action` events while creator-loop commands are running so the UI can
+show live stdout/stderr and terminal success/failure. This works for
+both dashboard-triggered actions and supported `selftune` commands run
+directly in another terminal, because the CLI writes a shared action
+stream under `~/.selftune/dashboard-action-events.jsonl`. The SPA
+invalidates cached queries on updates and terminal action events (~1s
+latency for DB-backed updates).
+For demo or operator workflows, the skill report can open a dedicated
+live-run screen. That screen follows one active creator-loop run at a
+time, keeps a larger terminal log visible, and shows parsed dry-run
+summary fields plus historical model/platform/token aggregates from the
+skill report. Replay dry-runs also attach live `metrics` events when the
+underlying runtime exposes structured output (for example Claude Code's
+`--output-format stream-json`), so the screen can show per-run platform,
+model, token, cost, and duration updates before the action finishes.
+Replay validation now also emits structured per-eval `progress` events,
+so the live-run screen can show `eval n/N`, the current query snippet,
+and pass/fail evidence as each replayed eval completes. New browser tabs
+receive recent action-event backfill on connect, which means opening the
+live-run screen mid-run can still reconstruct the current action instead
+of only showing the final JSON after completion.
 TanStack Query polling (60s) acts as a fallback safety net in case the
 SSE connection drops. Data also refreshes on window focus.
@@ -79,7 +105,7 @@ See [docs/design-docs/live-dashboard-sse.md](../../docs/design-docs/live-dashboa
 Action buttons in the dashboard trigger selftune commands via POST
 requests. Each endpoint spawns a `bun run` subprocess.
-**Watch and Evolve** request body:
+**Creator-loop and watch/deploy actions** request body:
 ```json
 {
@@ -104,7 +130,8 @@ All action endpoints return:
 {
   "success": true,
   "output": "command stdout",
-  "error": null
+  "error": null,
+  "exitCode": 0
 }
 ```

package/skill/workflows/Registry.md CHANGED Viewed

@@ -4,20 +4,21 @@ Manage versioned skill distribution across your team. Push skill folders to the
 ## Commands
-| Command | Flags | What It Does |
-|---------|-------|-------------|
-| `selftune registry push [name]` | `--version=<semver>` `--summary=<text>` | Archive current skill folder and push as a new version |
-| `selftune registry install <name>` | `--global` | Download and extract a skill from the registry |
-| `selftune registry sync` | | Check all installed entries for updates, pull latest |
-| `selftune registry status` | | Show installed entries with version drift |
-| `selftune registry rollback <name>` | `--to=<version>` `--reason=<text>` | Rollback a skill to a previous version |
-| `selftune registry history <name>` | | Show version timeline with quality data |
-| `selftune registry list` | | Show all published entries in the org |
+| Command                                                             | Flags                                   | What It Does                                                     |
+| ------------------------------------------------------------------- | --------------------------------------- | ---------------------------------------------------------------- |
+| `selftune registry push [name]`                                     | `--version=<semver>` `--summary=<text>` | Archive current skill folder and push as a new version           |
+| `selftune registry install <name\|github:owner/repo[@ref][//path]>` | `--global`                              | Download from the registry or clone/install directly from GitHub |
+| `selftune registry sync`                                            |                                         | Check all installed entries for updates, pull latest             |
+| `selftune registry status`                                          |                                         | Show installed entries with version drift                        |
+| `selftune registry rollback <name>`                                 | `--to=<version>` `--reason=<text>`      | Rollback a skill to a previous version                           |
+| `selftune registry history <name>`                                  |                                         | Show version timeline with quality data                          |
+| `selftune registry list`                                            |                                         | Show all published entries in the org                            |
 ## When to Use
 - User says "push this skill to the team" → `selftune registry push`
 - User says "install the deploy skill" → `selftune registry install deploy`
+- User says "install this GitHub skill repo" → `selftune registry install github:owner/repo`
 - User says "update my skills" or "sync registry" → `selftune registry sync`
 - User says "check for updates" → `selftune registry status`
 - User says "rollback the deploy skill" → `selftune registry rollback deploy`
@@ -34,10 +35,13 @@ Manage versioned skill distribution across your team. Push skill folders to the
 ## Install Workflow
-1. Run `selftune registry install <name>` to pull from the registry
+1. Run `selftune registry install <name>` to pull from the registry, or
+   `selftune registry install github:owner/repo[@ref][//path]` to clone and
+   install directly from GitHub using local git credentials
 2. By default, installs to `.claude/skills/<name>/` in the current project
 3. Use `--global` to install to `~/.claude/skills/<name>/` (available everywhere)
-4. Installation is tracked — `selftune registry status` shows what's installed
+4. Registry installs are tracked by `selftune registry status`; direct GitHub
+   installs are local-only and do not participate in `registry sync`
 ## Sync Workflow
@@ -82,8 +86,10 @@ All commands output JSON for agent consumption:
 **User wants to install a shared skill**
-> Run `selftune registry install <name>`. Use `--global` if they want it
-> available across all projects.
+> Run `selftune registry install <name>` for a cloud-published skill, or
+> `selftune registry install github:owner/repo[@ref][//path]` if they want to
+> install directly from GitHub. Use `--global` if they want it available across
+> all projects.
 **User wants to check what's outdated**