npm - selftune - Versions diffs - 0.1.0 - Mend

selftune 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/CHANGELOG.md +23 -0
package/README.md +259 -0
package/bin/selftune.cjs +29 -0
package/cli/selftune/constants.ts +71 -0
package/cli/selftune/eval/hooks-to-evals.ts +422 -0
package/cli/selftune/evolution/audit.ts +44 -0
package/cli/selftune/evolution/deploy-proposal.ts +244 -0
package/cli/selftune/evolution/evolve.ts +406 -0
package/cli/selftune/evolution/extract-patterns.ts +145 -0
package/cli/selftune/evolution/propose-description.ts +146 -0
package/cli/selftune/evolution/rollback.ts +242 -0
package/cli/selftune/evolution/stopping-criteria.ts +69 -0
package/cli/selftune/evolution/validate-proposal.ts +137 -0
package/cli/selftune/grading/grade-session.ts +459 -0
package/cli/selftune/hooks/prompt-log.ts +52 -0
package/cli/selftune/hooks/session-stop.ts +54 -0
package/cli/selftune/hooks/skill-eval.ts +73 -0
package/cli/selftune/index.ts +104 -0
package/cli/selftune/ingestors/codex-rollout.ts +416 -0
package/cli/selftune/ingestors/codex-wrapper.ts +332 -0
package/cli/selftune/ingestors/opencode-ingest.ts +565 -0
package/cli/selftune/init.ts +297 -0
package/cli/selftune/monitoring/watch.ts +328 -0
package/cli/selftune/observability.ts +255 -0
package/cli/selftune/types.ts +255 -0
package/cli/selftune/utils/jsonl.ts +75 -0
package/cli/selftune/utils/llm-call.ts +192 -0
package/cli/selftune/utils/logging.ts +40 -0
package/cli/selftune/utils/schema-validator.ts +47 -0
package/cli/selftune/utils/seeded-random.ts +31 -0
package/cli/selftune/utils/transcript.ts +260 -0
package/package.json +29 -0
package/skill/SKILL.md +120 -0
package/skill/Workflows/Doctor.md +145 -0
package/skill/Workflows/Evals.md +193 -0
package/skill/Workflows/Evolve.md +159 -0
package/skill/Workflows/Grade.md +157 -0
package/skill/Workflows/Ingest.md +159 -0
package/skill/Workflows/Initialize.md +125 -0
package/skill/Workflows/Rollback.md +131 -0
package/skill/Workflows/Watch.md +128 -0
package/skill/references/grading-methodology.md +176 -0
package/skill/references/invocation-taxonomy.md +144 -0
package/skill/references/logs.md +168 -0
package/skill/settings_snippet.json +41 -0

package/cli/selftune/evolution/extract-patterns.ts ADDED Viewed

@@ -0,0 +1,145 @@
+/**
+ * extract-patterns.ts
+ *
+ * Identifies failure patterns by cross-referencing eval entries with actual
+ * skill usage records. Groups missed queries by invocation type and clusters
+ * similar queries together using Jaccard similarity.
+ */
+import type { EvalEntry, FailurePattern, InvocationType, SkillUsageRecord } from "../types.js";
+// ---------------------------------------------------------------------------
+// Jaccard similarity
+// ---------------------------------------------------------------------------
+/** Tokenize a string into a set of lowercase words. */
+function tokenize(s: string): Set<string> {
+  const tokens = new Set<string>();
+  for (const word of s.split(/\s+/)) {
+    const w = word.toLowerCase();
+    if (w) tokens.add(w);
+  }
+  return tokens;
+}
+/** Jaccard similarity on word sets, returns 0.0-1.0 */
+export function computeQuerySimilarity(a: string, b: string): number {
+  const setA = tokenize(a);
+  const setB = tokenize(b);
+  if (setA.size === 0 && setB.size === 0) return 0;
+  let intersection = 0;
+  for (const token of setA) {
+    if (setB.has(token)) intersection++;
+  }
+  const union = setA.size + setB.size - intersection;
+  if (union === 0) return 0;
+  return intersection / union;
+}
+// ---------------------------------------------------------------------------
+// Single-linkage clustering
+// ---------------------------------------------------------------------------
+/** Single-linkage clustering, default threshold 0.3 */
+export function clusterQueries(queries: string[], threshold = 0.3): string[][] {
+  if (queries.length === 0) return [];
+  const clusters: string[][] = [];
+  for (const query of queries) {
+    // Collect indices of all clusters where any member has similarity >= threshold
+    const matchingIndices: number[] = [];
+    for (let i = 0; i < clusters.length; i++) {
+      for (const member of clusters[i]) {
+        if (computeQuerySimilarity(query, member) >= threshold) {
+          matchingIndices.push(i);
+          break;
+        }
+      }
+    }
+    if (matchingIndices.length === 0) {
+      clusters.push([query]);
+    } else {
+      // Merge all matching clusters into the first one, then add the query
+      const targetCluster = clusters[matchingIndices[0]];
+      // Merge in reverse order so splice indices stay valid
+      for (let j = matchingIndices.length - 1; j >= 1; j--) {
+        const idx = matchingIndices[j];
+        targetCluster.push(...clusters[idx]);
+        clusters.splice(idx, 1);
+      }
+      targetCluster.push(query);
+    }
+  }
+  return clusters;
+}
+// ---------------------------------------------------------------------------
+// Failure pattern extraction
+// ---------------------------------------------------------------------------
+/**
+ * Cross-reference eval entries with actual usage to find missed queries.
+ * Groups by invocation_type and clusters similar missed queries into patterns.
+ * Returns sorted by frequency descending.
+ */
+export function extractFailurePatterns(
+  evalEntries: EvalEntry[],
+  skillUsage: SkillUsageRecord[],
+  skillName: string,
+): FailurePattern[] {
+  // 1. Build a set of triggered queries from skillUsage for the given skillName
+  const triggeredQueries = new Set<string>();
+  for (const record of skillUsage) {
+    if (record.skill_name === skillName && record.triggered) {
+      triggeredQueries.add(record.query);
+    }
+  }
+  // 2. Find missed queries: should_trigger === true but NOT in the triggered set
+  const missedByType = new Map<InvocationType, string[]>();
+  for (const entry of evalEntries) {
+    if (!entry.should_trigger) continue;
+    if (triggeredQueries.has(entry.query)) continue;
+    const invType = entry.invocation_type ?? "implicit";
+    if (!missedByType.has(invType)) {
+      missedByType.set(invType, []);
+    }
+    missedByType.get(invType)?.push(entry.query);
+  }
+  // 3. For each group, cluster similar queries
+  const now = new Date().toISOString();
+  const allPatterns: FailurePattern[] = [];
+  let index = 0;
+  for (const [invType, queries] of missedByType) {
+    const clusters = clusterQueries(queries);
+    for (const cluster of clusters) {
+      allPatterns.push({
+        pattern_id: `fp-${skillName}-${index}`,
+        skill_name: skillName,
+        invocation_type: invType,
+        missed_queries: cluster,
+        frequency: cluster.length,
+        sample_sessions: [],
+        extracted_at: now,
+      });
+      index++;
+    }
+  }
+  // 4. Sort by frequency descending
+  allPatterns.sort((a, b) => b.frequency - a.frequency);
+  return allPatterns;
+}

package/cli/selftune/evolution/propose-description.ts ADDED Viewed

@@ -0,0 +1,146 @@
+/**
+ * propose-description.ts
+ *
+ * Generates improved skill description proposals using LLM analysis of failure
+ * patterns. Takes the current description, identified failure patterns, and
+ * missed queries, then produces a structured EvolutionProposal with an
+ * improved description, rationale, and confidence score.
+ */
+import type { EvolutionProposal, FailurePattern } from "../types.js";
+import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
+// ---------------------------------------------------------------------------
+// System prompt
+// ---------------------------------------------------------------------------
+/** System prompt for the proposal generator LLM. */
+export const PROPOSER_SYSTEM = `You are a skill description optimizer for an AI agent routing system.
+Your task is to analyze the current skill description and its failure patterns,
+then propose an improved description that would catch the missed queries while
+preserving correct routing for existing queries.
+Rules:
+- The description must be concise and specific.
+- It must cover the semantic space of the missed queries without being too broad.
+- Maintain the original intent and scope of the skill.
+- Output ONLY valid JSON with exactly these fields:
+  - "proposed_description" (string): the improved skill description
+  - "rationale" (string): explanation of what changed and why
+  - "confidence" (number): 0.0-1.0 how confident you are this improves routing
+Do NOT include any text outside the JSON object.`;
+// ---------------------------------------------------------------------------
+// Prompt builder
+// ---------------------------------------------------------------------------
+/** Build the user prompt for the LLM with context about failures. */
+export function buildProposalPrompt(
+  currentDescription: string,
+  failurePatterns: FailurePattern[],
+  missedQueries: string[],
+  skillName: string,
+): string {
+  const patternLines = failurePatterns.map((p) => {
+    const queries = p.missed_queries.map((q) => `    - "${q}"`).join("\n");
+    return `  Pattern ${p.pattern_id} (frequency: ${p.frequency}, type: ${p.invocation_type}):\n${queries}`;
+  });
+  const missedLines = missedQueries.map((q) => `  - "${q}"`).join("\n");
+  return `Skill Name: ${skillName}
+Current Description:
+${currentDescription}
+Failure Patterns:
+${patternLines.join("\n\n")}
+All Missed Queries:
+${missedLines}
+Propose an improved description for the "${skillName}" skill that would correctly route the missed queries listed above. Output ONLY a JSON object with "proposed_description", "rationale", and "confidence" fields.`;
+}
+// ---------------------------------------------------------------------------
+// Response parser
+// ---------------------------------------------------------------------------
+/** Parse LLM response text into structured proposal data. */
+export function parseProposalResponse(raw: string): {
+  proposed_description: string;
+  rationale: string;
+  confidence: number;
+} {
+  const cleaned = stripMarkdownFences(raw);
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(cleaned);
+  } catch {
+    throw new Error(`Failed to parse LLM response as JSON: ${cleaned.slice(0, 200)}`);
+  }
+  if (typeof parsed !== "object" || parsed === null) {
+    throw new Error("LLM response is not a JSON object");
+  }
+  const obj = parsed as Record<string, unknown>;
+  if (typeof obj.proposed_description !== "string") {
+    throw new Error("Missing or invalid 'proposed_description' field in LLM response");
+  }
+  if (typeof obj.rationale !== "string") {
+    throw new Error("Missing or invalid 'rationale' field in LLM response");
+  }
+  if (typeof obj.confidence !== "number") {
+    throw new Error("Missing or invalid 'confidence' field in LLM response");
+  }
+  // Clamp confidence to 0.0-1.0
+  const confidence = Math.max(0.0, Math.min(1.0, obj.confidence));
+  return {
+    proposed_description: obj.proposed_description,
+    rationale: obj.rationale,
+    confidence,
+  };
+}
+// ---------------------------------------------------------------------------
+// Proposal generator
+// ---------------------------------------------------------------------------
+/** Generate a complete evolution proposal using LLM. */
+export async function generateProposal(
+  currentDescription: string,
+  failurePatterns: FailurePattern[],
+  missedQueries: string[],
+  skillName: string,
+  skillPath: string,
+  mode: "agent" | "api",
+  agent?: string,
+): Promise<EvolutionProposal> {
+  const prompt = buildProposalPrompt(currentDescription, failurePatterns, missedQueries, skillName);
+  const rawResponse = await callLlm(PROPOSER_SYSTEM, prompt, mode, agent);
+  const { proposed_description, rationale, confidence } = parseProposalResponse(rawResponse);
+  return {
+    proposal_id: `evo-${skillName}-${Date.now()}`,
+    skill_name: skillName,
+    skill_path: skillPath,
+    original_description: currentDescription,
+    proposed_description,
+    rationale,
+    failure_patterns: failurePatterns.map((p) => p.pattern_id),
+    eval_results: {
+      before: { total: 0, passed: 0, failed: 0, pass_rate: 0 },
+      after: { total: 0, passed: 0, failed: 0, pass_rate: 0 },
+    },
+    confidence,
+    created_at: new Date().toISOString(),
+    status: "pending",
+  };
+}

package/cli/selftune/evolution/rollback.ts ADDED Viewed

@@ -0,0 +1,242 @@
+/**
+ * Evolution rollback mechanism (TASK-15).
+ *
+ * Restores a skill's SKILL.md to its pre-evolution state by:
+ * 1. Checking for a .bak backup file at the skill path
+ * 2. Falling back to the audit trail's "created" entry for original_description
+ * 3. Recording a "rolled_back" entry in the audit trail
+ */
+import { existsSync, readFileSync, readdirSync, unlinkSync, writeFileSync } from "node:fs";
+import { basename, dirname, join } from "node:path";
+import { parseArgs } from "node:util";
+import type { EvolutionAuditEntry } from "../types.js";
+import { appendAuditEntry, getLastDeployedProposal, readAuditTrail } from "./audit.js";
+import { replaceDescription } from "./deploy-proposal.js";
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+export interface RollbackOptions {
+  skillName: string;
+  skillPath: string;
+  proposalId?: string; // rollback specific proposal, or last deployed
+  logPath?: string; // optional override for audit log path (testing)
+}
+export interface RollbackResult {
+  rolledBack: boolean;
+  restoredDescription: string;
+  reason: string;
+}
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+const ORIGINAL_DESC_PREFIX = "original_description:";
+/**
+ * Find the most recent .bak file for the given skillPath.
+ * Matches both legacy `SKILL.md.bak` and timestamped `SKILL.md.<timestamp>.bak`.
+ * Returns the path to the most recent backup, or null if none found.
+ */
+function findLatestBackup(skillPath: string): string | null {
+  const dir = dirname(skillPath);
+  const base = basename(skillPath);
+  if (!existsSync(dir)) return null;
+  const entries = readdirSync(dir);
+  // Match <base>.bak or <base>.<anything>.bak
+  const plainBak = `${base}.bak`;
+  const backupFiles = entries
+    .filter((f) => f === plainBak || (f.startsWith(`${base}.`) && f.endsWith(".bak")))
+    .sort((a, b) => {
+      // Extract timestamp: plain "<base>.bak" gets "" (oldest), "<base>.<ts>.bak" gets "<ts>"
+      const tsA = a === plainBak ? "" : a.slice(base.length + 1, -4);
+      const tsB = b === plainBak ? "" : b.slice(base.length + 1, -4);
+      // Descending so newest timestamp first
+      return tsB.localeCompare(tsA);
+    });
+  if (backupFiles.length === 0) return null;
+  return join(dir, backupFiles[0]);
+}
+/**
+ * Find the "created" audit entry for a given proposal ID and extract
+ * the original_description from its details field.
+ */
+function findOriginalFromAudit(proposalId: string, logPath?: string): string | null {
+  const entries = readAuditTrail(undefined, logPath);
+  const createdEntry = entries.find((e) => e.proposal_id === proposalId && e.action === "created");
+  if (!createdEntry) return null;
+  const { details } = createdEntry;
+  if (details.startsWith(ORIGINAL_DESC_PREFIX)) {
+    return details.slice(ORIGINAL_DESC_PREFIX.length);
+  }
+  // Accept a plain non-empty string as the original description
+  if (details.length > 0) {
+    return details;
+  }
+  return null;
+}
+/**
+ * Find the deployed audit entry for a specific proposal ID.
+ */
+function findDeployedEntry(
+  proposalId: string,
+  skillName: string,
+  logPath?: string,
+): EvolutionAuditEntry | null {
+  const entries = readAuditTrail(skillName, logPath);
+  return entries.find((e) => e.proposal_id === proposalId && e.action === "deployed") ?? null;
+}
+// ---------------------------------------------------------------------------
+// Main rollback function
+// ---------------------------------------------------------------------------
+export async function rollback(options: RollbackOptions): Promise<RollbackResult> {
+  const { skillName, skillPath, proposalId, logPath } = options;
+  const noRollback = (reason: string): RollbackResult => ({
+    rolledBack: false,
+    restoredDescription: "",
+    reason,
+  });
+  // Guard: SKILL.md must exist
+  if (!existsSync(skillPath)) {
+    return noRollback(`SKILL.md not found at ${skillPath}`);
+  }
+  // Determine which proposal to roll back
+  let targetProposalId: string;
+  const explicitProposal = Boolean(proposalId);
+  if (proposalId) {
+    // Verify the specific proposal exists in audit trail
+    const entry = findDeployedEntry(proposalId, skillName, logPath);
+    if (!entry) {
+      return noRollback(`Proposal ${proposalId} not found as deployed entry in audit trail`);
+    }
+    targetProposalId = proposalId;
+  } else {
+    // Use the most recent deployed proposal
+    const lastDeployed = getLastDeployedProposal(skillName, logPath);
+    if (!lastDeployed) {
+      return noRollback(`No deployed proposal found for skill "${skillName}"`);
+    }
+    targetProposalId = lastDeployed.proposal_id;
+  }
+  // Strategy 1: Restore from .bak file (only when rolling back the latest deploy,
+  // i.e., when no explicit proposalId was supplied)
+  const backupPath = !explicitProposal ? findLatestBackup(skillPath) : null;
+  if (backupPath) {
+    const originalContent = readFileSync(backupPath, "utf-8");
+    writeFileSync(skillPath, originalContent, "utf-8");
+    unlinkSync(backupPath);
+    // Record rollback in audit trail
+    const auditEntry: EvolutionAuditEntry = {
+      timestamp: new Date().toISOString(),
+      proposal_id: targetProposalId,
+      action: "rolled_back",
+      details: `Rolled back ${skillName} from backup file`,
+    };
+    appendAuditEntry(auditEntry, logPath);
+    return {
+      rolledBack: true,
+      restoredDescription: originalContent,
+      reason: "Restored from backup file",
+    };
+  }
+  // Strategy 2: Restore from audit trail's created entry (description only)
+  const originalFromAudit = findOriginalFromAudit(targetProposalId, logPath);
+  if (originalFromAudit) {
+    // Replace only the description section in SKILL.md, preserving structure
+    const currentContent = readFileSync(skillPath, "utf-8");
+    const updatedContent = replaceDescription(currentContent, originalFromAudit);
+    writeFileSync(skillPath, updatedContent, "utf-8");
+    // Record rollback in audit trail
+    const auditEntry: EvolutionAuditEntry = {
+      timestamp: new Date().toISOString(),
+      proposal_id: targetProposalId,
+      action: "rolled_back",
+      details: `Rolled back ${skillName} from audit trail`,
+    };
+    appendAuditEntry(auditEntry, logPath);
+    return {
+      rolledBack: true,
+      restoredDescription: originalFromAudit,
+      reason: "Restored from audit trail",
+    };
+  }
+  // No restoration source available
+  return noRollback(
+    `No restoration source found for proposal ${targetProposalId} (no .bak file and no original_description in audit trail)`,
+  );
+}
+// ---------------------------------------------------------------------------
+// CLI entry point
+// ---------------------------------------------------------------------------
+export async function cliMain(): Promise<void> {
+  const { values } = parseArgs({
+    options: {
+      skill: { type: "string" },
+      "skill-path": { type: "string" },
+      "proposal-id": { type: "string" },
+      help: { type: "boolean", default: false },
+    },
+    strict: true,
+  });
+  if (values.help) {
+    console.log(`selftune rollback — Rollback a skill to its pre-evolution state
+Usage:
+  selftune rollback --skill <name> --skill-path <path> [options]
+Options:
+  --skill             Skill name (required)
+  --skill-path        Path to SKILL.md (required)
+  --proposal-id       Specific proposal ID to rollback (optional, uses latest if omitted)
+  --help              Show this help message`);
+    process.exit(0);
+  }
+  if (!values.skill || !values["skill-path"]) {
+    console.error("[ERROR] --skill and --skill-path are required");
+    process.exit(1);
+  }
+  const result = await rollback({
+    skillName: values.skill,
+    skillPath: values["skill-path"],
+    proposalId: values["proposal-id"],
+  });
+  console.log(JSON.stringify(result, null, 2));
+  process.exit(result.rolledBack ? 0 : 1);
+}
+if (import.meta.main) {
+  cliMain().catch((err) => {
+    console.error(`[FATAL] ${err}`);
+    process.exit(1);
+  });
+}

package/cli/selftune/evolution/stopping-criteria.ts ADDED Viewed

@@ -0,0 +1,69 @@
+/**
+ * stopping-criteria.ts
+ *
+ * Evaluates whether the evolution loop should stop based on convergence,
+ * iteration limits, confidence thresholds, and plateau detection.
+ * Pure function module with no external dependencies.
+ */
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+export interface StoppingDecision {
+  shouldStop: boolean;
+  reason: string;
+}
+// ---------------------------------------------------------------------------
+// Stopping criteria evaluator
+// ---------------------------------------------------------------------------
+/**
+ * Evaluate whether the evolution loop should stop.
+ *
+ * Checks conditions in priority order:
+ *   1. Converged (pass rate >= 95%)
+ *   2. Max iterations reached
+ *   3. Low confidence (below threshold)
+ *   4. Plateau (< 1% variation over last 3 iterations)
+ *   5. Continue (none of the above)
+ */
+export function evaluateStoppingCriteria(
+  currentPassRate: number,
+  previousPassRates: number[],
+  iterationCount: number,
+  maxIterations: number,
+  confidenceThreshold: number,
+  proposalConfidence: number,
+): StoppingDecision {
+  // 1. Converged
+  if (currentPassRate >= 0.95) {
+    return { shouldStop: true, reason: "Converged: pass rate \u2265 95%" };
+  }
+  // 2. Max iterations
+  if (iterationCount >= maxIterations) {
+    return { shouldStop: true, reason: "Max iterations reached" };
+  }
+  // 3. Low confidence
+  if (proposalConfidence < confidenceThreshold) {
+    return { shouldStop: true, reason: "Confidence below threshold" };
+  }
+  // 4. Plateau detection: need at least 2 previous rates to form 3 data points
+  if (previousPassRates.length >= 2) {
+    const last2Previous = previousPassRates.slice(-2);
+    const window = [...last2Previous, currentPassRate];
+    const min = Math.min(...window);
+    const max = Math.max(...window);
+    if (max - min < 0.01) {
+      return { shouldStop: true, reason: "Plateau: no improvement in last 3 iterations" };
+    }
+  }
+  // 5. Continue
+  return { shouldStop: false, reason: "Continuing: improvement possible" };
+}