npm - selftune - Versions diffs - 0.2.31 → 0.2.32 - Mend

selftune 0.2.31 → 0.2.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

package/README.md +83 -56
package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
package/apps/local-dashboard/dist/index.html +3 -3
package/cli/selftune/command-surface.ts +613 -2
package/cli/selftune/create/baseline.ts +429 -0
package/cli/selftune/create/check.ts +35 -0
package/cli/selftune/create/init.ts +115 -0
package/cli/selftune/create/package-candidate-state.ts +771 -0
package/cli/selftune/create/package-evaluator.ts +710 -0
package/cli/selftune/create/package-fingerprint.ts +142 -0
package/cli/selftune/create/package-search.ts +377 -0
package/cli/selftune/create/publish.ts +431 -0
package/cli/selftune/create/readiness.ts +495 -0
package/cli/selftune/create/replay.ts +330 -0
package/cli/selftune/create/report.ts +74 -0
package/cli/selftune/create/scaffold.ts +121 -0
package/cli/selftune/create/skills-ref-adapter.ts +177 -0
package/cli/selftune/create/status.ts +33 -0
package/cli/selftune/create/templates.ts +249 -0
package/cli/selftune/cron/setup.ts +1 -1
package/cli/selftune/dashboard-action-events.ts +4 -1
package/cli/selftune/dashboard-action-result.ts +789 -24
package/cli/selftune/dashboard-action-stream.ts +80 -0
package/cli/selftune/dashboard-contract.ts +146 -3
package/cli/selftune/dashboard-server.ts +5 -4
package/cli/selftune/eval/hooks-to-evals.ts +58 -35
package/cli/selftune/eval/synthetic-evals.ts +145 -17
package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
package/cli/selftune/evolution/evolve-body.ts +9 -36
package/cli/selftune/evolution/evolve.ts +8 -72
package/cli/selftune/evolution/stopping-criteria.ts +5 -13
package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
package/cli/selftune/evolution/validate-host-replay.ts +115 -15
package/cli/selftune/improve.ts +206 -0
package/cli/selftune/index.ts +123 -6
package/cli/selftune/init.ts +1 -1
package/cli/selftune/localdb/queries/dashboard.ts +30 -0
package/cli/selftune/localdb/schema.ts +52 -0
package/cli/selftune/monitoring/watch.ts +257 -23
package/cli/selftune/orchestrate/execute.ts +300 -1
package/cli/selftune/orchestrate/finalize.ts +14 -0
package/cli/selftune/orchestrate/plan.ts +22 -5
package/cli/selftune/orchestrate/prepare.ts +59 -4
package/cli/selftune/orchestrate/report.ts +1 -1
package/cli/selftune/orchestrate.ts +34 -1
package/cli/selftune/publish.ts +35 -0
package/cli/selftune/routes/actions.ts +81 -15
package/cli/selftune/routes/overview.ts +1 -1
package/cli/selftune/routes/skill-report.ts +147 -2
package/cli/selftune/run.ts +18 -0
package/cli/selftune/schedule.ts +3 -3
package/cli/selftune/search-run.ts +703 -0
package/cli/selftune/status.ts +35 -11
package/cli/selftune/testing-readiness.ts +431 -40
package/cli/selftune/types.ts +316 -0
package/cli/selftune/utils/eval-readiness.ts +1 -0
package/cli/selftune/utils/json-output.ts +11 -0
package/cli/selftune/utils/lifecycle-surface.ts +48 -0
package/cli/selftune/utils/query-filter.ts +82 -1
package/cli/selftune/utils/tui.ts +85 -2
package/cli/selftune/verify.ts +205 -0
package/cli/selftune/workflows/proposals.ts +1 -1
package/cli/selftune/workflows/skill-scaffold.ts +141 -63
package/cli/selftune/workflows/workflows.ts +4 -4
package/package.json +1 -1
package/skill/SKILL.md +148 -85
package/skill/references/cli-quick-reference.md +16 -1
package/skill/references/creator-playbook.md +31 -10
package/skill/workflows/Baseline.md +8 -9
package/skill/workflows/Contributions.md +4 -4
package/skill/workflows/Create.md +173 -0
package/skill/workflows/CreateTestDeploy.md +34 -30
package/skill/workflows/Cron.md +2 -2
package/skill/workflows/Dashboard.md +3 -3
package/skill/workflows/Evals.md +13 -7
package/skill/workflows/Evolve.md +75 -32
package/skill/workflows/EvolveBody.md +22 -15
package/skill/workflows/Hook.md +1 -1
package/skill/workflows/Improve.md +168 -0
package/skill/workflows/Initialize.md +3 -3
package/skill/workflows/Orchestrate.md +49 -12
package/skill/workflows/Publish.md +100 -0
package/skill/workflows/Run.md +72 -0
package/skill/workflows/Schedule.md +2 -2
package/skill/workflows/SearchRun.md +89 -0
package/skill/workflows/SignalsDashboard.md +2 -2
package/skill/workflows/UnitTest.md +13 -4
package/skill/workflows/Verify.md +136 -0
package/skill/workflows/Watch.md +114 -47
package/skill/workflows/Workflows.md +13 -8
package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +0 -15
package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +0 -1
package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1

package/cli/selftune/eval/synthetic-evals.ts CHANGED Viewed

@@ -8,9 +8,10 @@
 import { readFileSync } from "node:fs";
-import type { EvalEntry, InvocationType, SkillUsageRecord } from "../types.js";
+import type { EvalEntry, InvocationType, QueryLogRecord, SkillUsageRecord } from "../types.js";
 import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
 import type { LlmCallObserver } from "../utils/llm-call.js";
+import { extractActionableQueryText, extractPositiveEvalQueryText } from "../utils/query-filter.js";
 import { findInstalledSkillNames } from "../utils/skill-discovery.js";
 import { classifyInvocation } from "./invocation-classifier.js";
@@ -50,6 +51,25 @@ interface PromptFamilyTargets {
   unrelatedNegativeCount: number;
 }
+const MAX_REAL_EXAMPLE_LENGTH = 220;
+const MAX_SYNTHETIC_SKILL_CONTENT_CHARS = 6000;
+const MAX_SYNTHETIC_SECTION_CHARS = 1200;
+const MAX_SYNTHETIC_PREAMBLE_CHARS = 800;
+const PRIORITY_SYNTHETIC_SECTION_PATTERNS = [
+  /when this skill activates/i,
+  /when to invoke/i,
+  /when to use/i,
+  /\buse when\b/i,
+  /workflow routing/i,
+  /\busage\b/i,
+  /\bexamples?\b/i,
+  /\bformat\b/i,
+  /publish workflow/i,
+  /input/i,
+  /output/i,
+  /activation/i,
+] as const;
 function getSyntheticSkillSearchDirs(): string[] {
   const cwd = process.cwd();
   const homeDir = process.env.HOME ?? "";
@@ -122,6 +142,115 @@ function normalizeEvalQuery(query: string): string {
   return query.trim().toLowerCase().replace(/\s+/g, " ");
 }
+function truncatePromptExample(query: string): string {
+  const trimmed = query.trim();
+  if (trimmed.length <= MAX_REAL_EXAMPLE_LENGTH) return trimmed;
+  return `${trimmed.slice(0, MAX_REAL_EXAMPLE_LENGTH - 1).trimEnd()}…`;
+}
+function truncateSyntheticSection(text: string, limit: number): string {
+  const trimmed = text.trim();
+  if (trimmed.length <= limit) return trimmed;
+  return `${trimmed.slice(0, limit - 1).trimEnd()}…`;
+}
+export function summarizeSkillContentForSyntheticPrompt(skillContent: string): string {
+  const trimmed = skillContent.trim();
+  if (trimmed.length <= MAX_SYNTHETIC_SKILL_CONTENT_CHARS) return trimmed;
+  const frontmatterMatch = trimmed.match(/^---\n[\s\S]*?\n---\n*/);
+  const frontmatter = frontmatterMatch?.[0]?.trim() ?? "";
+  const body = frontmatterMatch ? trimmed.slice(frontmatterMatch[0].length).trim() : trimmed;
+  const sectionRegex = /^#{1,6}\s+.+$/gm;
+  const headingMatches = [...body.matchAll(sectionRegex)];
+  if (headingMatches.length === 0) {
+    return truncateSyntheticSection(trimmed, MAX_SYNTHETIC_SKILL_CONTENT_CHARS);
+  }
+  const summaryParts: string[] = [];
+  let usedLength = 0;
+  const appendPart = (part: string): boolean => {
+    const normalized = part.trim();
+    if (!normalized) return false;
+    const nextLength = usedLength + normalized.length + (summaryParts.length > 0 ? 2 : 0);
+    if (nextLength > MAX_SYNTHETIC_SKILL_CONTENT_CHARS) return false;
+    summaryParts.push(normalized);
+    usedLength = nextLength;
+    return true;
+  };
+  if (frontmatter) {
+    appendPart(frontmatter);
+  }
+  const preamble = body.slice(0, headingMatches[0]?.index ?? 0).trim();
+  if (preamble) {
+    appendPart(truncateSyntheticSection(preamble, MAX_SYNTHETIC_PREAMBLE_CHARS));
+  }
+  const sections = headingMatches.map((match, index) => {
+    const start = match.index ?? 0;
+    const end = headingMatches[index + 1]?.index ?? body.length;
+    const content = body.slice(start, end).trim();
+    const heading = match[0].replace(/^#{1,6}\s+/, "").trim();
+    return { heading, content, index };
+  });
+  const selectedIndices = new Set<number>();
+  if (sections.length > 0) selectedIndices.add(0);
+  for (const section of sections) {
+    if (PRIORITY_SYNTHETIC_SECTION_PATTERNS.some((pattern) => pattern.test(section.heading))) {
+      selectedIndices.add(section.index);
+    }
+  }
+  for (const section of sections) {
+    if (!selectedIndices.has(section.index)) continue;
+    appendPart(truncateSyntheticSection(section.content, MAX_SYNTHETIC_SECTION_CHARS));
+  }
+  appendPart("[skill content summarized for synthetic eval generation]");
+  return summaryParts.join("\n\n");
+}
+export function buildSyntheticPromptRealExamples(
+  positiveCandidates: string[],
+  negativeCandidates: string[],
+  skillName: string,
+  limit = 5,
+): SyntheticPromptRealExamples | undefined {
+  const cleanedPositives: string[] = [];
+  const seenPositives = new Set<string>();
+  for (const candidate of positiveCandidates) {
+    const cleaned = extractPositiveEvalQueryText(candidate, skillName);
+    if (!cleaned) continue;
+    const normalized = normalizeEvalQuery(cleaned);
+    if (seenPositives.has(normalized)) continue;
+    seenPositives.add(normalized);
+    cleanedPositives.push(truncatePromptExample(cleaned));
+    if (cleanedPositives.length >= limit) break;
+  }
+  if (cleanedPositives.length === 0) return undefined;
+  const positiveSet = new Set(cleanedPositives.map((query) => normalizeEvalQuery(query)));
+  const cleanedNegatives: string[] = [];
+  const seenNegatives = new Set<string>();
+  for (const candidate of negativeCandidates) {
+    const cleaned = extractActionableQueryText(candidate);
+    if (!cleaned) continue;
+    const truncated = truncatePromptExample(cleaned);
+    const normalized = normalizeEvalQuery(truncated);
+    if (positiveSet.has(normalized) || seenNegatives.has(normalized)) continue;
+    seenNegatives.add(normalized);
+    cleanedNegatives.push(truncated);
+    if (cleanedNegatives.length >= limit) break;
+  }
+  return { positive: cleanedPositives, negative: cleanedNegatives };
+}
 function dedupeEvalEntries(entries: EvalEntry[]): EvalEntry[] {
   const seen = new Set<string>();
   const deduped: EvalEntry[] = [];
@@ -223,6 +352,7 @@ export function buildSyntheticPrompt(
   realExamples?: SyntheticPromptRealExamples,
   siblingSkills: string[] = [],
 ): { system: string; user: string } {
+  const summarizedSkillContent = summarizeSkillContentForSyntheticPrompt(skillContent);
   const {
     explicitCount,
     implicitCount,
@@ -259,7 +389,7 @@ Output as JSON array with no surrounding text:
   let user = `Skill name: ${skillName}
 Skill content:
-${skillContent}
+${summarizedSkillContent}
 Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false).
@@ -308,6 +438,7 @@ export function buildSyntheticRefinementPrompt(
   maxNegatives: number,
   siblingSkills: string[] = [],
 ): { system: string; user: string } {
+  const summarizedSkillContent = summarizeSkillContentForSyntheticPrompt(skillContent);
   const targets = buildPromptFamilyTargets(maxPositives, maxNegatives, siblingSkills.length > 0);
   const system = `You are refining a cold-start eval benchmark for a coding agent skill.
@@ -325,7 +456,7 @@ Return ONLY a JSON array with the final benchmark.`;
   const user = `Skill name: ${skillName}
 Skill content:
-${skillContent}
+${summarizedSkillContent}
 Target final benchmark:
 - ${maxPositives} positives
@@ -459,25 +590,22 @@ export async function generateSyntheticEvals(
     // Positives: high-confidence triggered records for this skill
     const skillRecords = querySkillUsageRecords(db) as SkillUsageRecord[];
-    const positive = skillRecords
+    const positiveCandidates = skillRecords
       .filter((r) => isHighConfidencePositiveSkillRecord(r, skillName))
       .map((r) => r.query)
-      .filter((q): q is string => typeof q === "string" && q.length > 0)
-      .slice(0, 5);
+      .filter((q): q is string => typeof q === "string" && q.length > 0);
-    // Negatives: from all_queries, excluding known positives
-    const posSet = new Set(positive.map((q: string) => q.toLowerCase()));
-    const allQueries = queryQueryLog(db);
-    const negative = allQueries
+    // Negatives: from all_queries, excluding cleaned positives later.
+    const allQueries = queryQueryLog(db) as QueryLogRecord[];
+    const negativeCandidates = allQueries
       .map((r) => r.query)
-      .filter(
-        (q): q is string => typeof q === "string" && q.length > 0 && !posSet.has(q.toLowerCase()),
-      )
-      .slice(0, 5);
+      .filter((q): q is string => typeof q === "string" && q.length > 0);
-    if (positive.length > 0) {
-      realExamples = { positive, negative };
-    }
+    realExamples = buildSyntheticPromptRealExamples(
+      positiveCandidates,
+      negativeCandidates,
+      skillName,
+    );
   } catch {
     // fail-open: synthetic gen works without real examples
   }