selftune 0.2.30 → 0.2.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/README.md +83 -56
  2. package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
  3. package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
  5. package/apps/local-dashboard/dist/index.html +3 -3
  6. package/cli/selftune/command-surface.ts +613 -2
  7. package/cli/selftune/create/baseline.ts +429 -0
  8. package/cli/selftune/create/check.ts +35 -0
  9. package/cli/selftune/create/init.ts +115 -0
  10. package/cli/selftune/create/package-candidate-state.ts +771 -0
  11. package/cli/selftune/create/package-evaluator.ts +710 -0
  12. package/cli/selftune/create/package-fingerprint.ts +142 -0
  13. package/cli/selftune/create/package-search.ts +377 -0
  14. package/cli/selftune/create/publish.ts +431 -0
  15. package/cli/selftune/create/readiness.ts +495 -0
  16. package/cli/selftune/create/replay.ts +330 -0
  17. package/cli/selftune/create/report.ts +74 -0
  18. package/cli/selftune/create/scaffold.ts +121 -0
  19. package/cli/selftune/create/skills-ref-adapter.ts +177 -0
  20. package/cli/selftune/create/status.ts +33 -0
  21. package/cli/selftune/create/templates.ts +249 -0
  22. package/cli/selftune/cron/setup.ts +1 -1
  23. package/cli/selftune/dashboard-action-events.ts +4 -1
  24. package/cli/selftune/dashboard-action-result.ts +789 -24
  25. package/cli/selftune/dashboard-action-stream.ts +80 -0
  26. package/cli/selftune/dashboard-contract.ts +146 -3
  27. package/cli/selftune/dashboard-server.ts +5 -4
  28. package/cli/selftune/eval/hooks-to-evals.ts +58 -35
  29. package/cli/selftune/eval/synthetic-evals.ts +145 -17
  30. package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
  31. package/cli/selftune/evolution/evolve-body.ts +9 -36
  32. package/cli/selftune/evolution/evolve.ts +8 -72
  33. package/cli/selftune/evolution/stopping-criteria.ts +5 -13
  34. package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
  35. package/cli/selftune/evolution/validate-host-replay.ts +115 -15
  36. package/cli/selftune/improve.ts +206 -0
  37. package/cli/selftune/index.ts +123 -6
  38. package/cli/selftune/init.ts +1 -1
  39. package/cli/selftune/localdb/queries/dashboard.ts +30 -0
  40. package/cli/selftune/localdb/schema.ts +52 -0
  41. package/cli/selftune/monitoring/watch.ts +257 -23
  42. package/cli/selftune/orchestrate/execute.ts +300 -1
  43. package/cli/selftune/orchestrate/finalize.ts +14 -0
  44. package/cli/selftune/orchestrate/plan.ts +22 -5
  45. package/cli/selftune/orchestrate/prepare.ts +59 -4
  46. package/cli/selftune/orchestrate/report.ts +1 -1
  47. package/cli/selftune/orchestrate.ts +34 -1
  48. package/cli/selftune/publish.ts +35 -0
  49. package/cli/selftune/registry/github-install.ts +256 -0
  50. package/cli/selftune/registry/index.ts +1 -1
  51. package/cli/selftune/registry/install.ts +58 -7
  52. package/cli/selftune/routes/actions.ts +81 -15
  53. package/cli/selftune/routes/overview.ts +1 -1
  54. package/cli/selftune/routes/skill-report.ts +147 -2
  55. package/cli/selftune/run.ts +18 -0
  56. package/cli/selftune/schedule.ts +3 -3
  57. package/cli/selftune/search-run.ts +703 -0
  58. package/cli/selftune/status.ts +35 -11
  59. package/cli/selftune/testing-readiness.ts +431 -40
  60. package/cli/selftune/types.ts +316 -0
  61. package/cli/selftune/utils/eval-readiness.ts +1 -0
  62. package/cli/selftune/utils/json-output.ts +11 -0
  63. package/cli/selftune/utils/lifecycle-surface.ts +48 -0
  64. package/cli/selftune/utils/query-filter.ts +82 -1
  65. package/cli/selftune/utils/tui.ts +85 -2
  66. package/cli/selftune/verify.ts +205 -0
  67. package/cli/selftune/workflows/proposals.ts +1 -1
  68. package/cli/selftune/workflows/skill-scaffold.ts +141 -63
  69. package/cli/selftune/workflows/workflows.ts +4 -4
  70. package/package.json +1 -1
  71. package/packages/dashboard-core/src/routes/manifest.ts +2 -2
  72. package/packages/ui/src/components/SkillReportPanels.tsx +7 -7
  73. package/packages/ui/src/primitives/button.tsx +5 -0
  74. package/skill/SKILL.md +148 -85
  75. package/skill/references/cli-quick-reference.md +16 -1
  76. package/skill/references/creator-playbook.md +31 -10
  77. package/skill/workflows/Baseline.md +8 -9
  78. package/skill/workflows/Contributions.md +4 -4
  79. package/skill/workflows/Create.md +173 -0
  80. package/skill/workflows/CreateTestDeploy.md +34 -30
  81. package/skill/workflows/Cron.md +2 -2
  82. package/skill/workflows/Dashboard.md +3 -3
  83. package/skill/workflows/Evals.md +13 -7
  84. package/skill/workflows/Evolve.md +75 -32
  85. package/skill/workflows/EvolveBody.md +22 -15
  86. package/skill/workflows/Hook.md +1 -1
  87. package/skill/workflows/Improve.md +168 -0
  88. package/skill/workflows/Initialize.md +3 -3
  89. package/skill/workflows/Orchestrate.md +49 -12
  90. package/skill/workflows/Publish.md +100 -0
  91. package/skill/workflows/Registry.md +19 -13
  92. package/skill/workflows/Run.md +72 -0
  93. package/skill/workflows/Schedule.md +2 -2
  94. package/skill/workflows/SearchRun.md +89 -0
  95. package/skill/workflows/SignalsDashboard.md +2 -2
  96. package/skill/workflows/UnitTest.md +13 -4
  97. package/skill/workflows/Verify.md +136 -0
  98. package/skill/workflows/Watch.md +114 -47
  99. package/skill/workflows/Workflows.md +13 -8
  100. package/apps/local-dashboard/dist/assets/index-BcXquWFB.css +0 -1
  101. package/apps/local-dashboard/dist/assets/index-Coq42hE4.js +0 -15
  102. package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
@@ -8,9 +8,10 @@
8
8
 
9
9
  import { readFileSync } from "node:fs";
10
10
 
11
- import type { EvalEntry, InvocationType, SkillUsageRecord } from "../types.js";
11
+ import type { EvalEntry, InvocationType, QueryLogRecord, SkillUsageRecord } from "../types.js";
12
12
  import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
13
13
  import type { LlmCallObserver } from "../utils/llm-call.js";
14
+ import { extractActionableQueryText, extractPositiveEvalQueryText } from "../utils/query-filter.js";
14
15
  import { findInstalledSkillNames } from "../utils/skill-discovery.js";
15
16
  import { classifyInvocation } from "./invocation-classifier.js";
16
17
 
@@ -50,6 +51,25 @@ interface PromptFamilyTargets {
50
51
  unrelatedNegativeCount: number;
51
52
  }
52
53
 
54
+ const MAX_REAL_EXAMPLE_LENGTH = 220;
55
+ const MAX_SYNTHETIC_SKILL_CONTENT_CHARS = 6000;
56
+ const MAX_SYNTHETIC_SECTION_CHARS = 1200;
57
+ const MAX_SYNTHETIC_PREAMBLE_CHARS = 800;
58
+ const PRIORITY_SYNTHETIC_SECTION_PATTERNS = [
59
+ /when this skill activates/i,
60
+ /when to invoke/i,
61
+ /when to use/i,
62
+ /\buse when\b/i,
63
+ /workflow routing/i,
64
+ /\busage\b/i,
65
+ /\bexamples?\b/i,
66
+ /\bformat\b/i,
67
+ /publish workflow/i,
68
+ /input/i,
69
+ /output/i,
70
+ /activation/i,
71
+ ] as const;
72
+
53
73
  function getSyntheticSkillSearchDirs(): string[] {
54
74
  const cwd = process.cwd();
55
75
  const homeDir = process.env.HOME ?? "";
@@ -122,6 +142,115 @@ function normalizeEvalQuery(query: string): string {
122
142
  return query.trim().toLowerCase().replace(/\s+/g, " ");
123
143
  }
124
144
 
145
+ function truncatePromptExample(query: string): string {
146
+ const trimmed = query.trim();
147
+ if (trimmed.length <= MAX_REAL_EXAMPLE_LENGTH) return trimmed;
148
+ return `${trimmed.slice(0, MAX_REAL_EXAMPLE_LENGTH - 1).trimEnd()}…`;
149
+ }
150
+
151
+ function truncateSyntheticSection(text: string, limit: number): string {
152
+ const trimmed = text.trim();
153
+ if (trimmed.length <= limit) return trimmed;
154
+ return `${trimmed.slice(0, limit - 1).trimEnd()}…`;
155
+ }
156
+
157
+ export function summarizeSkillContentForSyntheticPrompt(skillContent: string): string {
158
+ const trimmed = skillContent.trim();
159
+ if (trimmed.length <= MAX_SYNTHETIC_SKILL_CONTENT_CHARS) return trimmed;
160
+
161
+ const frontmatterMatch = trimmed.match(/^---\n[\s\S]*?\n---\n*/);
162
+ const frontmatter = frontmatterMatch?.[0]?.trim() ?? "";
163
+ const body = frontmatterMatch ? trimmed.slice(frontmatterMatch[0].length).trim() : trimmed;
164
+ const sectionRegex = /^#{1,6}\s+.+$/gm;
165
+ const headingMatches = [...body.matchAll(sectionRegex)];
166
+
167
+ if (headingMatches.length === 0) {
168
+ return truncateSyntheticSection(trimmed, MAX_SYNTHETIC_SKILL_CONTENT_CHARS);
169
+ }
170
+
171
+ const summaryParts: string[] = [];
172
+ let usedLength = 0;
173
+ const appendPart = (part: string): boolean => {
174
+ const normalized = part.trim();
175
+ if (!normalized) return false;
176
+ const nextLength = usedLength + normalized.length + (summaryParts.length > 0 ? 2 : 0);
177
+ if (nextLength > MAX_SYNTHETIC_SKILL_CONTENT_CHARS) return false;
178
+ summaryParts.push(normalized);
179
+ usedLength = nextLength;
180
+ return true;
181
+ };
182
+
183
+ if (frontmatter) {
184
+ appendPart(frontmatter);
185
+ }
186
+
187
+ const preamble = body.slice(0, headingMatches[0]?.index ?? 0).trim();
188
+ if (preamble) {
189
+ appendPart(truncateSyntheticSection(preamble, MAX_SYNTHETIC_PREAMBLE_CHARS));
190
+ }
191
+
192
+ const sections = headingMatches.map((match, index) => {
193
+ const start = match.index ?? 0;
194
+ const end = headingMatches[index + 1]?.index ?? body.length;
195
+ const content = body.slice(start, end).trim();
196
+ const heading = match[0].replace(/^#{1,6}\s+/, "").trim();
197
+ return { heading, content, index };
198
+ });
199
+
200
+ const selectedIndices = new Set<number>();
201
+ if (sections.length > 0) selectedIndices.add(0);
202
+ for (const section of sections) {
203
+ if (PRIORITY_SYNTHETIC_SECTION_PATTERNS.some((pattern) => pattern.test(section.heading))) {
204
+ selectedIndices.add(section.index);
205
+ }
206
+ }
207
+
208
+ for (const section of sections) {
209
+ if (!selectedIndices.has(section.index)) continue;
210
+ appendPart(truncateSyntheticSection(section.content, MAX_SYNTHETIC_SECTION_CHARS));
211
+ }
212
+
213
+ appendPart("[skill content summarized for synthetic eval generation]");
214
+ return summaryParts.join("\n\n");
215
+ }
216
+
217
+ export function buildSyntheticPromptRealExamples(
218
+ positiveCandidates: string[],
219
+ negativeCandidates: string[],
220
+ skillName: string,
221
+ limit = 5,
222
+ ): SyntheticPromptRealExamples | undefined {
223
+ const cleanedPositives: string[] = [];
224
+ const seenPositives = new Set<string>();
225
+ for (const candidate of positiveCandidates) {
226
+ const cleaned = extractPositiveEvalQueryText(candidate, skillName);
227
+ if (!cleaned) continue;
228
+ const normalized = normalizeEvalQuery(cleaned);
229
+ if (seenPositives.has(normalized)) continue;
230
+ seenPositives.add(normalized);
231
+ cleanedPositives.push(truncatePromptExample(cleaned));
232
+ if (cleanedPositives.length >= limit) break;
233
+ }
234
+
235
+ if (cleanedPositives.length === 0) return undefined;
236
+
237
+ const positiveSet = new Set(cleanedPositives.map((query) => normalizeEvalQuery(query)));
238
+ const cleanedNegatives: string[] = [];
239
+ const seenNegatives = new Set<string>();
240
+ for (const candidate of negativeCandidates) {
241
+ const cleaned = extractActionableQueryText(candidate);
242
+ if (!cleaned) continue;
243
+ const truncated = truncatePromptExample(cleaned);
244
+ const normalized = normalizeEvalQuery(truncated);
245
+ if (positiveSet.has(normalized) || seenNegatives.has(normalized)) continue;
246
+ seenNegatives.add(normalized);
247
+ cleanedNegatives.push(truncated);
248
+ if (cleanedNegatives.length >= limit) break;
249
+ }
250
+
251
+ return { positive: cleanedPositives, negative: cleanedNegatives };
252
+ }
253
+
125
254
  function dedupeEvalEntries(entries: EvalEntry[]): EvalEntry[] {
126
255
  const seen = new Set<string>();
127
256
  const deduped: EvalEntry[] = [];
@@ -223,6 +352,7 @@ export function buildSyntheticPrompt(
223
352
  realExamples?: SyntheticPromptRealExamples,
224
353
  siblingSkills: string[] = [],
225
354
  ): { system: string; user: string } {
355
+ const summarizedSkillContent = summarizeSkillContentForSyntheticPrompt(skillContent);
226
356
  const {
227
357
  explicitCount,
228
358
  implicitCount,
@@ -259,7 +389,7 @@ Output as JSON array with no surrounding text:
259
389
  let user = `Skill name: ${skillName}
260
390
 
261
391
  Skill content:
262
- ${skillContent}
392
+ ${summarizedSkillContent}
263
393
 
264
394
  Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false).
265
395
 
@@ -308,6 +438,7 @@ export function buildSyntheticRefinementPrompt(
308
438
  maxNegatives: number,
309
439
  siblingSkills: string[] = [],
310
440
  ): { system: string; user: string } {
441
+ const summarizedSkillContent = summarizeSkillContentForSyntheticPrompt(skillContent);
311
442
  const targets = buildPromptFamilyTargets(maxPositives, maxNegatives, siblingSkills.length > 0);
312
443
  const system = `You are refining a cold-start eval benchmark for a coding agent skill.
313
444
 
@@ -325,7 +456,7 @@ Return ONLY a JSON array with the final benchmark.`;
325
456
  const user = `Skill name: ${skillName}
326
457
 
327
458
  Skill content:
328
- ${skillContent}
459
+ ${summarizedSkillContent}
329
460
 
330
461
  Target final benchmark:
331
462
  - ${maxPositives} positives
@@ -459,25 +590,22 @@ export async function generateSyntheticEvals(
459
590
 
460
591
  // Positives: high-confidence triggered records for this skill
461
592
  const skillRecords = querySkillUsageRecords(db) as SkillUsageRecord[];
462
- const positive = skillRecords
593
+ const positiveCandidates = skillRecords
463
594
  .filter((r) => isHighConfidencePositiveSkillRecord(r, skillName))
464
595
  .map((r) => r.query)
465
- .filter((q): q is string => typeof q === "string" && q.length > 0)
466
- .slice(0, 5);
596
+ .filter((q): q is string => typeof q === "string" && q.length > 0);
467
597
 
468
- // Negatives: from all_queries, excluding known positives
469
- const posSet = new Set(positive.map((q: string) => q.toLowerCase()));
470
- const allQueries = queryQueryLog(db);
471
- const negative = allQueries
598
+ // Negatives: from all_queries, excluding cleaned positives later.
599
+ const allQueries = queryQueryLog(db) as QueryLogRecord[];
600
+ const negativeCandidates = allQueries
472
601
  .map((r) => r.query)
473
- .filter(
474
- (q): q is string => typeof q === "string" && q.length > 0 && !posSet.has(q.toLowerCase()),
475
- )
476
- .slice(0, 5);
602
+ .filter((q): q is string => typeof q === "string" && q.length > 0);
477
603
 
478
- if (positive.length > 0) {
479
- realExamples = { positive, negative };
480
- }
604
+ realExamples = buildSyntheticPromptRealExamples(
605
+ positiveCandidates,
606
+ negativeCandidates,
607
+ skillName,
608
+ );
481
609
  } catch {
482
610
  // fail-open: synthetic gen works without real examples
483
611
  }