selftune 0.2.30 → 0.2.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/README.md +83 -56
  2. package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
  3. package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
  5. package/apps/local-dashboard/dist/index.html +3 -3
  6. package/cli/selftune/command-surface.ts +613 -2
  7. package/cli/selftune/create/baseline.ts +429 -0
  8. package/cli/selftune/create/check.ts +35 -0
  9. package/cli/selftune/create/init.ts +115 -0
  10. package/cli/selftune/create/package-candidate-state.ts +771 -0
  11. package/cli/selftune/create/package-evaluator.ts +710 -0
  12. package/cli/selftune/create/package-fingerprint.ts +142 -0
  13. package/cli/selftune/create/package-search.ts +377 -0
  14. package/cli/selftune/create/publish.ts +431 -0
  15. package/cli/selftune/create/readiness.ts +495 -0
  16. package/cli/selftune/create/replay.ts +330 -0
  17. package/cli/selftune/create/report.ts +74 -0
  18. package/cli/selftune/create/scaffold.ts +121 -0
  19. package/cli/selftune/create/skills-ref-adapter.ts +177 -0
  20. package/cli/selftune/create/status.ts +33 -0
  21. package/cli/selftune/create/templates.ts +249 -0
  22. package/cli/selftune/cron/setup.ts +1 -1
  23. package/cli/selftune/dashboard-action-events.ts +4 -1
  24. package/cli/selftune/dashboard-action-result.ts +789 -24
  25. package/cli/selftune/dashboard-action-stream.ts +80 -0
  26. package/cli/selftune/dashboard-contract.ts +146 -3
  27. package/cli/selftune/dashboard-server.ts +5 -4
  28. package/cli/selftune/eval/hooks-to-evals.ts +58 -35
  29. package/cli/selftune/eval/synthetic-evals.ts +145 -17
  30. package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
  31. package/cli/selftune/evolution/evolve-body.ts +9 -36
  32. package/cli/selftune/evolution/evolve.ts +8 -72
  33. package/cli/selftune/evolution/stopping-criteria.ts +5 -13
  34. package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
  35. package/cli/selftune/evolution/validate-host-replay.ts +115 -15
  36. package/cli/selftune/improve.ts +206 -0
  37. package/cli/selftune/index.ts +123 -6
  38. package/cli/selftune/init.ts +1 -1
  39. package/cli/selftune/localdb/queries/dashboard.ts +30 -0
  40. package/cli/selftune/localdb/schema.ts +52 -0
  41. package/cli/selftune/monitoring/watch.ts +257 -23
  42. package/cli/selftune/orchestrate/execute.ts +300 -1
  43. package/cli/selftune/orchestrate/finalize.ts +14 -0
  44. package/cli/selftune/orchestrate/plan.ts +22 -5
  45. package/cli/selftune/orchestrate/prepare.ts +59 -4
  46. package/cli/selftune/orchestrate/report.ts +1 -1
  47. package/cli/selftune/orchestrate.ts +34 -1
  48. package/cli/selftune/publish.ts +35 -0
  49. package/cli/selftune/registry/github-install.ts +256 -0
  50. package/cli/selftune/registry/index.ts +1 -1
  51. package/cli/selftune/registry/install.ts +58 -7
  52. package/cli/selftune/routes/actions.ts +81 -15
  53. package/cli/selftune/routes/overview.ts +1 -1
  54. package/cli/selftune/routes/skill-report.ts +147 -2
  55. package/cli/selftune/run.ts +18 -0
  56. package/cli/selftune/schedule.ts +3 -3
  57. package/cli/selftune/search-run.ts +703 -0
  58. package/cli/selftune/status.ts +35 -11
  59. package/cli/selftune/testing-readiness.ts +431 -40
  60. package/cli/selftune/types.ts +316 -0
  61. package/cli/selftune/utils/eval-readiness.ts +1 -0
  62. package/cli/selftune/utils/json-output.ts +11 -0
  63. package/cli/selftune/utils/lifecycle-surface.ts +48 -0
  64. package/cli/selftune/utils/query-filter.ts +82 -1
  65. package/cli/selftune/utils/tui.ts +85 -2
  66. package/cli/selftune/verify.ts +205 -0
  67. package/cli/selftune/workflows/proposals.ts +1 -1
  68. package/cli/selftune/workflows/skill-scaffold.ts +141 -63
  69. package/cli/selftune/workflows/workflows.ts +4 -4
  70. package/package.json +1 -1
  71. package/packages/dashboard-core/src/routes/manifest.ts +2 -2
  72. package/packages/ui/src/components/SkillReportPanels.tsx +7 -7
  73. package/packages/ui/src/primitives/button.tsx +5 -0
  74. package/skill/SKILL.md +148 -85
  75. package/skill/references/cli-quick-reference.md +16 -1
  76. package/skill/references/creator-playbook.md +31 -10
  77. package/skill/workflows/Baseline.md +8 -9
  78. package/skill/workflows/Contributions.md +4 -4
  79. package/skill/workflows/Create.md +173 -0
  80. package/skill/workflows/CreateTestDeploy.md +34 -30
  81. package/skill/workflows/Cron.md +2 -2
  82. package/skill/workflows/Dashboard.md +3 -3
  83. package/skill/workflows/Evals.md +13 -7
  84. package/skill/workflows/Evolve.md +75 -32
  85. package/skill/workflows/EvolveBody.md +22 -15
  86. package/skill/workflows/Hook.md +1 -1
  87. package/skill/workflows/Improve.md +168 -0
  88. package/skill/workflows/Initialize.md +3 -3
  89. package/skill/workflows/Orchestrate.md +49 -12
  90. package/skill/workflows/Publish.md +100 -0
  91. package/skill/workflows/Registry.md +19 -13
  92. package/skill/workflows/Run.md +72 -0
  93. package/skill/workflows/Schedule.md +2 -2
  94. package/skill/workflows/SearchRun.md +89 -0
  95. package/skill/workflows/SignalsDashboard.md +2 -2
  96. package/skill/workflows/UnitTest.md +13 -4
  97. package/skill/workflows/Verify.md +136 -0
  98. package/skill/workflows/Watch.md +114 -47
  99. package/skill/workflows/Workflows.md +13 -8
  100. package/apps/local-dashboard/dist/assets/index-BcXquWFB.css +0 -1
  101. package/apps/local-dashboard/dist/assets/index-Coq42hE4.js +0 -15
  102. package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
@@ -4,17 +4,25 @@ import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSy
4
4
  import { dirname, join } from "node:path";
5
5
 
6
6
  import { SELFTUNE_CONFIG_DIR } from "./constants.js";
7
+ import type { CreatePackageEvaluationResult } from "./create/package-evaluator.js";
7
8
  import type {
9
+ CreatePackageEvaluationSummary,
10
+ CreatePackageEvaluationStatus,
11
+ CreatorOverviewStep,
8
12
  CreatorLoopNextStep,
9
13
  CreatorTestingOverview,
10
14
  DeploymentReadiness,
11
15
  SkillEvalReadiness,
16
+ SkillSummary,
12
17
  SkillTestingReadiness,
13
18
  } from "./dashboard-contract.js";
14
19
  import { getDb } from "./localdb/db.js";
15
20
  import type { EvalEntry, SkillUnitTest, UnitTestSuiteResult } from "./types.js";
21
+ import { computeCreatePackageFingerprint } from "./create/package-fingerprint.js";
16
22
  import { queryEvolutionEvidence } from "./localdb/queries/evolution.js";
17
23
  import { queryTrustedSkillObservationRows } from "./localdb/queries/trust.js";
24
+ import { MIN_LOG_READY_POSITIVES } from "./utils/eval-readiness.js";
25
+ import { extractPositiveEvalQueryText } from "./utils/query-filter.js";
18
26
  import {
19
27
  findInstalledSkillNames,
20
28
  findInstalledSkillPath,
@@ -25,6 +33,7 @@ import {
25
33
  interface TrustedSkillObservationSummary {
26
34
  session_id: string;
27
35
  triggered: number;
36
+ query_text: string;
28
37
  }
29
38
 
30
39
  interface TestingReadinessContext {
@@ -39,6 +48,10 @@ interface TestingReadinessContext {
39
48
  string,
40
49
  { sample_size: number; pass_rate: number | null; measured_at: string | null }
41
50
  >;
51
+ packageEvaluationBySkill: Map<
52
+ string,
53
+ { summary: CreatePackageEvaluationSummary; storedAt: string | null }
54
+ >;
42
55
  latestEvolutionBySkill: Map<string, { action: string | null; timestamp: string | null }>;
43
56
  }
44
57
 
@@ -54,6 +67,10 @@ function getUnitTestDir(): string {
54
67
  return join(getConfigDir(), "unit-tests");
55
68
  }
56
69
 
70
+ function getPackageEvaluationDir(): string {
71
+ return join(getConfigDir(), "package-evaluations");
72
+ }
73
+
57
74
  export function getCanonicalEvalSetPath(skillName: string): string {
58
75
  return join(getEvalSetDir(), `${skillName}.json`);
59
76
  }
@@ -66,6 +83,14 @@ export function getUnitTestResultPath(skillName: string): string {
66
83
  return join(getUnitTestDir(), `${skillName}.last-run.json`);
67
84
  }
68
85
 
86
+ export function getCanonicalPackageEvaluationPath(skillName: string): string {
87
+ return join(getPackageEvaluationDir(), `${skillName}.json`);
88
+ }
89
+
90
+ export function getCanonicalPackageEvaluationArtifactPath(skillName: string): string {
91
+ return join(getPackageEvaluationDir(), `${skillName}.artifact.json`);
92
+ }
93
+
69
94
  function getOptionalDb(): Database | null {
70
95
  try {
71
96
  return getDb();
@@ -84,6 +109,16 @@ function parseJsonArray(value: string | null | undefined): unknown[] {
84
109
  }
85
110
  }
86
111
 
112
+ function parseJsonObject(value: string | null | undefined): Record<string, unknown> | null {
113
+ if (!value) return null;
114
+ try {
115
+ const parsed = JSON.parse(value) as unknown;
116
+ return parsed && typeof parsed === "object" ? (parsed as Record<string, unknown>) : null;
117
+ } catch {
118
+ return null;
119
+ }
120
+ }
121
+
87
122
  function upsertCanonicalEvalSet(db: Database, skillName: string, evalSet: EvalEntry[]): void {
88
123
  db.run(
89
124
  `INSERT INTO canonical_eval_sets (skill_name, stored_at, eval_set_json)
@@ -134,6 +169,21 @@ function upsertUnitTestRunResult(
134
169
  );
135
170
  }
136
171
 
172
+ function upsertPackageEvaluationReport(
173
+ db: Database,
174
+ skillName: string,
175
+ summary: CreatePackageEvaluationSummary,
176
+ ): void {
177
+ db.run(
178
+ `INSERT INTO package_evaluation_reports (skill_name, stored_at, summary_json)
179
+ VALUES (?, ?, ?)
180
+ ON CONFLICT(skill_name) DO UPDATE SET
181
+ stored_at = excluded.stored_at,
182
+ summary_json = excluded.summary_json`,
183
+ [skillName, new Date().toISOString(), JSON.stringify(summary)],
184
+ );
185
+ }
186
+
137
187
  function readCanonicalEvalSetFromDb(
138
188
  db: Database,
139
189
  skillName: string,
@@ -205,6 +255,35 @@ function readUnitTestRunResultFromDb(db: Database, skillName: string): UnitTestS
205
255
  }
206
256
  }
207
257
 
258
+ function readPackageEvaluationFromDb(
259
+ db: Database,
260
+ skillName: string,
261
+ ): { summary: CreatePackageEvaluationSummary; storedAt: string | null } | null {
262
+ const row = db
263
+ .query(
264
+ `SELECT summary_json, stored_at
265
+ FROM package_evaluation_reports
266
+ WHERE skill_name = ?`,
267
+ )
268
+ .get(skillName) as { summary_json: string; stored_at: string } | null;
269
+ if (!row?.summary_json) return null;
270
+
271
+ const parsed = parseJsonObject(row.summary_json);
272
+ if (
273
+ !parsed ||
274
+ typeof parsed["skill_name"] !== "string" ||
275
+ typeof parsed["status"] !== "string" ||
276
+ typeof parsed["evaluation_passed"] !== "boolean"
277
+ ) {
278
+ return null;
279
+ }
280
+
281
+ return {
282
+ summary: parsed as unknown as CreatePackageEvaluationSummary,
283
+ storedAt: row.stored_at ?? null,
284
+ };
285
+ }
286
+
208
287
  function listStoredSkillNames(db: Database, tableName: string): Set<string> {
209
288
  const rows = db.query(`SELECT skill_name FROM ${tableName}`).all() as Array<{
210
289
  skill_name: string;
@@ -254,6 +333,75 @@ export function writeUnitTestRunResult(skillName: string, suite: UnitTestSuiteRe
254
333
  return path;
255
334
  }
256
335
 
336
+ export function writeCanonicalPackageEvaluation(
337
+ skillName: string,
338
+ summary: CreatePackageEvaluationSummary,
339
+ ): string {
340
+ const path = getCanonicalPackageEvaluationPath(skillName);
341
+ const db = getOptionalDb();
342
+ if (db) {
343
+ upsertPackageEvaluationReport(db, skillName, summary);
344
+ }
345
+ mkdirSync(getPackageEvaluationDir(), { recursive: true });
346
+ writeFileSync(path, JSON.stringify(summary, null, 2), "utf-8");
347
+ return path;
348
+ }
349
+
350
+ export function writeCanonicalPackageEvaluationArtifact(
351
+ skillName: string,
352
+ result: CreatePackageEvaluationResult,
353
+ ): string {
354
+ const path = getCanonicalPackageEvaluationArtifactPath(skillName);
355
+ mkdirSync(getPackageEvaluationDir(), { recursive: true });
356
+ writeFileSync(path, JSON.stringify(result, null, 2), "utf-8");
357
+ return path;
358
+ }
359
+
360
+ export function readCanonicalUnitTestRunResult(
361
+ skillName: string,
362
+ db: Database | null = getOptionalDb(),
363
+ ): UnitTestSuiteResult | null {
364
+ const storedResult = db ? readUnitTestRunResultFromDb(db, skillName) : null;
365
+ if (storedResult) return storedResult;
366
+ return readUnitTestResult(getUnitTestResultPath(skillName));
367
+ }
368
+
369
+ export function readCanonicalPackageEvaluationArtifact(
370
+ skillName: string,
371
+ ): CreatePackageEvaluationResult | null {
372
+ try {
373
+ const path = getCanonicalPackageEvaluationArtifactPath(skillName);
374
+ if (!existsSync(path)) return null;
375
+ const parsed = JSON.parse(
376
+ readFileSync(path, "utf-8"),
377
+ ) as Partial<CreatePackageEvaluationResult>;
378
+ if (
379
+ typeof parsed !== "object" ||
380
+ parsed == null ||
381
+ typeof parsed.summary !== "object" ||
382
+ parsed.summary == null ||
383
+ typeof parsed.replay !== "object" ||
384
+ parsed.replay == null ||
385
+ typeof parsed.baseline !== "object" ||
386
+ parsed.baseline == null
387
+ ) {
388
+ return null;
389
+ }
390
+ if (
391
+ typeof parsed.summary.skill_name !== "string" ||
392
+ typeof parsed.summary.status !== "string" ||
393
+ typeof parsed.summary.evaluation_passed !== "boolean" ||
394
+ typeof parsed.replay.skill !== "string" ||
395
+ typeof parsed.baseline.skill_name !== "string"
396
+ ) {
397
+ return null;
398
+ }
399
+ return parsed as CreatePackageEvaluationResult;
400
+ } catch {
401
+ return null;
402
+ }
403
+ }
404
+
257
405
  function readJsonArrayFile(path: string): unknown[] {
258
406
  try {
259
407
  if (!existsSync(path)) return [];
@@ -319,7 +467,7 @@ function deriveEvalReadiness(
319
467
  skillPath: string | null,
320
468
  trustedTriggerCount: number,
321
469
  ): SkillEvalReadiness {
322
- if (trustedTriggerCount > 0) return "log_ready";
470
+ if (trustedTriggerCount >= MIN_LOG_READY_POSITIVES) return "log_ready";
323
471
  if (skillPath) return "cold_start_ready";
324
472
  return "telemetry_only";
325
473
  }
@@ -328,12 +476,18 @@ function formatSkillPathArg(skillPath: string | null, skillName: string): string
328
476
  return skillPath ?? `/path/to/skills/${skillName}/SKILL.md`;
329
477
  }
330
478
 
479
+ function isDraftSkillPath(skillPath: string | null): boolean {
480
+ if (!skillPath) return false;
481
+ return existsSync(join(dirname(skillPath), "selftune.create.json"));
482
+ }
483
+
331
484
  function recommendCommand(
332
485
  skillName: string,
333
486
  skillPath: string | null,
334
487
  nextStep: CreatorLoopNextStep,
335
488
  ): string {
336
489
  const pathArg = formatSkillPathArg(skillPath, skillName);
490
+ const draftPackage = isDraftSkillPath(skillPath);
337
491
  switch (nextStep) {
338
492
  case "generate_evals":
339
493
  return skillPath
@@ -342,25 +496,41 @@ function recommendCommand(
342
496
  case "run_unit_tests":
343
497
  return `selftune eval unit-test --skill ${skillName} --generate --skill-path ${pathArg}`;
344
498
  case "run_replay_dry_run":
345
- return `selftune evolve --skill ${skillName} --skill-path ${pathArg} --dry-run --validation-mode replay`;
499
+ return draftPackage
500
+ ? `selftune create replay --skill-path ${pathArg} --mode package`
501
+ : `selftune evolve --skill ${skillName} --skill-path ${pathArg} --dry-run --validation-mode replay`;
346
502
  case "measure_baseline":
347
- return `selftune grade baseline --skill ${skillName} --skill-path ${pathArg}`;
503
+ return draftPackage
504
+ ? `selftune create baseline --skill-path ${pathArg} --mode package`
505
+ : `selftune grade baseline --skill ${skillName} --skill-path ${pathArg}`;
348
506
  case "deploy_candidate":
349
- return `selftune evolve --skill ${skillName} --skill-path ${pathArg} --with-baseline`;
507
+ return draftPackage
508
+ ? `selftune create publish --skill-path ${pathArg}`
509
+ : `selftune evolve --skill ${skillName} --skill-path ${pathArg} --with-baseline`;
350
510
  case "watch_deployment":
351
- return `selftune watch --skill ${skillName}`;
511
+ return draftPackage
512
+ ? `selftune watch --skill ${skillName} --skill-path ${pathArg}`
513
+ : `selftune watch --skill ${skillName}`;
352
514
  }
353
515
  }
354
516
 
355
517
  function summarizeReadiness(
356
518
  nextStep: CreatorLoopNextStep,
519
+ draftPackage: boolean,
357
520
  evalReadiness: SkillEvalReadiness,
358
521
  evalSetEntries: number,
359
522
  unitTestCases: number,
360
523
  replayCheckCount: number,
361
524
  baselineSampleSize: number,
362
525
  unitTestPassRate: number | null,
526
+ packageEvaluationStatus: CreatePackageEvaluationStatus | null,
527
+ latestPackageEvaluationAt: string | null,
363
528
  ): string {
529
+ const latestPackageEvaluationText =
530
+ latestPackageEvaluationAt && packageEvaluationStatus
531
+ ? ` Latest measured package evaluation: ${packageEvaluationStatus} at ${latestPackageEvaluationAt}.`
532
+ : "";
533
+
364
534
  switch (nextStep) {
365
535
  case "generate_evals":
366
536
  if (evalReadiness === "log_ready") {
@@ -371,20 +541,36 @@ function summarizeReadiness(
371
541
  }
372
542
  return "Telemetry exists, but selftune cannot resolve a local SKILL.md yet. Point it at the skill and generate evals.";
373
543
  case "run_unit_tests":
374
- return `Eval coverage is present (${evalSetEntries} entries), but no unit tests are stored yet.`;
544
+ return unitTestCases > 0 && unitTestPassRate != null && unitTestPassRate < 1
545
+ ? `Deterministic unit tests exist (${unitTestCases} cases), but the latest run only passed ${Math.round(unitTestPassRate * 100)}%. Fix the failing tests and rerun them before moving on.`
546
+ : `Eval coverage is present (${evalSetEntries} entries), but no unit tests are stored yet.`;
375
547
  case "run_replay_dry_run": {
376
548
  const passRateText =
377
549
  unitTestPassRate != null
378
550
  ? ` Last unit-test run passed ${Math.round(unitTestPassRate * 100)}%.`
379
551
  : "";
380
- return `Unit tests are present (${unitTestCases} cases), but replay-backed dry-run validation has not been recorded yet.${passRateText}`;
552
+ if (draftPackage && packageEvaluationStatus === "replay_failed") {
553
+ return `A measured package evaluation already failed replay, so the draft is not publishable yet. Re-run package replay before publishing.${latestPackageEvaluationText}`;
554
+ }
555
+ return draftPackage
556
+ ? `Unit tests are present (${unitTestCases} cases), but package replay validation has not been recorded yet.${passRateText}`
557
+ : `Unit tests are present (${unitTestCases} cases), but replay-backed dry-run validation has not been recorded yet.${passRateText}`;
381
558
  }
382
559
  case "measure_baseline":
383
- return `Replay-backed validation exists (${replayCheckCount} recorded checks), but no stored no-skill baseline exists yet.`;
560
+ if (draftPackage && packageEvaluationStatus === "baseline_failed") {
561
+ return `A measured package evaluation already failed the package baseline gate, so the draft is not publishable yet. Re-run the package baseline after improving the draft.${latestPackageEvaluationText}`;
562
+ }
563
+ return draftPackage
564
+ ? `Package replay validation exists (${replayCheckCount} recorded checks), but no measured package baseline exists yet.`
565
+ : `Replay-backed validation exists (${replayCheckCount} recorded checks), but no stored no-skill baseline exists yet.`;
384
566
  case "deploy_candidate":
385
- return `Evals, unit tests, replay validation, and a baseline are all present. Ready to run a live evolve and deploy a watched candidate.${baselineSampleSize > 0 ? ` Latest baseline used ${baselineSampleSize} samples.` : ""}`;
567
+ return draftPackage
568
+ ? `Evals, unit tests, package replay, and a package baseline are all present. Ready to run create publish and hand the draft into watch.${baselineSampleSize > 0 ? ` Latest baseline used ${baselineSampleSize} samples.` : ""}`
569
+ : `Evals, unit tests, replay validation, and a baseline are all present. Ready to run a live evolve and deploy a watched candidate.${baselineSampleSize > 0 ? ` Latest baseline used ${baselineSampleSize} samples.` : ""}`;
386
570
  case "watch_deployment":
387
- return `A candidate has already been deployed for this skill. Keep watching live traffic and baseline lift before making another mutation.${baselineSampleSize > 0 ? ` Latest baseline used ${baselineSampleSize} samples.` : ""}`;
571
+ return draftPackage
572
+ ? `This draft package has already been published. Keep watching live traffic and measured package lift before making another mutation.${baselineSampleSize > 0 ? ` Latest baseline used ${baselineSampleSize} samples.` : ""}`
573
+ : `A candidate has already been deployed for this skill. Keep watching live traffic and baseline lift before making another mutation.${baselineSampleSize > 0 ? ` Latest baseline used ${baselineSampleSize} samples.` : ""}`;
388
574
  }
389
575
  }
390
576
 
@@ -427,6 +613,7 @@ function summarizeDeploymentReadiness(
427
613
  skillPath: string | null,
428
614
  ): { summary: string; command: string | null } {
429
615
  const pathArg = formatSkillPathArg(skillPath, skillName);
616
+ const draftPackage = isDraftSkillPath(skillPath);
430
617
  switch (deploymentReadiness) {
431
618
  case "blocked":
432
619
  return {
@@ -435,21 +622,30 @@ function summarizeDeploymentReadiness(
435
622
  };
436
623
  case "ready_to_deploy":
437
624
  return {
438
- summary:
439
- "Tests and baseline are in place. Run a live evolve so selftune can validate and deploy the strongest candidate.",
440
- command: `selftune evolve --skill ${skillName} --skill-path ${pathArg} --with-baseline`,
625
+ summary: draftPackage
626
+ ? "Tests and measured package checks are in place. Run create publish so selftune can re-run package replay and baseline before handing the draft into watch."
627
+ : "Tests and baseline are in place. Run a live evolve so selftune can validate and deploy the strongest candidate.",
628
+ command: draftPackage
629
+ ? `selftune create publish --skill-path ${pathArg}`
630
+ : `selftune evolve --skill ${skillName} --skill-path ${pathArg} --with-baseline`,
441
631
  };
442
632
  case "watching":
443
633
  return {
444
- summary:
445
- "A candidate is already deployed. Keep watching live trigger behavior and baseline lift before making another mutation.",
446
- command: `selftune watch --skill ${skillName}`,
634
+ summary: draftPackage
635
+ ? "This draft package is already published. Keep watching live trigger behavior and measured package lift before making another mutation."
636
+ : "A candidate is already deployed. Keep watching live trigger behavior and baseline lift before making another mutation.",
637
+ command: draftPackage
638
+ ? `selftune watch --skill ${skillName} --skill-path ${pathArg}`
639
+ : `selftune watch --skill ${skillName}`,
447
640
  };
448
641
  case "rolled_back":
449
642
  return {
450
- summary:
451
- "The last deployment rolled back. Review the failure evidence, rerun a replay dry-run if needed, then redeploy once the candidate is trustworthy again.",
452
- command: `selftune evolve --skill ${skillName} --skill-path ${pathArg} --dry-run --validation-mode replay`,
643
+ summary: draftPackage
644
+ ? "The last published draft rolled back. Review the failure evidence, rerun package replay and baseline if needed, then publish again once the package is trustworthy."
645
+ : "The last deployment rolled back. Review the failure evidence, rerun a replay dry-run if needed, then redeploy once the candidate is trustworthy again.",
646
+ command: draftPackage
647
+ ? `selftune create replay --skill-path ${pathArg} --mode package`
648
+ : `selftune evolve --skill ${skillName} --skill-path ${pathArg} --dry-run --validation-mode replay`,
453
649
  };
454
650
  }
455
651
  }
@@ -461,10 +657,10 @@ export function listSkillTestingReadiness(
461
657
  const context = buildTestingReadinessContext(db, searchDirs);
462
658
 
463
659
  return [...context.knownSkills]
464
- .sort((a, b) => a.localeCompare(b))
660
+ .toSorted((a, b) => a.localeCompare(b))
465
661
  .map((skillName) => buildSkillTestingReadinessRow(skillName, context))
466
662
  .filter((row): row is SkillTestingReadiness => row != null)
467
- .sort((a, b) => {
663
+ .toSorted((a, b) => {
468
664
  const priorityDiff = nextStepPriority(a.next_step) - nextStepPriority(b.next_step);
469
665
  if (priorityDiff !== 0) return priorityDiff;
470
666
  const trustedDiff = b.trusted_session_count - a.trusted_session_count;
@@ -487,7 +683,11 @@ function buildTestingReadinessContext(db: Database, searchDirs: string[]): Testi
487
683
 
488
684
  for (const row of trustedRows) {
489
685
  const existing = trustedRowsBySkill.get(row.skill_name);
490
- const compact = { session_id: row.session_id, triggered: row.triggered };
686
+ const compact = {
687
+ session_id: row.session_id,
688
+ triggered: row.triggered,
689
+ query_text: row.query_text,
690
+ };
491
691
  if (existing) existing.push(compact);
492
692
  else trustedRowsBySkill.set(row.skill_name, [compact]);
493
693
  }
@@ -495,6 +695,7 @@ function buildTestingReadinessContext(db: Database, searchDirs: string[]): Testi
495
695
  const installedNames = findInstalledSkillNames(searchDirs);
496
696
  const unitTestDir = getUnitTestDir();
497
697
  const evalSetDir = getEvalSetDir();
698
+ const packageEvaluationDir = getPackageEvaluationDir();
498
699
  const unitTestNames = scanSkillNamesFromDir(unitTestDir, (entry) => {
499
700
  if (!entry.endsWith(".json") || entry.endsWith(".last-run.json")) return null;
500
701
  return entry.slice(0, -".json".length);
@@ -507,9 +708,14 @@ function buildTestingReadinessContext(db: Database, searchDirs: string[]): Testi
507
708
  if (!entry.endsWith(".json")) return null;
508
709
  return entry.slice(0, -".json".length);
509
710
  });
711
+ const packageEvaluationNames = scanSkillNamesFromDir(packageEvaluationDir, (entry) => {
712
+ if (!entry.endsWith(".json")) return null;
713
+ return entry.slice(0, -".json".length);
714
+ });
510
715
  const storedEvalNames = listStoredSkillNames(db, "canonical_eval_sets");
511
716
  const storedUnitTestNames = listStoredSkillNames(db, "unit_test_files");
512
717
  const storedUnitTestRunNames = listStoredSkillNames(db, "unit_test_run_results");
718
+ const storedPackageEvaluationNames = listStoredSkillNames(db, "package_evaluation_reports");
513
719
 
514
720
  const evidenceRows = queryEvolutionEvidence(db);
515
721
  const evalEvidenceBySkill = new Map<string, { count: number; latestAt: string | null }>();
@@ -580,6 +786,38 @@ function buildTestingReadinessContext(db: Database, searchDirs: string[]): Testi
580
786
  });
581
787
  }
582
788
 
789
+ const packageEvaluationRows = db
790
+ .query(
791
+ `SELECT skill_name, stored_at, summary_json
792
+ FROM package_evaluation_reports
793
+ ORDER BY stored_at DESC`,
794
+ )
795
+ .all() as Array<{
796
+ skill_name: string;
797
+ stored_at: string;
798
+ summary_json: string;
799
+ }>;
800
+ const packageEvaluationBySkill = new Map<
801
+ string,
802
+ { summary: CreatePackageEvaluationSummary; storedAt: string | null }
803
+ >();
804
+ for (const row of packageEvaluationRows) {
805
+ if (packageEvaluationBySkill.has(row.skill_name)) continue;
806
+ const parsed = parseJsonObject(row.summary_json);
807
+ if (
808
+ !parsed ||
809
+ typeof parsed["skill_name"] !== "string" ||
810
+ typeof parsed["status"] !== "string" ||
811
+ typeof parsed["evaluation_passed"] !== "boolean"
812
+ ) {
813
+ continue;
814
+ }
815
+ packageEvaluationBySkill.set(row.skill_name, {
816
+ summary: parsed as unknown as CreatePackageEvaluationSummary,
817
+ storedAt: row.stored_at ?? null,
818
+ });
819
+ }
820
+
583
821
  const latestEvolutionRows = db
584
822
  .query(
585
823
  `SELECT skill_name, action, timestamp
@@ -624,9 +862,11 @@ function buildTestingReadinessContext(db: Database, searchDirs: string[]): Testi
624
862
  ...unitTestNames,
625
863
  ...unitTestResultNames,
626
864
  ...canonicalEvalNames,
865
+ ...packageEvaluationNames,
627
866
  ...storedEvalNames,
628
867
  ...storedUnitTestNames,
629
868
  ...storedUnitTestRunNames,
869
+ ...storedPackageEvaluationNames,
630
870
  ...evalEvidenceBySkill.keys(),
631
871
  ...replayBySkill.keys(),
632
872
  ...baselineBySkill.keys(),
@@ -642,6 +882,7 @@ function buildTestingReadinessContext(db: Database, searchDirs: string[]): Testi
642
882
  fallbackSkillPathBySkill,
643
883
  replayBySkill,
644
884
  baselineBySkill,
885
+ packageEvaluationBySkill,
645
886
  latestEvolutionBySkill,
646
887
  };
647
888
  }
@@ -651,8 +892,11 @@ function buildSkillTestingReadinessRow(
651
892
  context: TestingReadinessContext,
652
893
  ): SkillTestingReadiness | null {
653
894
  const trustRows = context.trustedRowsBySkill.get(skillName) ?? [];
654
- const trustedTriggerCount = trustRows.filter((row) => row.triggered === 1).length;
655
- const trustedSessionCount = new Set(trustRows.map((row) => row.session_id)).size;
895
+ const trustedPositiveRows = trustRows.filter(
896
+ (row) => row.triggered === 1 && extractPositiveEvalQueryText(row.query_text, skillName) != null,
897
+ );
898
+ const trustedTriggerCount = trustedPositiveRows.length;
899
+ const trustedSessionCount = new Set(trustedPositiveRows.map((row) => row.session_id)).size;
656
900
 
657
901
  const installedSkillPath = findInstalledSkillPath(skillName, context.searchDirs) ?? null;
658
902
  if (!context.knownSkills.has(skillName) && installedSkillPath == null) {
@@ -660,6 +904,7 @@ function buildSkillTestingReadinessRow(
660
904
  }
661
905
 
662
906
  const skillPath = installedSkillPath ?? context.fallbackSkillPathBySkill.get(skillName) ?? null;
907
+ const draftPackage = isDraftSkillPath(skillPath);
663
908
  const evalReadiness = deriveEvalReadiness(skillPath, trustedTriggerCount);
664
909
 
665
910
  const canonicalEvalPath = getCanonicalEvalSetPath(skillName);
@@ -683,6 +928,43 @@ function buildSkillTestingReadinessRow(
683
928
  const unitTestResult =
684
929
  readUnitTestRunResultFromDb(context.db, skillName) ??
685
930
  readUnitTestResult(getUnitTestResultPath(skillName));
931
+ const storedPackageEvaluation =
932
+ context.packageEvaluationBySkill.get(skillName) ??
933
+ readPackageEvaluationFromDb(context.db, skillName);
934
+ const filePackageEvaluation =
935
+ storedPackageEvaluation == null && existsSync(getCanonicalPackageEvaluationPath(skillName))
936
+ ? (() => {
937
+ const parsed = parseJsonObject(
938
+ readFileSync(getCanonicalPackageEvaluationPath(skillName), "utf-8"),
939
+ );
940
+ if (
941
+ !parsed ||
942
+ typeof parsed["skill_name"] !== "string" ||
943
+ typeof parsed["status"] !== "string" ||
944
+ typeof parsed["evaluation_passed"] !== "boolean"
945
+ ) {
946
+ return null;
947
+ }
948
+ const stat = statSync(getCanonicalPackageEvaluationPath(skillName));
949
+ return {
950
+ summary: parsed as unknown as CreatePackageEvaluationSummary,
951
+ storedAt: stat.mtime.toISOString?.() ?? null,
952
+ };
953
+ })()
954
+ : null;
955
+ const packageEvaluation = storedPackageEvaluation ?? filePackageEvaluation;
956
+ const currentPackageFingerprint =
957
+ draftPackage && skillPath ? computeCreatePackageFingerprint(skillPath) : null;
958
+ const packageEvaluationMatchesCurrentPackage =
959
+ packageEvaluation?.summary.package_fingerprint != null &&
960
+ currentPackageFingerprint != null &&
961
+ packageEvaluation.summary.package_fingerprint === currentPackageFingerprint;
962
+ const effectivePackageEvaluation = packageEvaluationMatchesCurrentPackage
963
+ ? packageEvaluation
964
+ : null;
965
+ const packageEvaluationStatus = effectivePackageEvaluation?.summary.status ?? null;
966
+ const packageEvaluationPassed = effectivePackageEvaluation?.summary.evaluation_passed ?? null;
967
+ const latestPackageEvaluationAt = effectivePackageEvaluation?.storedAt ?? null;
686
968
 
687
969
  const replay = context.replayBySkill.get(skillName) ?? {
688
970
  check_count: 0,
@@ -703,10 +985,16 @@ function buildSkillTestingReadinessRow(
703
985
  nextStep = "generate_evals";
704
986
  } else if (unitTestCases === 0) {
705
987
  nextStep = "run_unit_tests";
988
+ } else if (unitTestResult != null && unitTestResult.pass_rate < 1) {
989
+ nextStep = "run_unit_tests";
706
990
  } else if (replay.check_count === 0) {
707
991
  nextStep = "run_replay_dry_run";
708
992
  } else if (baseline.sample_size === 0) {
709
993
  nextStep = "measure_baseline";
994
+ } else if (draftPackage && packageEvaluationStatus === "replay_failed") {
995
+ nextStep = "run_replay_dry_run";
996
+ } else if (draftPackage && packageEvaluationStatus === "baseline_failed") {
997
+ nextStep = "measure_baseline";
710
998
  } else if (latestEvolution.action === "deployed") {
711
999
  nextStep = "watch_deployment";
712
1000
  } else {
@@ -718,12 +1006,15 @@ function buildSkillTestingReadinessRow(
718
1006
  const recommended_command = recommendCommand(skillName, skillPath, nextStep);
719
1007
  const summary = summarizeReadiness(
720
1008
  nextStep,
1009
+ draftPackage,
721
1010
  evalReadiness,
722
1011
  evalSetEntries,
723
1012
  unitTestCases,
724
1013
  replay.check_count,
725
1014
  baseline.sample_size,
726
1015
  unitTestResult?.pass_rate ?? null,
1016
+ packageEvaluationStatus,
1017
+ latestPackageEvaluationAt,
727
1018
  );
728
1019
 
729
1020
  return {
@@ -750,6 +1041,9 @@ function buildSkillTestingReadinessRow(
750
1041
  baseline_sample_size: baseline.sample_size,
751
1042
  baseline_pass_rate: baseline.pass_rate,
752
1043
  latest_baseline_at: baseline.measured_at,
1044
+ package_evaluation_status: packageEvaluationStatus,
1045
+ package_evaluation_passed: packageEvaluationPassed,
1046
+ latest_package_evaluation_at: latestPackageEvaluationAt,
753
1047
  deployment_readiness: deploymentReadiness,
754
1048
  deployment_summary: deployment.summary,
755
1049
  deployment_command: deployment.command,
@@ -758,10 +1052,102 @@ function buildSkillTestingReadinessRow(
758
1052
  } satisfies SkillTestingReadiness;
759
1053
  }
760
1054
 
761
- export function buildCreatorTestingOverview(
762
- readinessRows: SkillTestingReadiness[],
763
- ): CreatorTestingOverview {
1055
+ function mapCreatorLoopNextStep(step: CreatorLoopNextStep): CreatorOverviewStep {
1056
+ switch (step) {
1057
+ case "generate_evals":
1058
+ return "generate_evals";
1059
+ case "run_unit_tests":
1060
+ return "run_unit_tests";
1061
+ case "run_replay_dry_run":
1062
+ return "run_replay_dry_run";
1063
+ case "measure_baseline":
1064
+ return "measure_baseline";
1065
+ case "deploy_candidate":
1066
+ return "deploy_candidate";
1067
+ case "watch_deployment":
1068
+ return "watch_deployment";
1069
+ }
1070
+ }
1071
+
1072
+ function mapCreateStateToCreatorStep(
1073
+ createReadiness: NonNullable<SkillSummary["create_readiness"]>,
1074
+ testingReadiness: SkillTestingReadiness | undefined,
1075
+ ): CreatorOverviewStep {
1076
+ if (
1077
+ testingReadiness?.next_step === "watch_deployment" ||
1078
+ testingReadiness?.latest_evolution_action === "deployed"
1079
+ ) {
1080
+ return "watch_deployment";
1081
+ }
1082
+
1083
+ switch (createReadiness.state) {
1084
+ case "blocked_spec_validation":
1085
+ case "needs_spec_validation":
1086
+ return "run_create_check";
1087
+ case "needs_package_resources":
1088
+ return "finish_package";
1089
+ case "needs_evals":
1090
+ return "generate_evals";
1091
+ case "needs_unit_tests":
1092
+ return "run_unit_tests";
1093
+ case "needs_routing_replay":
1094
+ return "run_replay_dry_run";
1095
+ case "needs_baseline":
1096
+ return "measure_baseline";
1097
+ case "ready_to_publish":
1098
+ return "deploy_candidate";
1099
+ }
1100
+ }
1101
+
1102
+ function deriveCreatorPriority(
1103
+ skill: Pick<SkillSummary, "skill_name" | "testing_readiness" | "create_readiness">,
1104
+ ): CreatorTestingOverview["priorities"][number] | null {
1105
+ if (skill.create_readiness) {
1106
+ const step = mapCreateStateToCreatorStep(skill.create_readiness, skill.testing_readiness);
1107
+ if (step === "watch_deployment" && skill.testing_readiness) {
1108
+ return {
1109
+ skill_name: skill.skill_name,
1110
+ step,
1111
+ summary: skill.testing_readiness.summary,
1112
+ recommended_command: skill.testing_readiness.recommended_command,
1113
+ };
1114
+ }
1115
+
1116
+ return {
1117
+ skill_name: skill.skill_name,
1118
+ step,
1119
+ summary: skill.create_readiness.summary,
1120
+ recommended_command:
1121
+ skill.create_readiness.next_command ??
1122
+ skill.testing_readiness?.recommended_command ??
1123
+ `selftune create check --skill-path ${skill.create_readiness.skill_path}`,
1124
+ };
1125
+ }
1126
+
1127
+ if (!skill.testing_readiness) return null;
1128
+ return {
1129
+ skill_name: skill.skill_name,
1130
+ step: mapCreatorLoopNextStep(skill.testing_readiness.next_step),
1131
+ summary: skill.testing_readiness.summary,
1132
+ recommended_command: skill.testing_readiness.recommended_command,
1133
+ };
1134
+ }
1135
+
1136
+ const CREATOR_OVERVIEW_STEP_ORDER: Record<CreatorOverviewStep, number> = {
1137
+ run_create_check: 0,
1138
+ finish_package: 1,
1139
+ generate_evals: 2,
1140
+ run_unit_tests: 3,
1141
+ run_replay_dry_run: 4,
1142
+ measure_baseline: 5,
1143
+ deploy_candidate: 6,
1144
+ watch_deployment: 7,
1145
+ };
1146
+
1147
+ export function buildCreatorTestingOverview(skills: SkillSummary[]): CreatorTestingOverview {
764
1148
  const counts = {
1149
+ run_create_check: 0,
1150
+ finish_package: 0,
765
1151
  generate_evals: 0,
766
1152
  run_unit_tests: 0,
767
1153
  run_replay_dry_run: 0,
@@ -770,21 +1156,26 @@ export function buildCreatorTestingOverview(
770
1156
  watch_deployment: 0,
771
1157
  } satisfies CreatorTestingOverview["counts"];
772
1158
 
773
- for (const row of readinessRows) {
774
- counts[row.next_step]++;
1159
+ const priorities = skills
1160
+ .map((skill) => deriveCreatorPriority(skill))
1161
+ .filter(
1162
+ (priority): priority is CreatorTestingOverview["priorities"][number] => priority != null,
1163
+ );
1164
+
1165
+ for (const priority of priorities) {
1166
+ counts[priority.step]++;
775
1167
  }
776
1168
 
777
- const priorities = readinessRows
778
- .filter((row) => row.next_step !== "watch_deployment")
779
- .slice(0, 5)
780
- .map((row) => ({
781
- skill_name: row.skill_name,
782
- next_step: row.next_step,
783
- summary: row.summary,
784
- recommended_command: row.recommended_command,
785
- }));
1169
+ const visiblePriorities = priorities
1170
+ .filter((priority) => priority.step !== "watch_deployment")
1171
+ .toSorted(
1172
+ (a, b) =>
1173
+ CREATOR_OVERVIEW_STEP_ORDER[a.step] - CREATOR_OVERVIEW_STEP_ORDER[b.step] ||
1174
+ a.skill_name.localeCompare(b.skill_name),
1175
+ )
1176
+ .slice(0, 5);
786
1177
 
787
- const summary = `${counts.deploy_candidate} ready to deploy, ${counts.watch_deployment} already shipped and under watch, ${counts.generate_evals} still need evals, ${counts.run_unit_tests} need unit tests, ${counts.run_replay_dry_run} need replay dry-runs, ${counts.measure_baseline} need baselines.`;
1178
+ const summary = `${counts.deploy_candidate} ready to deploy, ${counts.watch_deployment} already shipped and under watch, ${counts.run_create_check} need create check, ${counts.finish_package} need package work, ${counts.generate_evals} still need evals, ${counts.run_unit_tests} need unit tests, ${counts.run_replay_dry_run} need replay dry-runs, ${counts.measure_baseline} need baselines.`;
788
1179
 
789
- return { summary, counts, priorities };
1180
+ return { summary, counts, priorities: visiblePriorities };
790
1181
  }