selftune 0.2.30 → 0.2.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/README.md +83 -56
  2. package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
  3. package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
  5. package/apps/local-dashboard/dist/index.html +3 -3
  6. package/cli/selftune/command-surface.ts +613 -2
  7. package/cli/selftune/create/baseline.ts +429 -0
  8. package/cli/selftune/create/check.ts +35 -0
  9. package/cli/selftune/create/init.ts +115 -0
  10. package/cli/selftune/create/package-candidate-state.ts +771 -0
  11. package/cli/selftune/create/package-evaluator.ts +710 -0
  12. package/cli/selftune/create/package-fingerprint.ts +142 -0
  13. package/cli/selftune/create/package-search.ts +377 -0
  14. package/cli/selftune/create/publish.ts +431 -0
  15. package/cli/selftune/create/readiness.ts +495 -0
  16. package/cli/selftune/create/replay.ts +330 -0
  17. package/cli/selftune/create/report.ts +74 -0
  18. package/cli/selftune/create/scaffold.ts +121 -0
  19. package/cli/selftune/create/skills-ref-adapter.ts +177 -0
  20. package/cli/selftune/create/status.ts +33 -0
  21. package/cli/selftune/create/templates.ts +249 -0
  22. package/cli/selftune/cron/setup.ts +1 -1
  23. package/cli/selftune/dashboard-action-events.ts +4 -1
  24. package/cli/selftune/dashboard-action-result.ts +789 -24
  25. package/cli/selftune/dashboard-action-stream.ts +80 -0
  26. package/cli/selftune/dashboard-contract.ts +146 -3
  27. package/cli/selftune/dashboard-server.ts +5 -4
  28. package/cli/selftune/eval/hooks-to-evals.ts +58 -35
  29. package/cli/selftune/eval/synthetic-evals.ts +145 -17
  30. package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
  31. package/cli/selftune/evolution/evolve-body.ts +9 -36
  32. package/cli/selftune/evolution/evolve.ts +8 -72
  33. package/cli/selftune/evolution/stopping-criteria.ts +5 -13
  34. package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
  35. package/cli/selftune/evolution/validate-host-replay.ts +115 -15
  36. package/cli/selftune/improve.ts +206 -0
  37. package/cli/selftune/index.ts +123 -6
  38. package/cli/selftune/init.ts +1 -1
  39. package/cli/selftune/localdb/queries/dashboard.ts +30 -0
  40. package/cli/selftune/localdb/schema.ts +52 -0
  41. package/cli/selftune/monitoring/watch.ts +257 -23
  42. package/cli/selftune/orchestrate/execute.ts +300 -1
  43. package/cli/selftune/orchestrate/finalize.ts +14 -0
  44. package/cli/selftune/orchestrate/plan.ts +22 -5
  45. package/cli/selftune/orchestrate/prepare.ts +59 -4
  46. package/cli/selftune/orchestrate/report.ts +1 -1
  47. package/cli/selftune/orchestrate.ts +34 -1
  48. package/cli/selftune/publish.ts +35 -0
  49. package/cli/selftune/registry/github-install.ts +256 -0
  50. package/cli/selftune/registry/index.ts +1 -1
  51. package/cli/selftune/registry/install.ts +58 -7
  52. package/cli/selftune/routes/actions.ts +81 -15
  53. package/cli/selftune/routes/overview.ts +1 -1
  54. package/cli/selftune/routes/skill-report.ts +147 -2
  55. package/cli/selftune/run.ts +18 -0
  56. package/cli/selftune/schedule.ts +3 -3
  57. package/cli/selftune/search-run.ts +703 -0
  58. package/cli/selftune/status.ts +35 -11
  59. package/cli/selftune/testing-readiness.ts +431 -40
  60. package/cli/selftune/types.ts +316 -0
  61. package/cli/selftune/utils/eval-readiness.ts +1 -0
  62. package/cli/selftune/utils/json-output.ts +11 -0
  63. package/cli/selftune/utils/lifecycle-surface.ts +48 -0
  64. package/cli/selftune/utils/query-filter.ts +82 -1
  65. package/cli/selftune/utils/tui.ts +85 -2
  66. package/cli/selftune/verify.ts +205 -0
  67. package/cli/selftune/workflows/proposals.ts +1 -1
  68. package/cli/selftune/workflows/skill-scaffold.ts +141 -63
  69. package/cli/selftune/workflows/workflows.ts +4 -4
  70. package/package.json +1 -1
  71. package/packages/dashboard-core/src/routes/manifest.ts +2 -2
  72. package/packages/ui/src/components/SkillReportPanels.tsx +7 -7
  73. package/packages/ui/src/primitives/button.tsx +5 -0
  74. package/skill/SKILL.md +148 -85
  75. package/skill/references/cli-quick-reference.md +16 -1
  76. package/skill/references/creator-playbook.md +31 -10
  77. package/skill/workflows/Baseline.md +8 -9
  78. package/skill/workflows/Contributions.md +4 -4
  79. package/skill/workflows/Create.md +173 -0
  80. package/skill/workflows/CreateTestDeploy.md +34 -30
  81. package/skill/workflows/Cron.md +2 -2
  82. package/skill/workflows/Dashboard.md +3 -3
  83. package/skill/workflows/Evals.md +13 -7
  84. package/skill/workflows/Evolve.md +75 -32
  85. package/skill/workflows/EvolveBody.md +22 -15
  86. package/skill/workflows/Hook.md +1 -1
  87. package/skill/workflows/Improve.md +168 -0
  88. package/skill/workflows/Initialize.md +3 -3
  89. package/skill/workflows/Orchestrate.md +49 -12
  90. package/skill/workflows/Publish.md +100 -0
  91. package/skill/workflows/Registry.md +19 -13
  92. package/skill/workflows/Run.md +72 -0
  93. package/skill/workflows/Schedule.md +2 -2
  94. package/skill/workflows/SearchRun.md +89 -0
  95. package/skill/workflows/SignalsDashboard.md +2 -2
  96. package/skill/workflows/UnitTest.md +13 -4
  97. package/skill/workflows/Verify.md +136 -0
  98. package/skill/workflows/Watch.md +114 -47
  99. package/skill/workflows/Workflows.md +13 -8
  100. package/apps/local-dashboard/dist/assets/index-BcXquWFB.css +0 -1
  101. package/apps/local-dashboard/dist/assets/index-Coq42hE4.js +0 -15
  102. package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
@@ -63,6 +63,70 @@ function detectDashboardAction(argv: string[]): {
63
63
  };
64
64
  }
65
65
 
66
+ if (command === "create" && subcommand === "replay") {
67
+ return {
68
+ action: "replay-dry-run",
69
+ skillName: null,
70
+ skillPath: readFlagValue(argv, "--skill-path"),
71
+ };
72
+ }
73
+
74
+ if (command === "create" && subcommand === "check") {
75
+ return {
76
+ action: "create-check",
77
+ skillName: null,
78
+ skillPath: readFlagValue(argv, "--skill-path"),
79
+ };
80
+ }
81
+
82
+ if (command === "create" && subcommand === "baseline") {
83
+ return {
84
+ action: "measure-baseline",
85
+ skillName: null,
86
+ skillPath: readFlagValue(argv, "--skill-path"),
87
+ };
88
+ }
89
+
90
+ if (command === "create" && subcommand === "report") {
91
+ return {
92
+ action: "report-package",
93
+ skillName: null,
94
+ skillPath: readFlagValue(argv, "--skill-path"),
95
+ };
96
+ }
97
+
98
+ if (command === "create" && subcommand === "publish") {
99
+ return {
100
+ action: hasFlag(argv, "--watch") ? "watch" : "deploy-candidate",
101
+ skillName: null,
102
+ skillPath: readFlagValue(argv, "--skill-path"),
103
+ };
104
+ }
105
+
106
+ if (command === "verify") {
107
+ return {
108
+ action: "report-package",
109
+ skillName: null,
110
+ skillPath: readFlagValue(argv, "--skill-path"),
111
+ };
112
+ }
113
+
114
+ if (command === "publish") {
115
+ return {
116
+ action: hasFlag(argv, "--no-watch") ? "deploy-candidate" : "watch",
117
+ skillName: null,
118
+ skillPath: readFlagValue(argv, "--skill-path"),
119
+ };
120
+ }
121
+
122
+ if (command === "search-run") {
123
+ return {
124
+ action: "search-run",
125
+ skillName: readFlagValue(argv, "--skill"),
126
+ skillPath: readFlagValue(argv, "--skill-path"),
127
+ };
128
+ }
129
+
66
130
  if (command === "orchestrate") {
67
131
  return {
68
132
  action: "orchestrate",
@@ -71,6 +135,14 @@ function detectDashboardAction(argv: string[]): {
71
135
  };
72
136
  }
73
137
 
138
+ if (command === "run") {
139
+ return {
140
+ action: "orchestrate",
141
+ skillName: null,
142
+ skillPath: null,
143
+ };
144
+ }
145
+
74
146
  if (command === "evolve" && subcommand === "rollback") {
75
147
  return {
76
148
  action: "rollback",
@@ -87,6 +159,14 @@ function detectDashboardAction(argv: string[]): {
87
159
  };
88
160
  }
89
161
 
162
+ if (command === "improve") {
163
+ return {
164
+ action: hasFlag(argv, "--dry-run") ? "replay-dry-run" : "deploy-candidate",
165
+ skillName: readFlagValue(argv, "--skill"),
166
+ skillPath: readFlagValue(argv, "--skill-path"),
167
+ };
168
+ }
169
+
90
170
  return null;
91
171
  }
92
172
 
@@ -1,3 +1,17 @@
1
+ import type {
2
+ CreatePackageBodySummary,
3
+ CreatePackageCandidateAcceptanceDecision,
4
+ CreateCheckReadiness,
5
+ CreatePackageEvaluationEfficiencySummary,
6
+ CreatePackageEvaluationEvidenceSummary,
7
+ CreatePackageEvaluationGradingSummary,
8
+ CreatePackageEvaluationSource,
9
+ CreatePackageReplaySummary,
10
+ CreatePackageEvaluationStatus,
11
+ CreatePackageEvaluationUnitTestSummary,
12
+ CreatePackageEvaluationWatchSummary,
13
+ } from "./types.js";
14
+
1
15
  // -- Cursor-based pagination types -------------------------------------------
2
16
 
3
17
  export interface PaginationCursor {
@@ -151,6 +165,7 @@ export interface SkillSummary {
151
165
  routing_confidence: number | null;
152
166
  confidence_coverage: number;
153
167
  testing_readiness?: SkillTestingReadiness;
168
+ create_readiness?: CreateCheckReadiness;
154
169
  }
155
170
 
156
171
  // -- Autonomy-first overview types -------------------------------------------
@@ -346,6 +361,9 @@ export interface SkillTestingReadiness {
346
361
  baseline_sample_size: number;
347
362
  baseline_pass_rate: number | null;
348
363
  latest_baseline_at: string | null;
364
+ package_evaluation_status?: CreatePackageEvaluationStatus | null;
365
+ package_evaluation_passed?: boolean | null;
366
+ latest_package_evaluation_at?: string | null;
349
367
  deployment_readiness: DeploymentReadiness;
350
368
  deployment_summary: string;
351
369
  deployment_command: string | null;
@@ -354,6 +372,8 @@ export interface SkillTestingReadiness {
354
372
  }
355
373
 
356
374
  export type DashboardActionName =
375
+ | "create-check"
376
+ | "report-package"
357
377
  | "generate-evals"
358
378
  | "generate-unit-tests"
359
379
  | "replay-dry-run"
@@ -361,7 +381,8 @@ export type DashboardActionName =
361
381
  | "deploy-candidate"
362
382
  | "watch"
363
383
  | "orchestrate"
364
- | "rollback";
384
+ | "rollback"
385
+ | "search-run";
365
386
 
366
387
  export type DashboardActionEventStage =
367
388
  | "started"
@@ -376,9 +397,49 @@ export interface DashboardActionResultSummary {
376
397
  improved: boolean | null;
377
398
  deployed: boolean | null;
378
399
  before_pass_rate: number | null;
400
+ before_label?: string | null;
379
401
  after_pass_rate: number | null;
402
+ after_label?: string | null;
380
403
  net_change: number | null;
404
+ net_change_label?: string | null;
381
405
  validation_mode: string | null;
406
+ validation_label?: string | null;
407
+ recommended_command?: string | null;
408
+ package_evaluation_source?: CreatePackageEvaluationSource | null;
409
+ package_candidate_id?: string | null;
410
+ package_parent_candidate_id?: string | null;
411
+ package_candidate_generation?: number | null;
412
+ package_candidate_acceptance_decision?: CreatePackageCandidateAcceptanceDecision | null;
413
+ package_candidate_acceptance_rationale?: string | null;
414
+ package_evidence?: CreatePackageEvaluationEvidenceSummary | null;
415
+ package_efficiency?: CreatePackageEvaluationEfficiencySummary | null;
416
+ package_routing?: CreatePackageReplaySummary | null;
417
+ package_body?: CreatePackageBodySummary | null;
418
+ package_grading?: CreatePackageEvaluationGradingSummary | null;
419
+ package_unit_tests?: CreatePackageEvaluationUnitTestSummary | null;
420
+ package_watch?: CreatePackageEvaluationWatchSummary | null;
421
+ /** Search run provenance — populated only for search-run actions. */
422
+ search_run?: DashboardSearchRunSummary | null;
423
+ /** Whether the watch gate passed for publish actions (null for non-publish actions). */
424
+ watch_gate_passed?: boolean | null;
425
+ }
426
+
427
+ /** Compact search run result surfaced in the action result summary. */
428
+ export interface DashboardSearchRunSummary {
429
+ search_id: string;
430
+ parent_candidate_id: string | null;
431
+ winner_candidate_id: string | null;
432
+ winner_rationale: string | null;
433
+ candidates_evaluated: number;
434
+ frontier_size: number;
435
+ parent_selection_method: string;
436
+ surface_plan?: {
437
+ routing_count: number;
438
+ body_count: number;
439
+ weakness_source: string;
440
+ routing_weakness: number | null;
441
+ body_weakness: number | null;
442
+ } | null;
382
443
  }
383
444
 
384
445
  export interface DashboardActionMetrics {
@@ -424,9 +485,21 @@ export interface DashboardActionEvent {
424
485
  progress?: DashboardActionProgress | null;
425
486
  }
426
487
 
488
+ export type CreatorOverviewStep =
489
+ | "run_create_check"
490
+ | "finish_package"
491
+ | "generate_evals"
492
+ | "run_unit_tests"
493
+ | "run_replay_dry_run"
494
+ | "measure_baseline"
495
+ | "deploy_candidate"
496
+ | "watch_deployment";
497
+
427
498
  export interface CreatorTestingOverview {
428
499
  summary: string;
429
500
  counts: {
501
+ run_create_check: number;
502
+ finish_package: number;
430
503
  generate_evals: number;
431
504
  run_unit_tests: number;
432
505
  run_replay_dry_run: number;
@@ -436,7 +509,7 @@ export interface CreatorTestingOverview {
436
509
  };
437
510
  priorities: Array<{
438
511
  skill_name: string;
439
- next_step: CreatorLoopNextStep;
512
+ step: CreatorOverviewStep;
440
513
  summary: string;
441
514
  recommended_command: string;
442
515
  }>;
@@ -446,7 +519,7 @@ export interface CreatorTestingOverview {
446
519
 
447
520
  export interface OrchestrateRunSkillAction {
448
521
  skill: string;
449
- action: "evolve" | "watch" | "skip";
522
+ action: "evolve" | "package-search" | "watch" | "skip";
450
523
  reason: string;
451
524
  deployed?: boolean;
452
525
  rolledBack?: boolean;
@@ -468,6 +541,8 @@ export interface OrchestrateRunReport {
468
541
  watched: number;
469
542
  skipped: number;
470
543
  auto_graded?: number;
544
+ package_searched?: number;
545
+ package_improved?: number;
471
546
  skill_actions: OrchestrateRunSkillAction[];
472
547
  }
473
548
 
@@ -558,6 +633,69 @@ export interface ReplayEntryResult {
558
633
  evidence: string | null;
559
634
  }
560
635
 
636
+ // -- Package search / frontier types (bounded package evolution) ---------------
637
+
638
+ /**
639
+ * Dashboard-facing view of a package search run result.
640
+ * References `PackageSearchRunResult` from types.ts — does not redefine search
641
+ * semantics, only surfaces what the search runner provides.
642
+ */
643
+ export interface DashboardSearchRunView {
644
+ search_id: string;
645
+ skill_name: string;
646
+ parent_candidate_id: string | null;
647
+ candidates_evaluated: number;
648
+ winner_candidate_id: string | null;
649
+ winner_rationale: string | null;
650
+ started_at: string;
651
+ completed_at: string;
652
+ provenance: DashboardSearchProvenance;
653
+ }
654
+
655
+ /** Provenance detail surfaced in the dashboard for a search run. */
656
+ export interface DashboardSearchProvenance {
657
+ frontier_size: number;
658
+ parent_selection_method: string;
659
+ candidate_fingerprints: string[];
660
+ surface_plan?: {
661
+ routing_count: number;
662
+ body_count: number;
663
+ weakness_source: string;
664
+ routing_weakness: number | null;
665
+ body_weakness: number | null;
666
+ } | null;
667
+ evaluation_summaries: Array<{
668
+ candidate_id: string;
669
+ decision: string;
670
+ rationale: string;
671
+ }>;
672
+ }
673
+
674
+ /** A frontier member shown in the skill report's frontier state panel. */
675
+ export interface DashboardFrontierMember {
676
+ candidate_id: string;
677
+ skill_name: string;
678
+ fingerprint: string;
679
+ decision: "accepted" | "rejected" | "pending";
680
+ measured_delta: number | null;
681
+ created_at: string;
682
+ parent_candidate_id: string | null;
683
+ /** True when this candidate was demoted by watch-fed evidence. */
684
+ watch_demoted?: boolean;
685
+ /** Evidence rank within the accepted frontier (1 = best). */
686
+ evidence_rank?: number | null;
687
+ }
688
+
689
+ /** Frontier state summary surfaced in the skill report. */
690
+ export interface DashboardFrontierState {
691
+ skill_name: string;
692
+ accepted_count: number;
693
+ rejected_count: number;
694
+ pending_count: number;
695
+ members: DashboardFrontierMember[];
696
+ latest_search_run: DashboardSearchRunView | null;
697
+ }
698
+
561
699
  // -- Doctor / health check types ----------------------------------------------
562
700
  export type { DoctorResult, HealthCheck, HealthStatus } from "./types.js";
563
701
 
@@ -693,6 +831,8 @@ export interface TrustFields {
693
831
  }
694
832
 
695
833
  export interface SkillReportResponse extends SkillReportPayload, TrustFields {
834
+ /** Watch trust score (0-1) from the most recent watch cycle, null if never watched. */
835
+ watch_trust_score: number | null;
696
836
  evolution: EvolutionEntry[];
697
837
  pending_proposals: PendingProposal[];
698
838
  token_usage: {
@@ -727,4 +867,7 @@ export interface SkillReportResponse extends SkillReportPayload, TrustFields {
727
867
  };
728
868
  } | null;
729
869
  testing_readiness?: SkillTestingReadiness;
870
+ create_readiness?: CreateCheckReadiness;
871
+ /** Package frontier state — populated when bounded package evolution data exists. */
872
+ frontier_state?: DashboardFrontierState | null;
730
873
  }
@@ -10,9 +10,10 @@
10
10
  * GET /api/v2/overview — SQLite-backed overview payload
11
11
  * GET /api/v2/analytics — Performance analytics (trends, rankings, heatmap)
12
12
  * GET /api/v2/skills/:name — SQLite-backed per-skill report
13
- * POST /api/actions/watch — Trigger `selftune watch` for a skill
14
- * POST /api/actions/evolve — Trigger `selftune evolve` for a skill
15
- * POST /api/actions/rollback — Trigger `selftune rollback` for a skill
13
+ * POST /api/actions/create-check — Trigger `selftune create check` for a draft package
14
+ * POST /api/actions/watch — Trigger `selftune watch` for a skill
15
+ * POST /api/actions/evolve — Trigger `selftune evolve` for a skill
16
+ * POST /api/actions/rollback — Trigger `selftune rollback` for a skill
16
17
  * POST /api/actions/watchlist — Persist creator watchlist preferences
17
18
  * GET /badge/:name — Skill health badge
18
19
  * GET /report/:name — Skill health report HTML
@@ -676,7 +677,7 @@ export async function startDashboardServer(options?: DashboardServerOptions): Pr
676
677
  return serveSpaShell(spaDir);
677
678
  }
678
679
 
679
- // ---- POST /api/actions/{watch,evolve,rollback,watchlist} ----
680
+ // ---- POST /api/actions/{create-check,watch,evolve,rollback,watchlist} ----
680
681
  if (url.pathname.startsWith("/api/actions/") && req.method === "POST") {
681
682
  const trustedActionOrigins = allowedDashboardOrigins(hostname, boundPort);
682
683
  const origin = req.headers.get("origin");
@@ -43,8 +43,10 @@ import type {
43
43
  SkillUsageRecord,
44
44
  } from "../types.js";
45
45
  import { CLIError, handleCLIError } from "../utils/cli-error.js";
46
- import { detectLlmAgent } from "../utils/llm-call.js";
46
+ import { MIN_LOG_READY_POSITIVES } from "../utils/eval-readiness.js";
47
+ import { detectLlmAgent, isLlmBackedAgent } from "../utils/llm-call.js";
47
48
  import {
49
+ extractPositiveEvalQueryText,
48
50
  filterActionableQueryRecords,
49
51
  filterActionableSkillUsageRecords,
50
52
  } from "../utils/query-filter.js";
@@ -63,6 +65,36 @@ import { writeCanonicalEvalSet } from "../testing-readiness.js";
63
65
 
64
66
  export { classifyInvocation } from "./invocation-classifier.js";
65
67
 
68
+ function resolveEvalGenerateAgent(requestedAgent?: string | null): string {
69
+ if (requestedAgent) {
70
+ if (!isLlmBackedAgent(requestedAgent)) {
71
+ throw new CLIError(
72
+ `Unsupported --agent value "${requestedAgent}".`,
73
+ "INVALID_FLAG",
74
+ "Use claude, codex, opencode, or pi.",
75
+ );
76
+ }
77
+ if (!Bun.which(requestedAgent)) {
78
+ throw new CLIError(
79
+ `Agent CLI '${requestedAgent}' not found in PATH`,
80
+ "AGENT_NOT_FOUND",
81
+ "Install it or omit --agent to use auto-detection",
82
+ );
83
+ }
84
+ return requestedAgent;
85
+ }
86
+
87
+ const detected = detectLlmAgent();
88
+ if (!detected) {
89
+ throw new CLIError(
90
+ "No agent CLI found (claude/codex/opencode/pi)",
91
+ "AGENT_NOT_FOUND",
92
+ "Install one of the supported agent CLIs",
93
+ );
94
+ }
95
+ return detected;
96
+ }
97
+
66
98
  // ---------------------------------------------------------------------------
67
99
  // Query truncation
68
100
  // ---------------------------------------------------------------------------
@@ -97,8 +129,8 @@ export function buildEvalSet(
97
129
  for (const r of actionableSkillRecords) {
98
130
  if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
99
131
  if (isHighConfidencePositiveSkillRecord(r, skillName)) {
100
- const q = (r.query ?? "").trim();
101
- if (q && q !== "(query not found)") {
132
+ const q = extractPositiveEvalQueryText(r.query, skillName);
133
+ if (q) {
102
134
  positiveQueries.add(q);
103
135
  }
104
136
  }
@@ -110,8 +142,8 @@ export function buildEvalSet(
110
142
  for (const r of actionableSkillRecords) {
111
143
  if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
112
144
  if (!isHighConfidencePositiveSkillRecord(r, skillName)) continue;
113
- const q = (r.query ?? "").trim();
114
- if (!q || q === "(query not found)" || seen.has(q)) continue;
145
+ const q = extractPositiveEvalQueryText(r.query, skillName);
146
+ if (!q || seen.has(q)) continue;
115
147
  seen.add(q);
116
148
  const entry: EvalEntry = {
117
149
  query: truncateQuery(q),
@@ -331,6 +363,7 @@ export function listEvalSkillReadiness(
331
363
  if (r.session_id) rawSessionCounts.get(name)?.add(r.session_id);
332
364
 
333
365
  if (!isHighConfidencePositiveSkillRecord(r, name)) continue;
366
+ if (!extractPositiveEvalQueryText(r.query ?? "", name)) continue;
334
367
  trustedTriggerCounts.set(name, (trustedTriggerCounts.get(name) ?? 0) + 1);
335
368
  if (!trustedSessionCounts.has(name)) trustedSessionCounts.set(name, new Set<string>());
336
369
  if (r.session_id) trustedSessionCounts.get(name)?.add(r.session_id);
@@ -354,7 +387,11 @@ export function listEvalSkillReadiness(
354
387
  installed,
355
388
  skill_path: installed ? findInstalledSkillPath(name, searchDirs) : undefined,
356
389
  readiness:
357
- trustedTriggerCount > 0 ? "log_ready" : installed ? "cold_start_ready" : "telemetry_only",
390
+ trustedTriggerCount >= MIN_LOG_READY_POSITIVES
391
+ ? "log_ready"
392
+ : installed
393
+ ? "cold_start_ready"
394
+ : "telemetry_only",
358
395
  } satisfies EvalSkillReadiness;
359
396
  });
360
397
  }
@@ -392,9 +429,9 @@ export function listSkills(
392
429
  }
393
430
  console.log("");
394
431
  console.log("Legend:");
395
- console.log(" log-ready real triggers exist; run eval generate normally");
432
+ console.log(" log-ready enough clean real triggers exist; run eval generate normally");
396
433
  console.log(
397
- " cold-start installed locally but no trusted triggers yet; use --auto-synthetic",
434
+ " cold-start installed locally but not enough clean trusted triggers yet; use --auto-synthetic",
398
435
  );
399
436
  console.log(" telemetry-only trigger data exists but local SKILL.md was not found");
400
437
  } else {
@@ -566,6 +603,7 @@ export async function cliMain(): Promise<void> {
566
603
  skill: { type: "string" },
567
604
  output: { type: "string" },
568
605
  out: { type: "string" },
606
+ agent: { type: "string" },
569
607
  max: { type: "string", default: "50" },
570
608
  seed: { type: "string", default: "42" },
571
609
  "list-skills": { type: "boolean", default: false },
@@ -607,14 +645,7 @@ export async function cliMain(): Promise<void> {
607
645
  );
608
646
  }
609
647
 
610
- const agent = detectLlmAgent();
611
- if (!agent) {
612
- throw new CLIError(
613
- "No agent CLI found (claude/codex/opencode/pi)",
614
- "AGENT_NOT_FOUND",
615
- "Install one of the supported agent CLIs",
616
- );
617
- }
648
+ const agent = resolveEvalGenerateAgent(values.agent);
618
649
 
619
650
  const maxPerSide = Number.parseInt(values.max ?? "50", 10);
620
651
  const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
@@ -781,24 +812,17 @@ export async function cliMain(): Promise<void> {
781
812
  });
782
813
 
783
814
  const positiveCount = evalSet.filter((entry) => entry.should_trigger).length;
784
- if (positiveCount === 0 && values["auto-synthetic"]) {
815
+ if (positiveCount < MIN_LOG_READY_POSITIVES && values["auto-synthetic"]) {
785
816
  const skillPath = values["skill-path"] ?? detectedSkillPath;
786
817
  if (!skillPath) {
787
818
  throw new CLIError(
788
- `No trusted triggers found for '${values.skill}', and no SKILL.md path could be resolved for synthetic fallback.`,
819
+ `Not enough clean trusted triggers found for '${values.skill}', and no SKILL.md path could be resolved for synthetic fallback.`,
789
820
  "FILE_NOT_FOUND",
790
821
  `Run 'selftune eval generate --list-skills' or rerun with --skill-path /path/to/SKILL.md`,
791
822
  );
792
823
  }
793
824
 
794
- const agent = detectLlmAgent();
795
- if (!agent) {
796
- throw new CLIError(
797
- "No agent CLI found (claude/codex/opencode/pi)",
798
- "AGENT_NOT_FOUND",
799
- "Install one of the supported agent CLIs",
800
- );
801
- }
825
+ const agent = resolveEvalGenerateAgent(values.agent);
802
826
 
803
827
  emitDashboardStepProgress({
804
828
  current: 1,
@@ -808,7 +832,7 @@ export async function cliMain(): Promise<void> {
808
832
  label: "Load skill content",
809
833
  });
810
834
  console.log(
811
- `No trusted triggers found for '${values.skill}'. Falling back to synthetic cold-start eval generation...`,
835
+ `Only ${positiveCount} clean trusted positive eval candidate(s) found for '${values.skill}'. Falling back to synthetic cold-start eval generation...`,
812
836
  );
813
837
  const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
814
838
  const syntheticEvalSet = await generateSyntheticEvals(skillPath, values.skill, agent, {
@@ -860,6 +884,12 @@ export async function cliMain(): Promise<void> {
860
884
  return;
861
885
  }
862
886
 
887
+ if (positiveCount > 0 && positiveCount < MIN_LOG_READY_POSITIVES) {
888
+ console.warn(
889
+ `[WARN] Only ${positiveCount} clean positive eval candidate(s) were found for '${values.skill}'. The log-derived eval set may be low-confidence. Consider rerunning with --auto-synthetic or --blend.`,
890
+ );
891
+ }
892
+
863
893
  // --- Blend mode: merge log-based evals with synthetic gap-fillers ---
864
894
  let finalEvalSet = evalSet;
865
895
  if (values.blend) {
@@ -872,14 +902,7 @@ export async function cliMain(): Promise<void> {
872
902
  );
873
903
  }
874
904
 
875
- const agent = detectLlmAgent();
876
- if (!agent) {
877
- throw new CLIError(
878
- "No agent CLI found (claude/codex/opencode/pi)",
879
- "AGENT_NOT_FOUND",
880
- "Install one of the supported agent CLIs",
881
- );
882
- }
905
+ const agent = resolveEvalGenerateAgent(values.agent);
883
906
 
884
907
  // Fail fast before expensive LLM calls — blending with zero logs always produces []
885
908
  if (evalSet.length === 0) {