selftune 0.2.30 → 0.2.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -56
- package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
- package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
- package/apps/local-dashboard/dist/index.html +3 -3
- package/cli/selftune/command-surface.ts +613 -2
- package/cli/selftune/create/baseline.ts +429 -0
- package/cli/selftune/create/check.ts +35 -0
- package/cli/selftune/create/init.ts +115 -0
- package/cli/selftune/create/package-candidate-state.ts +771 -0
- package/cli/selftune/create/package-evaluator.ts +710 -0
- package/cli/selftune/create/package-fingerprint.ts +142 -0
- package/cli/selftune/create/package-search.ts +377 -0
- package/cli/selftune/create/publish.ts +431 -0
- package/cli/selftune/create/readiness.ts +495 -0
- package/cli/selftune/create/replay.ts +330 -0
- package/cli/selftune/create/report.ts +74 -0
- package/cli/selftune/create/scaffold.ts +121 -0
- package/cli/selftune/create/skills-ref-adapter.ts +177 -0
- package/cli/selftune/create/status.ts +33 -0
- package/cli/selftune/create/templates.ts +249 -0
- package/cli/selftune/cron/setup.ts +1 -1
- package/cli/selftune/dashboard-action-events.ts +4 -1
- package/cli/selftune/dashboard-action-result.ts +789 -24
- package/cli/selftune/dashboard-action-stream.ts +80 -0
- package/cli/selftune/dashboard-contract.ts +146 -3
- package/cli/selftune/dashboard-server.ts +5 -4
- package/cli/selftune/eval/hooks-to-evals.ts +58 -35
- package/cli/selftune/eval/synthetic-evals.ts +145 -17
- package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
- package/cli/selftune/evolution/evolve-body.ts +9 -36
- package/cli/selftune/evolution/evolve.ts +8 -72
- package/cli/selftune/evolution/stopping-criteria.ts +5 -13
- package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
- package/cli/selftune/evolution/validate-host-replay.ts +115 -15
- package/cli/selftune/improve.ts +206 -0
- package/cli/selftune/index.ts +123 -6
- package/cli/selftune/init.ts +1 -1
- package/cli/selftune/localdb/queries/dashboard.ts +30 -0
- package/cli/selftune/localdb/schema.ts +52 -0
- package/cli/selftune/monitoring/watch.ts +257 -23
- package/cli/selftune/orchestrate/execute.ts +300 -1
- package/cli/selftune/orchestrate/finalize.ts +14 -0
- package/cli/selftune/orchestrate/plan.ts +22 -5
- package/cli/selftune/orchestrate/prepare.ts +59 -4
- package/cli/selftune/orchestrate/report.ts +1 -1
- package/cli/selftune/orchestrate.ts +34 -1
- package/cli/selftune/publish.ts +35 -0
- package/cli/selftune/registry/github-install.ts +256 -0
- package/cli/selftune/registry/index.ts +1 -1
- package/cli/selftune/registry/install.ts +58 -7
- package/cli/selftune/routes/actions.ts +81 -15
- package/cli/selftune/routes/overview.ts +1 -1
- package/cli/selftune/routes/skill-report.ts +147 -2
- package/cli/selftune/run.ts +18 -0
- package/cli/selftune/schedule.ts +3 -3
- package/cli/selftune/search-run.ts +703 -0
- package/cli/selftune/status.ts +35 -11
- package/cli/selftune/testing-readiness.ts +431 -40
- package/cli/selftune/types.ts +316 -0
- package/cli/selftune/utils/eval-readiness.ts +1 -0
- package/cli/selftune/utils/json-output.ts +11 -0
- package/cli/selftune/utils/lifecycle-surface.ts +48 -0
- package/cli/selftune/utils/query-filter.ts +82 -1
- package/cli/selftune/utils/tui.ts +85 -2
- package/cli/selftune/verify.ts +205 -0
- package/cli/selftune/workflows/proposals.ts +1 -1
- package/cli/selftune/workflows/skill-scaffold.ts +141 -63
- package/cli/selftune/workflows/workflows.ts +4 -4
- package/package.json +1 -1
- package/packages/dashboard-core/src/routes/manifest.ts +2 -2
- package/packages/ui/src/components/SkillReportPanels.tsx +7 -7
- package/packages/ui/src/primitives/button.tsx +5 -0
- package/skill/SKILL.md +148 -85
- package/skill/references/cli-quick-reference.md +16 -1
- package/skill/references/creator-playbook.md +31 -10
- package/skill/workflows/Baseline.md +8 -9
- package/skill/workflows/Contributions.md +4 -4
- package/skill/workflows/Create.md +173 -0
- package/skill/workflows/CreateTestDeploy.md +34 -30
- package/skill/workflows/Cron.md +2 -2
- package/skill/workflows/Dashboard.md +3 -3
- package/skill/workflows/Evals.md +13 -7
- package/skill/workflows/Evolve.md +75 -32
- package/skill/workflows/EvolveBody.md +22 -15
- package/skill/workflows/Hook.md +1 -1
- package/skill/workflows/Improve.md +168 -0
- package/skill/workflows/Initialize.md +3 -3
- package/skill/workflows/Orchestrate.md +49 -12
- package/skill/workflows/Publish.md +100 -0
- package/skill/workflows/Registry.md +19 -13
- package/skill/workflows/Run.md +72 -0
- package/skill/workflows/Schedule.md +2 -2
- package/skill/workflows/SearchRun.md +89 -0
- package/skill/workflows/SignalsDashboard.md +2 -2
- package/skill/workflows/UnitTest.md +13 -4
- package/skill/workflows/Verify.md +136 -0
- package/skill/workflows/Watch.md +114 -47
- package/skill/workflows/Workflows.md +13 -8
- package/apps/local-dashboard/dist/assets/index-BcXquWFB.css +0 -1
- package/apps/local-dashboard/dist/assets/index-Coq42hE4.js +0 -15
- package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
|
@@ -63,6 +63,70 @@ function detectDashboardAction(argv: string[]): {
|
|
|
63
63
|
};
|
|
64
64
|
}
|
|
65
65
|
|
|
66
|
+
if (command === "create" && subcommand === "replay") {
|
|
67
|
+
return {
|
|
68
|
+
action: "replay-dry-run",
|
|
69
|
+
skillName: null,
|
|
70
|
+
skillPath: readFlagValue(argv, "--skill-path"),
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (command === "create" && subcommand === "check") {
|
|
75
|
+
return {
|
|
76
|
+
action: "create-check",
|
|
77
|
+
skillName: null,
|
|
78
|
+
skillPath: readFlagValue(argv, "--skill-path"),
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
if (command === "create" && subcommand === "baseline") {
|
|
83
|
+
return {
|
|
84
|
+
action: "measure-baseline",
|
|
85
|
+
skillName: null,
|
|
86
|
+
skillPath: readFlagValue(argv, "--skill-path"),
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
if (command === "create" && subcommand === "report") {
|
|
91
|
+
return {
|
|
92
|
+
action: "report-package",
|
|
93
|
+
skillName: null,
|
|
94
|
+
skillPath: readFlagValue(argv, "--skill-path"),
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (command === "create" && subcommand === "publish") {
|
|
99
|
+
return {
|
|
100
|
+
action: hasFlag(argv, "--watch") ? "watch" : "deploy-candidate",
|
|
101
|
+
skillName: null,
|
|
102
|
+
skillPath: readFlagValue(argv, "--skill-path"),
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
if (command === "verify") {
|
|
107
|
+
return {
|
|
108
|
+
action: "report-package",
|
|
109
|
+
skillName: null,
|
|
110
|
+
skillPath: readFlagValue(argv, "--skill-path"),
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if (command === "publish") {
|
|
115
|
+
return {
|
|
116
|
+
action: hasFlag(argv, "--no-watch") ? "deploy-candidate" : "watch",
|
|
117
|
+
skillName: null,
|
|
118
|
+
skillPath: readFlagValue(argv, "--skill-path"),
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if (command === "search-run") {
|
|
123
|
+
return {
|
|
124
|
+
action: "search-run",
|
|
125
|
+
skillName: readFlagValue(argv, "--skill"),
|
|
126
|
+
skillPath: readFlagValue(argv, "--skill-path"),
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
|
|
66
130
|
if (command === "orchestrate") {
|
|
67
131
|
return {
|
|
68
132
|
action: "orchestrate",
|
|
@@ -71,6 +135,14 @@ function detectDashboardAction(argv: string[]): {
|
|
|
71
135
|
};
|
|
72
136
|
}
|
|
73
137
|
|
|
138
|
+
if (command === "run") {
|
|
139
|
+
return {
|
|
140
|
+
action: "orchestrate",
|
|
141
|
+
skillName: null,
|
|
142
|
+
skillPath: null,
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
|
|
74
146
|
if (command === "evolve" && subcommand === "rollback") {
|
|
75
147
|
return {
|
|
76
148
|
action: "rollback",
|
|
@@ -87,6 +159,14 @@ function detectDashboardAction(argv: string[]): {
|
|
|
87
159
|
};
|
|
88
160
|
}
|
|
89
161
|
|
|
162
|
+
if (command === "improve") {
|
|
163
|
+
return {
|
|
164
|
+
action: hasFlag(argv, "--dry-run") ? "replay-dry-run" : "deploy-candidate",
|
|
165
|
+
skillName: readFlagValue(argv, "--skill"),
|
|
166
|
+
skillPath: readFlagValue(argv, "--skill-path"),
|
|
167
|
+
};
|
|
168
|
+
}
|
|
169
|
+
|
|
90
170
|
return null;
|
|
91
171
|
}
|
|
92
172
|
|
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
CreatePackageBodySummary,
|
|
3
|
+
CreatePackageCandidateAcceptanceDecision,
|
|
4
|
+
CreateCheckReadiness,
|
|
5
|
+
CreatePackageEvaluationEfficiencySummary,
|
|
6
|
+
CreatePackageEvaluationEvidenceSummary,
|
|
7
|
+
CreatePackageEvaluationGradingSummary,
|
|
8
|
+
CreatePackageEvaluationSource,
|
|
9
|
+
CreatePackageReplaySummary,
|
|
10
|
+
CreatePackageEvaluationStatus,
|
|
11
|
+
CreatePackageEvaluationUnitTestSummary,
|
|
12
|
+
CreatePackageEvaluationWatchSummary,
|
|
13
|
+
} from "./types.js";
|
|
14
|
+
|
|
1
15
|
// -- Cursor-based pagination types -------------------------------------------
|
|
2
16
|
|
|
3
17
|
export interface PaginationCursor {
|
|
@@ -151,6 +165,7 @@ export interface SkillSummary {
|
|
|
151
165
|
routing_confidence: number | null;
|
|
152
166
|
confidence_coverage: number;
|
|
153
167
|
testing_readiness?: SkillTestingReadiness;
|
|
168
|
+
create_readiness?: CreateCheckReadiness;
|
|
154
169
|
}
|
|
155
170
|
|
|
156
171
|
// -- Autonomy-first overview types -------------------------------------------
|
|
@@ -346,6 +361,9 @@ export interface SkillTestingReadiness {
|
|
|
346
361
|
baseline_sample_size: number;
|
|
347
362
|
baseline_pass_rate: number | null;
|
|
348
363
|
latest_baseline_at: string | null;
|
|
364
|
+
package_evaluation_status?: CreatePackageEvaluationStatus | null;
|
|
365
|
+
package_evaluation_passed?: boolean | null;
|
|
366
|
+
latest_package_evaluation_at?: string | null;
|
|
349
367
|
deployment_readiness: DeploymentReadiness;
|
|
350
368
|
deployment_summary: string;
|
|
351
369
|
deployment_command: string | null;
|
|
@@ -354,6 +372,8 @@ export interface SkillTestingReadiness {
|
|
|
354
372
|
}
|
|
355
373
|
|
|
356
374
|
export type DashboardActionName =
|
|
375
|
+
| "create-check"
|
|
376
|
+
| "report-package"
|
|
357
377
|
| "generate-evals"
|
|
358
378
|
| "generate-unit-tests"
|
|
359
379
|
| "replay-dry-run"
|
|
@@ -361,7 +381,8 @@ export type DashboardActionName =
|
|
|
361
381
|
| "deploy-candidate"
|
|
362
382
|
| "watch"
|
|
363
383
|
| "orchestrate"
|
|
364
|
-
| "rollback"
|
|
384
|
+
| "rollback"
|
|
385
|
+
| "search-run";
|
|
365
386
|
|
|
366
387
|
export type DashboardActionEventStage =
|
|
367
388
|
| "started"
|
|
@@ -376,9 +397,49 @@ export interface DashboardActionResultSummary {
|
|
|
376
397
|
improved: boolean | null;
|
|
377
398
|
deployed: boolean | null;
|
|
378
399
|
before_pass_rate: number | null;
|
|
400
|
+
before_label?: string | null;
|
|
379
401
|
after_pass_rate: number | null;
|
|
402
|
+
after_label?: string | null;
|
|
380
403
|
net_change: number | null;
|
|
404
|
+
net_change_label?: string | null;
|
|
381
405
|
validation_mode: string | null;
|
|
406
|
+
validation_label?: string | null;
|
|
407
|
+
recommended_command?: string | null;
|
|
408
|
+
package_evaluation_source?: CreatePackageEvaluationSource | null;
|
|
409
|
+
package_candidate_id?: string | null;
|
|
410
|
+
package_parent_candidate_id?: string | null;
|
|
411
|
+
package_candidate_generation?: number | null;
|
|
412
|
+
package_candidate_acceptance_decision?: CreatePackageCandidateAcceptanceDecision | null;
|
|
413
|
+
package_candidate_acceptance_rationale?: string | null;
|
|
414
|
+
package_evidence?: CreatePackageEvaluationEvidenceSummary | null;
|
|
415
|
+
package_efficiency?: CreatePackageEvaluationEfficiencySummary | null;
|
|
416
|
+
package_routing?: CreatePackageReplaySummary | null;
|
|
417
|
+
package_body?: CreatePackageBodySummary | null;
|
|
418
|
+
package_grading?: CreatePackageEvaluationGradingSummary | null;
|
|
419
|
+
package_unit_tests?: CreatePackageEvaluationUnitTestSummary | null;
|
|
420
|
+
package_watch?: CreatePackageEvaluationWatchSummary | null;
|
|
421
|
+
/** Search run provenance — populated only for search-run actions. */
|
|
422
|
+
search_run?: DashboardSearchRunSummary | null;
|
|
423
|
+
/** Whether the watch gate passed for publish actions (null for non-publish actions). */
|
|
424
|
+
watch_gate_passed?: boolean | null;
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
/** Compact search run result surfaced in the action result summary. */
|
|
428
|
+
export interface DashboardSearchRunSummary {
|
|
429
|
+
search_id: string;
|
|
430
|
+
parent_candidate_id: string | null;
|
|
431
|
+
winner_candidate_id: string | null;
|
|
432
|
+
winner_rationale: string | null;
|
|
433
|
+
candidates_evaluated: number;
|
|
434
|
+
frontier_size: number;
|
|
435
|
+
parent_selection_method: string;
|
|
436
|
+
surface_plan?: {
|
|
437
|
+
routing_count: number;
|
|
438
|
+
body_count: number;
|
|
439
|
+
weakness_source: string;
|
|
440
|
+
routing_weakness: number | null;
|
|
441
|
+
body_weakness: number | null;
|
|
442
|
+
} | null;
|
|
382
443
|
}
|
|
383
444
|
|
|
384
445
|
export interface DashboardActionMetrics {
|
|
@@ -424,9 +485,21 @@ export interface DashboardActionEvent {
|
|
|
424
485
|
progress?: DashboardActionProgress | null;
|
|
425
486
|
}
|
|
426
487
|
|
|
488
|
+
export type CreatorOverviewStep =
|
|
489
|
+
| "run_create_check"
|
|
490
|
+
| "finish_package"
|
|
491
|
+
| "generate_evals"
|
|
492
|
+
| "run_unit_tests"
|
|
493
|
+
| "run_replay_dry_run"
|
|
494
|
+
| "measure_baseline"
|
|
495
|
+
| "deploy_candidate"
|
|
496
|
+
| "watch_deployment";
|
|
497
|
+
|
|
427
498
|
export interface CreatorTestingOverview {
|
|
428
499
|
summary: string;
|
|
429
500
|
counts: {
|
|
501
|
+
run_create_check: number;
|
|
502
|
+
finish_package: number;
|
|
430
503
|
generate_evals: number;
|
|
431
504
|
run_unit_tests: number;
|
|
432
505
|
run_replay_dry_run: number;
|
|
@@ -436,7 +509,7 @@ export interface CreatorTestingOverview {
|
|
|
436
509
|
};
|
|
437
510
|
priorities: Array<{
|
|
438
511
|
skill_name: string;
|
|
439
|
-
|
|
512
|
+
step: CreatorOverviewStep;
|
|
440
513
|
summary: string;
|
|
441
514
|
recommended_command: string;
|
|
442
515
|
}>;
|
|
@@ -446,7 +519,7 @@ export interface CreatorTestingOverview {
|
|
|
446
519
|
|
|
447
520
|
export interface OrchestrateRunSkillAction {
|
|
448
521
|
skill: string;
|
|
449
|
-
action: "evolve" | "watch" | "skip";
|
|
522
|
+
action: "evolve" | "package-search" | "watch" | "skip";
|
|
450
523
|
reason: string;
|
|
451
524
|
deployed?: boolean;
|
|
452
525
|
rolledBack?: boolean;
|
|
@@ -468,6 +541,8 @@ export interface OrchestrateRunReport {
|
|
|
468
541
|
watched: number;
|
|
469
542
|
skipped: number;
|
|
470
543
|
auto_graded?: number;
|
|
544
|
+
package_searched?: number;
|
|
545
|
+
package_improved?: number;
|
|
471
546
|
skill_actions: OrchestrateRunSkillAction[];
|
|
472
547
|
}
|
|
473
548
|
|
|
@@ -558,6 +633,69 @@ export interface ReplayEntryResult {
|
|
|
558
633
|
evidence: string | null;
|
|
559
634
|
}
|
|
560
635
|
|
|
636
|
+
// -- Package search / frontier types (bounded package evolution) ---------------
|
|
637
|
+
|
|
638
|
+
/**
|
|
639
|
+
* Dashboard-facing view of a package search run result.
|
|
640
|
+
* References `PackageSearchRunResult` from types.ts — does not redefine search
|
|
641
|
+
* semantics, only surfaces what the search runner provides.
|
|
642
|
+
*/
|
|
643
|
+
export interface DashboardSearchRunView {
|
|
644
|
+
search_id: string;
|
|
645
|
+
skill_name: string;
|
|
646
|
+
parent_candidate_id: string | null;
|
|
647
|
+
candidates_evaluated: number;
|
|
648
|
+
winner_candidate_id: string | null;
|
|
649
|
+
winner_rationale: string | null;
|
|
650
|
+
started_at: string;
|
|
651
|
+
completed_at: string;
|
|
652
|
+
provenance: DashboardSearchProvenance;
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
/** Provenance detail surfaced in the dashboard for a search run. */
|
|
656
|
+
export interface DashboardSearchProvenance {
|
|
657
|
+
frontier_size: number;
|
|
658
|
+
parent_selection_method: string;
|
|
659
|
+
candidate_fingerprints: string[];
|
|
660
|
+
surface_plan?: {
|
|
661
|
+
routing_count: number;
|
|
662
|
+
body_count: number;
|
|
663
|
+
weakness_source: string;
|
|
664
|
+
routing_weakness: number | null;
|
|
665
|
+
body_weakness: number | null;
|
|
666
|
+
} | null;
|
|
667
|
+
evaluation_summaries: Array<{
|
|
668
|
+
candidate_id: string;
|
|
669
|
+
decision: string;
|
|
670
|
+
rationale: string;
|
|
671
|
+
}>;
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
/** A frontier member shown in the skill report's frontier state panel. */
|
|
675
|
+
export interface DashboardFrontierMember {
|
|
676
|
+
candidate_id: string;
|
|
677
|
+
skill_name: string;
|
|
678
|
+
fingerprint: string;
|
|
679
|
+
decision: "accepted" | "rejected" | "pending";
|
|
680
|
+
measured_delta: number | null;
|
|
681
|
+
created_at: string;
|
|
682
|
+
parent_candidate_id: string | null;
|
|
683
|
+
/** True when this candidate was demoted by watch-fed evidence. */
|
|
684
|
+
watch_demoted?: boolean;
|
|
685
|
+
/** Evidence rank within the accepted frontier (1 = best). */
|
|
686
|
+
evidence_rank?: number | null;
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
/** Frontier state summary surfaced in the skill report. */
|
|
690
|
+
export interface DashboardFrontierState {
|
|
691
|
+
skill_name: string;
|
|
692
|
+
accepted_count: number;
|
|
693
|
+
rejected_count: number;
|
|
694
|
+
pending_count: number;
|
|
695
|
+
members: DashboardFrontierMember[];
|
|
696
|
+
latest_search_run: DashboardSearchRunView | null;
|
|
697
|
+
}
|
|
698
|
+
|
|
561
699
|
// -- Doctor / health check types ----------------------------------------------
|
|
562
700
|
export type { DoctorResult, HealthCheck, HealthStatus } from "./types.js";
|
|
563
701
|
|
|
@@ -693,6 +831,8 @@ export interface TrustFields {
|
|
|
693
831
|
}
|
|
694
832
|
|
|
695
833
|
export interface SkillReportResponse extends SkillReportPayload, TrustFields {
|
|
834
|
+
/** Watch trust score (0-1) from the most recent watch cycle, null if never watched. */
|
|
835
|
+
watch_trust_score: number | null;
|
|
696
836
|
evolution: EvolutionEntry[];
|
|
697
837
|
pending_proposals: PendingProposal[];
|
|
698
838
|
token_usage: {
|
|
@@ -727,4 +867,7 @@ export interface SkillReportResponse extends SkillReportPayload, TrustFields {
|
|
|
727
867
|
};
|
|
728
868
|
} | null;
|
|
729
869
|
testing_readiness?: SkillTestingReadiness;
|
|
870
|
+
create_readiness?: CreateCheckReadiness;
|
|
871
|
+
/** Package frontier state — populated when bounded package evolution data exists. */
|
|
872
|
+
frontier_state?: DashboardFrontierState | null;
|
|
730
873
|
}
|
|
@@ -10,9 +10,10 @@
|
|
|
10
10
|
* GET /api/v2/overview — SQLite-backed overview payload
|
|
11
11
|
* GET /api/v2/analytics — Performance analytics (trends, rankings, heatmap)
|
|
12
12
|
* GET /api/v2/skills/:name — SQLite-backed per-skill report
|
|
13
|
-
* POST /api/actions/
|
|
14
|
-
* POST /api/actions/
|
|
15
|
-
* POST /api/actions/
|
|
13
|
+
* POST /api/actions/create-check — Trigger `selftune create check` for a draft package
|
|
14
|
+
* POST /api/actions/watch — Trigger `selftune watch` for a skill
|
|
15
|
+
* POST /api/actions/evolve — Trigger `selftune evolve` for a skill
|
|
16
|
+
* POST /api/actions/rollback — Trigger `selftune rollback` for a skill
|
|
16
17
|
* POST /api/actions/watchlist — Persist creator watchlist preferences
|
|
17
18
|
* GET /badge/:name — Skill health badge
|
|
18
19
|
* GET /report/:name — Skill health report HTML
|
|
@@ -676,7 +677,7 @@ export async function startDashboardServer(options?: DashboardServerOptions): Pr
|
|
|
676
677
|
return serveSpaShell(spaDir);
|
|
677
678
|
}
|
|
678
679
|
|
|
679
|
-
// ---- POST /api/actions/{watch,evolve,rollback,watchlist} ----
|
|
680
|
+
// ---- POST /api/actions/{create-check,watch,evolve,rollback,watchlist} ----
|
|
680
681
|
if (url.pathname.startsWith("/api/actions/") && req.method === "POST") {
|
|
681
682
|
const trustedActionOrigins = allowedDashboardOrigins(hostname, boundPort);
|
|
682
683
|
const origin = req.headers.get("origin");
|
|
@@ -43,8 +43,10 @@ import type {
|
|
|
43
43
|
SkillUsageRecord,
|
|
44
44
|
} from "../types.js";
|
|
45
45
|
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
46
|
-
import {
|
|
46
|
+
import { MIN_LOG_READY_POSITIVES } from "../utils/eval-readiness.js";
|
|
47
|
+
import { detectLlmAgent, isLlmBackedAgent } from "../utils/llm-call.js";
|
|
47
48
|
import {
|
|
49
|
+
extractPositiveEvalQueryText,
|
|
48
50
|
filterActionableQueryRecords,
|
|
49
51
|
filterActionableSkillUsageRecords,
|
|
50
52
|
} from "../utils/query-filter.js";
|
|
@@ -63,6 +65,36 @@ import { writeCanonicalEvalSet } from "../testing-readiness.js";
|
|
|
63
65
|
|
|
64
66
|
export { classifyInvocation } from "./invocation-classifier.js";
|
|
65
67
|
|
|
68
|
+
function resolveEvalGenerateAgent(requestedAgent?: string | null): string {
|
|
69
|
+
if (requestedAgent) {
|
|
70
|
+
if (!isLlmBackedAgent(requestedAgent)) {
|
|
71
|
+
throw new CLIError(
|
|
72
|
+
`Unsupported --agent value "${requestedAgent}".`,
|
|
73
|
+
"INVALID_FLAG",
|
|
74
|
+
"Use claude, codex, opencode, or pi.",
|
|
75
|
+
);
|
|
76
|
+
}
|
|
77
|
+
if (!Bun.which(requestedAgent)) {
|
|
78
|
+
throw new CLIError(
|
|
79
|
+
`Agent CLI '${requestedAgent}' not found in PATH`,
|
|
80
|
+
"AGENT_NOT_FOUND",
|
|
81
|
+
"Install it or omit --agent to use auto-detection",
|
|
82
|
+
);
|
|
83
|
+
}
|
|
84
|
+
return requestedAgent;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const detected = detectLlmAgent();
|
|
88
|
+
if (!detected) {
|
|
89
|
+
throw new CLIError(
|
|
90
|
+
"No agent CLI found (claude/codex/opencode/pi)",
|
|
91
|
+
"AGENT_NOT_FOUND",
|
|
92
|
+
"Install one of the supported agent CLIs",
|
|
93
|
+
);
|
|
94
|
+
}
|
|
95
|
+
return detected;
|
|
96
|
+
}
|
|
97
|
+
|
|
66
98
|
// ---------------------------------------------------------------------------
|
|
67
99
|
// Query truncation
|
|
68
100
|
// ---------------------------------------------------------------------------
|
|
@@ -97,8 +129,8 @@ export function buildEvalSet(
|
|
|
97
129
|
for (const r of actionableSkillRecords) {
|
|
98
130
|
if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
|
|
99
131
|
if (isHighConfidencePositiveSkillRecord(r, skillName)) {
|
|
100
|
-
const q = (r.query
|
|
101
|
-
if (q
|
|
132
|
+
const q = extractPositiveEvalQueryText(r.query, skillName);
|
|
133
|
+
if (q) {
|
|
102
134
|
positiveQueries.add(q);
|
|
103
135
|
}
|
|
104
136
|
}
|
|
@@ -110,8 +142,8 @@ export function buildEvalSet(
|
|
|
110
142
|
for (const r of actionableSkillRecords) {
|
|
111
143
|
if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
|
|
112
144
|
if (!isHighConfidencePositiveSkillRecord(r, skillName)) continue;
|
|
113
|
-
const q = (r.query
|
|
114
|
-
if (!q ||
|
|
145
|
+
const q = extractPositiveEvalQueryText(r.query, skillName);
|
|
146
|
+
if (!q || seen.has(q)) continue;
|
|
115
147
|
seen.add(q);
|
|
116
148
|
const entry: EvalEntry = {
|
|
117
149
|
query: truncateQuery(q),
|
|
@@ -331,6 +363,7 @@ export function listEvalSkillReadiness(
|
|
|
331
363
|
if (r.session_id) rawSessionCounts.get(name)?.add(r.session_id);
|
|
332
364
|
|
|
333
365
|
if (!isHighConfidencePositiveSkillRecord(r, name)) continue;
|
|
366
|
+
if (!extractPositiveEvalQueryText(r.query ?? "", name)) continue;
|
|
334
367
|
trustedTriggerCounts.set(name, (trustedTriggerCounts.get(name) ?? 0) + 1);
|
|
335
368
|
if (!trustedSessionCounts.has(name)) trustedSessionCounts.set(name, new Set<string>());
|
|
336
369
|
if (r.session_id) trustedSessionCounts.get(name)?.add(r.session_id);
|
|
@@ -354,7 +387,11 @@ export function listEvalSkillReadiness(
|
|
|
354
387
|
installed,
|
|
355
388
|
skill_path: installed ? findInstalledSkillPath(name, searchDirs) : undefined,
|
|
356
389
|
readiness:
|
|
357
|
-
trustedTriggerCount
|
|
390
|
+
trustedTriggerCount >= MIN_LOG_READY_POSITIVES
|
|
391
|
+
? "log_ready"
|
|
392
|
+
: installed
|
|
393
|
+
? "cold_start_ready"
|
|
394
|
+
: "telemetry_only",
|
|
358
395
|
} satisfies EvalSkillReadiness;
|
|
359
396
|
});
|
|
360
397
|
}
|
|
@@ -392,9 +429,9 @@ export function listSkills(
|
|
|
392
429
|
}
|
|
393
430
|
console.log("");
|
|
394
431
|
console.log("Legend:");
|
|
395
|
-
console.log(" log-ready real triggers exist; run eval generate normally");
|
|
432
|
+
console.log(" log-ready enough clean real triggers exist; run eval generate normally");
|
|
396
433
|
console.log(
|
|
397
|
-
" cold-start installed locally but
|
|
434
|
+
" cold-start installed locally but not enough clean trusted triggers yet; use --auto-synthetic",
|
|
398
435
|
);
|
|
399
436
|
console.log(" telemetry-only trigger data exists but local SKILL.md was not found");
|
|
400
437
|
} else {
|
|
@@ -566,6 +603,7 @@ export async function cliMain(): Promise<void> {
|
|
|
566
603
|
skill: { type: "string" },
|
|
567
604
|
output: { type: "string" },
|
|
568
605
|
out: { type: "string" },
|
|
606
|
+
agent: { type: "string" },
|
|
569
607
|
max: { type: "string", default: "50" },
|
|
570
608
|
seed: { type: "string", default: "42" },
|
|
571
609
|
"list-skills": { type: "boolean", default: false },
|
|
@@ -607,14 +645,7 @@ export async function cliMain(): Promise<void> {
|
|
|
607
645
|
);
|
|
608
646
|
}
|
|
609
647
|
|
|
610
|
-
const agent =
|
|
611
|
-
if (!agent) {
|
|
612
|
-
throw new CLIError(
|
|
613
|
-
"No agent CLI found (claude/codex/opencode/pi)",
|
|
614
|
-
"AGENT_NOT_FOUND",
|
|
615
|
-
"Install one of the supported agent CLIs",
|
|
616
|
-
);
|
|
617
|
-
}
|
|
648
|
+
const agent = resolveEvalGenerateAgent(values.agent);
|
|
618
649
|
|
|
619
650
|
const maxPerSide = Number.parseInt(values.max ?? "50", 10);
|
|
620
651
|
const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
|
|
@@ -781,24 +812,17 @@ export async function cliMain(): Promise<void> {
|
|
|
781
812
|
});
|
|
782
813
|
|
|
783
814
|
const positiveCount = evalSet.filter((entry) => entry.should_trigger).length;
|
|
784
|
-
if (positiveCount
|
|
815
|
+
if (positiveCount < MIN_LOG_READY_POSITIVES && values["auto-synthetic"]) {
|
|
785
816
|
const skillPath = values["skill-path"] ?? detectedSkillPath;
|
|
786
817
|
if (!skillPath) {
|
|
787
818
|
throw new CLIError(
|
|
788
|
-
`
|
|
819
|
+
`Not enough clean trusted triggers found for '${values.skill}', and no SKILL.md path could be resolved for synthetic fallback.`,
|
|
789
820
|
"FILE_NOT_FOUND",
|
|
790
821
|
`Run 'selftune eval generate --list-skills' or rerun with --skill-path /path/to/SKILL.md`,
|
|
791
822
|
);
|
|
792
823
|
}
|
|
793
824
|
|
|
794
|
-
const agent =
|
|
795
|
-
if (!agent) {
|
|
796
|
-
throw new CLIError(
|
|
797
|
-
"No agent CLI found (claude/codex/opencode/pi)",
|
|
798
|
-
"AGENT_NOT_FOUND",
|
|
799
|
-
"Install one of the supported agent CLIs",
|
|
800
|
-
);
|
|
801
|
-
}
|
|
825
|
+
const agent = resolveEvalGenerateAgent(values.agent);
|
|
802
826
|
|
|
803
827
|
emitDashboardStepProgress({
|
|
804
828
|
current: 1,
|
|
@@ -808,7 +832,7 @@ export async function cliMain(): Promise<void> {
|
|
|
808
832
|
label: "Load skill content",
|
|
809
833
|
});
|
|
810
834
|
console.log(
|
|
811
|
-
`
|
|
835
|
+
`Only ${positiveCount} clean trusted positive eval candidate(s) found for '${values.skill}'. Falling back to synthetic cold-start eval generation...`,
|
|
812
836
|
);
|
|
813
837
|
const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
|
|
814
838
|
const syntheticEvalSet = await generateSyntheticEvals(skillPath, values.skill, agent, {
|
|
@@ -860,6 +884,12 @@ export async function cliMain(): Promise<void> {
|
|
|
860
884
|
return;
|
|
861
885
|
}
|
|
862
886
|
|
|
887
|
+
if (positiveCount > 0 && positiveCount < MIN_LOG_READY_POSITIVES) {
|
|
888
|
+
console.warn(
|
|
889
|
+
`[WARN] Only ${positiveCount} clean positive eval candidate(s) were found for '${values.skill}'. The log-derived eval set may be low-confidence. Consider rerunning with --auto-synthetic or --blend.`,
|
|
890
|
+
);
|
|
891
|
+
}
|
|
892
|
+
|
|
863
893
|
// --- Blend mode: merge log-based evals with synthetic gap-fillers ---
|
|
864
894
|
let finalEvalSet = evalSet;
|
|
865
895
|
if (values.blend) {
|
|
@@ -872,14 +902,7 @@ export async function cliMain(): Promise<void> {
|
|
|
872
902
|
);
|
|
873
903
|
}
|
|
874
904
|
|
|
875
|
-
const agent =
|
|
876
|
-
if (!agent) {
|
|
877
|
-
throw new CLIError(
|
|
878
|
-
"No agent CLI found (claude/codex/opencode/pi)",
|
|
879
|
-
"AGENT_NOT_FOUND",
|
|
880
|
-
"Install one of the supported agent CLIs",
|
|
881
|
-
);
|
|
882
|
-
}
|
|
905
|
+
const agent = resolveEvalGenerateAgent(values.agent);
|
|
883
906
|
|
|
884
907
|
// Fail fast before expensive LLM calls — blending with zero logs always produces []
|
|
885
908
|
if (evalSet.length === 0) {
|