selftune 0.2.23 → 0.2.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +93 -15
- package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
- package/apps/local-dashboard/dist/assets/index-Dhgv5BQO.js +15 -0
- package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
- package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/adapters/codex/install.ts +310 -78
- package/cli/selftune/adapters/opencode/install.ts +3 -4
- package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
- package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
- package/cli/selftune/auto-update.ts +200 -8
- package/cli/selftune/canonical-export.ts +55 -25
- package/cli/selftune/command-surface.ts +397 -0
- package/cli/selftune/contribute/contribute.ts +64 -13
- package/cli/selftune/contribution-config.ts +57 -3
- package/cli/selftune/contribution-preferences.ts +117 -0
- package/cli/selftune/contribution-signals.ts +8 -4
- package/cli/selftune/contribution-staging.ts +13 -2
- package/cli/selftune/contributions.ts +55 -121
- package/cli/selftune/creator-contributions.ts +29 -10
- package/cli/selftune/cron/setup.ts +7 -3
- package/cli/selftune/dashboard-contract.ts +73 -0
- package/cli/selftune/dashboard-server.ts +168 -17
- package/cli/selftune/dashboard.ts +350 -17
- package/cli/selftune/eval/baseline.ts +21 -5
- package/cli/selftune/eval/execution-eval.ts +170 -0
- package/cli/selftune/eval/family-overlap.ts +2 -2
- package/cli/selftune/eval/hooks-to-evals.ts +228 -82
- package/cli/selftune/eval/import-skillsbench.ts +2 -2
- package/cli/selftune/eval/invocation-classifier.ts +56 -0
- package/cli/selftune/eval/synthetic-evals.ts +5 -3
- package/cli/selftune/eval/unit-test-cli.ts +7 -4
- package/cli/selftune/evolution/apply-proposal.ts +295 -0
- package/cli/selftune/evolution/engines/replay-engine.ts +79 -57
- package/cli/selftune/evolution/evolve-body.ts +100 -39
- package/cli/selftune/evolution/evolve.ts +244 -52
- package/cli/selftune/evolution/rollback.ts +0 -1
- package/cli/selftune/evolution/validate-body.ts +68 -42
- package/cli/selftune/evolution/validate-host-replay.ts +510 -60
- package/cli/selftune/evolution/validate-proposal.ts +11 -150
- package/cli/selftune/evolution/validate-routing.ts +43 -41
- package/cli/selftune/evolution/validation-contract.ts +91 -0
- package/cli/selftune/grading/auto-grade.ts +11 -7
- package/cli/selftune/grading/grade-session.ts +10 -16
- package/cli/selftune/index.ts +35 -10
- package/cli/selftune/ingestors/claude-replay.ts +15 -10
- package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
- package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
- package/cli/selftune/ingestors/pi-ingest.ts +3 -2
- package/cli/selftune/init.ts +27 -3
- package/cli/selftune/localdb/direct-write.ts +35 -1
- package/cli/selftune/localdb/queries/cron.ts +34 -0
- package/cli/selftune/localdb/queries/dashboard.ts +834 -0
- package/cli/selftune/localdb/queries/evolution.ts +158 -0
- package/cli/selftune/localdb/queries/execution.ts +133 -0
- package/cli/selftune/localdb/queries/json.ts +18 -0
- package/cli/selftune/localdb/queries/monitoring.ts +263 -0
- package/cli/selftune/localdb/queries/raw.ts +95 -0
- package/cli/selftune/localdb/queries/staging.ts +270 -0
- package/cli/selftune/localdb/queries/trust.ts +392 -0
- package/cli/selftune/localdb/queries.ts +60 -2288
- package/cli/selftune/localdb/schema.ts +21 -0
- package/cli/selftune/monitoring/watch.ts +96 -29
- package/cli/selftune/normalization.ts +3 -0
- package/cli/selftune/observability.ts +4 -2
- package/cli/selftune/orchestrate/cli.ts +161 -0
- package/cli/selftune/orchestrate/execute.ts +295 -0
- package/cli/selftune/orchestrate/finalize.ts +157 -0
- package/cli/selftune/orchestrate/locks.ts +40 -0
- package/cli/selftune/orchestrate/plan.ts +131 -0
- package/cli/selftune/orchestrate/post-run.ts +59 -0
- package/cli/selftune/orchestrate/prepare.ts +334 -0
- package/cli/selftune/orchestrate/report.ts +182 -0
- package/cli/selftune/orchestrate/runtime.ts +120 -0
- package/cli/selftune/orchestrate/signals.ts +48 -0
- package/cli/selftune/orchestrate.ts +150 -1173
- package/cli/selftune/repair/skill-usage.ts +5 -2
- package/cli/selftune/routes/overview.ts +5 -2
- package/cli/selftune/routes/skill-report.ts +15 -2
- package/cli/selftune/schedule.ts +5 -5
- package/cli/selftune/status.ts +39 -2
- package/cli/selftune/testing-readiness.ts +597 -0
- package/cli/selftune/types.ts +44 -4
- package/cli/selftune/uninstall.ts +2 -1
- package/cli/selftune/utils/canonical-log.ts +1 -9
- package/cli/selftune/utils/cli-error.ts +9 -0
- package/cli/selftune/utils/llm-call.ts +126 -6
- package/cli/selftune/utils/skill-discovery.ts +2 -0
- package/cli/selftune/workflows/proposals.ts +184 -0
- package/cli/selftune/workflows/skill-scaffold.ts +241 -0
- package/cli/selftune/workflows/workflows.ts +100 -26
- package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/src/schemas.ts +41 -1
- package/node_modules/@selftune/telemetry-contract/src/types.ts +103 -2
- package/package.json +25 -9
- package/packages/dashboard-core/AGENTS.md +18 -0
- package/packages/dashboard-core/README.md +30 -0
- package/packages/dashboard-core/index.ts +3 -0
- package/packages/dashboard-core/package.json +39 -0
- package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
- package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
- package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
- package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
- package/packages/dashboard-core/src/chrome/index.ts +14 -0
- package/packages/dashboard-core/src/chrome/types.ts +81 -0
- package/packages/dashboard-core/src/chrome/utils.ts +23 -0
- package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
- package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
- package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
- package/packages/dashboard-core/src/gates/index.ts +3 -0
- package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
- package/packages/dashboard-core/src/host/adapter.ts +47 -0
- package/packages/dashboard-core/src/host/capabilities.ts +55 -0
- package/packages/dashboard-core/src/host/index.ts +3 -0
- package/packages/dashboard-core/src/models/analytics.ts +39 -0
- package/packages/dashboard-core/src/models/index.ts +4 -0
- package/packages/dashboard-core/src/models/overview.ts +98 -0
- package/packages/dashboard-core/src/models/runtime.ts +7 -0
- package/packages/dashboard-core/src/models/skills.ts +34 -0
- package/packages/dashboard-core/src/routes/index.ts +2 -0
- package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
- package/packages/dashboard-core/src/routes/manifest.ts +451 -0
- package/packages/dashboard-core/src/routes/types.ts +39 -0
- package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
- package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
- package/packages/dashboard-core/src/screens/index.ts +37 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
- package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
- package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
- package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
- package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
- package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
- package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
- package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
- package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
- package/packages/telemetry-contract/src/schemas.ts +41 -1
- package/packages/telemetry-contract/src/types.ts +103 -2
- package/packages/ui/src/components/EvidenceViewer.tsx +80 -25
- package/packages/ui/src/components/OverviewPanels.tsx +67 -26
- package/packages/ui/src/primitives/tabs.tsx +7 -6
- package/packages/ui/src/types.ts +10 -0
- package/skill/SKILL.md +130 -332
- package/skill/agents/diagnosis-analyst.md +3 -3
- package/skill/agents/evolution-reviewer.md +3 -3
- package/skill/agents/integration-guide.md +3 -3
- package/skill/agents/pattern-analyst.md +2 -2
- package/skill/references/cli-quick-reference.md +89 -0
- package/skill/references/creator-playbook.md +131 -0
- package/skill/references/examples.md +48 -0
- package/skill/references/troubleshooting.md +47 -0
- package/skill/references/version-history.md +1 -1
- package/skill/selftune.contribute.json +11 -0
- package/skill/{Workflows → workflows}/Baseline.md +20 -1
- package/skill/{Workflows → workflows}/Contribute.md +23 -10
- package/skill/{Workflows → workflows}/Contributions.md +13 -5
- package/skill/workflows/CreateTestDeploy.md +170 -0
- package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
- package/skill/{Workflows → workflows}/Cron.md +1 -1
- package/skill/{Workflows → workflows}/Dashboard.md +20 -0
- package/skill/{Workflows → workflows}/Doctor.md +1 -1
- package/skill/{Workflows → workflows}/Evals.md +67 -2
- package/skill/{Workflows → workflows}/Evolve.md +119 -30
- package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
- package/skill/{Workflows → workflows}/Grade.md +1 -1
- package/skill/{Workflows → workflows}/Initialize.md +8 -4
- package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
- package/skill/{Workflows → workflows}/Schedule.md +3 -3
- package/skill/workflows/SignalsDashboard.md +87 -0
- package/skill/{Workflows → workflows}/UnitTest.md +19 -0
- package/skill/{Workflows → workflows}/Watch.md +42 -2
- package/skill/{Workflows → workflows}/Workflows.md +39 -2
- package/apps/local-dashboard/dist/assets/index-CwOtTrUS.css +0 -1
- package/apps/local-dashboard/dist/assets/index-f1HQpbeH.js +0 -59
- package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
- package/apps/local-dashboard/dist/assets/vendor-ui-jVSaIZey.js +0 -12
- /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
- /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
- /package/skill/{Workflows → workflows}/Badge.md +0 -0
- /package/skill/{Workflows → workflows}/Composability.md +0 -0
- /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
- /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
- /package/skill/{Workflows → workflows}/Hook.md +0 -0
- /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
- /package/skill/{Workflows → workflows}/Ingest.md +0 -0
- /package/skill/{Workflows → workflows}/PlatformHooks.md +0 -0
- /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
- /package/skill/{Workflows → workflows}/Recover.md +0 -0
- /package/skill/{Workflows → workflows}/Registry.md +0 -0
- /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
- /package/skill/{Workflows → workflows}/Replay.md +0 -0
- /package/skill/{Workflows → workflows}/Rollback.md +0 -0
- /package/skill/{Workflows → workflows}/Sync.md +0 -0
- /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
- /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
|
@@ -6,17 +6,14 @@
|
|
|
6
6
|
* to determine whether the proposal is an improvement.
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
|
-
import type {
|
|
9
|
+
import type {
|
|
10
|
+
EvalEntry,
|
|
11
|
+
EvolutionProposal,
|
|
12
|
+
InvocationTypeScores,
|
|
13
|
+
ValidationMode,
|
|
14
|
+
} from "../types.js";
|
|
10
15
|
import { callLlm, type EffortLevel } from "../utils/llm-call.js";
|
|
11
|
-
import {
|
|
12
|
-
buildBatchTriggerCheckPrompt,
|
|
13
|
-
buildTriggerCheckPrompt,
|
|
14
|
-
parseBatchTriggerResponse,
|
|
15
|
-
parseTriggerResponse,
|
|
16
|
-
} from "../utils/trigger-check.js";
|
|
17
|
-
|
|
18
|
-
// Re-export so existing consumers don't break
|
|
19
|
-
export { buildTriggerCheckPrompt, parseTriggerResponse };
|
|
16
|
+
import { buildBatchTriggerCheckPrompt, parseBatchTriggerResponse } from "../utils/trigger-check.js";
|
|
20
17
|
|
|
21
18
|
/** Number of eval queries to batch into a single LLM call.
|
|
22
19
|
* Higher = fewer claude -p spawns = much faster (each spawn has ~30-60s overhead).
|
|
@@ -40,147 +37,11 @@ export interface ValidationResult {
|
|
|
40
37
|
net_change: number; // after - before pass rate
|
|
41
38
|
by_invocation_type?: InvocationTypeScores;
|
|
42
39
|
per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
|
|
43
|
-
validation_mode?:
|
|
40
|
+
validation_mode?: ValidationMode;
|
|
44
41
|
validation_agent?: string;
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
// Proposal validation
|
|
49
|
-
// ---------------------------------------------------------------------------
|
|
50
|
-
|
|
51
|
-
/** Validate a proposal sequentially (one LLM call per query). Kept for backward compat. */
|
|
52
|
-
export async function validateProposalSequential(
|
|
53
|
-
proposal: EvolutionProposal,
|
|
54
|
-
evalSet: EvalEntry[],
|
|
55
|
-
agent: string,
|
|
56
|
-
modelFlag?: string,
|
|
57
|
-
effort?: EffortLevel,
|
|
58
|
-
): Promise<ValidationResult> {
|
|
59
|
-
if (evalSet.length === 0) {
|
|
60
|
-
return {
|
|
61
|
-
proposal_id: proposal.proposal_id,
|
|
62
|
-
before_pass_rate: 0,
|
|
63
|
-
after_pass_rate: 0,
|
|
64
|
-
improved: false,
|
|
65
|
-
regressions: [],
|
|
66
|
-
new_passes: [],
|
|
67
|
-
net_change: 0,
|
|
68
|
-
validation_mode: "llm_judge",
|
|
69
|
-
validation_agent: agent,
|
|
70
|
-
};
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
|
|
74
|
-
const regressions: EvalEntry[] = [];
|
|
75
|
-
const newPasses: EvalEntry[] = [];
|
|
76
|
-
const perEntryResults: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }> =
|
|
77
|
-
[];
|
|
78
|
-
let beforePassed = 0;
|
|
79
|
-
let afterPassed = 0;
|
|
80
|
-
|
|
81
|
-
for (const entry of evalSet) {
|
|
82
|
-
// Check with original description
|
|
83
|
-
const beforePrompt = buildTriggerCheckPrompt(proposal.original_description, entry.query);
|
|
84
|
-
const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag, effort);
|
|
85
|
-
const beforeTriggered = parseTriggerResponse(beforeRaw);
|
|
86
|
-
const beforePass =
|
|
87
|
-
(entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
|
|
88
|
-
|
|
89
|
-
// Check with proposed description
|
|
90
|
-
const afterPrompt = buildTriggerCheckPrompt(proposal.proposed_description, entry.query);
|
|
91
|
-
const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag, effort);
|
|
92
|
-
const afterTriggered = parseTriggerResponse(afterRaw);
|
|
93
|
-
const afterPass =
|
|
94
|
-
(entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
|
|
95
|
-
|
|
96
|
-
if (beforePass) beforePassed++;
|
|
97
|
-
if (afterPass) afterPassed++;
|
|
98
|
-
|
|
99
|
-
// Regression: passed before, fails after
|
|
100
|
-
if (beforePass && !afterPass) {
|
|
101
|
-
regressions.push(entry);
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
// New pass: failed before, passes after
|
|
105
|
-
if (!beforePass && afterPass) {
|
|
106
|
-
newPasses.push(entry);
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
perEntryResults.push({ entry, before_pass: beforePass, after_pass: afterPass });
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
const total = evalSet.length;
|
|
113
|
-
const beforePassRate = beforePassed / total;
|
|
114
|
-
const afterPassRate = afterPassed / total;
|
|
115
|
-
const netChange = afterPassRate - beforePassRate;
|
|
116
|
-
|
|
117
|
-
// A proposal is improved when ALL of:
|
|
118
|
-
// - after_pass_rate > before_pass_rate
|
|
119
|
-
// - regressions count < 5% of total eval entries
|
|
120
|
-
// - Either net improvement >= 0.10 OR new_passes.length >= 2
|
|
121
|
-
const improved =
|
|
122
|
-
afterPassRate > beforePassRate &&
|
|
123
|
-
regressions.length < total * 0.05 &&
|
|
124
|
-
(netChange >= 0.1 || newPasses.length >= 2);
|
|
125
|
-
|
|
126
|
-
// Compute per-invocation-type scores (initialize all required keys)
|
|
127
|
-
const byInvocationType: Record<string, { passed: number; total: number }> = {
|
|
128
|
-
explicit: { passed: 0, total: 0 },
|
|
129
|
-
implicit: { passed: 0, total: 0 },
|
|
130
|
-
contextual: { passed: 0, total: 0 },
|
|
131
|
-
negative: { passed: 0, total: 0 },
|
|
132
|
-
};
|
|
133
|
-
for (const r of perEntryResults) {
|
|
134
|
-
const type = r.entry.invocation_type ?? "implicit";
|
|
135
|
-
if (!byInvocationType[type]) byInvocationType[type] = { passed: 0, total: 0 };
|
|
136
|
-
byInvocationType[type].total++;
|
|
137
|
-
if (r.after_pass) byInvocationType[type].passed++;
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
const invocationScores: InvocationTypeScores = {
|
|
141
|
-
explicit: {
|
|
142
|
-
...byInvocationType.explicit,
|
|
143
|
-
pass_rate:
|
|
144
|
-
byInvocationType.explicit.total > 0
|
|
145
|
-
? byInvocationType.explicit.passed / byInvocationType.explicit.total
|
|
146
|
-
: 0,
|
|
147
|
-
},
|
|
148
|
-
implicit: {
|
|
149
|
-
...byInvocationType.implicit,
|
|
150
|
-
pass_rate:
|
|
151
|
-
byInvocationType.implicit.total > 0
|
|
152
|
-
? byInvocationType.implicit.passed / byInvocationType.implicit.total
|
|
153
|
-
: 0,
|
|
154
|
-
},
|
|
155
|
-
contextual: {
|
|
156
|
-
...byInvocationType.contextual,
|
|
157
|
-
pass_rate:
|
|
158
|
-
byInvocationType.contextual.total > 0
|
|
159
|
-
? byInvocationType.contextual.passed / byInvocationType.contextual.total
|
|
160
|
-
: 0,
|
|
161
|
-
},
|
|
162
|
-
negative: {
|
|
163
|
-
...byInvocationType.negative,
|
|
164
|
-
pass_rate:
|
|
165
|
-
byInvocationType.negative.total > 0
|
|
166
|
-
? byInvocationType.negative.passed / byInvocationType.negative.total
|
|
167
|
-
: 0,
|
|
168
|
-
},
|
|
169
|
-
};
|
|
170
|
-
|
|
171
|
-
return {
|
|
172
|
-
proposal_id: proposal.proposal_id,
|
|
173
|
-
before_pass_rate: beforePassRate,
|
|
174
|
-
after_pass_rate: afterPassRate,
|
|
175
|
-
improved,
|
|
176
|
-
regressions,
|
|
177
|
-
new_passes: newPasses,
|
|
178
|
-
net_change: netChange,
|
|
179
|
-
by_invocation_type: invocationScores,
|
|
180
|
-
per_entry_results: perEntryResults,
|
|
181
|
-
validation_mode: "llm_judge",
|
|
182
|
-
validation_agent: agent,
|
|
183
|
-
};
|
|
42
|
+
validation_fixture_id?: string;
|
|
43
|
+
validation_fallback_reason?: string;
|
|
44
|
+
before_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
|
|
184
45
|
}
|
|
185
46
|
|
|
186
47
|
// ---------------------------------------------------------------------------
|
|
@@ -13,20 +13,12 @@ import type {
|
|
|
13
13
|
BodyValidationResult,
|
|
14
14
|
EvalEntry,
|
|
15
15
|
RoutingReplayEntryResult,
|
|
16
|
+
ValidationGate,
|
|
16
17
|
ValidationMode,
|
|
17
18
|
} from "../types.js";
|
|
18
19
|
import { runJudgeValidation } from "./engines/judge-engine.js";
|
|
19
|
-
import {
|
|
20
|
-
|
|
21
|
-
type ReplayRunner,
|
|
22
|
-
type ReplayRunnerInput,
|
|
23
|
-
type ReplayValidationOptions,
|
|
24
|
-
} from "./engines/replay-engine.js";
|
|
25
|
-
|
|
26
|
-
// Re-export engine types for backward compatibility
|
|
27
|
-
export type { ReplayRunnerInput as RoutingReplayRunnerInput };
|
|
28
|
-
export type { ReplayRunner as RoutingReplayRunner };
|
|
29
|
-
export type { ReplayValidationOptions as RoutingValidationOptions };
|
|
20
|
+
import { type ReplayValidationOptions } from "./engines/replay-engine.js";
|
|
21
|
+
import { runValidationContract, type ValidationStrategy } from "./validation-contract.js";
|
|
30
22
|
|
|
31
23
|
export interface RoutingTriggerAccuracyResult {
|
|
32
24
|
before_pass_rate: number;
|
|
@@ -35,10 +27,16 @@ export interface RoutingTriggerAccuracyResult {
|
|
|
35
27
|
validation_mode: ValidationMode;
|
|
36
28
|
validation_agent: string;
|
|
37
29
|
validation_fixture_id?: string;
|
|
30
|
+
validation_fallback_reason?: string;
|
|
38
31
|
per_entry_results?: RoutingReplayEntryResult[];
|
|
39
32
|
before_entry_results?: RoutingReplayEntryResult[];
|
|
40
33
|
}
|
|
41
34
|
|
|
35
|
+
export interface RoutingValidationOptions extends ReplayValidationOptions {
|
|
36
|
+
mode?: ValidationStrategy;
|
|
37
|
+
onReplayFallback?: (reason?: string) => void;
|
|
38
|
+
}
|
|
39
|
+
|
|
42
40
|
// ---------------------------------------------------------------------------
|
|
43
41
|
// Structural validation
|
|
44
42
|
// ---------------------------------------------------------------------------
|
|
@@ -101,7 +99,7 @@ export function validateRoutingStructure(routing: string): { valid: boolean; rea
|
|
|
101
99
|
* Run before/after trigger checks on the eval set using the routing content.
|
|
102
100
|
* Returns pass rates for comparison.
|
|
103
101
|
*
|
|
104
|
-
* Prefers replay
|
|
102
|
+
* Prefers host/runtime replay when a runtime runner is available,
|
|
105
103
|
* falls back to LLM judge otherwise.
|
|
106
104
|
*/
|
|
107
105
|
export async function validateRoutingTriggerAccuracy(
|
|
@@ -110,7 +108,7 @@ export async function validateRoutingTriggerAccuracy(
|
|
|
110
108
|
evalSet: EvalEntry[],
|
|
111
109
|
agent: string,
|
|
112
110
|
modelFlag?: string,
|
|
113
|
-
options:
|
|
111
|
+
options: RoutingValidationOptions = {},
|
|
114
112
|
): Promise<RoutingTriggerAccuracyResult> {
|
|
115
113
|
if (evalSet.length === 0) {
|
|
116
114
|
return {
|
|
@@ -122,35 +120,38 @@ export async function validateRoutingTriggerAccuracy(
|
|
|
122
120
|
};
|
|
123
121
|
}
|
|
124
122
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
originalRouting,
|
|
128
|
-
proposedRouting,
|
|
129
|
-
evalSet,
|
|
130
|
-
agent,
|
|
131
|
-
options,
|
|
132
|
-
);
|
|
133
|
-
|
|
134
|
-
if (replayResult) {
|
|
135
|
-
return replayResult;
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
// Fall back to LLM judge
|
|
139
|
-
const judgeResult = await runJudgeValidation(
|
|
140
|
-
originalRouting,
|
|
141
|
-
proposedRouting,
|
|
123
|
+
const { result, fallbackReason } = await runValidationContract<RoutingTriggerAccuracyResult>({
|
|
124
|
+
mode: options.mode ?? "auto",
|
|
125
|
+
originalContent: originalRouting,
|
|
126
|
+
proposedContent: proposedRouting,
|
|
142
127
|
evalSet,
|
|
143
128
|
agent,
|
|
144
|
-
|
|
145
|
-
|
|
129
|
+
replayOptions: options,
|
|
130
|
+
runJudge: async () => {
|
|
131
|
+
const judgeResult = await runJudgeValidation(
|
|
132
|
+
originalRouting,
|
|
133
|
+
proposedRouting,
|
|
134
|
+
evalSet,
|
|
135
|
+
agent,
|
|
136
|
+
modelFlag,
|
|
137
|
+
);
|
|
138
|
+
|
|
139
|
+
return {
|
|
140
|
+
result: {
|
|
141
|
+
before_pass_rate: judgeResult.before_pass_rate,
|
|
142
|
+
after_pass_rate: judgeResult.after_pass_rate,
|
|
143
|
+
improved: judgeResult.improved,
|
|
144
|
+
validation_mode: judgeResult.validation_mode,
|
|
145
|
+
validation_agent: judgeResult.validation_agent,
|
|
146
|
+
},
|
|
147
|
+
modeUsed: judgeResult.validation_mode,
|
|
148
|
+
};
|
|
149
|
+
},
|
|
150
|
+
onReplayFallback: options.onReplayFallback,
|
|
151
|
+
adaptReplayResult: (replayResult) => replayResult,
|
|
152
|
+
});
|
|
146
153
|
|
|
147
|
-
return {
|
|
148
|
-
before_pass_rate: judgeResult.before_pass_rate,
|
|
149
|
-
after_pass_rate: judgeResult.after_pass_rate,
|
|
150
|
-
improved: judgeResult.improved,
|
|
151
|
-
validation_mode: judgeResult.validation_mode,
|
|
152
|
-
validation_agent: judgeResult.validation_agent,
|
|
153
|
-
};
|
|
154
|
+
return fallbackReason ? { ...result, validation_fallback_reason: fallbackReason } : result;
|
|
154
155
|
}
|
|
155
156
|
|
|
156
157
|
// ---------------------------------------------------------------------------
|
|
@@ -163,9 +164,9 @@ export async function validateRoutingProposal(
|
|
|
163
164
|
evalSet: EvalEntry[],
|
|
164
165
|
agent: string,
|
|
165
166
|
modelFlag?: string,
|
|
166
|
-
options:
|
|
167
|
+
options: RoutingValidationOptions = {},
|
|
167
168
|
): Promise<BodyValidationResult> {
|
|
168
|
-
const gateResults: Array<{ gate:
|
|
169
|
+
const gateResults: Array<{ gate: ValidationGate; passed: boolean; reason: string }> = [];
|
|
169
170
|
|
|
170
171
|
// Gate 1: Structural validation
|
|
171
172
|
const structural = validateRoutingStructure(proposal.proposed_body);
|
|
@@ -217,6 +218,7 @@ export async function validateRoutingProposal(
|
|
|
217
218
|
validation_mode: accuracy.validation_mode,
|
|
218
219
|
validation_agent: accuracy.validation_agent,
|
|
219
220
|
validation_fixture_id: accuracy.validation_fixture_id,
|
|
221
|
+
validation_fallback_reason: accuracy.validation_fallback_reason,
|
|
220
222
|
before_pass_rate: accuracy.before_pass_rate,
|
|
221
223
|
after_pass_rate: accuracy.after_pass_rate,
|
|
222
224
|
per_entry_results: accuracy.per_entry_results,
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import type { EvalEntry, ValidationMode } from "../types.js";
|
|
2
|
+
import { CLIError } from "../utils/cli-error.js";
|
|
3
|
+
import {
|
|
4
|
+
runReplayValidation,
|
|
5
|
+
type ReplayValidationOptions,
|
|
6
|
+
type ReplayValidationResult,
|
|
7
|
+
} from "./engines/replay-engine.js";
|
|
8
|
+
|
|
9
|
+
export type ValidationStrategy = "auto" | "replay" | "judge";
|
|
10
|
+
|
|
11
|
+
export const DEFAULT_VALIDATION_STRATEGY: ValidationStrategy = "auto";
|
|
12
|
+
|
|
13
|
+
export interface ValidationExecutionResult<TResult> {
|
|
14
|
+
result: TResult;
|
|
15
|
+
modeUsed: ValidationMode;
|
|
16
|
+
fallbackReason?: string;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface ValidationContractOptions<TResult> {
|
|
20
|
+
mode?: ValidationStrategy;
|
|
21
|
+
originalContent: string;
|
|
22
|
+
proposedContent: string;
|
|
23
|
+
evalSet: EvalEntry[];
|
|
24
|
+
agent: string;
|
|
25
|
+
replayOptions?: ReplayValidationOptions;
|
|
26
|
+
runJudge: () => Promise<ValidationExecutionResult<TResult>>;
|
|
27
|
+
adaptReplayResult: (replayResult: ReplayValidationResult) => TResult;
|
|
28
|
+
onReplayFallback?: (reason?: string) => void;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export function hasReplayValidationPath(
|
|
32
|
+
replayOptions?: ReplayValidationOptions,
|
|
33
|
+
): replayOptions is ReplayValidationOptions {
|
|
34
|
+
return Boolean(replayOptions?.replayFixture || replayOptions?.replayRunner);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export function createReplayUnavailableError(reason?: string): CLIError {
|
|
38
|
+
const message = reason
|
|
39
|
+
? `Replay validation requested but real host/runtime replay is unavailable: ${reason}`
|
|
40
|
+
: "Replay validation requested but real host/runtime replay is unavailable.";
|
|
41
|
+
return new CLIError(
|
|
42
|
+
message,
|
|
43
|
+
"REPLAY_UNAVAILABLE",
|
|
44
|
+
"Use --validation-mode auto to allow LLM judge fallback, or run selftune on a host/agent with runtime replay support for this skill.",
|
|
45
|
+
);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
export async function runValidationContract<TResult>(
|
|
49
|
+
options: ValidationContractOptions<TResult>,
|
|
50
|
+
): Promise<ValidationExecutionResult<TResult>> {
|
|
51
|
+
const mode = options.mode ?? DEFAULT_VALIDATION_STRATEGY;
|
|
52
|
+
|
|
53
|
+
if (mode === "judge") {
|
|
54
|
+
return options.runJudge();
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if (hasReplayValidationPath(options.replayOptions)) {
|
|
58
|
+
const replayAttempt = await runReplayValidation(
|
|
59
|
+
options.originalContent,
|
|
60
|
+
options.proposedContent,
|
|
61
|
+
options.evalSet,
|
|
62
|
+
options.agent,
|
|
63
|
+
options.replayOptions,
|
|
64
|
+
);
|
|
65
|
+
|
|
66
|
+
if (replayAttempt.result) {
|
|
67
|
+
return {
|
|
68
|
+
result: options.adaptReplayResult(replayAttempt.result),
|
|
69
|
+
modeUsed: replayAttempt.result.validation_mode,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if (mode === "replay") {
|
|
74
|
+
throw createReplayUnavailableError(replayAttempt.fallbackReason);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
options.onReplayFallback?.(replayAttempt.fallbackReason);
|
|
78
|
+
const judgeResult = await options.runJudge();
|
|
79
|
+
return {
|
|
80
|
+
...judgeResult,
|
|
81
|
+
fallbackReason: replayAttempt.fallbackReason,
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if (mode === "replay") {
|
|
86
|
+
throw createReplayUnavailableError();
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
options.onReplayFallback?.();
|
|
90
|
+
return options.runJudge();
|
|
91
|
+
}
|
|
@@ -13,12 +13,16 @@ import { mkdirSync, writeFileSync } from "node:fs";
|
|
|
13
13
|
import { dirname } from "node:path";
|
|
14
14
|
import { parseArgs } from "node:util";
|
|
15
15
|
|
|
16
|
-
import {
|
|
16
|
+
import { TELEMETRY_LOG } from "../constants.js";
|
|
17
17
|
import { getDb } from "../localdb/db.js";
|
|
18
18
|
import { querySessionTelemetry, querySkillUsageRecords } from "../localdb/queries.js";
|
|
19
19
|
import type { GradingResult, SessionTelemetryRecord, SkillUsageRecord } from "../types.js";
|
|
20
20
|
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
21
|
-
import {
|
|
21
|
+
import {
|
|
22
|
+
detectLlmAgent as _detectAgent,
|
|
23
|
+
isLlmBackedAgent,
|
|
24
|
+
LLM_BACKED_AGENT_CANDIDATES,
|
|
25
|
+
} from "../utils/llm-call.js";
|
|
22
26
|
import { readExcerpt } from "../utils/transcript.js";
|
|
23
27
|
import {
|
|
24
28
|
buildDefaultGradingOutputPath,
|
|
@@ -55,7 +59,7 @@ Options:
|
|
|
55
59
|
--session-id Grade a specific session (auto-detects most recent if omitted)
|
|
56
60
|
--telemetry-log Path to telemetry log (default: ~/.claude/session_telemetry_log.jsonl)
|
|
57
61
|
--output Output path for grading JSON (default: ~/.selftune/grading/result-<session>.json)
|
|
58
|
-
--agent Agent CLI to use (${
|
|
62
|
+
--agent Agent CLI to use (${LLM_BACKED_AGENT_CANDIDATES.join(", ")})
|
|
59
63
|
--show-transcript Print transcript excerpt before grading
|
|
60
64
|
-h, --help Show this help message`);
|
|
61
65
|
process.exit(0);
|
|
@@ -68,9 +72,9 @@ Options:
|
|
|
68
72
|
|
|
69
73
|
// --- Determine agent ---
|
|
70
74
|
let agent: string | null = null;
|
|
71
|
-
const validAgents = [...
|
|
75
|
+
const validAgents = [...LLM_BACKED_AGENT_CANDIDATES];
|
|
72
76
|
if (values.agent) {
|
|
73
|
-
if (!
|
|
77
|
+
if (!isLlmBackedAgent(values.agent)) {
|
|
74
78
|
throw new CLIError(
|
|
75
79
|
`Invalid --agent '${values.agent}'. Expected one of: ${validAgents.join(", ")}`,
|
|
76
80
|
"INVALID_FLAG",
|
|
@@ -84,9 +88,9 @@ Options:
|
|
|
84
88
|
|
|
85
89
|
if (!agent) {
|
|
86
90
|
throw new CLIError(
|
|
87
|
-
`No supported agent CLI (${
|
|
91
|
+
`No supported agent CLI (${LLM_BACKED_AGENT_CANDIDATES.join("/")}) found in PATH`,
|
|
88
92
|
"AGENT_NOT_FOUND",
|
|
89
|
-
"Install
|
|
93
|
+
"Install Claude Code, Codex, OpenCode, or Pi",
|
|
90
94
|
);
|
|
91
95
|
}
|
|
92
96
|
|
|
@@ -5,19 +5,14 @@
|
|
|
5
5
|
* Rubric-based grader for Claude Code skill sessions.
|
|
6
6
|
* Migrated from grade_session.py.
|
|
7
7
|
*
|
|
8
|
-
* Grades via an installed agent CLI selected from
|
|
8
|
+
* Grades via an installed agent CLI selected from the LLM-backed agent set.
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
11
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
12
12
|
import { basename, dirname, join } from "node:path";
|
|
13
13
|
import { parseArgs } from "node:util";
|
|
14
14
|
|
|
15
|
-
import {
|
|
16
|
-
AGENT_CANDIDATES,
|
|
17
|
-
CLAUDE_CODE_PROJECTS_DIR,
|
|
18
|
-
SELFTUNE_CONFIG_DIR,
|
|
19
|
-
TELEMETRY_LOG,
|
|
20
|
-
} from "../constants.js";
|
|
15
|
+
import { CLAUDE_CODE_PROJECTS_DIR, SELFTUNE_CONFIG_DIR, TELEMETRY_LOG } from "../constants.js";
|
|
21
16
|
import { getDb } from "../localdb/db.js";
|
|
22
17
|
import { querySessionTelemetry, querySkillUsageRecords } from "../localdb/queries.js";
|
|
23
18
|
import type {
|
|
@@ -31,7 +26,9 @@ import type {
|
|
|
31
26
|
} from "../types.js";
|
|
32
27
|
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
33
28
|
import {
|
|
34
|
-
|
|
29
|
+
detectLlmAgent as _detectAgent,
|
|
30
|
+
isLlmBackedAgent,
|
|
31
|
+
LLM_BACKED_AGENT_CANDIDATES,
|
|
35
32
|
stripMarkdownFences as _stripMarkdownFences,
|
|
36
33
|
callViaAgent,
|
|
37
34
|
} from "../utils/llm-call.js";
|
|
@@ -42,9 +39,6 @@ import {
|
|
|
42
39
|
} from "../utils/transcript.js";
|
|
43
40
|
import { type PreGateContext, runPreGates } from "./pre-gates.js";
|
|
44
41
|
|
|
45
|
-
// Re-export for backward compatibility
|
|
46
|
-
export { detectAgent, stripMarkdownFences } from "../utils/llm-call.js";
|
|
47
|
-
|
|
48
42
|
// ---------------------------------------------------------------------------
|
|
49
43
|
// Constants
|
|
50
44
|
// ---------------------------------------------------------------------------
|
|
@@ -756,7 +750,7 @@ Options:
|
|
|
756
750
|
--transcript Path to transcript file
|
|
757
751
|
--telemetry-log Path to telemetry log (default: ~/.claude/session_telemetry_log.jsonl)
|
|
758
752
|
--output Output path for grading JSON (default: ~/.selftune/grading/result-<session>.json)
|
|
759
|
-
--agent Agent CLI to use (${
|
|
753
|
+
--agent Agent CLI to use (${LLM_BACKED_AGENT_CANDIDATES.join(", ")})
|
|
760
754
|
--show-transcript Print transcript excerpt before grading
|
|
761
755
|
-h, --help Show this help message`);
|
|
762
756
|
process.exit(0);
|
|
@@ -769,9 +763,9 @@ Options:
|
|
|
769
763
|
|
|
770
764
|
// --- Determine agent ---
|
|
771
765
|
let agent: string | null = null;
|
|
772
|
-
const validAgents = [...
|
|
766
|
+
const validAgents = [...LLM_BACKED_AGENT_CANDIDATES];
|
|
773
767
|
if (values.agent) {
|
|
774
|
-
if (!
|
|
768
|
+
if (!isLlmBackedAgent(values.agent)) {
|
|
775
769
|
throw new CLIError(
|
|
776
770
|
`Invalid --agent '${values.agent}'. Expected one of: ${validAgents.join(", ")}`,
|
|
777
771
|
"INVALID_FLAG",
|
|
@@ -785,9 +779,9 @@ Options:
|
|
|
785
779
|
|
|
786
780
|
if (!agent) {
|
|
787
781
|
throw new CLIError(
|
|
788
|
-
`No supported agent CLI (${
|
|
782
|
+
`No supported agent CLI (${LLM_BACKED_AGENT_CANDIDATES.join("/")}) found in PATH`,
|
|
789
783
|
"AGENT_NOT_FOUND",
|
|
790
|
-
"Install
|
|
784
|
+
"Install Claude Code, Codex, OpenCode, or Pi, then retry",
|
|
791
785
|
);
|
|
792
786
|
}
|
|
793
787
|
|
package/cli/selftune/index.ts
CHANGED
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
* selftune contribute — Export anonymized skill data for community
|
|
22
22
|
* selftune contributions — Manage creator-directed sharing preferences
|
|
23
23
|
* selftune creator-contributions — Manage creator-side contribution configs
|
|
24
|
-
* selftune workflows — Discover and
|
|
24
|
+
* selftune workflows — Discover workflows and scaffold workflow skills
|
|
25
25
|
* selftune quickstart — Guided onboarding: init, ingest, status, and suggestions
|
|
26
26
|
* selftune repair-skill-usage — Rebuild trustworthy skill usage from transcripts
|
|
27
27
|
* selftune export — Export SQLite data to JSONL snapshots
|
|
@@ -38,6 +38,7 @@
|
|
|
38
38
|
*/
|
|
39
39
|
|
|
40
40
|
import { CLIError, handleCLIError } from "./utils/cli-error.js";
|
|
41
|
+
import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "./command-surface.js";
|
|
41
42
|
|
|
42
43
|
process.on("uncaughtException", handleCLIError);
|
|
43
44
|
process.on("unhandledRejection", handleCLIError);
|
|
@@ -69,7 +70,7 @@ Commands:
|
|
|
69
70
|
contribute Export anonymized skill data for community
|
|
70
71
|
contributions Manage creator-directed sharing preferences
|
|
71
72
|
creator-contributions Manage creator-side contribution configs
|
|
72
|
-
workflows Discover and
|
|
73
|
+
workflows Discover workflows and scaffold workflow skills
|
|
73
74
|
quickstart Guided onboarding: init, ingest, status, and suggestions
|
|
74
75
|
repair-skill-usage Rebuild trustworthy skill usage from transcripts
|
|
75
76
|
export Export SQLite data to JSONL snapshots
|
|
@@ -227,12 +228,12 @@ Run 'selftune grade <subcommand> --help' for subcommand-specific options.`);
|
|
|
227
228
|
case "evolve": {
|
|
228
229
|
const sub = process.argv[2];
|
|
229
230
|
if (sub === "--help" || sub === "-h") {
|
|
230
|
-
console.log(
|
|
231
|
+
console.log(`${renderCommandHelp(PUBLIC_COMMAND_SURFACES.evolve)}
|
|
231
232
|
|
|
232
|
-
|
|
233
|
-
selftune evolve [options]
|
|
234
|
-
selftune evolve
|
|
235
|
-
selftune evolve
|
|
233
|
+
Subcommands:
|
|
234
|
+
selftune evolve body [options] Evolve full body or routing table
|
|
235
|
+
selftune evolve rollback [options] Rollback a previous evolution
|
|
236
|
+
selftune evolve apply-proposal [options] Apply an approved contributor proposal
|
|
236
237
|
|
|
237
238
|
Run 'selftune evolve <subcommand> --help' for subcommand-specific options.`);
|
|
238
239
|
process.exit(0);
|
|
@@ -255,6 +256,11 @@ Run 'selftune evolve <subcommand> --help' for subcommand-specific options.`);
|
|
|
255
256
|
await cliMain();
|
|
256
257
|
break;
|
|
257
258
|
}
|
|
259
|
+
case "apply-proposal": {
|
|
260
|
+
const { cliMain } = await import("./evolution/apply-proposal.js");
|
|
261
|
+
await cliMain();
|
|
262
|
+
break;
|
|
263
|
+
}
|
|
258
264
|
default:
|
|
259
265
|
throw new CLIError(
|
|
260
266
|
`Unknown evolve target: ${sub}`,
|
|
@@ -281,6 +287,12 @@ Actions:
|
|
|
281
287
|
composability Analyze skill co-occurrence conflicts
|
|
282
288
|
family-overlap Detect sibling-skill overlap and consolidation pressure
|
|
283
289
|
|
|
290
|
+
Recommended creator loop:
|
|
291
|
+
1. selftune eval generate --skill <name>
|
|
292
|
+
2. selftune eval unit-test --skill <name> --generate --skill-path <path>
|
|
293
|
+
3. selftune evolve --skill <name> --skill-path <path> --dry-run --validation-mode replay
|
|
294
|
+
4. selftune grade baseline --skill <name> --skill-path <path>
|
|
295
|
+
|
|
284
296
|
Run 'selftune eval <action> --help' for action-specific options.`);
|
|
285
297
|
process.exit(0);
|
|
286
298
|
}
|
|
@@ -337,7 +349,8 @@ Run 'selftune eval <action> --help' for action-specific options.`);
|
|
|
337
349
|
"selftune eval composability --skill <name>",
|
|
338
350
|
);
|
|
339
351
|
}
|
|
340
|
-
const logPath =
|
|
352
|
+
const logPath =
|
|
353
|
+
typeof values["telemetry-log"] === "string" ? values["telemetry-log"] : TELEMETRY_LOG;
|
|
341
354
|
let telemetry: unknown[];
|
|
342
355
|
if (logPath === TELEMETRY_LOG) {
|
|
343
356
|
try {
|
|
@@ -363,7 +376,19 @@ Run 'selftune eval <action> --help' for action-specific options.`);
|
|
|
363
376
|
);
|
|
364
377
|
}
|
|
365
378
|
const windowSize = rawWindow === undefined ? undefined : Number(rawWindow);
|
|
366
|
-
const
|
|
379
|
+
const skillName = typeof values.skill === "string" ? values.skill : undefined;
|
|
380
|
+
if (!skillName) {
|
|
381
|
+
throw new CLIError(
|
|
382
|
+
"--skill <name> is required.",
|
|
383
|
+
"MISSING_FLAG",
|
|
384
|
+
"selftune eval composability --skill <name>",
|
|
385
|
+
);
|
|
386
|
+
}
|
|
387
|
+
const report = analyzeComposability(
|
|
388
|
+
skillName,
|
|
389
|
+
telemetry as import("./types.js").SessionTelemetryRecord[],
|
|
390
|
+
windowSize,
|
|
391
|
+
);
|
|
367
392
|
console.log(JSON.stringify(report, null, 2));
|
|
368
393
|
break;
|
|
369
394
|
}
|
|
@@ -720,7 +745,7 @@ Output:
|
|
|
720
745
|
userId: identity.user_id,
|
|
721
746
|
agentType: readConfiguredAgentType(SELFTUNE_CONFIG_PATH, "unknown"),
|
|
722
747
|
selftuneVersion: getSelftuneVersion(),
|
|
723
|
-
dryRun: values["dry-run"]
|
|
748
|
+
dryRun: values["dry-run"] === true,
|
|
724
749
|
apiKey: identity.api_key,
|
|
725
750
|
});
|
|
726
751
|
|