selftune 0.2.31 → 0.2.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -56
- package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
- package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
- package/apps/local-dashboard/dist/index.html +3 -3
- package/cli/selftune/command-surface.ts +613 -2
- package/cli/selftune/create/baseline.ts +429 -0
- package/cli/selftune/create/check.ts +35 -0
- package/cli/selftune/create/init.ts +115 -0
- package/cli/selftune/create/package-candidate-state.ts +771 -0
- package/cli/selftune/create/package-evaluator.ts +710 -0
- package/cli/selftune/create/package-fingerprint.ts +142 -0
- package/cli/selftune/create/package-search.ts +377 -0
- package/cli/selftune/create/publish.ts +431 -0
- package/cli/selftune/create/readiness.ts +495 -0
- package/cli/selftune/create/replay.ts +330 -0
- package/cli/selftune/create/report.ts +74 -0
- package/cli/selftune/create/scaffold.ts +121 -0
- package/cli/selftune/create/skills-ref-adapter.ts +177 -0
- package/cli/selftune/create/status.ts +33 -0
- package/cli/selftune/create/templates.ts +249 -0
- package/cli/selftune/cron/setup.ts +1 -1
- package/cli/selftune/dashboard-action-events.ts +4 -1
- package/cli/selftune/dashboard-action-result.ts +789 -24
- package/cli/selftune/dashboard-action-stream.ts +80 -0
- package/cli/selftune/dashboard-contract.ts +146 -3
- package/cli/selftune/dashboard-server.ts +5 -4
- package/cli/selftune/eval/hooks-to-evals.ts +58 -35
- package/cli/selftune/eval/synthetic-evals.ts +145 -17
- package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
- package/cli/selftune/evolution/evolve-body.ts +9 -36
- package/cli/selftune/evolution/evolve.ts +8 -72
- package/cli/selftune/evolution/stopping-criteria.ts +5 -13
- package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
- package/cli/selftune/evolution/validate-host-replay.ts +115 -15
- package/cli/selftune/improve.ts +206 -0
- package/cli/selftune/index.ts +123 -6
- package/cli/selftune/init.ts +1 -1
- package/cli/selftune/localdb/queries/dashboard.ts +30 -0
- package/cli/selftune/localdb/schema.ts +52 -0
- package/cli/selftune/monitoring/watch.ts +257 -23
- package/cli/selftune/orchestrate/execute.ts +300 -1
- package/cli/selftune/orchestrate/finalize.ts +14 -0
- package/cli/selftune/orchestrate/plan.ts +22 -5
- package/cli/selftune/orchestrate/prepare.ts +59 -4
- package/cli/selftune/orchestrate/report.ts +1 -1
- package/cli/selftune/orchestrate.ts +34 -1
- package/cli/selftune/publish.ts +35 -0
- package/cli/selftune/routes/actions.ts +81 -15
- package/cli/selftune/routes/overview.ts +1 -1
- package/cli/selftune/routes/skill-report.ts +147 -2
- package/cli/selftune/run.ts +18 -0
- package/cli/selftune/schedule.ts +3 -3
- package/cli/selftune/search-run.ts +703 -0
- package/cli/selftune/status.ts +35 -11
- package/cli/selftune/testing-readiness.ts +431 -40
- package/cli/selftune/types.ts +316 -0
- package/cli/selftune/utils/eval-readiness.ts +1 -0
- package/cli/selftune/utils/json-output.ts +11 -0
- package/cli/selftune/utils/lifecycle-surface.ts +48 -0
- package/cli/selftune/utils/query-filter.ts +82 -1
- package/cli/selftune/utils/tui.ts +85 -2
- package/cli/selftune/verify.ts +205 -0
- package/cli/selftune/workflows/proposals.ts +1 -1
- package/cli/selftune/workflows/skill-scaffold.ts +141 -63
- package/cli/selftune/workflows/workflows.ts +4 -4
- package/package.json +1 -1
- package/skill/SKILL.md +148 -85
- package/skill/references/cli-quick-reference.md +16 -1
- package/skill/references/creator-playbook.md +31 -10
- package/skill/workflows/Baseline.md +8 -9
- package/skill/workflows/Contributions.md +4 -4
- package/skill/workflows/Create.md +173 -0
- package/skill/workflows/CreateTestDeploy.md +34 -30
- package/skill/workflows/Cron.md +2 -2
- package/skill/workflows/Dashboard.md +3 -3
- package/skill/workflows/Evals.md +13 -7
- package/skill/workflows/Evolve.md +75 -32
- package/skill/workflows/EvolveBody.md +22 -15
- package/skill/workflows/Hook.md +1 -1
- package/skill/workflows/Improve.md +168 -0
- package/skill/workflows/Initialize.md +3 -3
- package/skill/workflows/Orchestrate.md +49 -12
- package/skill/workflows/Publish.md +100 -0
- package/skill/workflows/Run.md +72 -0
- package/skill/workflows/Schedule.md +2 -2
- package/skill/workflows/SearchRun.md +89 -0
- package/skill/workflows/SignalsDashboard.md +2 -2
- package/skill/workflows/UnitTest.md +13 -4
- package/skill/workflows/Verify.md +136 -0
- package/skill/workflows/Watch.md +114 -47
- package/skill/workflows/Workflows.md +13 -8
- package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +0 -15
- package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +0 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
|
@@ -0,0 +1,710 @@
|
|
|
1
|
+
import type { Database } from "bun:sqlite";
|
|
2
|
+
|
|
3
|
+
import { readFileSync } from "node:fs";
|
|
4
|
+
import { basename, dirname } from "node:path";
|
|
5
|
+
|
|
6
|
+
import {
|
|
7
|
+
persistPackageCandidateEvaluation,
|
|
8
|
+
readPackageCandidateArtifactByFingerprint,
|
|
9
|
+
} from "./package-candidate-state.js";
|
|
10
|
+
import { parseSkillSections } from "../evolution/deploy-proposal.js";
|
|
11
|
+
import { getLastDeployedProposal } from "../evolution/audit.js";
|
|
12
|
+
import { assessBodyQuality, validateBodyStructure } from "../evolution/validate-body.js";
|
|
13
|
+
import { getDb } from "../localdb/db.js";
|
|
14
|
+
import { queryGradingBaseline, queryRecentGradingResults } from "../localdb/queries.js";
|
|
15
|
+
import type { WatchResult } from "../monitoring/watch.js";
|
|
16
|
+
import {
|
|
17
|
+
readCanonicalPackageEvaluationArtifact,
|
|
18
|
+
readCanonicalUnitTestRunResult,
|
|
19
|
+
writeCanonicalPackageEvaluationArtifact,
|
|
20
|
+
writeCanonicalPackageEvaluation,
|
|
21
|
+
} from "../testing-readiness.js";
|
|
22
|
+
import type {
|
|
23
|
+
CreatePackageBodySummary,
|
|
24
|
+
CreatePackageEvaluationSummary,
|
|
25
|
+
CreatePackageEvaluationGradingSummary,
|
|
26
|
+
CreatePackageEvaluationUnitTestSummary,
|
|
27
|
+
CreatePackageEvaluationWatchSummary,
|
|
28
|
+
} from "../types.js";
|
|
29
|
+
import { computeCreatePackageFingerprint } from "./package-fingerprint.js";
|
|
30
|
+
import {
|
|
31
|
+
runCreateBaseline,
|
|
32
|
+
type CreateBaselineDeps,
|
|
33
|
+
type CreateBaselineResult,
|
|
34
|
+
type RunCreateBaselineOptions,
|
|
35
|
+
} from "./baseline.js";
|
|
36
|
+
import {
|
|
37
|
+
runCreateReplay,
|
|
38
|
+
type CreateReplayMode,
|
|
39
|
+
type CreateReplayResult,
|
|
40
|
+
type RunCreateReplayOptions,
|
|
41
|
+
} from "./replay.js";
|
|
42
|
+
|
|
43
|
+
export interface RunCreatePackageEvaluationOptions {
|
|
44
|
+
skillPath: string;
|
|
45
|
+
skillName?: string;
|
|
46
|
+
mode?: Extract<CreateReplayMode, "package">;
|
|
47
|
+
agent?: string;
|
|
48
|
+
evalSetPath?: string;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export interface CreatePackageEvaluationResult {
|
|
52
|
+
summary: CreatePackageEvaluationSummary;
|
|
53
|
+
replay: CreateReplayResult;
|
|
54
|
+
baseline: CreateBaselineResult;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export interface CreatePackageEvaluationDeps extends CreateBaselineDeps {
|
|
58
|
+
getDb?: () => Database;
|
|
59
|
+
getLastDeployedProposal?: typeof getLastDeployedProposal;
|
|
60
|
+
queryGradingBaseline?: typeof queryGradingBaseline;
|
|
61
|
+
queryRecentGradingResults?: typeof queryRecentGradingResults;
|
|
62
|
+
computeCreatePackageFingerprint?: typeof computeCreatePackageFingerprint;
|
|
63
|
+
readCanonicalPackageEvaluationArtifact?: typeof readCanonicalPackageEvaluationArtifact;
|
|
64
|
+
readPackageCandidateArtifactByFingerprint?: typeof readPackageCandidateArtifactByFingerprint;
|
|
65
|
+
readCanonicalUnitTestRunResult?: typeof readCanonicalUnitTestRunResult;
|
|
66
|
+
assessBodyQuality?: typeof assessBodyQuality;
|
|
67
|
+
readSkillContent?: (skillPath: string) => string;
|
|
68
|
+
persistPackageCandidateEvaluation?: typeof persistPackageCandidateEvaluation;
|
|
69
|
+
writeCanonicalPackageEvaluationArtifact?: typeof writeCanonicalPackageEvaluationArtifact;
|
|
70
|
+
writeCanonicalPackageEvaluation?: typeof writeCanonicalPackageEvaluation;
|
|
71
|
+
runCreateReplay?: (
|
|
72
|
+
options: RunCreateReplayOptions,
|
|
73
|
+
) => Promise<Awaited<ReturnType<typeof runCreateReplay>>>;
|
|
74
|
+
runCreateBaseline?: (
|
|
75
|
+
options: RunCreateBaselineOptions,
|
|
76
|
+
deps?: CreateBaselineDeps,
|
|
77
|
+
) => Promise<CreateBaselineResult>;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
type BaselineResultLike = CreateBaselineResult["per_entry"][number];
|
|
81
|
+
const BODY_QUALITY_THRESHOLD = 0.6;
|
|
82
|
+
|
|
83
|
+
function inferSkillNameFromSkillPath(skillPath: string): string {
|
|
84
|
+
return basename(dirname(skillPath));
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function emptyRuntimeMetrics() {
|
|
88
|
+
return {
|
|
89
|
+
eval_runs: 0,
|
|
90
|
+
usage_observations: 0,
|
|
91
|
+
total_duration_ms: 0,
|
|
92
|
+
avg_duration_ms: 0,
|
|
93
|
+
total_input_tokens: null,
|
|
94
|
+
total_output_tokens: null,
|
|
95
|
+
total_cache_creation_input_tokens: null,
|
|
96
|
+
total_cache_read_input_tokens: null,
|
|
97
|
+
total_cost_usd: null,
|
|
98
|
+
total_turns: null,
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function average(values: number[]): number | null {
|
|
103
|
+
if (values.length === 0) return null;
|
|
104
|
+
return values.reduce((sum, value) => sum + value, 0) / values.length;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function collectEvidenceSamples(replay: CreateReplayResult, baseline: CreateBaselineResult) {
|
|
108
|
+
const replayFailureSamples = replay.results
|
|
109
|
+
.filter((result) => !result.passed)
|
|
110
|
+
.slice(0, 3)
|
|
111
|
+
.map((result) => ({
|
|
112
|
+
query: result.query,
|
|
113
|
+
evidence: result.evidence ?? null,
|
|
114
|
+
}));
|
|
115
|
+
|
|
116
|
+
const perQuery = new Map<
|
|
117
|
+
string,
|
|
118
|
+
{
|
|
119
|
+
with_skill?: BaselineResultLike;
|
|
120
|
+
without_skill?: BaselineResultLike;
|
|
121
|
+
}
|
|
122
|
+
>();
|
|
123
|
+
for (const entry of baseline.per_entry) {
|
|
124
|
+
const current = perQuery.get(entry.query) ?? {};
|
|
125
|
+
if (entry.with_skill) {
|
|
126
|
+
current.with_skill = entry;
|
|
127
|
+
} else {
|
|
128
|
+
current.without_skill = entry;
|
|
129
|
+
}
|
|
130
|
+
perQuery.set(entry.query, current);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const baselineWins: Array<{ query: string; evidence: string | null }> = [];
|
|
134
|
+
const baselineRegressions: Array<{ query: string; evidence: string | null }> = [];
|
|
135
|
+
for (const [query, pair] of perQuery) {
|
|
136
|
+
if (pair.with_skill?.pass === true && pair.without_skill?.pass === false) {
|
|
137
|
+
baselineWins.push({
|
|
138
|
+
query,
|
|
139
|
+
evidence: pair.with_skill.evidence ?? pair.without_skill.evidence ?? null,
|
|
140
|
+
});
|
|
141
|
+
}
|
|
142
|
+
if (pair.with_skill?.pass === false && pair.without_skill?.pass === true) {
|
|
143
|
+
baselineRegressions.push({
|
|
144
|
+
query,
|
|
145
|
+
evidence: pair.with_skill.evidence ?? pair.without_skill.evidence ?? null,
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
return {
|
|
151
|
+
replay_failures: replay.results.filter((result) => !result.passed).length,
|
|
152
|
+
baseline_wins: baselineWins.length,
|
|
153
|
+
baseline_regressions: baselineRegressions.length,
|
|
154
|
+
replay_failure_samples: replayFailureSamples,
|
|
155
|
+
baseline_win_samples: baselineWins.slice(0, 3),
|
|
156
|
+
baseline_regression_samples: baselineRegressions.slice(0, 3),
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
function buildGradingSummary(
|
|
161
|
+
skillName: string,
|
|
162
|
+
deps: CreatePackageEvaluationDeps,
|
|
163
|
+
): CreatePackageEvaluationGradingSummary | undefined {
|
|
164
|
+
try {
|
|
165
|
+
const db = (deps.getDb ?? getDb)();
|
|
166
|
+
const lastDeployed = (deps.getLastDeployedProposal ?? getLastDeployedProposal)(skillName);
|
|
167
|
+
const baselineRow = (deps.queryGradingBaseline ?? queryGradingBaseline)(
|
|
168
|
+
db,
|
|
169
|
+
skillName,
|
|
170
|
+
lastDeployed?.proposal_id,
|
|
171
|
+
);
|
|
172
|
+
const recentRows = (deps.queryRecentGradingResults ?? queryRecentGradingResults)(
|
|
173
|
+
db,
|
|
174
|
+
skillName,
|
|
175
|
+
10,
|
|
176
|
+
);
|
|
177
|
+
|
|
178
|
+
if (!baselineRow && recentRows.length === 0) {
|
|
179
|
+
return undefined;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
const recentPassRates = recentRows.flatMap((row) =>
|
|
183
|
+
row.pass_rate == null ? [] : [row.pass_rate],
|
|
184
|
+
);
|
|
185
|
+
const recentMeanScores = recentRows.flatMap((row) =>
|
|
186
|
+
row.mean_score == null ? [] : [row.mean_score],
|
|
187
|
+
);
|
|
188
|
+
const recentSummary =
|
|
189
|
+
recentRows.length === 0
|
|
190
|
+
? null
|
|
191
|
+
: {
|
|
192
|
+
sample_size: recentRows.length,
|
|
193
|
+
average_pass_rate: average(recentPassRates),
|
|
194
|
+
average_mean_score: average(recentMeanScores),
|
|
195
|
+
newest_graded_at: recentRows[0]?.graded_at ?? null,
|
|
196
|
+
oldest_graded_at: recentRows.at(-1)?.graded_at ?? null,
|
|
197
|
+
};
|
|
198
|
+
const baselineSummary = baselineRow
|
|
199
|
+
? {
|
|
200
|
+
proposal_id: baselineRow.proposal_id,
|
|
201
|
+
measured_at: baselineRow.measured_at,
|
|
202
|
+
pass_rate: baselineRow.pass_rate,
|
|
203
|
+
mean_score: baselineRow.mean_score,
|
|
204
|
+
sample_size: baselineRow.sample_size,
|
|
205
|
+
}
|
|
206
|
+
: null;
|
|
207
|
+
const passRateDelta =
|
|
208
|
+
baselineSummary && recentSummary?.average_pass_rate != null
|
|
209
|
+
? recentSummary.average_pass_rate - baselineSummary.pass_rate
|
|
210
|
+
: null;
|
|
211
|
+
const meanScoreDelta =
|
|
212
|
+
baselineSummary?.mean_score != null && recentSummary?.average_mean_score != null
|
|
213
|
+
? recentSummary.average_mean_score - baselineSummary.mean_score
|
|
214
|
+
: null;
|
|
215
|
+
|
|
216
|
+
return {
|
|
217
|
+
baseline: baselineSummary,
|
|
218
|
+
recent: recentSummary,
|
|
219
|
+
pass_rate_delta: passRateDelta,
|
|
220
|
+
mean_score_delta: meanScoreDelta,
|
|
221
|
+
regressed: passRateDelta == null ? null : passRateDelta < 0,
|
|
222
|
+
};
|
|
223
|
+
} catch {
|
|
224
|
+
// Fail-open: grading context should enrich the evaluator, never block it.
|
|
225
|
+
return undefined;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
function buildUnitTestSummary(
|
|
230
|
+
skillName: string,
|
|
231
|
+
deps: CreatePackageEvaluationDeps,
|
|
232
|
+
): CreatePackageEvaluationUnitTestSummary | undefined {
|
|
233
|
+
try {
|
|
234
|
+
const suite = deps.getDb
|
|
235
|
+
? (deps.readCanonicalUnitTestRunResult ?? readCanonicalUnitTestRunResult)(
|
|
236
|
+
skillName,
|
|
237
|
+
deps.getDb(),
|
|
238
|
+
)
|
|
239
|
+
: (deps.readCanonicalUnitTestRunResult ?? readCanonicalUnitTestRunResult)(skillName);
|
|
240
|
+
if (!suite) return undefined;
|
|
241
|
+
|
|
242
|
+
return {
|
|
243
|
+
total: suite.total,
|
|
244
|
+
passed: suite.passed,
|
|
245
|
+
failed: suite.failed,
|
|
246
|
+
pass_rate: suite.pass_rate,
|
|
247
|
+
run_at: suite.run_at,
|
|
248
|
+
failing_tests: suite.results
|
|
249
|
+
.filter((result) => !result.passed)
|
|
250
|
+
.slice(0, 3)
|
|
251
|
+
.map((result) => ({
|
|
252
|
+
test_id: result.test_id,
|
|
253
|
+
error: result.error ?? null,
|
|
254
|
+
failed_assertions: result.assertion_results
|
|
255
|
+
.filter((assertion) => !assertion.passed)
|
|
256
|
+
.map((assertion) => `${assertion.assertion.type}: ${assertion.assertion.value}`),
|
|
257
|
+
})),
|
|
258
|
+
};
|
|
259
|
+
} catch {
|
|
260
|
+
return undefined;
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
function extractSkillBody(skillContent: string): string {
|
|
265
|
+
const parsed = parseSkillSections(skillContent);
|
|
266
|
+
const bodyParts: string[] = [];
|
|
267
|
+
|
|
268
|
+
if (parsed.description.trim()) {
|
|
269
|
+
bodyParts.push(parsed.description.trim());
|
|
270
|
+
bodyParts.push("");
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
for (const [sectionName, sectionContent] of Object.entries(parsed.sections)) {
|
|
274
|
+
bodyParts.push(`## ${sectionName}`);
|
|
275
|
+
bodyParts.push("");
|
|
276
|
+
bodyParts.push(sectionContent.trim());
|
|
277
|
+
bodyParts.push("");
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
return bodyParts.join("\n").trim();
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
function canReuseCachedPackageEvaluation(
|
|
284
|
+
cached: CreatePackageEvaluationResult | null,
|
|
285
|
+
options: RunCreatePackageEvaluationOptions,
|
|
286
|
+
packageFingerprint: string | null,
|
|
287
|
+
): cached is CreatePackageEvaluationResult {
|
|
288
|
+
if (!cached || !packageFingerprint || options.evalSetPath) return false;
|
|
289
|
+
if (cached.summary.mode !== "package") return false;
|
|
290
|
+
if (cached.summary.skill_path !== options.skillPath) return false;
|
|
291
|
+
if (options.skillName && cached.summary.skill_name !== options.skillName) return false;
|
|
292
|
+
if (cached.summary.package_fingerprint !== packageFingerprint) return false;
|
|
293
|
+
if (options.agent && cached.summary.replay.agent !== options.agent) return false;
|
|
294
|
+
if (cached.summary.replay.validation_mode !== "host_replay") return false;
|
|
295
|
+
if (cached.summary.routing?.validation_mode !== "host_replay") return false;
|
|
296
|
+
if (typeof cached.summary.candidate_id !== "string") return false;
|
|
297
|
+
if (typeof cached.summary.candidate_generation !== "number") return false;
|
|
298
|
+
if (!cached.summary.candidate_acceptance) return false;
|
|
299
|
+
if (!cached.summary.body) return false;
|
|
300
|
+
if (cached.replay.skill !== cached.summary.skill_name) return false;
|
|
301
|
+
if (cached.baseline.skill_name !== cached.summary.skill_name) return false;
|
|
302
|
+
return true;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
function buildSummary(
|
|
306
|
+
skillName: string,
|
|
307
|
+
skillPath: string,
|
|
308
|
+
replay: CreateReplayResult,
|
|
309
|
+
routing: CreateReplayResult | undefined,
|
|
310
|
+
baseline: CreateBaselineResult,
|
|
311
|
+
grading?: CreatePackageEvaluationGradingSummary,
|
|
312
|
+
body?: CreatePackageBodySummary,
|
|
313
|
+
unitTests?: CreatePackageEvaluationUnitTestSummary,
|
|
314
|
+
packageFingerprint?: string | null,
|
|
315
|
+
): CreatePackageEvaluationSummary {
|
|
316
|
+
const replayFailed = replay.failed > 0;
|
|
317
|
+
const baselineFailed = !baseline.adds_value;
|
|
318
|
+
const status = replayFailed ? "replay_failed" : baselineFailed ? "baseline_failed" : "passed";
|
|
319
|
+
const withSkillMetrics = replay.runtime_metrics ?? emptyRuntimeMetrics();
|
|
320
|
+
const withoutSkillMetrics = baseline.runtime_metrics?.without_skill ?? emptyRuntimeMetrics();
|
|
321
|
+
|
|
322
|
+
return {
|
|
323
|
+
skill_name: skillName,
|
|
324
|
+
skill_path: skillPath,
|
|
325
|
+
mode: "package",
|
|
326
|
+
...(packageFingerprint ? { package_fingerprint: packageFingerprint } : {}),
|
|
327
|
+
evaluation_source: "fresh",
|
|
328
|
+
status,
|
|
329
|
+
evaluation_passed: status === "passed",
|
|
330
|
+
next_command:
|
|
331
|
+
status === "passed"
|
|
332
|
+
? null
|
|
333
|
+
: replayFailed
|
|
334
|
+
? `selftune create replay --skill-path ${skillPath} --mode package`
|
|
335
|
+
: `selftune create baseline --skill-path ${skillPath} --mode package`,
|
|
336
|
+
replay: {
|
|
337
|
+
mode: replay.mode,
|
|
338
|
+
validation_mode: "host_replay",
|
|
339
|
+
agent: replay.agent,
|
|
340
|
+
proposal_id: replay.proposal_id,
|
|
341
|
+
fixture_id: replay.fixture_id,
|
|
342
|
+
total: replay.total,
|
|
343
|
+
passed: replay.passed,
|
|
344
|
+
failed: replay.failed,
|
|
345
|
+
pass_rate: replay.pass_rate,
|
|
346
|
+
runtime_metrics: replay.runtime_metrics,
|
|
347
|
+
},
|
|
348
|
+
...(routing
|
|
349
|
+
? {
|
|
350
|
+
routing: {
|
|
351
|
+
mode: routing.mode,
|
|
352
|
+
validation_mode: "host_replay",
|
|
353
|
+
agent: routing.agent,
|
|
354
|
+
proposal_id: routing.proposal_id,
|
|
355
|
+
fixture_id: routing.fixture_id,
|
|
356
|
+
total: routing.total,
|
|
357
|
+
passed: routing.passed,
|
|
358
|
+
failed: routing.failed,
|
|
359
|
+
pass_rate: routing.pass_rate,
|
|
360
|
+
runtime_metrics: routing.runtime_metrics,
|
|
361
|
+
},
|
|
362
|
+
}
|
|
363
|
+
: {}),
|
|
364
|
+
baseline: {
|
|
365
|
+
mode: baseline.mode,
|
|
366
|
+
baseline_pass_rate: baseline.baseline_pass_rate,
|
|
367
|
+
with_skill_pass_rate: baseline.with_skill_pass_rate,
|
|
368
|
+
lift: baseline.lift,
|
|
369
|
+
adds_value: baseline.adds_value,
|
|
370
|
+
measured_at: baseline.measured_at,
|
|
371
|
+
sample_size: baseline.per_entry.filter((entry) => entry.with_skill).length,
|
|
372
|
+
...(baseline.runtime_metrics ? { runtime_metrics: baseline.runtime_metrics } : {}),
|
|
373
|
+
},
|
|
374
|
+
evidence: collectEvidenceSamples(replay, baseline),
|
|
375
|
+
...(baseline.runtime_metrics
|
|
376
|
+
? {
|
|
377
|
+
efficiency: {
|
|
378
|
+
with_skill: withSkillMetrics,
|
|
379
|
+
without_skill: withoutSkillMetrics,
|
|
380
|
+
},
|
|
381
|
+
}
|
|
382
|
+
: {}),
|
|
383
|
+
...(grading ? { grading } : {}),
|
|
384
|
+
...(body ? { body } : {}),
|
|
385
|
+
...(unitTests ? { unit_tests: unitTests } : {}),
|
|
386
|
+
};
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
function formatPercent(value: number): string {
|
|
390
|
+
return `${(value * 100).toFixed(1)}%`;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
function formatEvaluationSource(
|
|
394
|
+
source: CreatePackageEvaluationSummary["evaluation_source"],
|
|
395
|
+
): string {
|
|
396
|
+
if (source === "artifact_cache") return "cached artifact";
|
|
397
|
+
if (source === "candidate_cache") return "accepted candidate cache";
|
|
398
|
+
return "fresh";
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
function formatCandidateAcceptance(summary: CreatePackageEvaluationSummary): string | null {
|
|
402
|
+
const acceptance = summary.candidate_acceptance;
|
|
403
|
+
if (!acceptance) return null;
|
|
404
|
+
const comparedTo = acceptance.compared_to_candidate_id ?? "root";
|
|
405
|
+
return `${acceptance.decision} vs ${comparedTo} | ${acceptance.rationale}`;
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
function summarizeReplayFailures(replay: CreateReplayResult): string[] {
|
|
409
|
+
return replay.results
|
|
410
|
+
.filter((result) => !result.passed)
|
|
411
|
+
.map((result) => {
|
|
412
|
+
const expected = result.should_trigger ? "trigger" : "skip";
|
|
413
|
+
const actual = result.triggered ? "triggered" : "skipped";
|
|
414
|
+
const evidence = result.evidence?.trim() ? ` | evidence: ${result.evidence.trim()}` : "";
|
|
415
|
+
return `- query: ${result.query} | expected: ${expected} | actual: ${actual}${evidence}`;
|
|
416
|
+
});
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
function summarizeBaselineDiffs(baseline: CreateBaselineResult): string[] {
|
|
420
|
+
const byQuery = new Map<
|
|
421
|
+
string,
|
|
422
|
+
{
|
|
423
|
+
withSkill?: boolean;
|
|
424
|
+
withoutSkill?: boolean;
|
|
425
|
+
}
|
|
426
|
+
>();
|
|
427
|
+
|
|
428
|
+
for (const entry of baseline.per_entry) {
|
|
429
|
+
const current = byQuery.get(entry.query) ?? {};
|
|
430
|
+
if (entry.with_skill) {
|
|
431
|
+
current.withSkill = entry.pass;
|
|
432
|
+
} else {
|
|
433
|
+
current.withoutSkill = entry.pass;
|
|
434
|
+
}
|
|
435
|
+
byQuery.set(entry.query, current);
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
return [...byQuery.entries()]
|
|
439
|
+
.filter(([, value]) => value.withSkill !== value.withoutSkill)
|
|
440
|
+
.map(([query, value]) => {
|
|
441
|
+
const withoutSkill =
|
|
442
|
+
value.withoutSkill == null ? "n/a" : value.withoutSkill ? "pass" : "fail";
|
|
443
|
+
const withSkill = value.withSkill == null ? "n/a" : value.withSkill ? "pass" : "fail";
|
|
444
|
+
return `- query: ${query} | without skill: ${withoutSkill} | with skill: ${withSkill}`;
|
|
445
|
+
});
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
function summarizeFailedUnitTests(
|
|
449
|
+
unitTests: CreatePackageEvaluationUnitTestSummary | undefined,
|
|
450
|
+
): string[] {
|
|
451
|
+
if (!unitTests || unitTests.failed === 0) return [];
|
|
452
|
+
return unitTests.failing_tests.slice(0, 3).map((failure) => {
|
|
453
|
+
const failureDetails =
|
|
454
|
+
failure.failed_assertions.length > 0
|
|
455
|
+
? ` | failed assertions: ${failure.failed_assertions.join(", ")}`
|
|
456
|
+
: "";
|
|
457
|
+
const error = failure.error?.trim() ? ` | error: ${failure.error.trim()}` : "";
|
|
458
|
+
return `- unit test: ${failure.test_id}${error}${failureDetails}`;
|
|
459
|
+
});
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
export function formatCreatePackageBenchmarkReport(
|
|
463
|
+
evaluation: CreatePackageEvaluationResult,
|
|
464
|
+
): string {
|
|
465
|
+
const routing = evaluation.summary.routing;
|
|
466
|
+
const body = evaluation.summary.body;
|
|
467
|
+
const grading = evaluation.summary.grading;
|
|
468
|
+
const unitTests = evaluation.summary.unit_tests;
|
|
469
|
+
const candidateAcceptance = formatCandidateAcceptance(evaluation.summary);
|
|
470
|
+
const lines = [
|
|
471
|
+
`CREATE PACKAGE BENCHMARK REPORT: ${evaluation.summary.skill_name}`,
|
|
472
|
+
"",
|
|
473
|
+
`PACKAGE: skill=${evaluation.summary.skill_name} | mode=${evaluation.summary.mode} | status=${evaluation.summary.status}`,
|
|
474
|
+
`SOURCE: ${formatEvaluationSource(evaluation.summary.evaluation_source)}`,
|
|
475
|
+
...(evaluation.summary.candidate_id
|
|
476
|
+
? [
|
|
477
|
+
`CANDIDATE: id=${evaluation.summary.candidate_id} | generation=${evaluation.summary.candidate_generation ?? 0} | parent=${evaluation.summary.parent_candidate_id ?? "root"}`,
|
|
478
|
+
]
|
|
479
|
+
: []),
|
|
480
|
+
...(candidateAcceptance ? [`ACCEPTANCE: ${candidateAcceptance}`] : []),
|
|
481
|
+
`REPLAY: agent=${evaluation.summary.replay.agent} | pass_rate=${formatPercent(evaluation.summary.replay.pass_rate)} | passed=${evaluation.summary.replay.passed}/${evaluation.summary.replay.total} | fixture=${evaluation.summary.replay.fixture_id}`,
|
|
482
|
+
...(routing
|
|
483
|
+
? [
|
|
484
|
+
`ROUTING VALIDATION: pass_rate=${formatPercent(routing.pass_rate)} | passed=${routing.passed}/${routing.total} | fixture=${routing.fixture_id}`,
|
|
485
|
+
"",
|
|
486
|
+
]
|
|
487
|
+
: []),
|
|
488
|
+
...(body
|
|
489
|
+
? [
|
|
490
|
+
`BODY VALIDATION: structural=${body.structural_valid ? "pass" : "fail"} | quality=${body.quality_score == null ? "n/a" : body.quality_score.toFixed(2)} | threshold=${body.quality_threshold.toFixed(2)} | valid=${body.valid ? "yes" : "no"}`,
|
|
491
|
+
"",
|
|
492
|
+
]
|
|
493
|
+
: []),
|
|
494
|
+
`SKILLS IMPACT: without_skill=${formatPercent(evaluation.summary.baseline.baseline_pass_rate)} | with_skill=${formatPercent(evaluation.summary.baseline.with_skill_pass_rate)} | lift=${evaluation.summary.baseline.lift.toFixed(3)} | adds_value=${evaluation.summary.baseline.adds_value ? "yes" : "no"}`,
|
|
495
|
+
...(unitTests
|
|
496
|
+
? [
|
|
497
|
+
`UNIT TESTS: passed=${unitTests.passed}/${unitTests.total} | pass_rate=${formatPercent(unitTests.pass_rate)} | latest_run=${unitTests.run_at}`,
|
|
498
|
+
"",
|
|
499
|
+
]
|
|
500
|
+
: []),
|
|
501
|
+
"",
|
|
502
|
+
...(grading
|
|
503
|
+
? [
|
|
504
|
+
`GRADING CONTEXT: baseline=${grading.baseline ? formatPercent(grading.baseline.pass_rate) : "n/a"} | recent_avg=${grading.recent?.average_pass_rate != null ? formatPercent(grading.recent.average_pass_rate) : "n/a"} | delta=${grading.pass_rate_delta == null ? "n/a" : `${grading.pass_rate_delta >= 0 ? "+" : ""}${(grading.pass_rate_delta * 100).toFixed(1)}%`} | regressed=${grading.regressed == null ? "unknown" : grading.regressed ? "yes" : "no"}`,
|
|
505
|
+
"",
|
|
506
|
+
]
|
|
507
|
+
: []),
|
|
508
|
+
"FAILURE ANALYSIS:",
|
|
509
|
+
];
|
|
510
|
+
|
|
511
|
+
const replayFailures = summarizeReplayFailures(evaluation.replay);
|
|
512
|
+
const baselineDiffs = summarizeBaselineDiffs(evaluation.baseline);
|
|
513
|
+
const unitTestFailures = summarizeFailedUnitTests(unitTests);
|
|
514
|
+
|
|
515
|
+
if (replayFailures.length === 0 && baselineDiffs.length === 0 && unitTestFailures.length === 0) {
|
|
516
|
+
lines.push("- none");
|
|
517
|
+
} else {
|
|
518
|
+
if (replayFailures.length > 0) {
|
|
519
|
+
lines.push(...replayFailures);
|
|
520
|
+
}
|
|
521
|
+
if (baselineDiffs.length > 0) {
|
|
522
|
+
lines.push(...baselineDiffs);
|
|
523
|
+
}
|
|
524
|
+
if (unitTestFailures.length > 0) {
|
|
525
|
+
lines.push(...unitTestFailures);
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
lines.push("");
|
|
530
|
+
lines.push(
|
|
531
|
+
`RECOMMENDATION: ${evaluation.summary.evaluation_passed ? "APPROVE FOR PUBLISH" : "DO NOT PUBLISH"}`,
|
|
532
|
+
);
|
|
533
|
+
|
|
534
|
+
if (evaluation.summary.next_command) {
|
|
535
|
+
lines.push(`NEXT: ${evaluation.summary.next_command}`);
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
return lines.join("\n");
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
export function buildCreatePackageWatchSummary(
|
|
542
|
+
watchResult: WatchResult,
|
|
543
|
+
): CreatePackageEvaluationWatchSummary {
|
|
544
|
+
return {
|
|
545
|
+
snapshot: watchResult.snapshot,
|
|
546
|
+
alert: watchResult.alert,
|
|
547
|
+
rolled_back: watchResult.rolledBack,
|
|
548
|
+
recommendation: watchResult.recommendation,
|
|
549
|
+
recommended_command: watchResult.recommended_command ?? null,
|
|
550
|
+
grade_alert: watchResult.gradeAlert ?? null,
|
|
551
|
+
grade_regression: watchResult.gradeRegression ?? null,
|
|
552
|
+
...(watchResult.efficiencyAlert || watchResult.efficiencyRegression
|
|
553
|
+
? {
|
|
554
|
+
efficiency_alert: watchResult.efficiencyAlert ?? null,
|
|
555
|
+
efficiency_regression: watchResult.efficiencyRegression ?? null,
|
|
556
|
+
}
|
|
557
|
+
: {}),
|
|
558
|
+
};
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
export function attachCreatePackageWatchSummary(
|
|
562
|
+
summary: CreatePackageEvaluationSummary,
|
|
563
|
+
watchResult: WatchResult,
|
|
564
|
+
): CreatePackageEvaluationSummary {
|
|
565
|
+
return {
|
|
566
|
+
...summary,
|
|
567
|
+
watch: buildCreatePackageWatchSummary(watchResult),
|
|
568
|
+
};
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
export async function runCreatePackageEvaluation(
|
|
572
|
+
options: RunCreatePackageEvaluationOptions,
|
|
573
|
+
deps: CreatePackageEvaluationDeps = {},
|
|
574
|
+
): Promise<CreatePackageEvaluationResult> {
|
|
575
|
+
const packageFingerprint = (
|
|
576
|
+
deps.computeCreatePackageFingerprint ?? computeCreatePackageFingerprint
|
|
577
|
+
)(options.skillPath);
|
|
578
|
+
const skillName = options.skillName?.trim() || inferSkillNameFromSkillPath(options.skillPath);
|
|
579
|
+
const cachedEvaluation = (
|
|
580
|
+
deps.readCanonicalPackageEvaluationArtifact ?? readCanonicalPackageEvaluationArtifact
|
|
581
|
+
)(skillName);
|
|
582
|
+
if (canReuseCachedPackageEvaluation(cachedEvaluation, options, packageFingerprint)) {
|
|
583
|
+
return {
|
|
584
|
+
...cachedEvaluation,
|
|
585
|
+
summary: {
|
|
586
|
+
...cachedEvaluation.summary,
|
|
587
|
+
evaluation_source: "artifact_cache",
|
|
588
|
+
},
|
|
589
|
+
};
|
|
590
|
+
}
|
|
591
|
+
if (packageFingerprint) {
|
|
592
|
+
const candidateCachedEvaluation = (
|
|
593
|
+
deps.readPackageCandidateArtifactByFingerprint ?? readPackageCandidateArtifactByFingerprint
|
|
594
|
+
)(skillName, packageFingerprint, {
|
|
595
|
+
acceptedOnly: true,
|
|
596
|
+
db: deps.getDb ? deps.getDb() : undefined,
|
|
597
|
+
});
|
|
598
|
+
if (canReuseCachedPackageEvaluation(candidateCachedEvaluation, options, packageFingerprint)) {
|
|
599
|
+
return {
|
|
600
|
+
...candidateCachedEvaluation,
|
|
601
|
+
summary: {
|
|
602
|
+
...candidateCachedEvaluation.summary,
|
|
603
|
+
evaluation_source: "candidate_cache",
|
|
604
|
+
},
|
|
605
|
+
};
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
let replay = await (deps.runCreateReplay ?? runCreateReplay)({
|
|
610
|
+
skillPath: options.skillPath,
|
|
611
|
+
mode: options.mode ?? "package",
|
|
612
|
+
agent: options.agent,
|
|
613
|
+
evalSetPath: options.evalSetPath,
|
|
614
|
+
});
|
|
615
|
+
if (replay.skill !== skillName) {
|
|
616
|
+
replay = { ...replay, skill: skillName };
|
|
617
|
+
}
|
|
618
|
+
let routing: CreateReplayResult | undefined;
|
|
619
|
+
try {
|
|
620
|
+
routing = await (deps.runCreateReplay ?? runCreateReplay)({
|
|
621
|
+
skillPath: options.skillPath,
|
|
622
|
+
mode: "routing",
|
|
623
|
+
agent: replay.agent,
|
|
624
|
+
evalSetPath: options.evalSetPath,
|
|
625
|
+
});
|
|
626
|
+
if (routing.skill !== skillName) {
|
|
627
|
+
routing = { ...routing, skill: skillName };
|
|
628
|
+
}
|
|
629
|
+
} catch {
|
|
630
|
+
// Fail-open: routing validation should enrich package reports when available.
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
let baseline = await (deps.runCreateBaseline ?? runCreateBaseline)(
|
|
634
|
+
{
|
|
635
|
+
skillPath: options.skillPath,
|
|
636
|
+
mode: "package",
|
|
637
|
+
agent: options.agent,
|
|
638
|
+
evalSetPath: options.evalSetPath,
|
|
639
|
+
withSkillReplayResult: replay,
|
|
640
|
+
},
|
|
641
|
+
deps,
|
|
642
|
+
);
|
|
643
|
+
if (baseline.skill_name !== skillName) {
|
|
644
|
+
baseline = { ...baseline, skill_name: skillName };
|
|
645
|
+
}
|
|
646
|
+
const grading = buildGradingSummary(skillName, deps);
|
|
647
|
+
let body: CreatePackageBodySummary | undefined;
|
|
648
|
+
try {
|
|
649
|
+
const skillContent = (deps.readSkillContent ?? ((path) => readFileSync(path, "utf-8")))(
|
|
650
|
+
options.skillPath,
|
|
651
|
+
);
|
|
652
|
+
const bodyContent = extractSkillBody(skillContent);
|
|
653
|
+
const structural = validateBodyStructure(bodyContent);
|
|
654
|
+
const quality = await (deps.assessBodyQuality ?? assessBodyQuality)(
|
|
655
|
+
bodyContent,
|
|
656
|
+
replay.skill,
|
|
657
|
+
replay.agent,
|
|
658
|
+
);
|
|
659
|
+
body = {
|
|
660
|
+
structural_valid: structural.valid,
|
|
661
|
+
structural_reason: structural.reason,
|
|
662
|
+
quality_score: quality.score,
|
|
663
|
+
quality_reason: quality.reason,
|
|
664
|
+
quality_threshold: BODY_QUALITY_THRESHOLD,
|
|
665
|
+
quality_passed: quality.score >= BODY_QUALITY_THRESHOLD,
|
|
666
|
+
valid: structural.valid && quality.score >= BODY_QUALITY_THRESHOLD,
|
|
667
|
+
};
|
|
668
|
+
} catch {
|
|
669
|
+
// Fail-open: body validation should enrich package reports when available.
|
|
670
|
+
}
|
|
671
|
+
const unitTests = buildUnitTestSummary(skillName, deps);
|
|
672
|
+
let evaluationResult: CreatePackageEvaluationResult = {
|
|
673
|
+
summary: buildSummary(
|
|
674
|
+
skillName,
|
|
675
|
+
options.skillPath,
|
|
676
|
+
replay,
|
|
677
|
+
routing,
|
|
678
|
+
baseline,
|
|
679
|
+
grading,
|
|
680
|
+
body,
|
|
681
|
+
unitTests,
|
|
682
|
+
packageFingerprint,
|
|
683
|
+
),
|
|
684
|
+
replay,
|
|
685
|
+
baseline,
|
|
686
|
+
};
|
|
687
|
+
|
|
688
|
+
try {
|
|
689
|
+
evaluationResult = (
|
|
690
|
+
deps.persistPackageCandidateEvaluation ?? persistPackageCandidateEvaluation
|
|
691
|
+
)(evaluationResult, deps.getDb ? deps.getDb() : undefined);
|
|
692
|
+
} catch {
|
|
693
|
+
// Fail-open: candidate persistence should not block measurement.
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
try {
|
|
697
|
+
(deps.writeCanonicalPackageEvaluation ?? writeCanonicalPackageEvaluation)(
|
|
698
|
+
skillName,
|
|
699
|
+
evaluationResult.summary,
|
|
700
|
+
);
|
|
701
|
+
(deps.writeCanonicalPackageEvaluationArtifact ?? writeCanonicalPackageEvaluationArtifact)(
|
|
702
|
+
skillName,
|
|
703
|
+
evaluationResult,
|
|
704
|
+
);
|
|
705
|
+
} catch {
|
|
706
|
+
// Fail-open: evaluation artifacts should improve reuse, never block scoring.
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
return evaluationResult;
|
|
710
|
+
}
|