selftune 0.2.30 → 0.2.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -56
- package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
- package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
- package/apps/local-dashboard/dist/index.html +3 -3
- package/cli/selftune/command-surface.ts +613 -2
- package/cli/selftune/create/baseline.ts +429 -0
- package/cli/selftune/create/check.ts +35 -0
- package/cli/selftune/create/init.ts +115 -0
- package/cli/selftune/create/package-candidate-state.ts +771 -0
- package/cli/selftune/create/package-evaluator.ts +710 -0
- package/cli/selftune/create/package-fingerprint.ts +142 -0
- package/cli/selftune/create/package-search.ts +377 -0
- package/cli/selftune/create/publish.ts +431 -0
- package/cli/selftune/create/readiness.ts +495 -0
- package/cli/selftune/create/replay.ts +330 -0
- package/cli/selftune/create/report.ts +74 -0
- package/cli/selftune/create/scaffold.ts +121 -0
- package/cli/selftune/create/skills-ref-adapter.ts +177 -0
- package/cli/selftune/create/status.ts +33 -0
- package/cli/selftune/create/templates.ts +249 -0
- package/cli/selftune/cron/setup.ts +1 -1
- package/cli/selftune/dashboard-action-events.ts +4 -1
- package/cli/selftune/dashboard-action-result.ts +789 -24
- package/cli/selftune/dashboard-action-stream.ts +80 -0
- package/cli/selftune/dashboard-contract.ts +146 -3
- package/cli/selftune/dashboard-server.ts +5 -4
- package/cli/selftune/eval/hooks-to-evals.ts +58 -35
- package/cli/selftune/eval/synthetic-evals.ts +145 -17
- package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
- package/cli/selftune/evolution/evolve-body.ts +9 -36
- package/cli/selftune/evolution/evolve.ts +8 -72
- package/cli/selftune/evolution/stopping-criteria.ts +5 -13
- package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
- package/cli/selftune/evolution/validate-host-replay.ts +115 -15
- package/cli/selftune/improve.ts +206 -0
- package/cli/selftune/index.ts +123 -6
- package/cli/selftune/init.ts +1 -1
- package/cli/selftune/localdb/queries/dashboard.ts +30 -0
- package/cli/selftune/localdb/schema.ts +52 -0
- package/cli/selftune/monitoring/watch.ts +257 -23
- package/cli/selftune/orchestrate/execute.ts +300 -1
- package/cli/selftune/orchestrate/finalize.ts +14 -0
- package/cli/selftune/orchestrate/plan.ts +22 -5
- package/cli/selftune/orchestrate/prepare.ts +59 -4
- package/cli/selftune/orchestrate/report.ts +1 -1
- package/cli/selftune/orchestrate.ts +34 -1
- package/cli/selftune/publish.ts +35 -0
- package/cli/selftune/registry/github-install.ts +256 -0
- package/cli/selftune/registry/index.ts +1 -1
- package/cli/selftune/registry/install.ts +58 -7
- package/cli/selftune/routes/actions.ts +81 -15
- package/cli/selftune/routes/overview.ts +1 -1
- package/cli/selftune/routes/skill-report.ts +147 -2
- package/cli/selftune/run.ts +18 -0
- package/cli/selftune/schedule.ts +3 -3
- package/cli/selftune/search-run.ts +703 -0
- package/cli/selftune/status.ts +35 -11
- package/cli/selftune/testing-readiness.ts +431 -40
- package/cli/selftune/types.ts +316 -0
- package/cli/selftune/utils/eval-readiness.ts +1 -0
- package/cli/selftune/utils/json-output.ts +11 -0
- package/cli/selftune/utils/lifecycle-surface.ts +48 -0
- package/cli/selftune/utils/query-filter.ts +82 -1
- package/cli/selftune/utils/tui.ts +85 -2
- package/cli/selftune/verify.ts +205 -0
- package/cli/selftune/workflows/proposals.ts +1 -1
- package/cli/selftune/workflows/skill-scaffold.ts +141 -63
- package/cli/selftune/workflows/workflows.ts +4 -4
- package/package.json +1 -1
- package/packages/dashboard-core/src/routes/manifest.ts +2 -2
- package/packages/ui/src/components/SkillReportPanels.tsx +7 -7
- package/packages/ui/src/primitives/button.tsx +5 -0
- package/skill/SKILL.md +148 -85
- package/skill/references/cli-quick-reference.md +16 -1
- package/skill/references/creator-playbook.md +31 -10
- package/skill/workflows/Baseline.md +8 -9
- package/skill/workflows/Contributions.md +4 -4
- package/skill/workflows/Create.md +173 -0
- package/skill/workflows/CreateTestDeploy.md +34 -30
- package/skill/workflows/Cron.md +2 -2
- package/skill/workflows/Dashboard.md +3 -3
- package/skill/workflows/Evals.md +13 -7
- package/skill/workflows/Evolve.md +75 -32
- package/skill/workflows/EvolveBody.md +22 -15
- package/skill/workflows/Hook.md +1 -1
- package/skill/workflows/Improve.md +168 -0
- package/skill/workflows/Initialize.md +3 -3
- package/skill/workflows/Orchestrate.md +49 -12
- package/skill/workflows/Publish.md +100 -0
- package/skill/workflows/Registry.md +19 -13
- package/skill/workflows/Run.md +72 -0
- package/skill/workflows/Schedule.md +2 -2
- package/skill/workflows/SearchRun.md +89 -0
- package/skill/workflows/SignalsDashboard.md +2 -2
- package/skill/workflows/UnitTest.md +13 -4
- package/skill/workflows/Verify.md +136 -0
- package/skill/workflows/Watch.md +114 -47
- package/skill/workflows/Workflows.md +13 -8
- package/apps/local-dashboard/dist/assets/index-BcXquWFB.css +0 -1
- package/apps/local-dashboard/dist/assets/index-Coq42hE4.js +0 -15
- package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
|
@@ -0,0 +1,771 @@
|
|
|
1
|
+
import type { Database } from "bun:sqlite";
|
|
2
|
+
|
|
3
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
4
|
+
import { join } from "node:path";
|
|
5
|
+
|
|
6
|
+
import { SELFTUNE_CONFIG_DIR } from "../constants.js";
|
|
7
|
+
import { getDb } from "../localdb/db.js";
|
|
8
|
+
import type {
|
|
9
|
+
CreatePackageBodySummary,
|
|
10
|
+
CreatePackageCandidateAcceptanceDecision,
|
|
11
|
+
CreatePackageCandidateAcceptanceSummary,
|
|
12
|
+
CreatePackageCandidateRecord,
|
|
13
|
+
CreatePackageEvaluationSummary,
|
|
14
|
+
} from "../types.js";
|
|
15
|
+
import type { CreatePackageEvaluationResult } from "./package-evaluator.js";
|
|
16
|
+
|
|
17
|
+
const PACKAGE_CANDIDATE_DIRNAME = "package-candidates";
|
|
18
|
+
const METRIC_EPSILON = 1e-9;
|
|
19
|
+
|
|
20
|
+
function sanitizeSkillName(skillName: string): string {
|
|
21
|
+
return skillName.replaceAll(/[^a-zA-Z0-9._-]+/g, "-");
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function getPackageCandidateRoot(
|
|
25
|
+
configDir: string = process.env.SELFTUNE_CONFIG_DIR || SELFTUNE_CONFIG_DIR,
|
|
26
|
+
) {
|
|
27
|
+
return join(configDir, PACKAGE_CANDIDATE_DIRNAME);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export function getPackageCandidateArtifactPath(
|
|
31
|
+
skillName: string,
|
|
32
|
+
candidateId: string,
|
|
33
|
+
configDir: string = process.env.SELFTUNE_CONFIG_DIR || SELFTUNE_CONFIG_DIR,
|
|
34
|
+
): string {
|
|
35
|
+
return join(
|
|
36
|
+
getPackageCandidateRoot(configDir),
|
|
37
|
+
sanitizeSkillName(skillName),
|
|
38
|
+
`${candidateId}.json`,
|
|
39
|
+
);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function getOptionalDb(): Database | null {
|
|
43
|
+
try {
|
|
44
|
+
return getDb();
|
|
45
|
+
} catch {
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function buildCandidateId(skillName: string, packageFingerprint: string): string {
|
|
51
|
+
const fingerprintSuffix = packageFingerprint
|
|
52
|
+
.replace(/^pkg_sha256_/, "")
|
|
53
|
+
.slice(0, 16)
|
|
54
|
+
.padEnd(16, "0");
|
|
55
|
+
return `pkgcand_${sanitizeSkillName(skillName)}_${fingerprintSuffix}`;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function readCandidateArtifact(path: string): CreatePackageEvaluationResult | null {
|
|
59
|
+
try {
|
|
60
|
+
if (!existsSync(path)) return null;
|
|
61
|
+
const parsed = JSON.parse(
|
|
62
|
+
readFileSync(path, "utf-8"),
|
|
63
|
+
) as Partial<CreatePackageEvaluationResult>;
|
|
64
|
+
if (
|
|
65
|
+
typeof parsed !== "object" ||
|
|
66
|
+
parsed == null ||
|
|
67
|
+
typeof parsed.summary !== "object" ||
|
|
68
|
+
parsed.summary == null ||
|
|
69
|
+
typeof parsed.replay !== "object" ||
|
|
70
|
+
parsed.replay == null ||
|
|
71
|
+
typeof parsed.baseline !== "object" ||
|
|
72
|
+
parsed.baseline == null
|
|
73
|
+
) {
|
|
74
|
+
return null;
|
|
75
|
+
}
|
|
76
|
+
if (
|
|
77
|
+
typeof parsed.summary.skill_name !== "string" ||
|
|
78
|
+
typeof parsed.summary.status !== "string" ||
|
|
79
|
+
typeof parsed.summary.evaluation_passed !== "boolean" ||
|
|
80
|
+
typeof parsed.replay.skill !== "string" ||
|
|
81
|
+
typeof parsed.baseline.skill_name !== "string"
|
|
82
|
+
) {
|
|
83
|
+
return null;
|
|
84
|
+
}
|
|
85
|
+
return parsed as CreatePackageEvaluationResult;
|
|
86
|
+
} catch {
|
|
87
|
+
return null;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function readCandidateArtifactForRecord(
|
|
92
|
+
record: CreatePackageCandidateRecord,
|
|
93
|
+
): CreatePackageEvaluationResult | null {
|
|
94
|
+
if (record.artifact_path) {
|
|
95
|
+
const fromStoredPath = readCandidateArtifact(record.artifact_path);
|
|
96
|
+
if (fromStoredPath) return fromStoredPath;
|
|
97
|
+
}
|
|
98
|
+
return readCandidateArtifact(
|
|
99
|
+
getPackageCandidateArtifactPath(record.skill_name, record.candidate_id),
|
|
100
|
+
);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
type CandidateRow = {
|
|
104
|
+
candidate_id: string;
|
|
105
|
+
skill_name: string;
|
|
106
|
+
skill_path: string;
|
|
107
|
+
package_fingerprint: string;
|
|
108
|
+
parent_candidate_id: string | null;
|
|
109
|
+
candidate_generation: number;
|
|
110
|
+
evaluation_count: number;
|
|
111
|
+
first_evaluated_at: string;
|
|
112
|
+
last_evaluated_at: string;
|
|
113
|
+
latest_status: CreatePackageEvaluationSummary["status"];
|
|
114
|
+
latest_evaluation_source: CreatePackageEvaluationSummary["evaluation_source"] | null;
|
|
115
|
+
latest_acceptance_decision: CreatePackageCandidateAcceptanceDecision | null;
|
|
116
|
+
artifact_path: string | null;
|
|
117
|
+
summary_json: string;
|
|
118
|
+
};
|
|
119
|
+
|
|
120
|
+
type MetricComparison = {
|
|
121
|
+
delta: number | null;
|
|
122
|
+
improved: boolean;
|
|
123
|
+
regressed: boolean;
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
function hydrateCandidateRow(row: CandidateRow): CreatePackageCandidateRecord | null {
|
|
127
|
+
try {
|
|
128
|
+
const summary = JSON.parse(row.summary_json) as CreatePackageEvaluationSummary;
|
|
129
|
+
return {
|
|
130
|
+
candidate_id: row.candidate_id,
|
|
131
|
+
skill_name: row.skill_name,
|
|
132
|
+
skill_path: row.skill_path,
|
|
133
|
+
package_fingerprint: row.package_fingerprint,
|
|
134
|
+
parent_candidate_id: row.parent_candidate_id,
|
|
135
|
+
candidate_generation: row.candidate_generation,
|
|
136
|
+
evaluation_count: row.evaluation_count,
|
|
137
|
+
first_evaluated_at: row.first_evaluated_at,
|
|
138
|
+
last_evaluated_at: row.last_evaluated_at,
|
|
139
|
+
latest_status: row.latest_status,
|
|
140
|
+
latest_evaluation_source: row.latest_evaluation_source,
|
|
141
|
+
latest_acceptance_decision: row.latest_acceptance_decision,
|
|
142
|
+
artifact_path: row.artifact_path,
|
|
143
|
+
summary,
|
|
144
|
+
};
|
|
145
|
+
} catch {
|
|
146
|
+
return null;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
function compareMetric(current: number | null | undefined, baseline: number | null | undefined) {
|
|
151
|
+
if (current == null || baseline == null) {
|
|
152
|
+
return {
|
|
153
|
+
delta: null,
|
|
154
|
+
improved: false,
|
|
155
|
+
regressed: false,
|
|
156
|
+
} satisfies MetricComparison;
|
|
157
|
+
}
|
|
158
|
+
const delta = current - baseline;
|
|
159
|
+
return {
|
|
160
|
+
delta,
|
|
161
|
+
improved: delta > METRIC_EPSILON,
|
|
162
|
+
regressed: delta < -METRIC_EPSILON,
|
|
163
|
+
} satisfies MetricComparison;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
function compareBodyMetric(
|
|
167
|
+
current: CreatePackageBodySummary | undefined,
|
|
168
|
+
baseline: CreatePackageBodySummary | undefined,
|
|
169
|
+
): MetricComparison {
|
|
170
|
+
if (!current || !baseline) {
|
|
171
|
+
return {
|
|
172
|
+
delta: null,
|
|
173
|
+
improved: false,
|
|
174
|
+
regressed: false,
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
if (current.valid !== baseline.valid) {
|
|
178
|
+
const delta = current.valid ? 1 : -1;
|
|
179
|
+
return {
|
|
180
|
+
delta,
|
|
181
|
+
improved: delta > 0,
|
|
182
|
+
regressed: delta < 0,
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
return compareMetric(current.quality_score, baseline.quality_score);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
function formatDelta(delta: number, percent: boolean = true): string {
|
|
189
|
+
if (percent) return `${delta >= 0 ? "+" : ""}${(delta * 100).toFixed(1)}%`;
|
|
190
|
+
return `${delta >= 0 ? "+" : ""}${delta.toFixed(3)}`;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
function compareOptionalNumbersDesc(
|
|
194
|
+
left: number | null | undefined,
|
|
195
|
+
right: number | null | undefined,
|
|
196
|
+
): number {
|
|
197
|
+
if (left == null && right == null) return 0;
|
|
198
|
+
if (left == null) return 1;
|
|
199
|
+
if (right == null) return -1;
|
|
200
|
+
return right - left;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
function compareOptionalNumbersAsc(
|
|
204
|
+
left: number | null | undefined,
|
|
205
|
+
right: number | null | undefined,
|
|
206
|
+
): number {
|
|
207
|
+
if (left == null && right == null) return 0;
|
|
208
|
+
if (left == null) return 1;
|
|
209
|
+
if (right == null) return -1;
|
|
210
|
+
return left - right;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
function acceptedFrontierWatchRank(summary: CreatePackageEvaluationSummary): number {
|
|
214
|
+
const watch = summary.watch;
|
|
215
|
+
if (!watch) return 1;
|
|
216
|
+
if (
|
|
217
|
+
watch.rolled_back ||
|
|
218
|
+
watch.alert != null ||
|
|
219
|
+
watch.grade_regression != null ||
|
|
220
|
+
watch.efficiency_regression != null
|
|
221
|
+
) {
|
|
222
|
+
return 0;
|
|
223
|
+
}
|
|
224
|
+
return 2;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function acceptedFrontierGradingRank(summary: CreatePackageEvaluationSummary): number {
|
|
228
|
+
const grading = summary.grading;
|
|
229
|
+
if (!grading) return 1;
|
|
230
|
+
if (grading.regressed === true) return 0;
|
|
231
|
+
if (grading.regressed === false) return 2;
|
|
232
|
+
return 1;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
function acceptedFrontierBodyRank(summary: CreatePackageEvaluationSummary): number {
|
|
236
|
+
return summary.body?.valid ? 1 : 0;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
function compareAcceptedFrontierCandidates(
|
|
240
|
+
left: CreatePackageCandidateRecord,
|
|
241
|
+
right: CreatePackageCandidateRecord,
|
|
242
|
+
): number {
|
|
243
|
+
const comparisons = [
|
|
244
|
+
compareOptionalNumbersDesc(
|
|
245
|
+
acceptedFrontierWatchRank(left.summary),
|
|
246
|
+
acceptedFrontierWatchRank(right.summary),
|
|
247
|
+
),
|
|
248
|
+
compareOptionalNumbersDesc(
|
|
249
|
+
acceptedFrontierGradingRank(left.summary),
|
|
250
|
+
acceptedFrontierGradingRank(right.summary),
|
|
251
|
+
),
|
|
252
|
+
compareOptionalNumbersDesc(
|
|
253
|
+
left.summary.grading?.pass_rate_delta,
|
|
254
|
+
right.summary.grading?.pass_rate_delta,
|
|
255
|
+
),
|
|
256
|
+
compareOptionalNumbersDesc(
|
|
257
|
+
left.summary.grading?.recent?.average_pass_rate,
|
|
258
|
+
right.summary.grading?.recent?.average_pass_rate,
|
|
259
|
+
),
|
|
260
|
+
compareOptionalNumbersDesc(left.summary.replay.pass_rate, right.summary.replay.pass_rate),
|
|
261
|
+
compareOptionalNumbersDesc(left.summary.routing?.pass_rate, right.summary.routing?.pass_rate),
|
|
262
|
+
compareOptionalNumbersDesc(left.summary.baseline.lift, right.summary.baseline.lift),
|
|
263
|
+
compareOptionalNumbersDesc(
|
|
264
|
+
left.summary.unit_tests?.pass_rate,
|
|
265
|
+
right.summary.unit_tests?.pass_rate,
|
|
266
|
+
),
|
|
267
|
+
compareOptionalNumbersDesc(
|
|
268
|
+
acceptedFrontierBodyRank(left.summary),
|
|
269
|
+
acceptedFrontierBodyRank(right.summary),
|
|
270
|
+
),
|
|
271
|
+
compareOptionalNumbersDesc(left.summary.body?.quality_score, right.summary.body?.quality_score),
|
|
272
|
+
compareOptionalNumbersAsc(
|
|
273
|
+
left.summary.efficiency?.with_skill.avg_duration_ms,
|
|
274
|
+
right.summary.efficiency?.with_skill.avg_duration_ms,
|
|
275
|
+
),
|
|
276
|
+
compareOptionalNumbersAsc(
|
|
277
|
+
left.summary.efficiency?.with_skill.total_cost_usd,
|
|
278
|
+
right.summary.efficiency?.with_skill.total_cost_usd,
|
|
279
|
+
),
|
|
280
|
+
compareOptionalNumbersAsc(
|
|
281
|
+
left.summary.efficiency?.with_skill.total_turns,
|
|
282
|
+
right.summary.efficiency?.with_skill.total_turns,
|
|
283
|
+
),
|
|
284
|
+
compareOptionalNumbersDesc(left.candidate_generation, right.candidate_generation),
|
|
285
|
+
right.last_evaluated_at.localeCompare(left.last_evaluated_at),
|
|
286
|
+
];
|
|
287
|
+
|
|
288
|
+
return comparisons.find((comparison) => comparison !== 0) ?? 0;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
function buildCandidateAcceptanceSummary(
|
|
292
|
+
summary: CreatePackageEvaluationSummary,
|
|
293
|
+
parentCandidateId: string | null,
|
|
294
|
+
parent: CreatePackageCandidateRecord | null,
|
|
295
|
+
decidedAt: string,
|
|
296
|
+
): CreatePackageCandidateAcceptanceSummary {
|
|
297
|
+
if (!parentCandidateId) {
|
|
298
|
+
return {
|
|
299
|
+
decision: "root",
|
|
300
|
+
compared_to_candidate_id: null,
|
|
301
|
+
decided_at: decidedAt,
|
|
302
|
+
rationale: "Initial measured package candidate for this skill.",
|
|
303
|
+
replay_pass_rate_delta: null,
|
|
304
|
+
routing_pass_rate_delta: null,
|
|
305
|
+
baseline_lift_delta: null,
|
|
306
|
+
body_quality_delta: null,
|
|
307
|
+
unit_test_pass_rate_delta: null,
|
|
308
|
+
};
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
if (!parent) {
|
|
312
|
+
return {
|
|
313
|
+
decision: "rejected",
|
|
314
|
+
compared_to_candidate_id: parentCandidateId,
|
|
315
|
+
decided_at: decidedAt,
|
|
316
|
+
rationale:
|
|
317
|
+
"Parent candidate evidence is unavailable, so measured acceptance could not be determined.",
|
|
318
|
+
replay_pass_rate_delta: null,
|
|
319
|
+
routing_pass_rate_delta: null,
|
|
320
|
+
baseline_lift_delta: null,
|
|
321
|
+
body_quality_delta: null,
|
|
322
|
+
unit_test_pass_rate_delta: null,
|
|
323
|
+
};
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
const replay = compareMetric(summary.replay.pass_rate, parent.summary.replay.pass_rate);
|
|
327
|
+
const routing = compareMetric(summary.routing?.pass_rate, parent.summary.routing?.pass_rate);
|
|
328
|
+
const baselineLift = compareMetric(summary.baseline.lift, parent.summary.baseline.lift);
|
|
329
|
+
const bodyQuality = compareBodyMetric(summary.body, parent.summary.body);
|
|
330
|
+
const unitTests = compareMetric(
|
|
331
|
+
summary.unit_tests?.pass_rate,
|
|
332
|
+
parent.summary.unit_tests?.pass_rate,
|
|
333
|
+
);
|
|
334
|
+
|
|
335
|
+
const regressions: string[] = [];
|
|
336
|
+
const improvements: string[] = [];
|
|
337
|
+
const addDeltaSummary = (
|
|
338
|
+
label: string,
|
|
339
|
+
comparison: MetricComparison,
|
|
340
|
+
options: { percent?: boolean } = {},
|
|
341
|
+
) => {
|
|
342
|
+
if (comparison.regressed && comparison.delta != null) {
|
|
343
|
+
regressions.push(`${label} ${formatDelta(comparison.delta, options.percent ?? true)}`);
|
|
344
|
+
} else if (comparison.improved && comparison.delta != null) {
|
|
345
|
+
improvements.push(`${label} ${formatDelta(comparison.delta, options.percent ?? true)}`);
|
|
346
|
+
}
|
|
347
|
+
};
|
|
348
|
+
|
|
349
|
+
addDeltaSummary("replay", replay);
|
|
350
|
+
addDeltaSummary("routing", routing);
|
|
351
|
+
addDeltaSummary("baseline lift", baselineLift, { percent: false });
|
|
352
|
+
addDeltaSummary("body quality", bodyQuality, { percent: false });
|
|
353
|
+
addDeltaSummary("unit tests", unitTests);
|
|
354
|
+
|
|
355
|
+
let decision: CreatePackageCandidateAcceptanceDecision;
|
|
356
|
+
let rationale: string;
|
|
357
|
+
if (regressions.length > 0) {
|
|
358
|
+
decision = "rejected";
|
|
359
|
+
rationale = `Measured regressions vs parent: ${regressions.join(", ")}.`;
|
|
360
|
+
} else if (improvements.length > 0) {
|
|
361
|
+
decision = "accepted";
|
|
362
|
+
rationale = `Measured improvement vs parent: ${improvements.join(", ")}.`;
|
|
363
|
+
} else {
|
|
364
|
+
decision = "rejected";
|
|
365
|
+
rationale = "No measured improvement over the parent candidate.";
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
return {
|
|
369
|
+
decision,
|
|
370
|
+
compared_to_candidate_id: parent.candidate_id,
|
|
371
|
+
decided_at: decidedAt,
|
|
372
|
+
rationale,
|
|
373
|
+
replay_pass_rate_delta: replay.delta,
|
|
374
|
+
routing_pass_rate_delta: routing.delta,
|
|
375
|
+
baseline_lift_delta: baselineLift.delta,
|
|
376
|
+
body_quality_delta: bodyQuality.delta,
|
|
377
|
+
unit_test_pass_rate_delta: unitTests.delta,
|
|
378
|
+
};
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
function upsertCandidateRecord(db: Database, record: CreatePackageCandidateRecord): void {
|
|
382
|
+
db.run(
|
|
383
|
+
`INSERT INTO package_candidates (
|
|
384
|
+
candidate_id,
|
|
385
|
+
skill_name,
|
|
386
|
+
skill_path,
|
|
387
|
+
package_fingerprint,
|
|
388
|
+
parent_candidate_id,
|
|
389
|
+
candidate_generation,
|
|
390
|
+
evaluation_count,
|
|
391
|
+
first_evaluated_at,
|
|
392
|
+
last_evaluated_at,
|
|
393
|
+
latest_status,
|
|
394
|
+
latest_evaluation_source,
|
|
395
|
+
latest_acceptance_decision,
|
|
396
|
+
artifact_path,
|
|
397
|
+
summary_json
|
|
398
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
399
|
+
ON CONFLICT(candidate_id) DO UPDATE SET
|
|
400
|
+
skill_path = excluded.skill_path,
|
|
401
|
+
parent_candidate_id = excluded.parent_candidate_id,
|
|
402
|
+
candidate_generation = excluded.candidate_generation,
|
|
403
|
+
evaluation_count = excluded.evaluation_count,
|
|
404
|
+
last_evaluated_at = excluded.last_evaluated_at,
|
|
405
|
+
latest_status = excluded.latest_status,
|
|
406
|
+
latest_evaluation_source = excluded.latest_evaluation_source,
|
|
407
|
+
latest_acceptance_decision = excluded.latest_acceptance_decision,
|
|
408
|
+
artifact_path = excluded.artifact_path,
|
|
409
|
+
summary_json = excluded.summary_json`,
|
|
410
|
+
[
|
|
411
|
+
record.candidate_id,
|
|
412
|
+
record.skill_name,
|
|
413
|
+
record.skill_path,
|
|
414
|
+
record.package_fingerprint,
|
|
415
|
+
record.parent_candidate_id,
|
|
416
|
+
record.candidate_generation,
|
|
417
|
+
record.evaluation_count,
|
|
418
|
+
record.first_evaluated_at,
|
|
419
|
+
record.last_evaluated_at,
|
|
420
|
+
record.latest_status,
|
|
421
|
+
record.latest_evaluation_source,
|
|
422
|
+
record.latest_acceptance_decision,
|
|
423
|
+
record.artifact_path,
|
|
424
|
+
JSON.stringify(record.summary),
|
|
425
|
+
],
|
|
426
|
+
);
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
function readExistingCandidate(
|
|
430
|
+
db: Database,
|
|
431
|
+
skillName: string,
|
|
432
|
+
packageFingerprint: string,
|
|
433
|
+
): CreatePackageCandidateRecord | null {
|
|
434
|
+
const row = db
|
|
435
|
+
.query(
|
|
436
|
+
`SELECT
|
|
437
|
+
candidate_id,
|
|
438
|
+
skill_name,
|
|
439
|
+
skill_path,
|
|
440
|
+
package_fingerprint,
|
|
441
|
+
parent_candidate_id,
|
|
442
|
+
candidate_generation,
|
|
443
|
+
evaluation_count,
|
|
444
|
+
first_evaluated_at,
|
|
445
|
+
last_evaluated_at,
|
|
446
|
+
latest_status,
|
|
447
|
+
latest_evaluation_source,
|
|
448
|
+
latest_acceptance_decision,
|
|
449
|
+
artifact_path,
|
|
450
|
+
summary_json
|
|
451
|
+
FROM package_candidates
|
|
452
|
+
WHERE skill_name = ? AND package_fingerprint = ?`,
|
|
453
|
+
)
|
|
454
|
+
.get(skillName, packageFingerprint) as CandidateRow | null;
|
|
455
|
+
|
|
456
|
+
return row ? hydrateCandidateRow(row) : null;
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
function readCandidateById(db: Database, candidateId: string): CreatePackageCandidateRecord | null {
|
|
460
|
+
const row = db
|
|
461
|
+
.query(
|
|
462
|
+
`SELECT
|
|
463
|
+
candidate_id,
|
|
464
|
+
skill_name,
|
|
465
|
+
skill_path,
|
|
466
|
+
package_fingerprint,
|
|
467
|
+
parent_candidate_id,
|
|
468
|
+
candidate_generation,
|
|
469
|
+
evaluation_count,
|
|
470
|
+
first_evaluated_at,
|
|
471
|
+
last_evaluated_at,
|
|
472
|
+
latest_status,
|
|
473
|
+
latest_evaluation_source,
|
|
474
|
+
latest_acceptance_decision,
|
|
475
|
+
artifact_path,
|
|
476
|
+
summary_json
|
|
477
|
+
FROM package_candidates
|
|
478
|
+
WHERE candidate_id = ?`,
|
|
479
|
+
)
|
|
480
|
+
.get(candidateId) as CandidateRow | null;
|
|
481
|
+
return row ? hydrateCandidateRow(row) : null;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
function readLatestCandidateForSkill(
|
|
485
|
+
db: Database,
|
|
486
|
+
skillName: string,
|
|
487
|
+
): CreatePackageCandidateRecord | null {
|
|
488
|
+
const row = db
|
|
489
|
+
.query(
|
|
490
|
+
`SELECT
|
|
491
|
+
candidate_id,
|
|
492
|
+
skill_name,
|
|
493
|
+
skill_path,
|
|
494
|
+
package_fingerprint,
|
|
495
|
+
parent_candidate_id,
|
|
496
|
+
candidate_generation,
|
|
497
|
+
evaluation_count,
|
|
498
|
+
first_evaluated_at,
|
|
499
|
+
last_evaluated_at,
|
|
500
|
+
latest_status,
|
|
501
|
+
latest_evaluation_source,
|
|
502
|
+
latest_acceptance_decision,
|
|
503
|
+
artifact_path,
|
|
504
|
+
summary_json
|
|
505
|
+
FROM package_candidates
|
|
506
|
+
WHERE skill_name = ?
|
|
507
|
+
ORDER BY last_evaluated_at DESC, candidate_generation DESC
|
|
508
|
+
LIMIT 1`,
|
|
509
|
+
)
|
|
510
|
+
.get(skillName) as CandidateRow | null;
|
|
511
|
+
return row ? hydrateCandidateRow(row) : null;
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
function readAcceptedPackageFrontierCandidatesForSkill(
|
|
515
|
+
db: Database,
|
|
516
|
+
skillName: string,
|
|
517
|
+
excludeCandidateId: string | null = null,
|
|
518
|
+
): CreatePackageCandidateRecord[] {
|
|
519
|
+
const rows = db
|
|
520
|
+
.query(
|
|
521
|
+
`SELECT
|
|
522
|
+
candidate_id,
|
|
523
|
+
skill_name,
|
|
524
|
+
skill_path,
|
|
525
|
+
package_fingerprint,
|
|
526
|
+
parent_candidate_id,
|
|
527
|
+
candidate_generation,
|
|
528
|
+
evaluation_count,
|
|
529
|
+
first_evaluated_at,
|
|
530
|
+
last_evaluated_at,
|
|
531
|
+
latest_status,
|
|
532
|
+
latest_evaluation_source,
|
|
533
|
+
latest_acceptance_decision,
|
|
534
|
+
artifact_path,
|
|
535
|
+
summary_json
|
|
536
|
+
FROM package_candidates
|
|
537
|
+
WHERE skill_name = ?
|
|
538
|
+
AND latest_acceptance_decision IN ('root', 'accepted')
|
|
539
|
+
AND (? IS NULL OR candidate_id != ?)
|
|
540
|
+
ORDER BY candidate_generation ASC, first_evaluated_at ASC`,
|
|
541
|
+
)
|
|
542
|
+
.all(skillName, excludeCandidateId, excludeCandidateId) as CandidateRow[];
|
|
543
|
+
return rows
|
|
544
|
+
.flatMap((row) => {
|
|
545
|
+
const hydrated = hydrateCandidateRow(row);
|
|
546
|
+
return hydrated ? [hydrated] : [];
|
|
547
|
+
})
|
|
548
|
+
.toSorted(compareAcceptedFrontierCandidates);
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
export function listPackageCandidates(
|
|
552
|
+
skillName: string,
|
|
553
|
+
db: Database | null = getOptionalDb(),
|
|
554
|
+
): CreatePackageCandidateRecord[] {
|
|
555
|
+
if (!db) return [];
|
|
556
|
+
const rows = db
|
|
557
|
+
.query(
|
|
558
|
+
`SELECT
|
|
559
|
+
candidate_id,
|
|
560
|
+
skill_name,
|
|
561
|
+
skill_path,
|
|
562
|
+
package_fingerprint,
|
|
563
|
+
parent_candidate_id,
|
|
564
|
+
candidate_generation,
|
|
565
|
+
evaluation_count,
|
|
566
|
+
first_evaluated_at,
|
|
567
|
+
last_evaluated_at,
|
|
568
|
+
latest_status,
|
|
569
|
+
latest_evaluation_source,
|
|
570
|
+
latest_acceptance_decision,
|
|
571
|
+
artifact_path,
|
|
572
|
+
summary_json
|
|
573
|
+
FROM package_candidates
|
|
574
|
+
WHERE skill_name = ?
|
|
575
|
+
ORDER BY candidate_generation ASC, first_evaluated_at ASC`,
|
|
576
|
+
)
|
|
577
|
+
.all(skillName) as CandidateRow[];
|
|
578
|
+
|
|
579
|
+
return rows.flatMap((row) => {
|
|
580
|
+
const hydrated = hydrateCandidateRow(row);
|
|
581
|
+
return hydrated ? [hydrated] : [];
|
|
582
|
+
});
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
export function listAcceptedPackageCandidates(
|
|
586
|
+
skillName: string,
|
|
587
|
+
db: Database | null = getOptionalDb(),
|
|
588
|
+
): CreatePackageCandidateRecord[] {
|
|
589
|
+
return listPackageCandidates(skillName, db).filter(
|
|
590
|
+
(candidate) =>
|
|
591
|
+
candidate.latest_acceptance_decision === "root" ||
|
|
592
|
+
candidate.latest_acceptance_decision === "accepted",
|
|
593
|
+
);
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
export function listAcceptedPackageFrontierCandidates(
|
|
597
|
+
skillName: string,
|
|
598
|
+
db: Database | null = getOptionalDb(),
|
|
599
|
+
): CreatePackageCandidateRecord[] {
|
|
600
|
+
if (!db) return [];
|
|
601
|
+
return readAcceptedPackageFrontierCandidatesForSkill(db, skillName);
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
export function selectAcceptedPackageFrontierCandidate(
|
|
605
|
+
skillName: string,
|
|
606
|
+
options: {
|
|
607
|
+
excludeCandidateId?: string | null;
|
|
608
|
+
db?: Database | null;
|
|
609
|
+
} = {},
|
|
610
|
+
): CreatePackageCandidateRecord | null {
|
|
611
|
+
const db = options.db ?? getOptionalDb();
|
|
612
|
+
if (!db) return null;
|
|
613
|
+
return (
|
|
614
|
+
readAcceptedPackageFrontierCandidatesForSkill(
|
|
615
|
+
db,
|
|
616
|
+
skillName,
|
|
617
|
+
options.excludeCandidateId ?? null,
|
|
618
|
+
)[0] ?? null
|
|
619
|
+
);
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
export function readPackageCandidateArtifactByFingerprint(
|
|
623
|
+
skillName: string,
|
|
624
|
+
packageFingerprint: string,
|
|
625
|
+
options: {
|
|
626
|
+
acceptedOnly?: boolean;
|
|
627
|
+
db?: Database | null;
|
|
628
|
+
} = {},
|
|
629
|
+
): CreatePackageEvaluationResult | null {
|
|
630
|
+
const db = options.db ?? getOptionalDb();
|
|
631
|
+
if (!db) return null;
|
|
632
|
+
|
|
633
|
+
const candidate = readExistingCandidate(db, skillName, packageFingerprint);
|
|
634
|
+
if (!candidate) return null;
|
|
635
|
+
if (
|
|
636
|
+
options.acceptedOnly &&
|
|
637
|
+
candidate.latest_acceptance_decision !== "root" &&
|
|
638
|
+
candidate.latest_acceptance_decision !== "accepted"
|
|
639
|
+
) {
|
|
640
|
+
return null;
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
return readCandidateArtifactForRecord(candidate);
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
export function persistPackageCandidateEvaluation(
|
|
647
|
+
evaluation: CreatePackageEvaluationResult,
|
|
648
|
+
db: Database | null = getOptionalDb(),
|
|
649
|
+
): CreatePackageEvaluationResult {
|
|
650
|
+
const packageFingerprint = evaluation.summary.package_fingerprint;
|
|
651
|
+
if (!packageFingerprint) return evaluation;
|
|
652
|
+
|
|
653
|
+
const now = new Date().toISOString();
|
|
654
|
+
const existing = db
|
|
655
|
+
? readExistingCandidate(db, evaluation.summary.skill_name, packageFingerprint)
|
|
656
|
+
: null;
|
|
657
|
+
const latestForSkill = db ? readLatestCandidateForSkill(db, evaluation.summary.skill_name) : null;
|
|
658
|
+
const parent = existing ? existing.parent_candidate_id : (latestForSkill?.candidate_id ?? null);
|
|
659
|
+
const candidateGeneration =
|
|
660
|
+
existing?.candidate_generation ?? (latestForSkill?.candidate_generation ?? -1) + 1;
|
|
661
|
+
const candidateId =
|
|
662
|
+
existing?.candidate_id ?? buildCandidateId(evaluation.summary.skill_name, packageFingerprint);
|
|
663
|
+
const comparisonCandidateId =
|
|
664
|
+
existing?.summary.candidate_acceptance?.compared_to_candidate_id ??
|
|
665
|
+
(db
|
|
666
|
+
? (selectAcceptedPackageFrontierCandidate(evaluation.summary.skill_name, {
|
|
667
|
+
excludeCandidateId: existing?.candidate_id ?? null,
|
|
668
|
+
db,
|
|
669
|
+
})?.candidate_id ?? parent)
|
|
670
|
+
: parent);
|
|
671
|
+
const comparisonCandidate =
|
|
672
|
+
db && comparisonCandidateId ? readCandidateById(db, comparisonCandidateId) : null;
|
|
673
|
+
|
|
674
|
+
const summaryWithCandidate: CreatePackageEvaluationSummary = {
|
|
675
|
+
...evaluation.summary,
|
|
676
|
+
candidate_id: candidateId,
|
|
677
|
+
parent_candidate_id: parent,
|
|
678
|
+
candidate_generation: candidateGeneration,
|
|
679
|
+
};
|
|
680
|
+
const acceptance = buildCandidateAcceptanceSummary(
|
|
681
|
+
summaryWithCandidate,
|
|
682
|
+
comparisonCandidateId,
|
|
683
|
+
comparisonCandidate,
|
|
684
|
+
now,
|
|
685
|
+
);
|
|
686
|
+
|
|
687
|
+
const summary: CreatePackageEvaluationSummary = {
|
|
688
|
+
...summaryWithCandidate,
|
|
689
|
+
candidate_acceptance: acceptance,
|
|
690
|
+
};
|
|
691
|
+
const enrichedEvaluation: CreatePackageEvaluationResult = {
|
|
692
|
+
...evaluation,
|
|
693
|
+
summary,
|
|
694
|
+
};
|
|
695
|
+
|
|
696
|
+
const artifactPath = getPackageCandidateArtifactPath(evaluation.summary.skill_name, candidateId);
|
|
697
|
+
mkdirSync(join(getPackageCandidateRoot(), sanitizeSkillName(evaluation.summary.skill_name)), {
|
|
698
|
+
recursive: true,
|
|
699
|
+
});
|
|
700
|
+
writeFileSync(artifactPath, JSON.stringify(enrichedEvaluation, null, 2), "utf-8");
|
|
701
|
+
|
|
702
|
+
if (db) {
|
|
703
|
+
const record: CreatePackageCandidateRecord = {
|
|
704
|
+
candidate_id: candidateId,
|
|
705
|
+
skill_name: evaluation.summary.skill_name,
|
|
706
|
+
skill_path: evaluation.summary.skill_path,
|
|
707
|
+
package_fingerprint: packageFingerprint,
|
|
708
|
+
parent_candidate_id: parent,
|
|
709
|
+
candidate_generation: candidateGeneration,
|
|
710
|
+
evaluation_count: (existing?.evaluation_count ?? 0) + 1,
|
|
711
|
+
first_evaluated_at: existing?.first_evaluated_at ?? now,
|
|
712
|
+
last_evaluated_at: now,
|
|
713
|
+
latest_status: summary.status,
|
|
714
|
+
latest_evaluation_source: summary.evaluation_source ?? null,
|
|
715
|
+
latest_acceptance_decision: summary.candidate_acceptance?.decision ?? null,
|
|
716
|
+
artifact_path: artifactPath,
|
|
717
|
+
summary,
|
|
718
|
+
};
|
|
719
|
+
upsertCandidateRecord(db, record);
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
return enrichedEvaluation;
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
export function refreshPackageCandidateEvaluationObservation(
|
|
726
|
+
evaluation: CreatePackageEvaluationResult,
|
|
727
|
+
db: Database | null = getOptionalDb(),
|
|
728
|
+
): CreatePackageEvaluationResult {
|
|
729
|
+
const candidateId = evaluation.summary.candidate_id;
|
|
730
|
+
const packageFingerprint = evaluation.summary.package_fingerprint;
|
|
731
|
+
if (!candidateId && !packageFingerprint) return evaluation;
|
|
732
|
+
|
|
733
|
+
const existing =
|
|
734
|
+
db && candidateId
|
|
735
|
+
? readCandidateById(db, candidateId)
|
|
736
|
+
: db && packageFingerprint
|
|
737
|
+
? readExistingCandidate(db, evaluation.summary.skill_name, packageFingerprint)
|
|
738
|
+
: null;
|
|
739
|
+
if (!existing) return evaluation;
|
|
740
|
+
|
|
741
|
+
const artifactPath =
|
|
742
|
+
existing.artifact_path ??
|
|
743
|
+
getPackageCandidateArtifactPath(existing.skill_name, existing.candidate_id);
|
|
744
|
+
mkdirSync(join(getPackageCandidateRoot(), sanitizeSkillName(existing.skill_name)), {
|
|
745
|
+
recursive: true,
|
|
746
|
+
});
|
|
747
|
+
writeFileSync(artifactPath, JSON.stringify(evaluation, null, 2), "utf-8");
|
|
748
|
+
|
|
749
|
+
if (db) {
|
|
750
|
+
const record: CreatePackageCandidateRecord = {
|
|
751
|
+
...existing,
|
|
752
|
+
latest_status: evaluation.summary.status,
|
|
753
|
+
latest_evaluation_source:
|
|
754
|
+
evaluation.summary.evaluation_source ?? existing.latest_evaluation_source,
|
|
755
|
+
latest_acceptance_decision:
|
|
756
|
+
evaluation.summary.candidate_acceptance?.decision ?? existing.latest_acceptance_decision,
|
|
757
|
+
artifact_path: artifactPath,
|
|
758
|
+
summary: evaluation.summary,
|
|
759
|
+
};
|
|
760
|
+
upsertCandidateRecord(db, record);
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
return evaluation;
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
export function readPackageCandidateArtifact(
|
|
767
|
+
skillName: string,
|
|
768
|
+
candidateId: string,
|
|
769
|
+
): CreatePackageEvaluationResult | null {
|
|
770
|
+
return readCandidateArtifact(getPackageCandidateArtifactPath(skillName, candidateId));
|
|
771
|
+
}
|