selftune 0.2.31 → 0.2.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +83 -56
  2. package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
  3. package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
  5. package/apps/local-dashboard/dist/index.html +3 -3
  6. package/cli/selftune/command-surface.ts +613 -2
  7. package/cli/selftune/create/baseline.ts +429 -0
  8. package/cli/selftune/create/check.ts +35 -0
  9. package/cli/selftune/create/init.ts +115 -0
  10. package/cli/selftune/create/package-candidate-state.ts +771 -0
  11. package/cli/selftune/create/package-evaluator.ts +710 -0
  12. package/cli/selftune/create/package-fingerprint.ts +142 -0
  13. package/cli/selftune/create/package-search.ts +377 -0
  14. package/cli/selftune/create/publish.ts +431 -0
  15. package/cli/selftune/create/readiness.ts +495 -0
  16. package/cli/selftune/create/replay.ts +330 -0
  17. package/cli/selftune/create/report.ts +74 -0
  18. package/cli/selftune/create/scaffold.ts +121 -0
  19. package/cli/selftune/create/skills-ref-adapter.ts +177 -0
  20. package/cli/selftune/create/status.ts +33 -0
  21. package/cli/selftune/create/templates.ts +249 -0
  22. package/cli/selftune/cron/setup.ts +1 -1
  23. package/cli/selftune/dashboard-action-events.ts +4 -1
  24. package/cli/selftune/dashboard-action-result.ts +789 -24
  25. package/cli/selftune/dashboard-action-stream.ts +80 -0
  26. package/cli/selftune/dashboard-contract.ts +146 -3
  27. package/cli/selftune/dashboard-server.ts +5 -4
  28. package/cli/selftune/eval/hooks-to-evals.ts +58 -35
  29. package/cli/selftune/eval/synthetic-evals.ts +145 -17
  30. package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
  31. package/cli/selftune/evolution/evolve-body.ts +9 -36
  32. package/cli/selftune/evolution/evolve.ts +8 -72
  33. package/cli/selftune/evolution/stopping-criteria.ts +5 -13
  34. package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
  35. package/cli/selftune/evolution/validate-host-replay.ts +115 -15
  36. package/cli/selftune/improve.ts +206 -0
  37. package/cli/selftune/index.ts +123 -6
  38. package/cli/selftune/init.ts +1 -1
  39. package/cli/selftune/localdb/queries/dashboard.ts +30 -0
  40. package/cli/selftune/localdb/schema.ts +52 -0
  41. package/cli/selftune/monitoring/watch.ts +257 -23
  42. package/cli/selftune/orchestrate/execute.ts +300 -1
  43. package/cli/selftune/orchestrate/finalize.ts +14 -0
  44. package/cli/selftune/orchestrate/plan.ts +22 -5
  45. package/cli/selftune/orchestrate/prepare.ts +59 -4
  46. package/cli/selftune/orchestrate/report.ts +1 -1
  47. package/cli/selftune/orchestrate.ts +34 -1
  48. package/cli/selftune/publish.ts +35 -0
  49. package/cli/selftune/routes/actions.ts +81 -15
  50. package/cli/selftune/routes/overview.ts +1 -1
  51. package/cli/selftune/routes/skill-report.ts +147 -2
  52. package/cli/selftune/run.ts +18 -0
  53. package/cli/selftune/schedule.ts +3 -3
  54. package/cli/selftune/search-run.ts +703 -0
  55. package/cli/selftune/status.ts +35 -11
  56. package/cli/selftune/testing-readiness.ts +431 -40
  57. package/cli/selftune/types.ts +316 -0
  58. package/cli/selftune/utils/eval-readiness.ts +1 -0
  59. package/cli/selftune/utils/json-output.ts +11 -0
  60. package/cli/selftune/utils/lifecycle-surface.ts +48 -0
  61. package/cli/selftune/utils/query-filter.ts +82 -1
  62. package/cli/selftune/utils/tui.ts +85 -2
  63. package/cli/selftune/verify.ts +205 -0
  64. package/cli/selftune/workflows/proposals.ts +1 -1
  65. package/cli/selftune/workflows/skill-scaffold.ts +141 -63
  66. package/cli/selftune/workflows/workflows.ts +4 -4
  67. package/package.json +1 -1
  68. package/skill/SKILL.md +148 -85
  69. package/skill/references/cli-quick-reference.md +16 -1
  70. package/skill/references/creator-playbook.md +31 -10
  71. package/skill/workflows/Baseline.md +8 -9
  72. package/skill/workflows/Contributions.md +4 -4
  73. package/skill/workflows/Create.md +173 -0
  74. package/skill/workflows/CreateTestDeploy.md +34 -30
  75. package/skill/workflows/Cron.md +2 -2
  76. package/skill/workflows/Dashboard.md +3 -3
  77. package/skill/workflows/Evals.md +13 -7
  78. package/skill/workflows/Evolve.md +75 -32
  79. package/skill/workflows/EvolveBody.md +22 -15
  80. package/skill/workflows/Hook.md +1 -1
  81. package/skill/workflows/Improve.md +168 -0
  82. package/skill/workflows/Initialize.md +3 -3
  83. package/skill/workflows/Orchestrate.md +49 -12
  84. package/skill/workflows/Publish.md +100 -0
  85. package/skill/workflows/Run.md +72 -0
  86. package/skill/workflows/Schedule.md +2 -2
  87. package/skill/workflows/SearchRun.md +89 -0
  88. package/skill/workflows/SignalsDashboard.md +2 -2
  89. package/skill/workflows/UnitTest.md +13 -4
  90. package/skill/workflows/Verify.md +136 -0
  91. package/skill/workflows/Watch.md +114 -47
  92. package/skill/workflows/Workflows.md +13 -8
  93. package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +0 -15
  94. package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +0 -1
  95. package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
@@ -0,0 +1,710 @@
1
+ import type { Database } from "bun:sqlite";
2
+
3
+ import { readFileSync } from "node:fs";
4
+ import { basename, dirname } from "node:path";
5
+
6
+ import {
7
+ persistPackageCandidateEvaluation,
8
+ readPackageCandidateArtifactByFingerprint,
9
+ } from "./package-candidate-state.js";
10
+ import { parseSkillSections } from "../evolution/deploy-proposal.js";
11
+ import { getLastDeployedProposal } from "../evolution/audit.js";
12
+ import { assessBodyQuality, validateBodyStructure } from "../evolution/validate-body.js";
13
+ import { getDb } from "../localdb/db.js";
14
+ import { queryGradingBaseline, queryRecentGradingResults } from "../localdb/queries.js";
15
+ import type { WatchResult } from "../monitoring/watch.js";
16
+ import {
17
+ readCanonicalPackageEvaluationArtifact,
18
+ readCanonicalUnitTestRunResult,
19
+ writeCanonicalPackageEvaluationArtifact,
20
+ writeCanonicalPackageEvaluation,
21
+ } from "../testing-readiness.js";
22
+ import type {
23
+ CreatePackageBodySummary,
24
+ CreatePackageEvaluationSummary,
25
+ CreatePackageEvaluationGradingSummary,
26
+ CreatePackageEvaluationUnitTestSummary,
27
+ CreatePackageEvaluationWatchSummary,
28
+ } from "../types.js";
29
+ import { computeCreatePackageFingerprint } from "./package-fingerprint.js";
30
+ import {
31
+ runCreateBaseline,
32
+ type CreateBaselineDeps,
33
+ type CreateBaselineResult,
34
+ type RunCreateBaselineOptions,
35
+ } from "./baseline.js";
36
+ import {
37
+ runCreateReplay,
38
+ type CreateReplayMode,
39
+ type CreateReplayResult,
40
+ type RunCreateReplayOptions,
41
+ } from "./replay.js";
42
+
43
+ export interface RunCreatePackageEvaluationOptions {
44
+ skillPath: string;
45
+ skillName?: string;
46
+ mode?: Extract<CreateReplayMode, "package">;
47
+ agent?: string;
48
+ evalSetPath?: string;
49
+ }
50
+
51
+ export interface CreatePackageEvaluationResult {
52
+ summary: CreatePackageEvaluationSummary;
53
+ replay: CreateReplayResult;
54
+ baseline: CreateBaselineResult;
55
+ }
56
+
57
+ export interface CreatePackageEvaluationDeps extends CreateBaselineDeps {
58
+ getDb?: () => Database;
59
+ getLastDeployedProposal?: typeof getLastDeployedProposal;
60
+ queryGradingBaseline?: typeof queryGradingBaseline;
61
+ queryRecentGradingResults?: typeof queryRecentGradingResults;
62
+ computeCreatePackageFingerprint?: typeof computeCreatePackageFingerprint;
63
+ readCanonicalPackageEvaluationArtifact?: typeof readCanonicalPackageEvaluationArtifact;
64
+ readPackageCandidateArtifactByFingerprint?: typeof readPackageCandidateArtifactByFingerprint;
65
+ readCanonicalUnitTestRunResult?: typeof readCanonicalUnitTestRunResult;
66
+ assessBodyQuality?: typeof assessBodyQuality;
67
+ readSkillContent?: (skillPath: string) => string;
68
+ persistPackageCandidateEvaluation?: typeof persistPackageCandidateEvaluation;
69
+ writeCanonicalPackageEvaluationArtifact?: typeof writeCanonicalPackageEvaluationArtifact;
70
+ writeCanonicalPackageEvaluation?: typeof writeCanonicalPackageEvaluation;
71
+ runCreateReplay?: (
72
+ options: RunCreateReplayOptions,
73
+ ) => Promise<Awaited<ReturnType<typeof runCreateReplay>>>;
74
+ runCreateBaseline?: (
75
+ options: RunCreateBaselineOptions,
76
+ deps?: CreateBaselineDeps,
77
+ ) => Promise<CreateBaselineResult>;
78
+ }
79
+
80
+ type BaselineResultLike = CreateBaselineResult["per_entry"][number];
81
+ const BODY_QUALITY_THRESHOLD = 0.6;
82
+
83
+ function inferSkillNameFromSkillPath(skillPath: string): string {
84
+ return basename(dirname(skillPath));
85
+ }
86
+
87
+ function emptyRuntimeMetrics() {
88
+ return {
89
+ eval_runs: 0,
90
+ usage_observations: 0,
91
+ total_duration_ms: 0,
92
+ avg_duration_ms: 0,
93
+ total_input_tokens: null,
94
+ total_output_tokens: null,
95
+ total_cache_creation_input_tokens: null,
96
+ total_cache_read_input_tokens: null,
97
+ total_cost_usd: null,
98
+ total_turns: null,
99
+ };
100
+ }
101
+
102
+ function average(values: number[]): number | null {
103
+ if (values.length === 0) return null;
104
+ return values.reduce((sum, value) => sum + value, 0) / values.length;
105
+ }
106
+
107
+ function collectEvidenceSamples(replay: CreateReplayResult, baseline: CreateBaselineResult) {
108
+ const replayFailureSamples = replay.results
109
+ .filter((result) => !result.passed)
110
+ .slice(0, 3)
111
+ .map((result) => ({
112
+ query: result.query,
113
+ evidence: result.evidence ?? null,
114
+ }));
115
+
116
+ const perQuery = new Map<
117
+ string,
118
+ {
119
+ with_skill?: BaselineResultLike;
120
+ without_skill?: BaselineResultLike;
121
+ }
122
+ >();
123
+ for (const entry of baseline.per_entry) {
124
+ const current = perQuery.get(entry.query) ?? {};
125
+ if (entry.with_skill) {
126
+ current.with_skill = entry;
127
+ } else {
128
+ current.without_skill = entry;
129
+ }
130
+ perQuery.set(entry.query, current);
131
+ }
132
+
133
+ const baselineWins: Array<{ query: string; evidence: string | null }> = [];
134
+ const baselineRegressions: Array<{ query: string; evidence: string | null }> = [];
135
+ for (const [query, pair] of perQuery) {
136
+ if (pair.with_skill?.pass === true && pair.without_skill?.pass === false) {
137
+ baselineWins.push({
138
+ query,
139
+ evidence: pair.with_skill.evidence ?? pair.without_skill.evidence ?? null,
140
+ });
141
+ }
142
+ if (pair.with_skill?.pass === false && pair.without_skill?.pass === true) {
143
+ baselineRegressions.push({
144
+ query,
145
+ evidence: pair.with_skill.evidence ?? pair.without_skill.evidence ?? null,
146
+ });
147
+ }
148
+ }
149
+
150
+ return {
151
+ replay_failures: replay.results.filter((result) => !result.passed).length,
152
+ baseline_wins: baselineWins.length,
153
+ baseline_regressions: baselineRegressions.length,
154
+ replay_failure_samples: replayFailureSamples,
155
+ baseline_win_samples: baselineWins.slice(0, 3),
156
+ baseline_regression_samples: baselineRegressions.slice(0, 3),
157
+ };
158
+ }
159
+
160
+ function buildGradingSummary(
161
+ skillName: string,
162
+ deps: CreatePackageEvaluationDeps,
163
+ ): CreatePackageEvaluationGradingSummary | undefined {
164
+ try {
165
+ const db = (deps.getDb ?? getDb)();
166
+ const lastDeployed = (deps.getLastDeployedProposal ?? getLastDeployedProposal)(skillName);
167
+ const baselineRow = (deps.queryGradingBaseline ?? queryGradingBaseline)(
168
+ db,
169
+ skillName,
170
+ lastDeployed?.proposal_id,
171
+ );
172
+ const recentRows = (deps.queryRecentGradingResults ?? queryRecentGradingResults)(
173
+ db,
174
+ skillName,
175
+ 10,
176
+ );
177
+
178
+ if (!baselineRow && recentRows.length === 0) {
179
+ return undefined;
180
+ }
181
+
182
+ const recentPassRates = recentRows.flatMap((row) =>
183
+ row.pass_rate == null ? [] : [row.pass_rate],
184
+ );
185
+ const recentMeanScores = recentRows.flatMap((row) =>
186
+ row.mean_score == null ? [] : [row.mean_score],
187
+ );
188
+ const recentSummary =
189
+ recentRows.length === 0
190
+ ? null
191
+ : {
192
+ sample_size: recentRows.length,
193
+ average_pass_rate: average(recentPassRates),
194
+ average_mean_score: average(recentMeanScores),
195
+ newest_graded_at: recentRows[0]?.graded_at ?? null,
196
+ oldest_graded_at: recentRows.at(-1)?.graded_at ?? null,
197
+ };
198
+ const baselineSummary = baselineRow
199
+ ? {
200
+ proposal_id: baselineRow.proposal_id,
201
+ measured_at: baselineRow.measured_at,
202
+ pass_rate: baselineRow.pass_rate,
203
+ mean_score: baselineRow.mean_score,
204
+ sample_size: baselineRow.sample_size,
205
+ }
206
+ : null;
207
+ const passRateDelta =
208
+ baselineSummary && recentSummary?.average_pass_rate != null
209
+ ? recentSummary.average_pass_rate - baselineSummary.pass_rate
210
+ : null;
211
+ const meanScoreDelta =
212
+ baselineSummary?.mean_score != null && recentSummary?.average_mean_score != null
213
+ ? recentSummary.average_mean_score - baselineSummary.mean_score
214
+ : null;
215
+
216
+ return {
217
+ baseline: baselineSummary,
218
+ recent: recentSummary,
219
+ pass_rate_delta: passRateDelta,
220
+ mean_score_delta: meanScoreDelta,
221
+ regressed: passRateDelta == null ? null : passRateDelta < 0,
222
+ };
223
+ } catch {
224
+ // Fail-open: grading context should enrich the evaluator, never block it.
225
+ return undefined;
226
+ }
227
+ }
228
+
229
+ function buildUnitTestSummary(
230
+ skillName: string,
231
+ deps: CreatePackageEvaluationDeps,
232
+ ): CreatePackageEvaluationUnitTestSummary | undefined {
233
+ try {
234
+ const suite = deps.getDb
235
+ ? (deps.readCanonicalUnitTestRunResult ?? readCanonicalUnitTestRunResult)(
236
+ skillName,
237
+ deps.getDb(),
238
+ )
239
+ : (deps.readCanonicalUnitTestRunResult ?? readCanonicalUnitTestRunResult)(skillName);
240
+ if (!suite) return undefined;
241
+
242
+ return {
243
+ total: suite.total,
244
+ passed: suite.passed,
245
+ failed: suite.failed,
246
+ pass_rate: suite.pass_rate,
247
+ run_at: suite.run_at,
248
+ failing_tests: suite.results
249
+ .filter((result) => !result.passed)
250
+ .slice(0, 3)
251
+ .map((result) => ({
252
+ test_id: result.test_id,
253
+ error: result.error ?? null,
254
+ failed_assertions: result.assertion_results
255
+ .filter((assertion) => !assertion.passed)
256
+ .map((assertion) => `${assertion.assertion.type}: ${assertion.assertion.value}`),
257
+ })),
258
+ };
259
+ } catch {
260
+ return undefined;
261
+ }
262
+ }
263
+
264
+ function extractSkillBody(skillContent: string): string {
265
+ const parsed = parseSkillSections(skillContent);
266
+ const bodyParts: string[] = [];
267
+
268
+ if (parsed.description.trim()) {
269
+ bodyParts.push(parsed.description.trim());
270
+ bodyParts.push("");
271
+ }
272
+
273
+ for (const [sectionName, sectionContent] of Object.entries(parsed.sections)) {
274
+ bodyParts.push(`## ${sectionName}`);
275
+ bodyParts.push("");
276
+ bodyParts.push(sectionContent.trim());
277
+ bodyParts.push("");
278
+ }
279
+
280
+ return bodyParts.join("\n").trim();
281
+ }
282
+
283
+ function canReuseCachedPackageEvaluation(
284
+ cached: CreatePackageEvaluationResult | null,
285
+ options: RunCreatePackageEvaluationOptions,
286
+ packageFingerprint: string | null,
287
+ ): cached is CreatePackageEvaluationResult {
288
+ if (!cached || !packageFingerprint || options.evalSetPath) return false;
289
+ if (cached.summary.mode !== "package") return false;
290
+ if (cached.summary.skill_path !== options.skillPath) return false;
291
+ if (options.skillName && cached.summary.skill_name !== options.skillName) return false;
292
+ if (cached.summary.package_fingerprint !== packageFingerprint) return false;
293
+ if (options.agent && cached.summary.replay.agent !== options.agent) return false;
294
+ if (cached.summary.replay.validation_mode !== "host_replay") return false;
295
+ if (cached.summary.routing?.validation_mode !== "host_replay") return false;
296
+ if (typeof cached.summary.candidate_id !== "string") return false;
297
+ if (typeof cached.summary.candidate_generation !== "number") return false;
298
+ if (!cached.summary.candidate_acceptance) return false;
299
+ if (!cached.summary.body) return false;
300
+ if (cached.replay.skill !== cached.summary.skill_name) return false;
301
+ if (cached.baseline.skill_name !== cached.summary.skill_name) return false;
302
+ return true;
303
+ }
304
+
305
+ function buildSummary(
306
+ skillName: string,
307
+ skillPath: string,
308
+ replay: CreateReplayResult,
309
+ routing: CreateReplayResult | undefined,
310
+ baseline: CreateBaselineResult,
311
+ grading?: CreatePackageEvaluationGradingSummary,
312
+ body?: CreatePackageBodySummary,
313
+ unitTests?: CreatePackageEvaluationUnitTestSummary,
314
+ packageFingerprint?: string | null,
315
+ ): CreatePackageEvaluationSummary {
316
+ const replayFailed = replay.failed > 0;
317
+ const baselineFailed = !baseline.adds_value;
318
+ const status = replayFailed ? "replay_failed" : baselineFailed ? "baseline_failed" : "passed";
319
+ const withSkillMetrics = replay.runtime_metrics ?? emptyRuntimeMetrics();
320
+ const withoutSkillMetrics = baseline.runtime_metrics?.without_skill ?? emptyRuntimeMetrics();
321
+
322
+ return {
323
+ skill_name: skillName,
324
+ skill_path: skillPath,
325
+ mode: "package",
326
+ ...(packageFingerprint ? { package_fingerprint: packageFingerprint } : {}),
327
+ evaluation_source: "fresh",
328
+ status,
329
+ evaluation_passed: status === "passed",
330
+ next_command:
331
+ status === "passed"
332
+ ? null
333
+ : replayFailed
334
+ ? `selftune create replay --skill-path ${skillPath} --mode package`
335
+ : `selftune create baseline --skill-path ${skillPath} --mode package`,
336
+ replay: {
337
+ mode: replay.mode,
338
+ validation_mode: "host_replay",
339
+ agent: replay.agent,
340
+ proposal_id: replay.proposal_id,
341
+ fixture_id: replay.fixture_id,
342
+ total: replay.total,
343
+ passed: replay.passed,
344
+ failed: replay.failed,
345
+ pass_rate: replay.pass_rate,
346
+ runtime_metrics: replay.runtime_metrics,
347
+ },
348
+ ...(routing
349
+ ? {
350
+ routing: {
351
+ mode: routing.mode,
352
+ validation_mode: "host_replay",
353
+ agent: routing.agent,
354
+ proposal_id: routing.proposal_id,
355
+ fixture_id: routing.fixture_id,
356
+ total: routing.total,
357
+ passed: routing.passed,
358
+ failed: routing.failed,
359
+ pass_rate: routing.pass_rate,
360
+ runtime_metrics: routing.runtime_metrics,
361
+ },
362
+ }
363
+ : {}),
364
+ baseline: {
365
+ mode: baseline.mode,
366
+ baseline_pass_rate: baseline.baseline_pass_rate,
367
+ with_skill_pass_rate: baseline.with_skill_pass_rate,
368
+ lift: baseline.lift,
369
+ adds_value: baseline.adds_value,
370
+ measured_at: baseline.measured_at,
371
+ sample_size: baseline.per_entry.filter((entry) => entry.with_skill).length,
372
+ ...(baseline.runtime_metrics ? { runtime_metrics: baseline.runtime_metrics } : {}),
373
+ },
374
+ evidence: collectEvidenceSamples(replay, baseline),
375
+ ...(baseline.runtime_metrics
376
+ ? {
377
+ efficiency: {
378
+ with_skill: withSkillMetrics,
379
+ without_skill: withoutSkillMetrics,
380
+ },
381
+ }
382
+ : {}),
383
+ ...(grading ? { grading } : {}),
384
+ ...(body ? { body } : {}),
385
+ ...(unitTests ? { unit_tests: unitTests } : {}),
386
+ };
387
+ }
388
+
389
+ function formatPercent(value: number): string {
390
+ return `${(value * 100).toFixed(1)}%`;
391
+ }
392
+
393
+ function formatEvaluationSource(
394
+ source: CreatePackageEvaluationSummary["evaluation_source"],
395
+ ): string {
396
+ if (source === "artifact_cache") return "cached artifact";
397
+ if (source === "candidate_cache") return "accepted candidate cache";
398
+ return "fresh";
399
+ }
400
+
401
+ function formatCandidateAcceptance(summary: CreatePackageEvaluationSummary): string | null {
402
+ const acceptance = summary.candidate_acceptance;
403
+ if (!acceptance) return null;
404
+ const comparedTo = acceptance.compared_to_candidate_id ?? "root";
405
+ return `${acceptance.decision} vs ${comparedTo} | ${acceptance.rationale}`;
406
+ }
407
+
408
+ function summarizeReplayFailures(replay: CreateReplayResult): string[] {
409
+ return replay.results
410
+ .filter((result) => !result.passed)
411
+ .map((result) => {
412
+ const expected = result.should_trigger ? "trigger" : "skip";
413
+ const actual = result.triggered ? "triggered" : "skipped";
414
+ const evidence = result.evidence?.trim() ? ` | evidence: ${result.evidence.trim()}` : "";
415
+ return `- query: ${result.query} | expected: ${expected} | actual: ${actual}${evidence}`;
416
+ });
417
+ }
418
+
419
+ function summarizeBaselineDiffs(baseline: CreateBaselineResult): string[] {
420
+ const byQuery = new Map<
421
+ string,
422
+ {
423
+ withSkill?: boolean;
424
+ withoutSkill?: boolean;
425
+ }
426
+ >();
427
+
428
+ for (const entry of baseline.per_entry) {
429
+ const current = byQuery.get(entry.query) ?? {};
430
+ if (entry.with_skill) {
431
+ current.withSkill = entry.pass;
432
+ } else {
433
+ current.withoutSkill = entry.pass;
434
+ }
435
+ byQuery.set(entry.query, current);
436
+ }
437
+
438
+ return [...byQuery.entries()]
439
+ .filter(([, value]) => value.withSkill !== value.withoutSkill)
440
+ .map(([query, value]) => {
441
+ const withoutSkill =
442
+ value.withoutSkill == null ? "n/a" : value.withoutSkill ? "pass" : "fail";
443
+ const withSkill = value.withSkill == null ? "n/a" : value.withSkill ? "pass" : "fail";
444
+ return `- query: ${query} | without skill: ${withoutSkill} | with skill: ${withSkill}`;
445
+ });
446
+ }
447
+
448
+ function summarizeFailedUnitTests(
449
+ unitTests: CreatePackageEvaluationUnitTestSummary | undefined,
450
+ ): string[] {
451
+ if (!unitTests || unitTests.failed === 0) return [];
452
+ return unitTests.failing_tests.slice(0, 3).map((failure) => {
453
+ const failureDetails =
454
+ failure.failed_assertions.length > 0
455
+ ? ` | failed assertions: ${failure.failed_assertions.join(", ")}`
456
+ : "";
457
+ const error = failure.error?.trim() ? ` | error: ${failure.error.trim()}` : "";
458
+ return `- unit test: ${failure.test_id}${error}${failureDetails}`;
459
+ });
460
+ }
461
+
462
+ export function formatCreatePackageBenchmarkReport(
463
+ evaluation: CreatePackageEvaluationResult,
464
+ ): string {
465
+ const routing = evaluation.summary.routing;
466
+ const body = evaluation.summary.body;
467
+ const grading = evaluation.summary.grading;
468
+ const unitTests = evaluation.summary.unit_tests;
469
+ const candidateAcceptance = formatCandidateAcceptance(evaluation.summary);
470
+ const lines = [
471
+ `CREATE PACKAGE BENCHMARK REPORT: ${evaluation.summary.skill_name}`,
472
+ "",
473
+ `PACKAGE: skill=${evaluation.summary.skill_name} | mode=${evaluation.summary.mode} | status=${evaluation.summary.status}`,
474
+ `SOURCE: ${formatEvaluationSource(evaluation.summary.evaluation_source)}`,
475
+ ...(evaluation.summary.candidate_id
476
+ ? [
477
+ `CANDIDATE: id=${evaluation.summary.candidate_id} | generation=${evaluation.summary.candidate_generation ?? 0} | parent=${evaluation.summary.parent_candidate_id ?? "root"}`,
478
+ ]
479
+ : []),
480
+ ...(candidateAcceptance ? [`ACCEPTANCE: ${candidateAcceptance}`] : []),
481
+ `REPLAY: agent=${evaluation.summary.replay.agent} | pass_rate=${formatPercent(evaluation.summary.replay.pass_rate)} | passed=${evaluation.summary.replay.passed}/${evaluation.summary.replay.total} | fixture=${evaluation.summary.replay.fixture_id}`,
482
+ ...(routing
483
+ ? [
484
+ `ROUTING VALIDATION: pass_rate=${formatPercent(routing.pass_rate)} | passed=${routing.passed}/${routing.total} | fixture=${routing.fixture_id}`,
485
+ "",
486
+ ]
487
+ : []),
488
+ ...(body
489
+ ? [
490
+ `BODY VALIDATION: structural=${body.structural_valid ? "pass" : "fail"} | quality=${body.quality_score == null ? "n/a" : body.quality_score.toFixed(2)} | threshold=${body.quality_threshold.toFixed(2)} | valid=${body.valid ? "yes" : "no"}`,
491
+ "",
492
+ ]
493
+ : []),
494
+ `SKILLS IMPACT: without_skill=${formatPercent(evaluation.summary.baseline.baseline_pass_rate)} | with_skill=${formatPercent(evaluation.summary.baseline.with_skill_pass_rate)} | lift=${evaluation.summary.baseline.lift.toFixed(3)} | adds_value=${evaluation.summary.baseline.adds_value ? "yes" : "no"}`,
495
+ ...(unitTests
496
+ ? [
497
+ `UNIT TESTS: passed=${unitTests.passed}/${unitTests.total} | pass_rate=${formatPercent(unitTests.pass_rate)} | latest_run=${unitTests.run_at}`,
498
+ "",
499
+ ]
500
+ : []),
501
+ "",
502
+ ...(grading
503
+ ? [
504
+ `GRADING CONTEXT: baseline=${grading.baseline ? formatPercent(grading.baseline.pass_rate) : "n/a"} | recent_avg=${grading.recent?.average_pass_rate != null ? formatPercent(grading.recent.average_pass_rate) : "n/a"} | delta=${grading.pass_rate_delta == null ? "n/a" : `${grading.pass_rate_delta >= 0 ? "+" : ""}${(grading.pass_rate_delta * 100).toFixed(1)}%`} | regressed=${grading.regressed == null ? "unknown" : grading.regressed ? "yes" : "no"}`,
505
+ "",
506
+ ]
507
+ : []),
508
+ "FAILURE ANALYSIS:",
509
+ ];
510
+
511
+ const replayFailures = summarizeReplayFailures(evaluation.replay);
512
+ const baselineDiffs = summarizeBaselineDiffs(evaluation.baseline);
513
+ const unitTestFailures = summarizeFailedUnitTests(unitTests);
514
+
515
+ if (replayFailures.length === 0 && baselineDiffs.length === 0 && unitTestFailures.length === 0) {
516
+ lines.push("- none");
517
+ } else {
518
+ if (replayFailures.length > 0) {
519
+ lines.push(...replayFailures);
520
+ }
521
+ if (baselineDiffs.length > 0) {
522
+ lines.push(...baselineDiffs);
523
+ }
524
+ if (unitTestFailures.length > 0) {
525
+ lines.push(...unitTestFailures);
526
+ }
527
+ }
528
+
529
+ lines.push("");
530
+ lines.push(
531
+ `RECOMMENDATION: ${evaluation.summary.evaluation_passed ? "APPROVE FOR PUBLISH" : "DO NOT PUBLISH"}`,
532
+ );
533
+
534
+ if (evaluation.summary.next_command) {
535
+ lines.push(`NEXT: ${evaluation.summary.next_command}`);
536
+ }
537
+
538
+ return lines.join("\n");
539
+ }
540
+
541
+ export function buildCreatePackageWatchSummary(
542
+ watchResult: WatchResult,
543
+ ): CreatePackageEvaluationWatchSummary {
544
+ return {
545
+ snapshot: watchResult.snapshot,
546
+ alert: watchResult.alert,
547
+ rolled_back: watchResult.rolledBack,
548
+ recommendation: watchResult.recommendation,
549
+ recommended_command: watchResult.recommended_command ?? null,
550
+ grade_alert: watchResult.gradeAlert ?? null,
551
+ grade_regression: watchResult.gradeRegression ?? null,
552
+ ...(watchResult.efficiencyAlert || watchResult.efficiencyRegression
553
+ ? {
554
+ efficiency_alert: watchResult.efficiencyAlert ?? null,
555
+ efficiency_regression: watchResult.efficiencyRegression ?? null,
556
+ }
557
+ : {}),
558
+ };
559
+ }
560
+
561
+ export function attachCreatePackageWatchSummary(
562
+ summary: CreatePackageEvaluationSummary,
563
+ watchResult: WatchResult,
564
+ ): CreatePackageEvaluationSummary {
565
+ return {
566
+ ...summary,
567
+ watch: buildCreatePackageWatchSummary(watchResult),
568
+ };
569
+ }
570
+
571
+ export async function runCreatePackageEvaluation(
572
+ options: RunCreatePackageEvaluationOptions,
573
+ deps: CreatePackageEvaluationDeps = {},
574
+ ): Promise<CreatePackageEvaluationResult> {
575
+ const packageFingerprint = (
576
+ deps.computeCreatePackageFingerprint ?? computeCreatePackageFingerprint
577
+ )(options.skillPath);
578
+ const skillName = options.skillName?.trim() || inferSkillNameFromSkillPath(options.skillPath);
579
+ const cachedEvaluation = (
580
+ deps.readCanonicalPackageEvaluationArtifact ?? readCanonicalPackageEvaluationArtifact
581
+ )(skillName);
582
+ if (canReuseCachedPackageEvaluation(cachedEvaluation, options, packageFingerprint)) {
583
+ return {
584
+ ...cachedEvaluation,
585
+ summary: {
586
+ ...cachedEvaluation.summary,
587
+ evaluation_source: "artifact_cache",
588
+ },
589
+ };
590
+ }
591
+ if (packageFingerprint) {
592
+ const candidateCachedEvaluation = (
593
+ deps.readPackageCandidateArtifactByFingerprint ?? readPackageCandidateArtifactByFingerprint
594
+ )(skillName, packageFingerprint, {
595
+ acceptedOnly: true,
596
+ db: deps.getDb ? deps.getDb() : undefined,
597
+ });
598
+ if (canReuseCachedPackageEvaluation(candidateCachedEvaluation, options, packageFingerprint)) {
599
+ return {
600
+ ...candidateCachedEvaluation,
601
+ summary: {
602
+ ...candidateCachedEvaluation.summary,
603
+ evaluation_source: "candidate_cache",
604
+ },
605
+ };
606
+ }
607
+ }
608
+
609
+ let replay = await (deps.runCreateReplay ?? runCreateReplay)({
610
+ skillPath: options.skillPath,
611
+ mode: options.mode ?? "package",
612
+ agent: options.agent,
613
+ evalSetPath: options.evalSetPath,
614
+ });
615
+ if (replay.skill !== skillName) {
616
+ replay = { ...replay, skill: skillName };
617
+ }
618
+ let routing: CreateReplayResult | undefined;
619
+ try {
620
+ routing = await (deps.runCreateReplay ?? runCreateReplay)({
621
+ skillPath: options.skillPath,
622
+ mode: "routing",
623
+ agent: replay.agent,
624
+ evalSetPath: options.evalSetPath,
625
+ });
626
+ if (routing.skill !== skillName) {
627
+ routing = { ...routing, skill: skillName };
628
+ }
629
+ } catch {
630
+ // Fail-open: routing validation should enrich package reports when available.
631
+ }
632
+
633
+ let baseline = await (deps.runCreateBaseline ?? runCreateBaseline)(
634
+ {
635
+ skillPath: options.skillPath,
636
+ mode: "package",
637
+ agent: options.agent,
638
+ evalSetPath: options.evalSetPath,
639
+ withSkillReplayResult: replay,
640
+ },
641
+ deps,
642
+ );
643
+ if (baseline.skill_name !== skillName) {
644
+ baseline = { ...baseline, skill_name: skillName };
645
+ }
646
+ const grading = buildGradingSummary(skillName, deps);
647
+ let body: CreatePackageBodySummary | undefined;
648
+ try {
649
+ const skillContent = (deps.readSkillContent ?? ((path) => readFileSync(path, "utf-8")))(
650
+ options.skillPath,
651
+ );
652
+ const bodyContent = extractSkillBody(skillContent);
653
+ const structural = validateBodyStructure(bodyContent);
654
+ const quality = await (deps.assessBodyQuality ?? assessBodyQuality)(
655
+ bodyContent,
656
+ replay.skill,
657
+ replay.agent,
658
+ );
659
+ body = {
660
+ structural_valid: structural.valid,
661
+ structural_reason: structural.reason,
662
+ quality_score: quality.score,
663
+ quality_reason: quality.reason,
664
+ quality_threshold: BODY_QUALITY_THRESHOLD,
665
+ quality_passed: quality.score >= BODY_QUALITY_THRESHOLD,
666
+ valid: structural.valid && quality.score >= BODY_QUALITY_THRESHOLD,
667
+ };
668
+ } catch {
669
+ // Fail-open: body validation should enrich package reports when available.
670
+ }
671
+ const unitTests = buildUnitTestSummary(skillName, deps);
672
+ let evaluationResult: CreatePackageEvaluationResult = {
673
+ summary: buildSummary(
674
+ skillName,
675
+ options.skillPath,
676
+ replay,
677
+ routing,
678
+ baseline,
679
+ grading,
680
+ body,
681
+ unitTests,
682
+ packageFingerprint,
683
+ ),
684
+ replay,
685
+ baseline,
686
+ };
687
+
688
+ try {
689
+ evaluationResult = (
690
+ deps.persistPackageCandidateEvaluation ?? persistPackageCandidateEvaluation
691
+ )(evaluationResult, deps.getDb ? deps.getDb() : undefined);
692
+ } catch {
693
+ // Fail-open: candidate persistence should not block measurement.
694
+ }
695
+
696
+ try {
697
+ (deps.writeCanonicalPackageEvaluation ?? writeCanonicalPackageEvaluation)(
698
+ skillName,
699
+ evaluationResult.summary,
700
+ );
701
+ (deps.writeCanonicalPackageEvaluationArtifact ?? writeCanonicalPackageEvaluationArtifact)(
702
+ skillName,
703
+ evaluationResult,
704
+ );
705
+ } catch {
706
+ // Fail-open: evaluation artifacts should improve reuse, never block scoring.
707
+ }
708
+
709
+ return evaluationResult;
710
+ }