selftune 0.2.31 → 0.2.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +83 -56
  2. package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
  3. package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
  5. package/apps/local-dashboard/dist/index.html +3 -3
  6. package/cli/selftune/command-surface.ts +613 -2
  7. package/cli/selftune/create/baseline.ts +429 -0
  8. package/cli/selftune/create/check.ts +35 -0
  9. package/cli/selftune/create/init.ts +115 -0
  10. package/cli/selftune/create/package-candidate-state.ts +771 -0
  11. package/cli/selftune/create/package-evaluator.ts +710 -0
  12. package/cli/selftune/create/package-fingerprint.ts +142 -0
  13. package/cli/selftune/create/package-search.ts +377 -0
  14. package/cli/selftune/create/publish.ts +431 -0
  15. package/cli/selftune/create/readiness.ts +495 -0
  16. package/cli/selftune/create/replay.ts +330 -0
  17. package/cli/selftune/create/report.ts +74 -0
  18. package/cli/selftune/create/scaffold.ts +121 -0
  19. package/cli/selftune/create/skills-ref-adapter.ts +177 -0
  20. package/cli/selftune/create/status.ts +33 -0
  21. package/cli/selftune/create/templates.ts +249 -0
  22. package/cli/selftune/cron/setup.ts +1 -1
  23. package/cli/selftune/dashboard-action-events.ts +4 -1
  24. package/cli/selftune/dashboard-action-result.ts +789 -24
  25. package/cli/selftune/dashboard-action-stream.ts +80 -0
  26. package/cli/selftune/dashboard-contract.ts +146 -3
  27. package/cli/selftune/dashboard-server.ts +5 -4
  28. package/cli/selftune/eval/hooks-to-evals.ts +58 -35
  29. package/cli/selftune/eval/synthetic-evals.ts +145 -17
  30. package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
  31. package/cli/selftune/evolution/evolve-body.ts +9 -36
  32. package/cli/selftune/evolution/evolve.ts +8 -72
  33. package/cli/selftune/evolution/stopping-criteria.ts +5 -13
  34. package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
  35. package/cli/selftune/evolution/validate-host-replay.ts +115 -15
  36. package/cli/selftune/improve.ts +206 -0
  37. package/cli/selftune/index.ts +123 -6
  38. package/cli/selftune/init.ts +1 -1
  39. package/cli/selftune/localdb/queries/dashboard.ts +30 -0
  40. package/cli/selftune/localdb/schema.ts +52 -0
  41. package/cli/selftune/monitoring/watch.ts +257 -23
  42. package/cli/selftune/orchestrate/execute.ts +300 -1
  43. package/cli/selftune/orchestrate/finalize.ts +14 -0
  44. package/cli/selftune/orchestrate/plan.ts +22 -5
  45. package/cli/selftune/orchestrate/prepare.ts +59 -4
  46. package/cli/selftune/orchestrate/report.ts +1 -1
  47. package/cli/selftune/orchestrate.ts +34 -1
  48. package/cli/selftune/publish.ts +35 -0
  49. package/cli/selftune/routes/actions.ts +81 -15
  50. package/cli/selftune/routes/overview.ts +1 -1
  51. package/cli/selftune/routes/skill-report.ts +147 -2
  52. package/cli/selftune/run.ts +18 -0
  53. package/cli/selftune/schedule.ts +3 -3
  54. package/cli/selftune/search-run.ts +703 -0
  55. package/cli/selftune/status.ts +35 -11
  56. package/cli/selftune/testing-readiness.ts +431 -40
  57. package/cli/selftune/types.ts +316 -0
  58. package/cli/selftune/utils/eval-readiness.ts +1 -0
  59. package/cli/selftune/utils/json-output.ts +11 -0
  60. package/cli/selftune/utils/lifecycle-surface.ts +48 -0
  61. package/cli/selftune/utils/query-filter.ts +82 -1
  62. package/cli/selftune/utils/tui.ts +85 -2
  63. package/cli/selftune/verify.ts +205 -0
  64. package/cli/selftune/workflows/proposals.ts +1 -1
  65. package/cli/selftune/workflows/skill-scaffold.ts +141 -63
  66. package/cli/selftune/workflows/workflows.ts +4 -4
  67. package/package.json +1 -1
  68. package/skill/SKILL.md +148 -85
  69. package/skill/references/cli-quick-reference.md +16 -1
  70. package/skill/references/creator-playbook.md +31 -10
  71. package/skill/workflows/Baseline.md +8 -9
  72. package/skill/workflows/Contributions.md +4 -4
  73. package/skill/workflows/Create.md +173 -0
  74. package/skill/workflows/CreateTestDeploy.md +34 -30
  75. package/skill/workflows/Cron.md +2 -2
  76. package/skill/workflows/Dashboard.md +3 -3
  77. package/skill/workflows/Evals.md +13 -7
  78. package/skill/workflows/Evolve.md +75 -32
  79. package/skill/workflows/EvolveBody.md +22 -15
  80. package/skill/workflows/Hook.md +1 -1
  81. package/skill/workflows/Improve.md +168 -0
  82. package/skill/workflows/Initialize.md +3 -3
  83. package/skill/workflows/Orchestrate.md +49 -12
  84. package/skill/workflows/Publish.md +100 -0
  85. package/skill/workflows/Run.md +72 -0
  86. package/skill/workflows/Schedule.md +2 -2
  87. package/skill/workflows/SearchRun.md +89 -0
  88. package/skill/workflows/SignalsDashboard.md +2 -2
  89. package/skill/workflows/UnitTest.md +13 -4
  90. package/skill/workflows/Verify.md +136 -0
  91. package/skill/workflows/Watch.md +114 -47
  92. package/skill/workflows/Workflows.md +13 -8
  93. package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +0 -15
  94. package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +0 -1
  95. package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
@@ -1,4 +1,22 @@
1
- import type { DashboardActionName, DashboardActionResultSummary } from "./dashboard-contract.js";
1
+ import type {
2
+ DashboardActionName,
3
+ DashboardActionResultSummary,
4
+ DashboardSearchRunSummary,
5
+ } from "./dashboard-contract.js";
6
+ import type {
7
+ CreatePackageBodySummary,
8
+ CreatePackageEvaluationEfficiencySummary,
9
+ CreatePackageEvaluationEvidenceSample,
10
+ CreatePackageEvaluationEvidenceSummary,
11
+ CreatePackageEvaluationGradingSummary,
12
+ CreatePackageEvaluationSource,
13
+ CreatePackageReplaySummary,
14
+ CreatePackageEvaluationUnitTestSummary,
15
+ CreatePackageEvaluationWatchSummary,
16
+ MonitoringSnapshot,
17
+ RuntimeReplayAggregateMetrics,
18
+ } from "./types.js";
19
+ import { extractJsonObject } from "./utils/json-output.js";
2
20
 
3
21
  export interface DashboardActionOutcomeInput {
4
22
  action: DashboardActionName;
@@ -13,18 +31,6 @@ export interface DashboardActionOutcome {
13
31
  summary: DashboardActionResultSummary | null;
14
32
  }
15
33
 
16
- function extractJsonObject(stdout: string): Record<string, unknown> | null {
17
- const trimmed = stdout.trim();
18
- if (!trimmed.startsWith("{") || !trimmed.endsWith("}")) return null;
19
-
20
- try {
21
- const parsed = JSON.parse(trimmed) as unknown;
22
- return parsed && typeof parsed === "object" ? (parsed as Record<string, unknown>) : null;
23
- } catch {
24
- return null;
25
- }
26
- }
27
-
28
34
  function readBoolean(value: unknown): boolean | null {
29
35
  return typeof value === "boolean" ? value : null;
30
36
  }
@@ -37,24 +43,775 @@ function readString(value: unknown): string | null {
37
43
  return typeof value === "string" && value.trim().length > 0 ? value : null;
38
44
  }
39
45
 
46
+ function readObject(value: unknown): Record<string, unknown> | null {
47
+ return value && typeof value === "object" ? (value as Record<string, unknown>) : null;
48
+ }
49
+
50
+ function readEvidenceSample(value: unknown): CreatePackageEvaluationEvidenceSample | null {
51
+ const sample = readObject(value);
52
+ const query = readString(sample?.["query"]);
53
+ if (!query) return null;
54
+
55
+ return {
56
+ query,
57
+ evidence: readString(sample?.["evidence"]),
58
+ };
59
+ }
60
+
61
+ function readEvidenceSamples(value: unknown): CreatePackageEvaluationEvidenceSample[] {
62
+ if (!Array.isArray(value)) return [];
63
+ return value
64
+ .map((sample) => readEvidenceSample(sample))
65
+ .filter((sample): sample is CreatePackageEvaluationEvidenceSample => sample != null);
66
+ }
67
+
68
+ function readRuntimeReplayAggregateMetrics(value: unknown): RuntimeReplayAggregateMetrics | null {
69
+ const metrics = readObject(value);
70
+ if (!metrics) return null;
71
+
72
+ const evalRuns = readNumber(metrics["eval_runs"]);
73
+ const usageObservations = readNumber(metrics["usage_observations"]);
74
+ const totalDurationMs = readNumber(metrics["total_duration_ms"]);
75
+ const avgDurationMs = readNumber(metrics["avg_duration_ms"]);
76
+ if (
77
+ evalRuns == null ||
78
+ usageObservations == null ||
79
+ totalDurationMs == null ||
80
+ avgDurationMs == null
81
+ ) {
82
+ return null;
83
+ }
84
+
85
+ return {
86
+ eval_runs: evalRuns,
87
+ usage_observations: usageObservations,
88
+ total_duration_ms: totalDurationMs,
89
+ avg_duration_ms: avgDurationMs,
90
+ total_input_tokens: readNumber(metrics["total_input_tokens"]),
91
+ total_output_tokens: readNumber(metrics["total_output_tokens"]),
92
+ total_cache_creation_input_tokens: readNumber(metrics["total_cache_creation_input_tokens"]),
93
+ total_cache_read_input_tokens: readNumber(metrics["total_cache_read_input_tokens"]),
94
+ total_cost_usd: readNumber(metrics["total_cost_usd"]),
95
+ total_turns: readNumber(metrics["total_turns"]),
96
+ };
97
+ }
98
+
99
+ function readPackageEvidenceSummary(value: unknown): CreatePackageEvaluationEvidenceSummary | null {
100
+ const summary = readObject(value);
101
+ if (!summary) return null;
102
+
103
+ const replayFailures = readNumber(summary["replay_failures"]);
104
+ const baselineWins = readNumber(summary["baseline_wins"]);
105
+ const baselineRegressions = readNumber(summary["baseline_regressions"]);
106
+ const replayFailureSamples = readEvidenceSamples(summary["replay_failure_samples"]);
107
+ const baselineWinSamples = readEvidenceSamples(summary["baseline_win_samples"]);
108
+ const baselineRegressionSamples = readEvidenceSamples(summary["baseline_regression_samples"]);
109
+
110
+ if (
111
+ replayFailures == null &&
112
+ baselineWins == null &&
113
+ baselineRegressions == null &&
114
+ replayFailureSamples.length === 0 &&
115
+ baselineWinSamples.length === 0 &&
116
+ baselineRegressionSamples.length === 0
117
+ ) {
118
+ return null;
119
+ }
120
+
121
+ return {
122
+ replay_failures: replayFailures ?? replayFailureSamples.length,
123
+ baseline_wins: baselineWins ?? baselineWinSamples.length,
124
+ baseline_regressions: baselineRegressions ?? baselineRegressionSamples.length,
125
+ replay_failure_samples: replayFailureSamples,
126
+ baseline_win_samples: baselineWinSamples,
127
+ baseline_regression_samples: baselineRegressionSamples,
128
+ };
129
+ }
130
+
131
+ function readPackageEfficiencySummary(
132
+ value: unknown,
133
+ ): CreatePackageEvaluationEfficiencySummary | null {
134
+ const summary = readObject(value);
135
+ if (!summary) return null;
136
+
137
+ const withSkill = readRuntimeReplayAggregateMetrics(summary["with_skill"]);
138
+ const withoutSkill = readRuntimeReplayAggregateMetrics(summary["without_skill"]);
139
+ if (!withSkill || !withoutSkill) return null;
140
+
141
+ return {
142
+ with_skill: withSkill,
143
+ without_skill: withoutSkill,
144
+ };
145
+ }
146
+
147
+ function readPackageEvaluationSource(value: unknown): CreatePackageEvaluationSource | null {
148
+ const source = readString(value);
149
+ if (source !== "fresh" && source !== "artifact_cache" && source !== "candidate_cache") {
150
+ return null;
151
+ }
152
+ return source;
153
+ }
154
+
155
+ function readPackageReplaySummary(value: unknown): CreatePackageReplaySummary | null {
156
+ const summary = readObject(value);
157
+ if (!summary) return null;
158
+
159
+ const mode = readString(summary["mode"]);
160
+ const validationMode = readString(summary["validation_mode"]);
161
+ const agent = readString(summary["agent"]);
162
+ const proposalId = readString(summary["proposal_id"]);
163
+ const fixtureId = readString(summary["fixture_id"]);
164
+ const total = readNumber(summary["total"]);
165
+ const passed = readNumber(summary["passed"]);
166
+ const failed = readNumber(summary["failed"]);
167
+ const passRate = readNumber(summary["pass_rate"]);
168
+ if (
169
+ (mode !== "routing" && mode !== "package") ||
170
+ validationMode !== "host_replay" ||
171
+ agent == null ||
172
+ proposalId == null ||
173
+ fixtureId == null ||
174
+ total == null ||
175
+ passed == null ||
176
+ failed == null ||
177
+ passRate == null
178
+ ) {
179
+ return null;
180
+ }
181
+
182
+ return {
183
+ mode,
184
+ validation_mode: validationMode,
185
+ agent,
186
+ proposal_id: proposalId,
187
+ fixture_id: fixtureId,
188
+ total,
189
+ passed,
190
+ failed,
191
+ pass_rate: passRate,
192
+ ...(readRuntimeReplayAggregateMetrics(summary["runtime_metrics"])
193
+ ? { runtime_metrics: readRuntimeReplayAggregateMetrics(summary["runtime_metrics"]) }
194
+ : {}),
195
+ };
196
+ }
197
+
198
+ function readPackageBodySummary(value: unknown): CreatePackageBodySummary | null {
199
+ const summary = readObject(value);
200
+ if (!summary) return null;
201
+
202
+ const structuralValid = readBoolean(summary["structural_valid"]);
203
+ const structuralReason = readString(summary["structural_reason"]);
204
+ const qualityThreshold = readNumber(summary["quality_threshold"]);
205
+ const valid = readBoolean(summary["valid"]);
206
+ if (
207
+ structuralValid == null ||
208
+ structuralReason == null ||
209
+ qualityThreshold == null ||
210
+ valid == null
211
+ ) {
212
+ return null;
213
+ }
214
+
215
+ return {
216
+ structural_valid: structuralValid,
217
+ structural_reason: structuralReason,
218
+ quality_score: readNumber(summary["quality_score"]),
219
+ quality_reason: readString(summary["quality_reason"]),
220
+ quality_threshold: qualityThreshold,
221
+ quality_passed: readBoolean(summary["quality_passed"]),
222
+ valid,
223
+ };
224
+ }
225
+
226
+ function readPackageGradingSummary(value: unknown): CreatePackageEvaluationGradingSummary | null {
227
+ const summary = readObject(value);
228
+ if (!summary) return null;
229
+
230
+ const baseline = readObject(summary["baseline"]);
231
+ const recent = readObject(summary["recent"]);
232
+ const baselinePassRate = readNumber(baseline?.["pass_rate"]);
233
+ const baselineMeasuredAt = readString(baseline?.["measured_at"]);
234
+ const baselineSampleSize = readNumber(baseline?.["sample_size"]);
235
+ const recentSampleSize = readNumber(recent?.["sample_size"]);
236
+
237
+ const parsedBaseline =
238
+ baselinePassRate != null && baselineMeasuredAt != null && baselineSampleSize != null
239
+ ? {
240
+ proposal_id: readString(baseline?.["proposal_id"]),
241
+ measured_at: baselineMeasuredAt,
242
+ pass_rate: baselinePassRate,
243
+ mean_score: readNumber(baseline?.["mean_score"]),
244
+ sample_size: baselineSampleSize,
245
+ }
246
+ : null;
247
+ const parsedRecent =
248
+ recentSampleSize != null
249
+ ? {
250
+ sample_size: recentSampleSize,
251
+ average_pass_rate: readNumber(recent?.["average_pass_rate"]),
252
+ average_mean_score: readNumber(recent?.["average_mean_score"]),
253
+ newest_graded_at: readString(recent?.["newest_graded_at"]),
254
+ oldest_graded_at: readString(recent?.["oldest_graded_at"]),
255
+ }
256
+ : null;
257
+
258
+ if (!parsedBaseline && !parsedRecent) return null;
259
+
260
+ return {
261
+ baseline: parsedBaseline,
262
+ recent: parsedRecent,
263
+ pass_rate_delta: readNumber(summary["pass_rate_delta"]),
264
+ mean_score_delta: readNumber(summary["mean_score_delta"]),
265
+ regressed: readBoolean(summary["regressed"]),
266
+ };
267
+ }
268
+
269
+ function readPackageUnitTestSummary(value: unknown): CreatePackageEvaluationUnitTestSummary | null {
270
+ const summary = readObject(value);
271
+ if (!summary) return null;
272
+
273
+ const total = readNumber(summary["total"]);
274
+ const passed = readNumber(summary["passed"]);
275
+ const failed = readNumber(summary["failed"]);
276
+ const passRate = readNumber(summary["pass_rate"]);
277
+ const runAt = readString(summary["run_at"]);
278
+ if (total == null || passed == null || failed == null || passRate == null || runAt == null) {
279
+ return null;
280
+ }
281
+
282
+ const failingTests = Array.isArray(summary["failing_tests"])
283
+ ? summary["failing_tests"]
284
+ .map((entry) => {
285
+ const failure = readObject(entry);
286
+ const testId = readString(failure?.["test_id"]);
287
+ if (!testId) return null;
288
+
289
+ const failedAssertions = Array.isArray(failure?.["failed_assertions"])
290
+ ? failure["failed_assertions"].filter(
291
+ (assertion): assertion is string =>
292
+ typeof assertion === "string" && assertion.trim().length > 0,
293
+ )
294
+ : [];
295
+
296
+ return {
297
+ test_id: testId,
298
+ error: readString(failure?.["error"]),
299
+ failed_assertions: failedAssertions,
300
+ };
301
+ })
302
+ .filter(
303
+ (failure): failure is CreatePackageEvaluationUnitTestSummary["failing_tests"][number] =>
304
+ failure != null,
305
+ )
306
+ : [];
307
+
308
+ return {
309
+ total,
310
+ passed,
311
+ failed,
312
+ pass_rate: passRate,
313
+ run_at: runAt,
314
+ failing_tests: failingTests,
315
+ };
316
+ }
317
+
318
+ function readInvocationTotals(value: unknown): { passed: number; total: number } | null {
319
+ const entry = readObject(value);
320
+ const passed = readNumber(entry?.["passed"]);
321
+ const total = readNumber(entry?.["total"]);
322
+ if (passed == null || total == null) return null;
323
+
324
+ return { passed, total };
325
+ }
326
+
327
+ function readMonitoringSnapshot(value: unknown): MonitoringSnapshot | null {
328
+ const snapshot = readObject(value);
329
+ if (!snapshot) return null;
330
+
331
+ const timestamp = readString(snapshot["timestamp"]);
332
+ const skillName = readString(snapshot["skill_name"]);
333
+ const windowSessions = readNumber(snapshot["window_sessions"]);
334
+ const skillChecks = readNumber(snapshot["skill_checks"]);
335
+ const passRate = readNumber(snapshot["pass_rate"]);
336
+ const falseNegativeRate = readNumber(snapshot["false_negative_rate"]);
337
+ const regressionDetected = readBoolean(snapshot["regression_detected"]);
338
+ const baselinePassRate = readNumber(snapshot["baseline_pass_rate"]);
339
+ const byInvocationType = readObject(snapshot["by_invocation_type"]);
340
+
341
+ const explicit = readInvocationTotals(byInvocationType?.["explicit"]);
342
+ const implicit = readInvocationTotals(byInvocationType?.["implicit"]);
343
+ const contextual = readInvocationTotals(byInvocationType?.["contextual"]);
344
+ const negative = readInvocationTotals(byInvocationType?.["negative"]);
345
+
346
+ if (
347
+ timestamp == null ||
348
+ skillName == null ||
349
+ windowSessions == null ||
350
+ skillChecks == null ||
351
+ passRate == null ||
352
+ falseNegativeRate == null ||
353
+ regressionDetected == null ||
354
+ baselinePassRate == null ||
355
+ explicit == null ||
356
+ implicit == null ||
357
+ contextual == null ||
358
+ negative == null
359
+ ) {
360
+ return null;
361
+ }
362
+
363
+ return {
364
+ timestamp,
365
+ skill_name: skillName,
366
+ window_sessions: windowSessions,
367
+ skill_checks: skillChecks,
368
+ pass_rate: passRate,
369
+ false_negative_rate: falseNegativeRate,
370
+ by_invocation_type: {
371
+ explicit,
372
+ implicit,
373
+ contextual,
374
+ negative,
375
+ },
376
+ regression_detected: regressionDetected,
377
+ baseline_pass_rate: baselinePassRate,
378
+ };
379
+ }
380
+
381
+ function readGradeRegression(
382
+ value: unknown,
383
+ ): CreatePackageEvaluationWatchSummary["grade_regression"] {
384
+ const regression = readObject(value);
385
+ if (!regression) return null;
386
+
387
+ const before = readNumber(regression["before"]);
388
+ const after = readNumber(regression["after"]);
389
+ const delta = readNumber(regression["delta"]);
390
+ if (before == null || after == null || delta == null) return null;
391
+
392
+ return { before, after, delta };
393
+ }
394
+
395
+ function readEfficiencyRegression(
396
+ value: unknown,
397
+ ): CreatePackageEvaluationWatchSummary["efficiency_regression"] {
398
+ const regression = readObject(value);
399
+ if (!regression) return null;
400
+
401
+ const sampleSize = readNumber(regression["sample_size"]);
402
+ if (sampleSize == null) return null;
403
+
404
+ return {
405
+ sample_size: sampleSize,
406
+ baseline_avg_duration_ms: readNumber(regression["baseline_avg_duration_ms"]),
407
+ observed_avg_duration_ms: readNumber(regression["observed_avg_duration_ms"]),
408
+ duration_delta_ratio: readNumber(regression["duration_delta_ratio"]),
409
+ baseline_avg_input_tokens: readNumber(regression["baseline_avg_input_tokens"]),
410
+ observed_avg_input_tokens: readNumber(regression["observed_avg_input_tokens"]),
411
+ input_tokens_delta_ratio: readNumber(regression["input_tokens_delta_ratio"]),
412
+ baseline_avg_output_tokens: readNumber(regression["baseline_avg_output_tokens"]),
413
+ observed_avg_output_tokens: readNumber(regression["observed_avg_output_tokens"]),
414
+ output_tokens_delta_ratio: readNumber(regression["output_tokens_delta_ratio"]),
415
+ baseline_avg_turns: readNumber(regression["baseline_avg_turns"]),
416
+ observed_avg_turns: readNumber(regression["observed_avg_turns"]),
417
+ turns_delta_ratio: readNumber(regression["turns_delta_ratio"]),
418
+ };
419
+ }
420
+
421
+ function readPackageWatchSummary(value: unknown): CreatePackageEvaluationWatchSummary | null {
422
+ const summary = readObject(value);
423
+ if (!summary) return null;
424
+
425
+ const snapshot = readMonitoringSnapshot(summary["snapshot"]);
426
+ const rolledBack = readBoolean(summary["rolled_back"]) ?? readBoolean(summary["rolledBack"]);
427
+ const recommendation = readString(summary["recommendation"]);
428
+
429
+ if (!snapshot || rolledBack == null || recommendation == null) return null;
430
+
431
+ return {
432
+ snapshot,
433
+ alert: readString(summary["alert"]),
434
+ rolled_back: rolledBack,
435
+ recommendation,
436
+ recommended_command: readString(summary["recommended_command"]),
437
+ grade_alert: readString(summary["grade_alert"]) ?? readString(summary["gradeAlert"]),
438
+ grade_regression:
439
+ readGradeRegression(summary["grade_regression"]) ??
440
+ readGradeRegression(summary["gradeRegression"]),
441
+ ...((readString(summary["efficiency_alert"]) ?? readString(summary["efficiencyAlert"]))
442
+ ? {
443
+ efficiency_alert:
444
+ readString(summary["efficiency_alert"]) ?? readString(summary["efficiencyAlert"]),
445
+ }
446
+ : {}),
447
+ ...((readEfficiencyRegression(summary["efficiency_regression"]) ??
448
+ readEfficiencyRegression(summary["efficiencyRegression"]))
449
+ ? {
450
+ efficiency_regression:
451
+ readEfficiencyRegression(summary["efficiency_regression"]) ??
452
+ readEfficiencyRegression(summary["efficiencyRegression"]),
453
+ }
454
+ : {}),
455
+ };
456
+ }
457
+
458
+ function subtractRates(current: number | null, baseline: number | null): number | null {
459
+ if (current == null || baseline == null) return null;
460
+ return Number.parseFloat((current - baseline).toFixed(4));
461
+ }
462
+
463
+ function buildWatchSummary(
464
+ watchResult: Record<string, unknown>,
465
+ fallbackReason: string | null = null,
466
+ packageEvaluation: Record<string, unknown> | null = null,
467
+ ): DashboardActionResultSummary | null {
468
+ const packageWatch =
469
+ readPackageWatchSummary(watchResult) ?? readPackageWatchSummary(packageEvaluation?.["watch"]);
470
+ const snapshot = packageWatch?.snapshot ?? readMonitoringSnapshot(watchResult["snapshot"]);
471
+ if (!snapshot) return null;
472
+
473
+ const baselinePassRate = snapshot.baseline_pass_rate;
474
+ const currentPassRate = snapshot.pass_rate;
475
+ const regressionDetected = snapshot.regression_detected;
476
+ const gradeAlert = packageWatch?.grade_alert ?? readString(watchResult["gradeAlert"]);
477
+ const alert = packageWatch?.alert ?? readString(watchResult["alert"]);
478
+ const recommendation =
479
+ packageWatch?.recommendation ?? readString(watchResult["recommendation"]) ?? fallbackReason;
480
+ const recommendedCommand =
481
+ packageWatch?.recommended_command ?? readString(watchResult["recommended_command"]);
482
+ const packageEvaluationSource = readPackageEvaluationSource(
483
+ packageEvaluation?.["evaluation_source"],
484
+ );
485
+ const packageCandidateId = readString(packageEvaluation?.["candidate_id"]);
486
+ const packageParentCandidateId = readString(packageEvaluation?.["parent_candidate_id"]);
487
+ const packageCandidateGeneration = readNumber(packageEvaluation?.["candidate_generation"]);
488
+ const packageCandidateAcceptance = readObject(packageEvaluation?.["candidate_acceptance"]);
489
+ const packageCandidateAcceptanceDecision = readString(packageCandidateAcceptance?.["decision"]);
490
+ const packageCandidateAcceptanceRationale = readString(packageCandidateAcceptance?.["rationale"]);
491
+ const packageEvidence = readPackageEvidenceSummary(packageEvaluation?.["evidence"]);
492
+ const packageEfficiency = readPackageEfficiencySummary(packageEvaluation?.["efficiency"]);
493
+ const packageRouting = readPackageReplaySummary(packageEvaluation?.["routing"]);
494
+ const packageBody = readPackageBodySummary(packageEvaluation?.["body"]);
495
+ const packageGrading = readPackageGradingSummary(packageEvaluation?.["grading"]);
496
+ const packageUnitTests = readPackageUnitTestSummary(packageEvaluation?.["unit_tests"]);
497
+
498
+ return {
499
+ reason: alert ?? recommendation,
500
+ improved: alert == null,
501
+ deployed: true,
502
+ before_pass_rate: baselinePassRate,
503
+ before_label: "Baseline",
504
+ after_pass_rate: currentPassRate,
505
+ after_label: "Observed",
506
+ net_change: subtractRates(currentPassRate, baselinePassRate),
507
+ net_change_label: "Delta",
508
+ validation_mode:
509
+ gradeAlert != null && regressionDetected
510
+ ? "trigger+grade_watch"
511
+ : gradeAlert != null
512
+ ? "grade_watch"
513
+ : regressionDetected
514
+ ? "trigger_watch"
515
+ : "live_watch",
516
+ validation_label: "Signal",
517
+ ...((recommendedCommand ?? readString(packageEvaluation?.["next_command"]))
518
+ ? {
519
+ recommended_command:
520
+ recommendedCommand ?? readString(packageEvaluation?.["next_command"]),
521
+ }
522
+ : {}),
523
+ ...(packageEvaluationSource ? { package_evaluation_source: packageEvaluationSource } : {}),
524
+ ...(packageCandidateId ? { package_candidate_id: packageCandidateId } : {}),
525
+ ...(packageParentCandidateId ? { package_parent_candidate_id: packageParentCandidateId } : {}),
526
+ ...(packageCandidateGeneration != null
527
+ ? { package_candidate_generation: packageCandidateGeneration }
528
+ : {}),
529
+ ...(packageCandidateAcceptanceDecision
530
+ ? {
531
+ package_candidate_acceptance_decision:
532
+ packageCandidateAcceptanceDecision as DashboardActionResultSummary["package_candidate_acceptance_decision"],
533
+ }
534
+ : {}),
535
+ ...(packageCandidateAcceptanceRationale
536
+ ? { package_candidate_acceptance_rationale: packageCandidateAcceptanceRationale }
537
+ : {}),
538
+ ...(packageEvidence ? { package_evidence: packageEvidence } : {}),
539
+ ...(packageEfficiency ? { package_efficiency: packageEfficiency } : {}),
540
+ ...(packageRouting ? { package_routing: packageRouting } : {}),
541
+ ...(packageBody ? { package_body: packageBody } : {}),
542
+ ...(packageGrading ? { package_grading: packageGrading } : {}),
543
+ ...(packageUnitTests ? { package_unit_tests: packageUnitTests } : {}),
544
+ ...(packageWatch ? { package_watch: packageWatch } : {}),
545
+ };
546
+ }
547
+
548
+ function buildPackageEvaluationSummary(
549
+ packageEvaluation: Record<string, unknown> | null,
550
+ options: {
551
+ deployed: boolean | null;
552
+ reason: string | null;
553
+ },
554
+ ): DashboardActionResultSummary | null {
555
+ if (!packageEvaluation) return null;
556
+
557
+ const replay = readObject(packageEvaluation["replay"]);
558
+ const baseline = readObject(packageEvaluation["baseline"]);
559
+ const recommendedCommand = readString(packageEvaluation["next_command"]);
560
+ const packageEvaluationSource = readPackageEvaluationSource(
561
+ packageEvaluation["evaluation_source"],
562
+ );
563
+ const packageCandidateId = readString(packageEvaluation["candidate_id"]);
564
+ const packageParentCandidateId = readString(packageEvaluation["parent_candidate_id"]);
565
+ const packageCandidateGeneration = readNumber(packageEvaluation["candidate_generation"]);
566
+ const packageCandidateAcceptance = readObject(packageEvaluation["candidate_acceptance"]);
567
+ const packageCandidateAcceptanceDecision = readString(packageCandidateAcceptance?.["decision"]);
568
+ const packageCandidateAcceptanceRationale = readString(packageCandidateAcceptance?.["rationale"]);
569
+ const packageEvidence = readPackageEvidenceSummary(packageEvaluation["evidence"]);
570
+ const packageEfficiency = readPackageEfficiencySummary(packageEvaluation["efficiency"]);
571
+ const packageRouting = readPackageReplaySummary(packageEvaluation["routing"]);
572
+ const packageBody = readPackageBodySummary(packageEvaluation["body"]);
573
+ const packageGrading = readPackageGradingSummary(packageEvaluation["grading"]);
574
+ const packageUnitTests = readPackageUnitTestSummary(packageEvaluation["unit_tests"]);
575
+ const packageWatch = readPackageWatchSummary(packageEvaluation["watch"]);
576
+
577
+ return {
578
+ reason: options.reason,
579
+ improved: readBoolean(packageEvaluation["evaluation_passed"]),
580
+ deployed: options.deployed,
581
+ before_pass_rate: readNumber(baseline?.["baseline_pass_rate"]),
582
+ after_pass_rate: readNumber(baseline?.["with_skill_pass_rate"]),
583
+ net_change: readNumber(baseline?.["lift"]),
584
+ validation_mode: readString(replay?.["validation_mode"]),
585
+ ...(recommendedCommand ? { recommended_command: recommendedCommand } : {}),
586
+ ...(packageEvaluationSource ? { package_evaluation_source: packageEvaluationSource } : {}),
587
+ ...(packageCandidateId ? { package_candidate_id: packageCandidateId } : {}),
588
+ ...(packageParentCandidateId ? { package_parent_candidate_id: packageParentCandidateId } : {}),
589
+ ...(packageCandidateGeneration != null
590
+ ? { package_candidate_generation: packageCandidateGeneration }
591
+ : {}),
592
+ ...(packageCandidateAcceptanceDecision
593
+ ? {
594
+ package_candidate_acceptance_decision:
595
+ packageCandidateAcceptanceDecision as DashboardActionResultSummary["package_candidate_acceptance_decision"],
596
+ }
597
+ : {}),
598
+ ...(packageCandidateAcceptanceRationale
599
+ ? { package_candidate_acceptance_rationale: packageCandidateAcceptanceRationale }
600
+ : {}),
601
+ ...(packageEvidence ? { package_evidence: packageEvidence } : {}),
602
+ ...(packageEfficiency ? { package_efficiency: packageEfficiency } : {}),
603
+ ...(packageRouting ? { package_routing: packageRouting } : {}),
604
+ ...(packageBody ? { package_body: packageBody } : {}),
605
+ ...(packageGrading ? { package_grading: packageGrading } : {}),
606
+ ...(packageUnitTests ? { package_unit_tests: packageUnitTests } : {}),
607
+ ...(packageWatch ? { package_watch: packageWatch } : {}),
608
+ };
609
+ }
610
+
611
+ function extractSearchRunSummary(
612
+ parsed: Record<string, unknown>,
613
+ ): DashboardSearchRunSummary | null {
614
+ const searchId = readString(parsed["search_id"]);
615
+ if (!searchId) return null;
616
+
617
+ const provenance = parsed["provenance"];
618
+ const prov =
619
+ provenance && typeof provenance === "object" ? (provenance as Record<string, unknown>) : null;
620
+ const surfacePlan =
621
+ prov && typeof prov["surface_plan"] === "object"
622
+ ? (prov["surface_plan"] as Record<string, unknown>)
623
+ : null;
624
+
625
+ return {
626
+ search_id: searchId,
627
+ parent_candidate_id: readString(parsed["parent_candidate_id"]),
628
+ winner_candidate_id: readString(parsed["winner_candidate_id"]),
629
+ winner_rationale: readString(parsed["winner_rationale"]),
630
+ candidates_evaluated: readNumber(parsed["candidates_evaluated"]) ?? 0,
631
+ frontier_size: prov ? (readNumber(prov["frontier_size"]) ?? 0) : 0,
632
+ parent_selection_method: prov
633
+ ? (readString(prov["parent_selection_method"]) ?? "unknown")
634
+ : "unknown",
635
+ ...(surfacePlan
636
+ ? {
637
+ surface_plan: {
638
+ routing_count: readNumber(surfacePlan["routing_count"]) ?? 0,
639
+ body_count: readNumber(surfacePlan["body_count"]) ?? 0,
640
+ weakness_source: readString(surfacePlan["weakness_source"]) ?? "unknown",
641
+ routing_weakness: readNumber(surfacePlan["routing_weakness"]),
642
+ body_weakness: readNumber(surfacePlan["body_weakness"]),
643
+ },
644
+ }
645
+ : {}),
646
+ };
647
+ }
648
+
40
649
  export function extractDashboardActionSummary(
41
650
  action: DashboardActionName,
42
651
  stdout: string,
43
652
  ): DashboardActionResultSummary | null {
44
- if (action !== "replay-dry-run") return null;
45
-
46
653
  const parsed = extractJsonObject(stdout);
47
654
  if (!parsed) return null;
48
655
 
49
- return {
50
- reason: readString(parsed["reason"]),
51
- improved: readBoolean(parsed["improved"]),
52
- deployed: readBoolean(parsed["deployed"]),
53
- before_pass_rate: readNumber(parsed["before_pass_rate"]) ?? readNumber(parsed["before"]),
54
- after_pass_rate: readNumber(parsed["after_pass_rate"]) ?? readNumber(parsed["after"]),
55
- net_change: readNumber(parsed["net_change"]),
56
- validation_mode: readString(parsed["validation_mode"]),
57
- };
656
+ if (action === "create-check") {
657
+ const readiness = readObject(parsed["readiness"]);
658
+ const specValidation = readObject(parsed["spec_validation"]);
659
+ const ok = readBoolean(parsed["ok"]);
660
+ const state = readString(parsed["state"]);
661
+ const recommendedCommand = readString(readiness?.["recommended_command"]);
662
+
663
+ return {
664
+ reason:
665
+ readString(readiness?.["summary"]) ??
666
+ (ok === true
667
+ ? "Draft package passed create check"
668
+ : state
669
+ ? `Draft package is in ${state.replaceAll("_", " ")} state`
670
+ : null),
671
+ improved: ok,
672
+ deployed: null,
673
+ before_pass_rate: null,
674
+ after_pass_rate: null,
675
+ net_change: null,
676
+ validation_mode: readString(specValidation?.["validator"]),
677
+ ...(recommendedCommand ? { recommended_command: recommendedCommand } : {}),
678
+ };
679
+ }
680
+
681
+ if (action === "replay-dry-run") {
682
+ return {
683
+ reason: readString(parsed["reason"]),
684
+ improved: readBoolean(parsed["improved"]),
685
+ deployed: readBoolean(parsed["deployed"]),
686
+ before_pass_rate: readNumber(parsed["before_pass_rate"]) ?? readNumber(parsed["before"]),
687
+ after_pass_rate: readNumber(parsed["after_pass_rate"]) ?? readNumber(parsed["after"]),
688
+ net_change: readNumber(parsed["net_change"]),
689
+ validation_mode: readString(parsed["validation_mode"]),
690
+ };
691
+ }
692
+
693
+ if (action === "search-run") {
694
+ const searchRun = extractSearchRunSummary(parsed);
695
+ const packageSummary = buildPackageEvaluationSummary(readObject(parsed["package_evaluation"]), {
696
+ deployed: false,
697
+ reason: readString(parsed["winner_rationale"]),
698
+ });
699
+ return {
700
+ ...(packageSummary ?? {
701
+ reason: readString(parsed["winner_rationale"]),
702
+ improved: readBoolean(parsed["improved"]) ?? searchRun?.winner_candidate_id != null,
703
+ deployed: null,
704
+ before_pass_rate: null,
705
+ after_pass_rate: null,
706
+ net_change: null,
707
+ validation_mode: null,
708
+ ...(readString(parsed["next_command"])
709
+ ? { recommended_command: readString(parsed["next_command"]) }
710
+ : {}),
711
+ }),
712
+ search_run: searchRun,
713
+ };
714
+ }
715
+
716
+ if (action === "measure-baseline") {
717
+ const packageEfficiency = readPackageEfficiencySummary(parsed["runtime_metrics"]);
718
+ return {
719
+ reason:
720
+ readBoolean(parsed["adds_value"]) === false ? "Baseline gate failed" : "Baseline measured",
721
+ improved: readBoolean(parsed["adds_value"]),
722
+ deployed: null,
723
+ before_pass_rate: readNumber(parsed["baseline_pass_rate"]),
724
+ after_pass_rate: readNumber(parsed["with_skill_pass_rate"]),
725
+ net_change: readNumber(parsed["lift"]),
726
+ validation_mode: readString(parsed["mode"]) === "package" ? "host_replay" : null,
727
+ ...(packageEfficiency ? { package_efficiency: packageEfficiency } : {}),
728
+ };
729
+ }
730
+
731
+ if (action === "report-package") {
732
+ const report = readObject(parsed["report"]);
733
+ const summary = readObject(parsed["summary"]) ?? readObject(report?.["summary"]);
734
+ const status = readString(summary?.["status"]);
735
+ const packageSummary = buildPackageEvaluationSummary(summary, {
736
+ deployed: null,
737
+ reason:
738
+ status === "replay_failed"
739
+ ? "Package report detected replay failures"
740
+ : status === "baseline_failed"
741
+ ? "Package report detected a baseline regression"
742
+ : "Package report ready",
743
+ });
744
+ if (packageSummary) {
745
+ return packageSummary;
746
+ }
747
+
748
+ const readiness = readObject(parsed["readiness"]);
749
+ const verified = readBoolean(parsed["verified"]);
750
+ const readinessState =
751
+ readString(parsed["readiness_state"]) ?? readString(readiness?.["state"]);
752
+ const recommendedCommand =
753
+ readString(parsed["next_command"]) ?? readString(readiness?.["next_command"]);
754
+
755
+ return {
756
+ reason:
757
+ readString(readiness?.["summary"]) ??
758
+ (readinessState
759
+ ? `Draft package is in ${readinessState.replaceAll("_", " ")} state`
760
+ : null),
761
+ improved: verified ?? readBoolean(readiness?.["ok"]),
762
+ deployed: null,
763
+ before_pass_rate: null,
764
+ after_pass_rate: null,
765
+ net_change: null,
766
+ validation_mode: null,
767
+ ...(recommendedCommand ? { recommended_command: recommendedCommand } : {}),
768
+ };
769
+ }
770
+
771
+ if (action === "deploy-candidate" || action === "watch") {
772
+ const packageEvaluation = readObject(parsed["package_evaluation"]);
773
+
774
+ if (action === "watch") {
775
+ const directWatchSummary = buildWatchSummary(parsed);
776
+ if (directWatchSummary) return directWatchSummary;
777
+
778
+ const nestedWatchResult = readObject(parsed["watch_result"]);
779
+ const nestedWatchSummary = nestedWatchResult
780
+ ? buildWatchSummary(
781
+ nestedWatchResult,
782
+ "Package evaluation passed and watch started",
783
+ packageEvaluation,
784
+ )
785
+ : null;
786
+ if (nestedWatchSummary) return nestedWatchSummary;
787
+ }
788
+
789
+ const status = readString(packageEvaluation?.["status"]);
790
+ const published = readBoolean(parsed["published"]);
791
+ const watchGatePassed =
792
+ action === "watch"
793
+ ? readString(parsed["alert"]) == null
794
+ : (readBoolean(parsed["watch_gate_passed"]) ?? null);
795
+ const baseSummary = buildPackageEvaluationSummary(packageEvaluation, {
796
+ deployed: published,
797
+ reason:
798
+ status === "replay_failed"
799
+ ? "Package replay failed"
800
+ : status === "baseline_failed"
801
+ ? "Package baseline failed"
802
+ : action === "watch" && readBoolean(parsed["watch_started"])
803
+ ? "Package evaluation passed and watch started"
804
+ : published
805
+ ? "Package evaluation passed"
806
+ : null,
807
+ });
808
+ if (baseSummary) {
809
+ return { ...baseSummary, watch_gate_passed: watchGatePassed };
810
+ }
811
+ return baseSummary;
812
+ }
813
+
814
+ return null;
58
815
  }
59
816
 
60
817
  function isSuccessfulReplayDryRun(summary: DashboardActionResultSummary | null): boolean {
@@ -72,6 +829,14 @@ export function resolveDashboardActionOutcome(
72
829
  ): DashboardActionOutcome {
73
830
  const summary = extractDashboardActionSummary(input.action, input.stdout);
74
831
 
832
+ if (input.action === "watch" && summary?.improved === false) {
833
+ return {
834
+ success: false,
835
+ summary,
836
+ error: summary.reason ?? input.stderr ?? "Watch detected a regression",
837
+ };
838
+ }
839
+
75
840
  if (input.exitCode === 0) {
76
841
  return { success: true, error: null, summary };
77
842
  }