selftune 0.2.31 → 0.2.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -56
- package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
- package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
- package/apps/local-dashboard/dist/index.html +3 -3
- package/cli/selftune/command-surface.ts +613 -2
- package/cli/selftune/create/baseline.ts +429 -0
- package/cli/selftune/create/check.ts +35 -0
- package/cli/selftune/create/init.ts +115 -0
- package/cli/selftune/create/package-candidate-state.ts +771 -0
- package/cli/selftune/create/package-evaluator.ts +710 -0
- package/cli/selftune/create/package-fingerprint.ts +142 -0
- package/cli/selftune/create/package-search.ts +377 -0
- package/cli/selftune/create/publish.ts +431 -0
- package/cli/selftune/create/readiness.ts +495 -0
- package/cli/selftune/create/replay.ts +330 -0
- package/cli/selftune/create/report.ts +74 -0
- package/cli/selftune/create/scaffold.ts +121 -0
- package/cli/selftune/create/skills-ref-adapter.ts +177 -0
- package/cli/selftune/create/status.ts +33 -0
- package/cli/selftune/create/templates.ts +249 -0
- package/cli/selftune/cron/setup.ts +1 -1
- package/cli/selftune/dashboard-action-events.ts +4 -1
- package/cli/selftune/dashboard-action-result.ts +789 -24
- package/cli/selftune/dashboard-action-stream.ts +80 -0
- package/cli/selftune/dashboard-contract.ts +146 -3
- package/cli/selftune/dashboard-server.ts +5 -4
- package/cli/selftune/eval/hooks-to-evals.ts +58 -35
- package/cli/selftune/eval/synthetic-evals.ts +145 -17
- package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
- package/cli/selftune/evolution/evolve-body.ts +9 -36
- package/cli/selftune/evolution/evolve.ts +8 -72
- package/cli/selftune/evolution/stopping-criteria.ts +5 -13
- package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
- package/cli/selftune/evolution/validate-host-replay.ts +115 -15
- package/cli/selftune/improve.ts +206 -0
- package/cli/selftune/index.ts +123 -6
- package/cli/selftune/init.ts +1 -1
- package/cli/selftune/localdb/queries/dashboard.ts +30 -0
- package/cli/selftune/localdb/schema.ts +52 -0
- package/cli/selftune/monitoring/watch.ts +257 -23
- package/cli/selftune/orchestrate/execute.ts +300 -1
- package/cli/selftune/orchestrate/finalize.ts +14 -0
- package/cli/selftune/orchestrate/plan.ts +22 -5
- package/cli/selftune/orchestrate/prepare.ts +59 -4
- package/cli/selftune/orchestrate/report.ts +1 -1
- package/cli/selftune/orchestrate.ts +34 -1
- package/cli/selftune/publish.ts +35 -0
- package/cli/selftune/routes/actions.ts +81 -15
- package/cli/selftune/routes/overview.ts +1 -1
- package/cli/selftune/routes/skill-report.ts +147 -2
- package/cli/selftune/run.ts +18 -0
- package/cli/selftune/schedule.ts +3 -3
- package/cli/selftune/search-run.ts +703 -0
- package/cli/selftune/status.ts +35 -11
- package/cli/selftune/testing-readiness.ts +431 -40
- package/cli/selftune/types.ts +316 -0
- package/cli/selftune/utils/eval-readiness.ts +1 -0
- package/cli/selftune/utils/json-output.ts +11 -0
- package/cli/selftune/utils/lifecycle-surface.ts +48 -0
- package/cli/selftune/utils/query-filter.ts +82 -1
- package/cli/selftune/utils/tui.ts +85 -2
- package/cli/selftune/verify.ts +205 -0
- package/cli/selftune/workflows/proposals.ts +1 -1
- package/cli/selftune/workflows/skill-scaffold.ts +141 -63
- package/cli/selftune/workflows/workflows.ts +4 -4
- package/package.json +1 -1
- package/skill/SKILL.md +148 -85
- package/skill/references/cli-quick-reference.md +16 -1
- package/skill/references/creator-playbook.md +31 -10
- package/skill/workflows/Baseline.md +8 -9
- package/skill/workflows/Contributions.md +4 -4
- package/skill/workflows/Create.md +173 -0
- package/skill/workflows/CreateTestDeploy.md +34 -30
- package/skill/workflows/Cron.md +2 -2
- package/skill/workflows/Dashboard.md +3 -3
- package/skill/workflows/Evals.md +13 -7
- package/skill/workflows/Evolve.md +75 -32
- package/skill/workflows/EvolveBody.md +22 -15
- package/skill/workflows/Hook.md +1 -1
- package/skill/workflows/Improve.md +168 -0
- package/skill/workflows/Initialize.md +3 -3
- package/skill/workflows/Orchestrate.md +49 -12
- package/skill/workflows/Publish.md +100 -0
- package/skill/workflows/Run.md +72 -0
- package/skill/workflows/Schedule.md +2 -2
- package/skill/workflows/SearchRun.md +89 -0
- package/skill/workflows/SignalsDashboard.md +2 -2
- package/skill/workflows/UnitTest.md +13 -4
- package/skill/workflows/Verify.md +136 -0
- package/skill/workflows/Watch.md +114 -47
- package/skill/workflows/Workflows.md +13 -8
- package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +0 -15
- package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +0 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
|
@@ -1,4 +1,22 @@
|
|
|
1
|
-
import type {
|
|
1
|
+
import type {
|
|
2
|
+
DashboardActionName,
|
|
3
|
+
DashboardActionResultSummary,
|
|
4
|
+
DashboardSearchRunSummary,
|
|
5
|
+
} from "./dashboard-contract.js";
|
|
6
|
+
import type {
|
|
7
|
+
CreatePackageBodySummary,
|
|
8
|
+
CreatePackageEvaluationEfficiencySummary,
|
|
9
|
+
CreatePackageEvaluationEvidenceSample,
|
|
10
|
+
CreatePackageEvaluationEvidenceSummary,
|
|
11
|
+
CreatePackageEvaluationGradingSummary,
|
|
12
|
+
CreatePackageEvaluationSource,
|
|
13
|
+
CreatePackageReplaySummary,
|
|
14
|
+
CreatePackageEvaluationUnitTestSummary,
|
|
15
|
+
CreatePackageEvaluationWatchSummary,
|
|
16
|
+
MonitoringSnapshot,
|
|
17
|
+
RuntimeReplayAggregateMetrics,
|
|
18
|
+
} from "./types.js";
|
|
19
|
+
import { extractJsonObject } from "./utils/json-output.js";
|
|
2
20
|
|
|
3
21
|
export interface DashboardActionOutcomeInput {
|
|
4
22
|
action: DashboardActionName;
|
|
@@ -13,18 +31,6 @@ export interface DashboardActionOutcome {
|
|
|
13
31
|
summary: DashboardActionResultSummary | null;
|
|
14
32
|
}
|
|
15
33
|
|
|
16
|
-
function extractJsonObject(stdout: string): Record<string, unknown> | null {
|
|
17
|
-
const trimmed = stdout.trim();
|
|
18
|
-
if (!trimmed.startsWith("{") || !trimmed.endsWith("}")) return null;
|
|
19
|
-
|
|
20
|
-
try {
|
|
21
|
-
const parsed = JSON.parse(trimmed) as unknown;
|
|
22
|
-
return parsed && typeof parsed === "object" ? (parsed as Record<string, unknown>) : null;
|
|
23
|
-
} catch {
|
|
24
|
-
return null;
|
|
25
|
-
}
|
|
26
|
-
}
|
|
27
|
-
|
|
28
34
|
function readBoolean(value: unknown): boolean | null {
|
|
29
35
|
return typeof value === "boolean" ? value : null;
|
|
30
36
|
}
|
|
@@ -37,24 +43,775 @@ function readString(value: unknown): string | null {
|
|
|
37
43
|
return typeof value === "string" && value.trim().length > 0 ? value : null;
|
|
38
44
|
}
|
|
39
45
|
|
|
46
|
+
function readObject(value: unknown): Record<string, unknown> | null {
|
|
47
|
+
return value && typeof value === "object" ? (value as Record<string, unknown>) : null;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function readEvidenceSample(value: unknown): CreatePackageEvaluationEvidenceSample | null {
|
|
51
|
+
const sample = readObject(value);
|
|
52
|
+
const query = readString(sample?.["query"]);
|
|
53
|
+
if (!query) return null;
|
|
54
|
+
|
|
55
|
+
return {
|
|
56
|
+
query,
|
|
57
|
+
evidence: readString(sample?.["evidence"]),
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function readEvidenceSamples(value: unknown): CreatePackageEvaluationEvidenceSample[] {
|
|
62
|
+
if (!Array.isArray(value)) return [];
|
|
63
|
+
return value
|
|
64
|
+
.map((sample) => readEvidenceSample(sample))
|
|
65
|
+
.filter((sample): sample is CreatePackageEvaluationEvidenceSample => sample != null);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function readRuntimeReplayAggregateMetrics(value: unknown): RuntimeReplayAggregateMetrics | null {
|
|
69
|
+
const metrics = readObject(value);
|
|
70
|
+
if (!metrics) return null;
|
|
71
|
+
|
|
72
|
+
const evalRuns = readNumber(metrics["eval_runs"]);
|
|
73
|
+
const usageObservations = readNumber(metrics["usage_observations"]);
|
|
74
|
+
const totalDurationMs = readNumber(metrics["total_duration_ms"]);
|
|
75
|
+
const avgDurationMs = readNumber(metrics["avg_duration_ms"]);
|
|
76
|
+
if (
|
|
77
|
+
evalRuns == null ||
|
|
78
|
+
usageObservations == null ||
|
|
79
|
+
totalDurationMs == null ||
|
|
80
|
+
avgDurationMs == null
|
|
81
|
+
) {
|
|
82
|
+
return null;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
return {
|
|
86
|
+
eval_runs: evalRuns,
|
|
87
|
+
usage_observations: usageObservations,
|
|
88
|
+
total_duration_ms: totalDurationMs,
|
|
89
|
+
avg_duration_ms: avgDurationMs,
|
|
90
|
+
total_input_tokens: readNumber(metrics["total_input_tokens"]),
|
|
91
|
+
total_output_tokens: readNumber(metrics["total_output_tokens"]),
|
|
92
|
+
total_cache_creation_input_tokens: readNumber(metrics["total_cache_creation_input_tokens"]),
|
|
93
|
+
total_cache_read_input_tokens: readNumber(metrics["total_cache_read_input_tokens"]),
|
|
94
|
+
total_cost_usd: readNumber(metrics["total_cost_usd"]),
|
|
95
|
+
total_turns: readNumber(metrics["total_turns"]),
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function readPackageEvidenceSummary(value: unknown): CreatePackageEvaluationEvidenceSummary | null {
|
|
100
|
+
const summary = readObject(value);
|
|
101
|
+
if (!summary) return null;
|
|
102
|
+
|
|
103
|
+
const replayFailures = readNumber(summary["replay_failures"]);
|
|
104
|
+
const baselineWins = readNumber(summary["baseline_wins"]);
|
|
105
|
+
const baselineRegressions = readNumber(summary["baseline_regressions"]);
|
|
106
|
+
const replayFailureSamples = readEvidenceSamples(summary["replay_failure_samples"]);
|
|
107
|
+
const baselineWinSamples = readEvidenceSamples(summary["baseline_win_samples"]);
|
|
108
|
+
const baselineRegressionSamples = readEvidenceSamples(summary["baseline_regression_samples"]);
|
|
109
|
+
|
|
110
|
+
if (
|
|
111
|
+
replayFailures == null &&
|
|
112
|
+
baselineWins == null &&
|
|
113
|
+
baselineRegressions == null &&
|
|
114
|
+
replayFailureSamples.length === 0 &&
|
|
115
|
+
baselineWinSamples.length === 0 &&
|
|
116
|
+
baselineRegressionSamples.length === 0
|
|
117
|
+
) {
|
|
118
|
+
return null;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
replay_failures: replayFailures ?? replayFailureSamples.length,
|
|
123
|
+
baseline_wins: baselineWins ?? baselineWinSamples.length,
|
|
124
|
+
baseline_regressions: baselineRegressions ?? baselineRegressionSamples.length,
|
|
125
|
+
replay_failure_samples: replayFailureSamples,
|
|
126
|
+
baseline_win_samples: baselineWinSamples,
|
|
127
|
+
baseline_regression_samples: baselineRegressionSamples,
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
function readPackageEfficiencySummary(
|
|
132
|
+
value: unknown,
|
|
133
|
+
): CreatePackageEvaluationEfficiencySummary | null {
|
|
134
|
+
const summary = readObject(value);
|
|
135
|
+
if (!summary) return null;
|
|
136
|
+
|
|
137
|
+
const withSkill = readRuntimeReplayAggregateMetrics(summary["with_skill"]);
|
|
138
|
+
const withoutSkill = readRuntimeReplayAggregateMetrics(summary["without_skill"]);
|
|
139
|
+
if (!withSkill || !withoutSkill) return null;
|
|
140
|
+
|
|
141
|
+
return {
|
|
142
|
+
with_skill: withSkill,
|
|
143
|
+
without_skill: withoutSkill,
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
function readPackageEvaluationSource(value: unknown): CreatePackageEvaluationSource | null {
|
|
148
|
+
const source = readString(value);
|
|
149
|
+
if (source !== "fresh" && source !== "artifact_cache" && source !== "candidate_cache") {
|
|
150
|
+
return null;
|
|
151
|
+
}
|
|
152
|
+
return source;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
function readPackageReplaySummary(value: unknown): CreatePackageReplaySummary | null {
|
|
156
|
+
const summary = readObject(value);
|
|
157
|
+
if (!summary) return null;
|
|
158
|
+
|
|
159
|
+
const mode = readString(summary["mode"]);
|
|
160
|
+
const validationMode = readString(summary["validation_mode"]);
|
|
161
|
+
const agent = readString(summary["agent"]);
|
|
162
|
+
const proposalId = readString(summary["proposal_id"]);
|
|
163
|
+
const fixtureId = readString(summary["fixture_id"]);
|
|
164
|
+
const total = readNumber(summary["total"]);
|
|
165
|
+
const passed = readNumber(summary["passed"]);
|
|
166
|
+
const failed = readNumber(summary["failed"]);
|
|
167
|
+
const passRate = readNumber(summary["pass_rate"]);
|
|
168
|
+
if (
|
|
169
|
+
(mode !== "routing" && mode !== "package") ||
|
|
170
|
+
validationMode !== "host_replay" ||
|
|
171
|
+
agent == null ||
|
|
172
|
+
proposalId == null ||
|
|
173
|
+
fixtureId == null ||
|
|
174
|
+
total == null ||
|
|
175
|
+
passed == null ||
|
|
176
|
+
failed == null ||
|
|
177
|
+
passRate == null
|
|
178
|
+
) {
|
|
179
|
+
return null;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return {
|
|
183
|
+
mode,
|
|
184
|
+
validation_mode: validationMode,
|
|
185
|
+
agent,
|
|
186
|
+
proposal_id: proposalId,
|
|
187
|
+
fixture_id: fixtureId,
|
|
188
|
+
total,
|
|
189
|
+
passed,
|
|
190
|
+
failed,
|
|
191
|
+
pass_rate: passRate,
|
|
192
|
+
...(readRuntimeReplayAggregateMetrics(summary["runtime_metrics"])
|
|
193
|
+
? { runtime_metrics: readRuntimeReplayAggregateMetrics(summary["runtime_metrics"]) }
|
|
194
|
+
: {}),
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
function readPackageBodySummary(value: unknown): CreatePackageBodySummary | null {
|
|
199
|
+
const summary = readObject(value);
|
|
200
|
+
if (!summary) return null;
|
|
201
|
+
|
|
202
|
+
const structuralValid = readBoolean(summary["structural_valid"]);
|
|
203
|
+
const structuralReason = readString(summary["structural_reason"]);
|
|
204
|
+
const qualityThreshold = readNumber(summary["quality_threshold"]);
|
|
205
|
+
const valid = readBoolean(summary["valid"]);
|
|
206
|
+
if (
|
|
207
|
+
structuralValid == null ||
|
|
208
|
+
structuralReason == null ||
|
|
209
|
+
qualityThreshold == null ||
|
|
210
|
+
valid == null
|
|
211
|
+
) {
|
|
212
|
+
return null;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
return {
|
|
216
|
+
structural_valid: structuralValid,
|
|
217
|
+
structural_reason: structuralReason,
|
|
218
|
+
quality_score: readNumber(summary["quality_score"]),
|
|
219
|
+
quality_reason: readString(summary["quality_reason"]),
|
|
220
|
+
quality_threshold: qualityThreshold,
|
|
221
|
+
quality_passed: readBoolean(summary["quality_passed"]),
|
|
222
|
+
valid,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
function readPackageGradingSummary(value: unknown): CreatePackageEvaluationGradingSummary | null {
|
|
227
|
+
const summary = readObject(value);
|
|
228
|
+
if (!summary) return null;
|
|
229
|
+
|
|
230
|
+
const baseline = readObject(summary["baseline"]);
|
|
231
|
+
const recent = readObject(summary["recent"]);
|
|
232
|
+
const baselinePassRate = readNumber(baseline?.["pass_rate"]);
|
|
233
|
+
const baselineMeasuredAt = readString(baseline?.["measured_at"]);
|
|
234
|
+
const baselineSampleSize = readNumber(baseline?.["sample_size"]);
|
|
235
|
+
const recentSampleSize = readNumber(recent?.["sample_size"]);
|
|
236
|
+
|
|
237
|
+
const parsedBaseline =
|
|
238
|
+
baselinePassRate != null && baselineMeasuredAt != null && baselineSampleSize != null
|
|
239
|
+
? {
|
|
240
|
+
proposal_id: readString(baseline?.["proposal_id"]),
|
|
241
|
+
measured_at: baselineMeasuredAt,
|
|
242
|
+
pass_rate: baselinePassRate,
|
|
243
|
+
mean_score: readNumber(baseline?.["mean_score"]),
|
|
244
|
+
sample_size: baselineSampleSize,
|
|
245
|
+
}
|
|
246
|
+
: null;
|
|
247
|
+
const parsedRecent =
|
|
248
|
+
recentSampleSize != null
|
|
249
|
+
? {
|
|
250
|
+
sample_size: recentSampleSize,
|
|
251
|
+
average_pass_rate: readNumber(recent?.["average_pass_rate"]),
|
|
252
|
+
average_mean_score: readNumber(recent?.["average_mean_score"]),
|
|
253
|
+
newest_graded_at: readString(recent?.["newest_graded_at"]),
|
|
254
|
+
oldest_graded_at: readString(recent?.["oldest_graded_at"]),
|
|
255
|
+
}
|
|
256
|
+
: null;
|
|
257
|
+
|
|
258
|
+
if (!parsedBaseline && !parsedRecent) return null;
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
baseline: parsedBaseline,
|
|
262
|
+
recent: parsedRecent,
|
|
263
|
+
pass_rate_delta: readNumber(summary["pass_rate_delta"]),
|
|
264
|
+
mean_score_delta: readNumber(summary["mean_score_delta"]),
|
|
265
|
+
regressed: readBoolean(summary["regressed"]),
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
function readPackageUnitTestSummary(value: unknown): CreatePackageEvaluationUnitTestSummary | null {
|
|
270
|
+
const summary = readObject(value);
|
|
271
|
+
if (!summary) return null;
|
|
272
|
+
|
|
273
|
+
const total = readNumber(summary["total"]);
|
|
274
|
+
const passed = readNumber(summary["passed"]);
|
|
275
|
+
const failed = readNumber(summary["failed"]);
|
|
276
|
+
const passRate = readNumber(summary["pass_rate"]);
|
|
277
|
+
const runAt = readString(summary["run_at"]);
|
|
278
|
+
if (total == null || passed == null || failed == null || passRate == null || runAt == null) {
|
|
279
|
+
return null;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
const failingTests = Array.isArray(summary["failing_tests"])
|
|
283
|
+
? summary["failing_tests"]
|
|
284
|
+
.map((entry) => {
|
|
285
|
+
const failure = readObject(entry);
|
|
286
|
+
const testId = readString(failure?.["test_id"]);
|
|
287
|
+
if (!testId) return null;
|
|
288
|
+
|
|
289
|
+
const failedAssertions = Array.isArray(failure?.["failed_assertions"])
|
|
290
|
+
? failure["failed_assertions"].filter(
|
|
291
|
+
(assertion): assertion is string =>
|
|
292
|
+
typeof assertion === "string" && assertion.trim().length > 0,
|
|
293
|
+
)
|
|
294
|
+
: [];
|
|
295
|
+
|
|
296
|
+
return {
|
|
297
|
+
test_id: testId,
|
|
298
|
+
error: readString(failure?.["error"]),
|
|
299
|
+
failed_assertions: failedAssertions,
|
|
300
|
+
};
|
|
301
|
+
})
|
|
302
|
+
.filter(
|
|
303
|
+
(failure): failure is CreatePackageEvaluationUnitTestSummary["failing_tests"][number] =>
|
|
304
|
+
failure != null,
|
|
305
|
+
)
|
|
306
|
+
: [];
|
|
307
|
+
|
|
308
|
+
return {
|
|
309
|
+
total,
|
|
310
|
+
passed,
|
|
311
|
+
failed,
|
|
312
|
+
pass_rate: passRate,
|
|
313
|
+
run_at: runAt,
|
|
314
|
+
failing_tests: failingTests,
|
|
315
|
+
};
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
function readInvocationTotals(value: unknown): { passed: number; total: number } | null {
|
|
319
|
+
const entry = readObject(value);
|
|
320
|
+
const passed = readNumber(entry?.["passed"]);
|
|
321
|
+
const total = readNumber(entry?.["total"]);
|
|
322
|
+
if (passed == null || total == null) return null;
|
|
323
|
+
|
|
324
|
+
return { passed, total };
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
function readMonitoringSnapshot(value: unknown): MonitoringSnapshot | null {
|
|
328
|
+
const snapshot = readObject(value);
|
|
329
|
+
if (!snapshot) return null;
|
|
330
|
+
|
|
331
|
+
const timestamp = readString(snapshot["timestamp"]);
|
|
332
|
+
const skillName = readString(snapshot["skill_name"]);
|
|
333
|
+
const windowSessions = readNumber(snapshot["window_sessions"]);
|
|
334
|
+
const skillChecks = readNumber(snapshot["skill_checks"]);
|
|
335
|
+
const passRate = readNumber(snapshot["pass_rate"]);
|
|
336
|
+
const falseNegativeRate = readNumber(snapshot["false_negative_rate"]);
|
|
337
|
+
const regressionDetected = readBoolean(snapshot["regression_detected"]);
|
|
338
|
+
const baselinePassRate = readNumber(snapshot["baseline_pass_rate"]);
|
|
339
|
+
const byInvocationType = readObject(snapshot["by_invocation_type"]);
|
|
340
|
+
|
|
341
|
+
const explicit = readInvocationTotals(byInvocationType?.["explicit"]);
|
|
342
|
+
const implicit = readInvocationTotals(byInvocationType?.["implicit"]);
|
|
343
|
+
const contextual = readInvocationTotals(byInvocationType?.["contextual"]);
|
|
344
|
+
const negative = readInvocationTotals(byInvocationType?.["negative"]);
|
|
345
|
+
|
|
346
|
+
if (
|
|
347
|
+
timestamp == null ||
|
|
348
|
+
skillName == null ||
|
|
349
|
+
windowSessions == null ||
|
|
350
|
+
skillChecks == null ||
|
|
351
|
+
passRate == null ||
|
|
352
|
+
falseNegativeRate == null ||
|
|
353
|
+
regressionDetected == null ||
|
|
354
|
+
baselinePassRate == null ||
|
|
355
|
+
explicit == null ||
|
|
356
|
+
implicit == null ||
|
|
357
|
+
contextual == null ||
|
|
358
|
+
negative == null
|
|
359
|
+
) {
|
|
360
|
+
return null;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
return {
|
|
364
|
+
timestamp,
|
|
365
|
+
skill_name: skillName,
|
|
366
|
+
window_sessions: windowSessions,
|
|
367
|
+
skill_checks: skillChecks,
|
|
368
|
+
pass_rate: passRate,
|
|
369
|
+
false_negative_rate: falseNegativeRate,
|
|
370
|
+
by_invocation_type: {
|
|
371
|
+
explicit,
|
|
372
|
+
implicit,
|
|
373
|
+
contextual,
|
|
374
|
+
negative,
|
|
375
|
+
},
|
|
376
|
+
regression_detected: regressionDetected,
|
|
377
|
+
baseline_pass_rate: baselinePassRate,
|
|
378
|
+
};
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
function readGradeRegression(
|
|
382
|
+
value: unknown,
|
|
383
|
+
): CreatePackageEvaluationWatchSummary["grade_regression"] {
|
|
384
|
+
const regression = readObject(value);
|
|
385
|
+
if (!regression) return null;
|
|
386
|
+
|
|
387
|
+
const before = readNumber(regression["before"]);
|
|
388
|
+
const after = readNumber(regression["after"]);
|
|
389
|
+
const delta = readNumber(regression["delta"]);
|
|
390
|
+
if (before == null || after == null || delta == null) return null;
|
|
391
|
+
|
|
392
|
+
return { before, after, delta };
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
function readEfficiencyRegression(
|
|
396
|
+
value: unknown,
|
|
397
|
+
): CreatePackageEvaluationWatchSummary["efficiency_regression"] {
|
|
398
|
+
const regression = readObject(value);
|
|
399
|
+
if (!regression) return null;
|
|
400
|
+
|
|
401
|
+
const sampleSize = readNumber(regression["sample_size"]);
|
|
402
|
+
if (sampleSize == null) return null;
|
|
403
|
+
|
|
404
|
+
return {
|
|
405
|
+
sample_size: sampleSize,
|
|
406
|
+
baseline_avg_duration_ms: readNumber(regression["baseline_avg_duration_ms"]),
|
|
407
|
+
observed_avg_duration_ms: readNumber(regression["observed_avg_duration_ms"]),
|
|
408
|
+
duration_delta_ratio: readNumber(regression["duration_delta_ratio"]),
|
|
409
|
+
baseline_avg_input_tokens: readNumber(regression["baseline_avg_input_tokens"]),
|
|
410
|
+
observed_avg_input_tokens: readNumber(regression["observed_avg_input_tokens"]),
|
|
411
|
+
input_tokens_delta_ratio: readNumber(regression["input_tokens_delta_ratio"]),
|
|
412
|
+
baseline_avg_output_tokens: readNumber(regression["baseline_avg_output_tokens"]),
|
|
413
|
+
observed_avg_output_tokens: readNumber(regression["observed_avg_output_tokens"]),
|
|
414
|
+
output_tokens_delta_ratio: readNumber(regression["output_tokens_delta_ratio"]),
|
|
415
|
+
baseline_avg_turns: readNumber(regression["baseline_avg_turns"]),
|
|
416
|
+
observed_avg_turns: readNumber(regression["observed_avg_turns"]),
|
|
417
|
+
turns_delta_ratio: readNumber(regression["turns_delta_ratio"]),
|
|
418
|
+
};
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
function readPackageWatchSummary(value: unknown): CreatePackageEvaluationWatchSummary | null {
|
|
422
|
+
const summary = readObject(value);
|
|
423
|
+
if (!summary) return null;
|
|
424
|
+
|
|
425
|
+
const snapshot = readMonitoringSnapshot(summary["snapshot"]);
|
|
426
|
+
const rolledBack = readBoolean(summary["rolled_back"]) ?? readBoolean(summary["rolledBack"]);
|
|
427
|
+
const recommendation = readString(summary["recommendation"]);
|
|
428
|
+
|
|
429
|
+
if (!snapshot || rolledBack == null || recommendation == null) return null;
|
|
430
|
+
|
|
431
|
+
return {
|
|
432
|
+
snapshot,
|
|
433
|
+
alert: readString(summary["alert"]),
|
|
434
|
+
rolled_back: rolledBack,
|
|
435
|
+
recommendation,
|
|
436
|
+
recommended_command: readString(summary["recommended_command"]),
|
|
437
|
+
grade_alert: readString(summary["grade_alert"]) ?? readString(summary["gradeAlert"]),
|
|
438
|
+
grade_regression:
|
|
439
|
+
readGradeRegression(summary["grade_regression"]) ??
|
|
440
|
+
readGradeRegression(summary["gradeRegression"]),
|
|
441
|
+
...((readString(summary["efficiency_alert"]) ?? readString(summary["efficiencyAlert"]))
|
|
442
|
+
? {
|
|
443
|
+
efficiency_alert:
|
|
444
|
+
readString(summary["efficiency_alert"]) ?? readString(summary["efficiencyAlert"]),
|
|
445
|
+
}
|
|
446
|
+
: {}),
|
|
447
|
+
...((readEfficiencyRegression(summary["efficiency_regression"]) ??
|
|
448
|
+
readEfficiencyRegression(summary["efficiencyRegression"]))
|
|
449
|
+
? {
|
|
450
|
+
efficiency_regression:
|
|
451
|
+
readEfficiencyRegression(summary["efficiency_regression"]) ??
|
|
452
|
+
readEfficiencyRegression(summary["efficiencyRegression"]),
|
|
453
|
+
}
|
|
454
|
+
: {}),
|
|
455
|
+
};
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
function subtractRates(current: number | null, baseline: number | null): number | null {
|
|
459
|
+
if (current == null || baseline == null) return null;
|
|
460
|
+
return Number.parseFloat((current - baseline).toFixed(4));
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
function buildWatchSummary(
|
|
464
|
+
watchResult: Record<string, unknown>,
|
|
465
|
+
fallbackReason: string | null = null,
|
|
466
|
+
packageEvaluation: Record<string, unknown> | null = null,
|
|
467
|
+
): DashboardActionResultSummary | null {
|
|
468
|
+
const packageWatch =
|
|
469
|
+
readPackageWatchSummary(watchResult) ?? readPackageWatchSummary(packageEvaluation?.["watch"]);
|
|
470
|
+
const snapshot = packageWatch?.snapshot ?? readMonitoringSnapshot(watchResult["snapshot"]);
|
|
471
|
+
if (!snapshot) return null;
|
|
472
|
+
|
|
473
|
+
const baselinePassRate = snapshot.baseline_pass_rate;
|
|
474
|
+
const currentPassRate = snapshot.pass_rate;
|
|
475
|
+
const regressionDetected = snapshot.regression_detected;
|
|
476
|
+
const gradeAlert = packageWatch?.grade_alert ?? readString(watchResult["gradeAlert"]);
|
|
477
|
+
const alert = packageWatch?.alert ?? readString(watchResult["alert"]);
|
|
478
|
+
const recommendation =
|
|
479
|
+
packageWatch?.recommendation ?? readString(watchResult["recommendation"]) ?? fallbackReason;
|
|
480
|
+
const recommendedCommand =
|
|
481
|
+
packageWatch?.recommended_command ?? readString(watchResult["recommended_command"]);
|
|
482
|
+
const packageEvaluationSource = readPackageEvaluationSource(
|
|
483
|
+
packageEvaluation?.["evaluation_source"],
|
|
484
|
+
);
|
|
485
|
+
const packageCandidateId = readString(packageEvaluation?.["candidate_id"]);
|
|
486
|
+
const packageParentCandidateId = readString(packageEvaluation?.["parent_candidate_id"]);
|
|
487
|
+
const packageCandidateGeneration = readNumber(packageEvaluation?.["candidate_generation"]);
|
|
488
|
+
const packageCandidateAcceptance = readObject(packageEvaluation?.["candidate_acceptance"]);
|
|
489
|
+
const packageCandidateAcceptanceDecision = readString(packageCandidateAcceptance?.["decision"]);
|
|
490
|
+
const packageCandidateAcceptanceRationale = readString(packageCandidateAcceptance?.["rationale"]);
|
|
491
|
+
const packageEvidence = readPackageEvidenceSummary(packageEvaluation?.["evidence"]);
|
|
492
|
+
const packageEfficiency = readPackageEfficiencySummary(packageEvaluation?.["efficiency"]);
|
|
493
|
+
const packageRouting = readPackageReplaySummary(packageEvaluation?.["routing"]);
|
|
494
|
+
const packageBody = readPackageBodySummary(packageEvaluation?.["body"]);
|
|
495
|
+
const packageGrading = readPackageGradingSummary(packageEvaluation?.["grading"]);
|
|
496
|
+
const packageUnitTests = readPackageUnitTestSummary(packageEvaluation?.["unit_tests"]);
|
|
497
|
+
|
|
498
|
+
return {
|
|
499
|
+
reason: alert ?? recommendation,
|
|
500
|
+
improved: alert == null,
|
|
501
|
+
deployed: true,
|
|
502
|
+
before_pass_rate: baselinePassRate,
|
|
503
|
+
before_label: "Baseline",
|
|
504
|
+
after_pass_rate: currentPassRate,
|
|
505
|
+
after_label: "Observed",
|
|
506
|
+
net_change: subtractRates(currentPassRate, baselinePassRate),
|
|
507
|
+
net_change_label: "Delta",
|
|
508
|
+
validation_mode:
|
|
509
|
+
gradeAlert != null && regressionDetected
|
|
510
|
+
? "trigger+grade_watch"
|
|
511
|
+
: gradeAlert != null
|
|
512
|
+
? "grade_watch"
|
|
513
|
+
: regressionDetected
|
|
514
|
+
? "trigger_watch"
|
|
515
|
+
: "live_watch",
|
|
516
|
+
validation_label: "Signal",
|
|
517
|
+
...((recommendedCommand ?? readString(packageEvaluation?.["next_command"]))
|
|
518
|
+
? {
|
|
519
|
+
recommended_command:
|
|
520
|
+
recommendedCommand ?? readString(packageEvaluation?.["next_command"]),
|
|
521
|
+
}
|
|
522
|
+
: {}),
|
|
523
|
+
...(packageEvaluationSource ? { package_evaluation_source: packageEvaluationSource } : {}),
|
|
524
|
+
...(packageCandidateId ? { package_candidate_id: packageCandidateId } : {}),
|
|
525
|
+
...(packageParentCandidateId ? { package_parent_candidate_id: packageParentCandidateId } : {}),
|
|
526
|
+
...(packageCandidateGeneration != null
|
|
527
|
+
? { package_candidate_generation: packageCandidateGeneration }
|
|
528
|
+
: {}),
|
|
529
|
+
...(packageCandidateAcceptanceDecision
|
|
530
|
+
? {
|
|
531
|
+
package_candidate_acceptance_decision:
|
|
532
|
+
packageCandidateAcceptanceDecision as DashboardActionResultSummary["package_candidate_acceptance_decision"],
|
|
533
|
+
}
|
|
534
|
+
: {}),
|
|
535
|
+
...(packageCandidateAcceptanceRationale
|
|
536
|
+
? { package_candidate_acceptance_rationale: packageCandidateAcceptanceRationale }
|
|
537
|
+
: {}),
|
|
538
|
+
...(packageEvidence ? { package_evidence: packageEvidence } : {}),
|
|
539
|
+
...(packageEfficiency ? { package_efficiency: packageEfficiency } : {}),
|
|
540
|
+
...(packageRouting ? { package_routing: packageRouting } : {}),
|
|
541
|
+
...(packageBody ? { package_body: packageBody } : {}),
|
|
542
|
+
...(packageGrading ? { package_grading: packageGrading } : {}),
|
|
543
|
+
...(packageUnitTests ? { package_unit_tests: packageUnitTests } : {}),
|
|
544
|
+
...(packageWatch ? { package_watch: packageWatch } : {}),
|
|
545
|
+
};
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
function buildPackageEvaluationSummary(
|
|
549
|
+
packageEvaluation: Record<string, unknown> | null,
|
|
550
|
+
options: {
|
|
551
|
+
deployed: boolean | null;
|
|
552
|
+
reason: string | null;
|
|
553
|
+
},
|
|
554
|
+
): DashboardActionResultSummary | null {
|
|
555
|
+
if (!packageEvaluation) return null;
|
|
556
|
+
|
|
557
|
+
const replay = readObject(packageEvaluation["replay"]);
|
|
558
|
+
const baseline = readObject(packageEvaluation["baseline"]);
|
|
559
|
+
const recommendedCommand = readString(packageEvaluation["next_command"]);
|
|
560
|
+
const packageEvaluationSource = readPackageEvaluationSource(
|
|
561
|
+
packageEvaluation["evaluation_source"],
|
|
562
|
+
);
|
|
563
|
+
const packageCandidateId = readString(packageEvaluation["candidate_id"]);
|
|
564
|
+
const packageParentCandidateId = readString(packageEvaluation["parent_candidate_id"]);
|
|
565
|
+
const packageCandidateGeneration = readNumber(packageEvaluation["candidate_generation"]);
|
|
566
|
+
const packageCandidateAcceptance = readObject(packageEvaluation["candidate_acceptance"]);
|
|
567
|
+
const packageCandidateAcceptanceDecision = readString(packageCandidateAcceptance?.["decision"]);
|
|
568
|
+
const packageCandidateAcceptanceRationale = readString(packageCandidateAcceptance?.["rationale"]);
|
|
569
|
+
const packageEvidence = readPackageEvidenceSummary(packageEvaluation["evidence"]);
|
|
570
|
+
const packageEfficiency = readPackageEfficiencySummary(packageEvaluation["efficiency"]);
|
|
571
|
+
const packageRouting = readPackageReplaySummary(packageEvaluation["routing"]);
|
|
572
|
+
const packageBody = readPackageBodySummary(packageEvaluation["body"]);
|
|
573
|
+
const packageGrading = readPackageGradingSummary(packageEvaluation["grading"]);
|
|
574
|
+
const packageUnitTests = readPackageUnitTestSummary(packageEvaluation["unit_tests"]);
|
|
575
|
+
const packageWatch = readPackageWatchSummary(packageEvaluation["watch"]);
|
|
576
|
+
|
|
577
|
+
return {
|
|
578
|
+
reason: options.reason,
|
|
579
|
+
improved: readBoolean(packageEvaluation["evaluation_passed"]),
|
|
580
|
+
deployed: options.deployed,
|
|
581
|
+
before_pass_rate: readNumber(baseline?.["baseline_pass_rate"]),
|
|
582
|
+
after_pass_rate: readNumber(baseline?.["with_skill_pass_rate"]),
|
|
583
|
+
net_change: readNumber(baseline?.["lift"]),
|
|
584
|
+
validation_mode: readString(replay?.["validation_mode"]),
|
|
585
|
+
...(recommendedCommand ? { recommended_command: recommendedCommand } : {}),
|
|
586
|
+
...(packageEvaluationSource ? { package_evaluation_source: packageEvaluationSource } : {}),
|
|
587
|
+
...(packageCandidateId ? { package_candidate_id: packageCandidateId } : {}),
|
|
588
|
+
...(packageParentCandidateId ? { package_parent_candidate_id: packageParentCandidateId } : {}),
|
|
589
|
+
...(packageCandidateGeneration != null
|
|
590
|
+
? { package_candidate_generation: packageCandidateGeneration }
|
|
591
|
+
: {}),
|
|
592
|
+
...(packageCandidateAcceptanceDecision
|
|
593
|
+
? {
|
|
594
|
+
package_candidate_acceptance_decision:
|
|
595
|
+
packageCandidateAcceptanceDecision as DashboardActionResultSummary["package_candidate_acceptance_decision"],
|
|
596
|
+
}
|
|
597
|
+
: {}),
|
|
598
|
+
...(packageCandidateAcceptanceRationale
|
|
599
|
+
? { package_candidate_acceptance_rationale: packageCandidateAcceptanceRationale }
|
|
600
|
+
: {}),
|
|
601
|
+
...(packageEvidence ? { package_evidence: packageEvidence } : {}),
|
|
602
|
+
...(packageEfficiency ? { package_efficiency: packageEfficiency } : {}),
|
|
603
|
+
...(packageRouting ? { package_routing: packageRouting } : {}),
|
|
604
|
+
...(packageBody ? { package_body: packageBody } : {}),
|
|
605
|
+
...(packageGrading ? { package_grading: packageGrading } : {}),
|
|
606
|
+
...(packageUnitTests ? { package_unit_tests: packageUnitTests } : {}),
|
|
607
|
+
...(packageWatch ? { package_watch: packageWatch } : {}),
|
|
608
|
+
};
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
function extractSearchRunSummary(
|
|
612
|
+
parsed: Record<string, unknown>,
|
|
613
|
+
): DashboardSearchRunSummary | null {
|
|
614
|
+
const searchId = readString(parsed["search_id"]);
|
|
615
|
+
if (!searchId) return null;
|
|
616
|
+
|
|
617
|
+
const provenance = parsed["provenance"];
|
|
618
|
+
const prov =
|
|
619
|
+
provenance && typeof provenance === "object" ? (provenance as Record<string, unknown>) : null;
|
|
620
|
+
const surfacePlan =
|
|
621
|
+
prov && typeof prov["surface_plan"] === "object"
|
|
622
|
+
? (prov["surface_plan"] as Record<string, unknown>)
|
|
623
|
+
: null;
|
|
624
|
+
|
|
625
|
+
return {
|
|
626
|
+
search_id: searchId,
|
|
627
|
+
parent_candidate_id: readString(parsed["parent_candidate_id"]),
|
|
628
|
+
winner_candidate_id: readString(parsed["winner_candidate_id"]),
|
|
629
|
+
winner_rationale: readString(parsed["winner_rationale"]),
|
|
630
|
+
candidates_evaluated: readNumber(parsed["candidates_evaluated"]) ?? 0,
|
|
631
|
+
frontier_size: prov ? (readNumber(prov["frontier_size"]) ?? 0) : 0,
|
|
632
|
+
parent_selection_method: prov
|
|
633
|
+
? (readString(prov["parent_selection_method"]) ?? "unknown")
|
|
634
|
+
: "unknown",
|
|
635
|
+
...(surfacePlan
|
|
636
|
+
? {
|
|
637
|
+
surface_plan: {
|
|
638
|
+
routing_count: readNumber(surfacePlan["routing_count"]) ?? 0,
|
|
639
|
+
body_count: readNumber(surfacePlan["body_count"]) ?? 0,
|
|
640
|
+
weakness_source: readString(surfacePlan["weakness_source"]) ?? "unknown",
|
|
641
|
+
routing_weakness: readNumber(surfacePlan["routing_weakness"]),
|
|
642
|
+
body_weakness: readNumber(surfacePlan["body_weakness"]),
|
|
643
|
+
},
|
|
644
|
+
}
|
|
645
|
+
: {}),
|
|
646
|
+
};
|
|
647
|
+
}
|
|
648
|
+
|
|
40
649
|
export function extractDashboardActionSummary(
|
|
41
650
|
action: DashboardActionName,
|
|
42
651
|
stdout: string,
|
|
43
652
|
): DashboardActionResultSummary | null {
|
|
44
|
-
if (action !== "replay-dry-run") return null;
|
|
45
|
-
|
|
46
653
|
const parsed = extractJsonObject(stdout);
|
|
47
654
|
if (!parsed) return null;
|
|
48
655
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
656
|
+
if (action === "create-check") {
|
|
657
|
+
const readiness = readObject(parsed["readiness"]);
|
|
658
|
+
const specValidation = readObject(parsed["spec_validation"]);
|
|
659
|
+
const ok = readBoolean(parsed["ok"]);
|
|
660
|
+
const state = readString(parsed["state"]);
|
|
661
|
+
const recommendedCommand = readString(readiness?.["recommended_command"]);
|
|
662
|
+
|
|
663
|
+
return {
|
|
664
|
+
reason:
|
|
665
|
+
readString(readiness?.["summary"]) ??
|
|
666
|
+
(ok === true
|
|
667
|
+
? "Draft package passed create check"
|
|
668
|
+
: state
|
|
669
|
+
? `Draft package is in ${state.replaceAll("_", " ")} state`
|
|
670
|
+
: null),
|
|
671
|
+
improved: ok,
|
|
672
|
+
deployed: null,
|
|
673
|
+
before_pass_rate: null,
|
|
674
|
+
after_pass_rate: null,
|
|
675
|
+
net_change: null,
|
|
676
|
+
validation_mode: readString(specValidation?.["validator"]),
|
|
677
|
+
...(recommendedCommand ? { recommended_command: recommendedCommand } : {}),
|
|
678
|
+
};
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
if (action === "replay-dry-run") {
|
|
682
|
+
return {
|
|
683
|
+
reason: readString(parsed["reason"]),
|
|
684
|
+
improved: readBoolean(parsed["improved"]),
|
|
685
|
+
deployed: readBoolean(parsed["deployed"]),
|
|
686
|
+
before_pass_rate: readNumber(parsed["before_pass_rate"]) ?? readNumber(parsed["before"]),
|
|
687
|
+
after_pass_rate: readNumber(parsed["after_pass_rate"]) ?? readNumber(parsed["after"]),
|
|
688
|
+
net_change: readNumber(parsed["net_change"]),
|
|
689
|
+
validation_mode: readString(parsed["validation_mode"]),
|
|
690
|
+
};
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
if (action === "search-run") {
|
|
694
|
+
const searchRun = extractSearchRunSummary(parsed);
|
|
695
|
+
const packageSummary = buildPackageEvaluationSummary(readObject(parsed["package_evaluation"]), {
|
|
696
|
+
deployed: false,
|
|
697
|
+
reason: readString(parsed["winner_rationale"]),
|
|
698
|
+
});
|
|
699
|
+
return {
|
|
700
|
+
...(packageSummary ?? {
|
|
701
|
+
reason: readString(parsed["winner_rationale"]),
|
|
702
|
+
improved: readBoolean(parsed["improved"]) ?? searchRun?.winner_candidate_id != null,
|
|
703
|
+
deployed: null,
|
|
704
|
+
before_pass_rate: null,
|
|
705
|
+
after_pass_rate: null,
|
|
706
|
+
net_change: null,
|
|
707
|
+
validation_mode: null,
|
|
708
|
+
...(readString(parsed["next_command"])
|
|
709
|
+
? { recommended_command: readString(parsed["next_command"]) }
|
|
710
|
+
: {}),
|
|
711
|
+
}),
|
|
712
|
+
search_run: searchRun,
|
|
713
|
+
};
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
if (action === "measure-baseline") {
|
|
717
|
+
const packageEfficiency = readPackageEfficiencySummary(parsed["runtime_metrics"]);
|
|
718
|
+
return {
|
|
719
|
+
reason:
|
|
720
|
+
readBoolean(parsed["adds_value"]) === false ? "Baseline gate failed" : "Baseline measured",
|
|
721
|
+
improved: readBoolean(parsed["adds_value"]),
|
|
722
|
+
deployed: null,
|
|
723
|
+
before_pass_rate: readNumber(parsed["baseline_pass_rate"]),
|
|
724
|
+
after_pass_rate: readNumber(parsed["with_skill_pass_rate"]),
|
|
725
|
+
net_change: readNumber(parsed["lift"]),
|
|
726
|
+
validation_mode: readString(parsed["mode"]) === "package" ? "host_replay" : null,
|
|
727
|
+
...(packageEfficiency ? { package_efficiency: packageEfficiency } : {}),
|
|
728
|
+
};
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
if (action === "report-package") {
|
|
732
|
+
const report = readObject(parsed["report"]);
|
|
733
|
+
const summary = readObject(parsed["summary"]) ?? readObject(report?.["summary"]);
|
|
734
|
+
const status = readString(summary?.["status"]);
|
|
735
|
+
const packageSummary = buildPackageEvaluationSummary(summary, {
|
|
736
|
+
deployed: null,
|
|
737
|
+
reason:
|
|
738
|
+
status === "replay_failed"
|
|
739
|
+
? "Package report detected replay failures"
|
|
740
|
+
: status === "baseline_failed"
|
|
741
|
+
? "Package report detected a baseline regression"
|
|
742
|
+
: "Package report ready",
|
|
743
|
+
});
|
|
744
|
+
if (packageSummary) {
|
|
745
|
+
return packageSummary;
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
const readiness = readObject(parsed["readiness"]);
|
|
749
|
+
const verified = readBoolean(parsed["verified"]);
|
|
750
|
+
const readinessState =
|
|
751
|
+
readString(parsed["readiness_state"]) ?? readString(readiness?.["state"]);
|
|
752
|
+
const recommendedCommand =
|
|
753
|
+
readString(parsed["next_command"]) ?? readString(readiness?.["next_command"]);
|
|
754
|
+
|
|
755
|
+
return {
|
|
756
|
+
reason:
|
|
757
|
+
readString(readiness?.["summary"]) ??
|
|
758
|
+
(readinessState
|
|
759
|
+
? `Draft package is in ${readinessState.replaceAll("_", " ")} state`
|
|
760
|
+
: null),
|
|
761
|
+
improved: verified ?? readBoolean(readiness?.["ok"]),
|
|
762
|
+
deployed: null,
|
|
763
|
+
before_pass_rate: null,
|
|
764
|
+
after_pass_rate: null,
|
|
765
|
+
net_change: null,
|
|
766
|
+
validation_mode: null,
|
|
767
|
+
...(recommendedCommand ? { recommended_command: recommendedCommand } : {}),
|
|
768
|
+
};
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
if (action === "deploy-candidate" || action === "watch") {
|
|
772
|
+
const packageEvaluation = readObject(parsed["package_evaluation"]);
|
|
773
|
+
|
|
774
|
+
if (action === "watch") {
|
|
775
|
+
const directWatchSummary = buildWatchSummary(parsed);
|
|
776
|
+
if (directWatchSummary) return directWatchSummary;
|
|
777
|
+
|
|
778
|
+
const nestedWatchResult = readObject(parsed["watch_result"]);
|
|
779
|
+
const nestedWatchSummary = nestedWatchResult
|
|
780
|
+
? buildWatchSummary(
|
|
781
|
+
nestedWatchResult,
|
|
782
|
+
"Package evaluation passed and watch started",
|
|
783
|
+
packageEvaluation,
|
|
784
|
+
)
|
|
785
|
+
: null;
|
|
786
|
+
if (nestedWatchSummary) return nestedWatchSummary;
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
const status = readString(packageEvaluation?.["status"]);
|
|
790
|
+
const published = readBoolean(parsed["published"]);
|
|
791
|
+
const watchGatePassed =
|
|
792
|
+
action === "watch"
|
|
793
|
+
? readString(parsed["alert"]) == null
|
|
794
|
+
: (readBoolean(parsed["watch_gate_passed"]) ?? null);
|
|
795
|
+
const baseSummary = buildPackageEvaluationSummary(packageEvaluation, {
|
|
796
|
+
deployed: published,
|
|
797
|
+
reason:
|
|
798
|
+
status === "replay_failed"
|
|
799
|
+
? "Package replay failed"
|
|
800
|
+
: status === "baseline_failed"
|
|
801
|
+
? "Package baseline failed"
|
|
802
|
+
: action === "watch" && readBoolean(parsed["watch_started"])
|
|
803
|
+
? "Package evaluation passed and watch started"
|
|
804
|
+
: published
|
|
805
|
+
? "Package evaluation passed"
|
|
806
|
+
: null,
|
|
807
|
+
});
|
|
808
|
+
if (baseSummary) {
|
|
809
|
+
return { ...baseSummary, watch_gate_passed: watchGatePassed };
|
|
810
|
+
}
|
|
811
|
+
return baseSummary;
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
return null;
|
|
58
815
|
}
|
|
59
816
|
|
|
60
817
|
function isSuccessfulReplayDryRun(summary: DashboardActionResultSummary | null): boolean {
|
|
@@ -72,6 +829,14 @@ export function resolveDashboardActionOutcome(
|
|
|
72
829
|
): DashboardActionOutcome {
|
|
73
830
|
const summary = extractDashboardActionSummary(input.action, input.stdout);
|
|
74
831
|
|
|
832
|
+
if (input.action === "watch" && summary?.improved === false) {
|
|
833
|
+
return {
|
|
834
|
+
success: false,
|
|
835
|
+
summary,
|
|
836
|
+
error: summary.reason ?? input.stderr ?? "Watch detected a regression",
|
|
837
|
+
};
|
|
838
|
+
}
|
|
839
|
+
|
|
75
840
|
if (input.exitCode === 0) {
|
|
76
841
|
return { success: true, error: null, summary };
|
|
77
842
|
}
|