@spacek33z/autoauto 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -0
- package/package.json +51 -0
- package/src/App.tsx +224 -0
- package/src/cli.ts +772 -0
- package/src/components/AgentPanel.tsx +254 -0
- package/src/components/Chat.test.tsx +71 -0
- package/src/components/Chat.tsx +308 -0
- package/src/components/CycleField.tsx +23 -0
- package/src/components/ModelPicker.tsx +97 -0
- package/src/components/PostUpdatePrompt.tsx +46 -0
- package/src/components/ResultsTable.tsx +172 -0
- package/src/components/RunCompletePrompt.tsx +90 -0
- package/src/components/RunSettingsOverlay.tsx +49 -0
- package/src/components/RunsTable.tsx +219 -0
- package/src/components/StatsHeader.tsx +100 -0
- package/src/daemon.ts +264 -0
- package/src/index.tsx +8 -0
- package/src/lib/agent/agent-provider.test.ts +133 -0
- package/src/lib/agent/claude-provider.ts +277 -0
- package/src/lib/agent/codex-provider.ts +413 -0
- package/src/lib/agent/default-providers.ts +10 -0
- package/src/lib/agent/index.ts +32 -0
- package/src/lib/agent/mock-provider.ts +61 -0
- package/src/lib/agent/opencode-provider.ts +424 -0
- package/src/lib/agent/types.ts +73 -0
- package/src/lib/auth.ts +11 -0
- package/src/lib/config.ts +152 -0
- package/src/lib/daemon-callbacks.ts +59 -0
- package/src/lib/daemon-client.ts +16 -0
- package/src/lib/daemon-lifecycle.ts +368 -0
- package/src/lib/daemon-spawn.ts +122 -0
- package/src/lib/daemon-status.ts +189 -0
- package/src/lib/daemon-watcher.ts +192 -0
- package/src/lib/experiment-loop.ts +679 -0
- package/src/lib/experiment.ts +356 -0
- package/src/lib/finalize.test.ts +143 -0
- package/src/lib/finalize.ts +511 -0
- package/src/lib/format.test.ts +32 -0
- package/src/lib/format.ts +44 -0
- package/src/lib/git.ts +176 -0
- package/src/lib/ideas-backlog.test.ts +54 -0
- package/src/lib/ideas-backlog.ts +109 -0
- package/src/lib/measure.ts +472 -0
- package/src/lib/model-options.ts +24 -0
- package/src/lib/programs.ts +247 -0
- package/src/lib/push-stream.ts +48 -0
- package/src/lib/run-context.ts +112 -0
- package/src/lib/run-setup.ts +34 -0
- package/src/lib/run.ts +383 -0
- package/src/lib/syntax-theme.ts +39 -0
- package/src/lib/system-prompts/experiment.ts +77 -0
- package/src/lib/system-prompts/finalize.ts +90 -0
- package/src/lib/system-prompts/index.ts +7 -0
- package/src/lib/system-prompts/setup.ts +516 -0
- package/src/lib/system-prompts/update.ts +188 -0
- package/src/lib/tool-events.ts +99 -0
- package/src/lib/validate-measurement.ts +326 -0
- package/src/lib/worktree.ts +40 -0
- package/src/screens/AuthErrorScreen.tsx +31 -0
- package/src/screens/ExecutionScreen.tsx +851 -0
- package/src/screens/FirstSetupScreen.tsx +168 -0
- package/src/screens/HomeScreen.tsx +406 -0
- package/src/screens/PreRunScreen.tsx +206 -0
- package/src/screens/SettingsScreen.tsx +189 -0
- package/src/screens/SetupScreen.tsx +226 -0
- package/src/tui.tsx +17 -0
- package/tsconfig.json +17 -0
|
@@ -0,0 +1,472 @@
|
|
|
1
|
+
import { spawn, type ChildProcess } from "node:child_process"
|
|
2
|
+
import { join } from "node:path"
|
|
3
|
+
import { unlink } from "node:fs/promises"
|
|
4
|
+
import type { ProgramConfig } from "./programs.ts"
|
|
5
|
+
|
|
6
|
+
// --- Helpers ---
|
|
7
|
+
|
|
8
|
+
/** Kills a detached child's entire process group, falling back to direct kill. */
|
|
9
|
+
function killProcessGroup(proc: ChildProcess, signal: NodeJS.Signals = "SIGTERM"): void {
|
|
10
|
+
if (proc.killed || !proc.pid) return
|
|
11
|
+
try {
|
|
12
|
+
process.kill(-proc.pid, signal)
|
|
13
|
+
} catch {
|
|
14
|
+
proc.kill(signal)
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
// --- Types ---
|
|
19
|
+
|
|
20
|
+
export type MeasurementResult =
|
|
21
|
+
| { success: true; output: Record<string, unknown>; duration_ms: number; diagnostics?: string }
|
|
22
|
+
| { success: false; error: string; duration_ms: number }
|
|
23
|
+
|
|
24
|
+
export interface MeasurementSeriesResult {
|
|
25
|
+
success: boolean
|
|
26
|
+
median_metric: number
|
|
27
|
+
median_quality_gates: Record<string, number>
|
|
28
|
+
median_secondary_metrics: Record<string, number>
|
|
29
|
+
quality_gates_passed: boolean
|
|
30
|
+
gate_violations: string[]
|
|
31
|
+
individual_runs: MeasurementResult[]
|
|
32
|
+
duration_ms: number
|
|
33
|
+
failure_reason?: string
|
|
34
|
+
diagnostics?: string
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// --- Helpers ---
|
|
38
|
+
|
|
39
|
+
function median(values: number[]): number {
|
|
40
|
+
const sorted = [...values].toSorted((a, b) => a - b)
|
|
41
|
+
const n = sorted.length
|
|
42
|
+
return n % 2 === 0 ? (sorted[n / 2 - 1] + sorted[n / 2]) / 2 : sorted[Math.floor(n / 2)]
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function collectFiniteValues(
|
|
46
|
+
output: Record<string, unknown>,
|
|
47
|
+
fields: string[],
|
|
48
|
+
target: Record<string, number[]>,
|
|
49
|
+
): void {
|
|
50
|
+
for (const field of fields) {
|
|
51
|
+
const value = output[field]
|
|
52
|
+
if (typeof value === "number" && isFinite(value)) {
|
|
53
|
+
if (!target[field]) target[field] = []
|
|
54
|
+
target[field].push(value)
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function computeMedians(fieldValues: Record<string, number[]>): Record<string, number> {
|
|
60
|
+
const result: Record<string, number> = {}
|
|
61
|
+
for (const [field, values] of Object.entries(fieldValues)) {
|
|
62
|
+
result[field] = median(values)
|
|
63
|
+
}
|
|
64
|
+
return result
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// --- Diagnostics Sidecar ---
|
|
68
|
+
|
|
69
|
+
const DIAGNOSTICS_FILENAME = ".autoauto-diagnostics"
|
|
70
|
+
|
|
71
|
+
async function readAndCleanDiagnostics(cwd: string): Promise<string | undefined> {
|
|
72
|
+
const diagnosticsPath = join(cwd, DIAGNOSTICS_FILENAME)
|
|
73
|
+
try {
|
|
74
|
+
const content = await Bun.file(diagnosticsPath).text()
|
|
75
|
+
await unlink(diagnosticsPath).catch(() => {})
|
|
76
|
+
return content.trim() || undefined
|
|
77
|
+
} catch {
|
|
78
|
+
return undefined
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// --- Measurement Execution ---
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Runs measure.sh once and returns parsed output.
|
|
86
|
+
* Uses Node spawn with timeout (matching validate-measurement.ts pattern).
|
|
87
|
+
*/
|
|
88
|
+
export async function runMeasurement(
|
|
89
|
+
measureShPath: string,
|
|
90
|
+
cwd: string,
|
|
91
|
+
timeoutMs?: number,
|
|
92
|
+
signal?: AbortSignal,
|
|
93
|
+
): Promise<MeasurementResult> {
|
|
94
|
+
if (signal?.aborted) {
|
|
95
|
+
return { success: false, error: "aborted", duration_ms: 0 }
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const start = performance.now()
|
|
99
|
+
return new Promise((resolve) => {
|
|
100
|
+
const proc = spawn("bash", [measureShPath], {
|
|
101
|
+
cwd,
|
|
102
|
+
env: { ...process.env },
|
|
103
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
104
|
+
detached: true,
|
|
105
|
+
})
|
|
106
|
+
const timeoutLimit = timeoutMs ?? 60_000
|
|
107
|
+
let timedOut = false
|
|
108
|
+
const timeout = setTimeout(() => {
|
|
109
|
+
timedOut = true
|
|
110
|
+
killProcessGroup(proc)
|
|
111
|
+
}, timeoutLimit)
|
|
112
|
+
|
|
113
|
+
const onAbort = () => {
|
|
114
|
+
killProcessGroup(proc)
|
|
115
|
+
}
|
|
116
|
+
signal?.addEventListener("abort", onAbort, { once: true })
|
|
117
|
+
|
|
118
|
+
proc.stdout!.setEncoding("utf-8")
|
|
119
|
+
proc.stderr!.setEncoding("utf-8")
|
|
120
|
+
|
|
121
|
+
let stdout = ""
|
|
122
|
+
let stderr = ""
|
|
123
|
+
|
|
124
|
+
proc.stdout!.on("data", (chunk: string) => {
|
|
125
|
+
stdout += chunk
|
|
126
|
+
})
|
|
127
|
+
proc.stderr!.on("data", (chunk: string) => {
|
|
128
|
+
stderr += chunk
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
proc.on("close", (exitCode) => {
|
|
132
|
+
clearTimeout(timeout)
|
|
133
|
+
signal?.removeEventListener("abort", onAbort)
|
|
134
|
+
const duration_ms = Math.round(performance.now() - start)
|
|
135
|
+
|
|
136
|
+
// Failure paths: fire-and-forget cleanup so the sidecar doesn't leak
|
|
137
|
+
// as an untracked file, but resolve immediately without blocking on I/O.
|
|
138
|
+
if (signal?.aborted) {
|
|
139
|
+
readAndCleanDiagnostics(cwd)
|
|
140
|
+
resolve({ success: false, error: "aborted", duration_ms })
|
|
141
|
+
return
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
if (timedOut) {
|
|
145
|
+
readAndCleanDiagnostics(cwd)
|
|
146
|
+
resolve({ success: false, error: `Measurement timed out after ${timeoutLimit}ms`, duration_ms })
|
|
147
|
+
return
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if (exitCode !== 0) {
|
|
151
|
+
readAndCleanDiagnostics(cwd)
|
|
152
|
+
resolve({
|
|
153
|
+
success: false,
|
|
154
|
+
error: `exit code ${exitCode}${stderr ? `: ${stderr.trim().slice(0, 200)}` : ""}`,
|
|
155
|
+
duration_ms,
|
|
156
|
+
})
|
|
157
|
+
return
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
let parsed: unknown
|
|
161
|
+
try {
|
|
162
|
+
parsed = JSON.parse(stdout.trim())
|
|
163
|
+
} catch {
|
|
164
|
+
readAndCleanDiagnostics(cwd)
|
|
165
|
+
resolve({
|
|
166
|
+
success: false,
|
|
167
|
+
error: `invalid JSON on stdout: ${stdout.trim().slice(0, 200)}`,
|
|
168
|
+
duration_ms,
|
|
169
|
+
})
|
|
170
|
+
return
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
|
|
174
|
+
readAndCleanDiagnostics(cwd)
|
|
175
|
+
resolve({
|
|
176
|
+
success: false,
|
|
177
|
+
error: `stdout must be a JSON object, got ${Array.isArray(parsed) ? "array" : typeof parsed}`,
|
|
178
|
+
duration_ms,
|
|
179
|
+
})
|
|
180
|
+
return
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Success path: await diagnostics before resolving
|
|
184
|
+
readAndCleanDiagnostics(cwd).then((diagnostics) => {
|
|
185
|
+
resolve({ success: true, output: parsed as Record<string, unknown>, duration_ms, diagnostics })
|
|
186
|
+
}).catch(() => {
|
|
187
|
+
resolve({ success: true, output: parsed as Record<string, unknown>, duration_ms })
|
|
188
|
+
})
|
|
189
|
+
})
|
|
190
|
+
|
|
191
|
+
proc.on("error", (err) => {
|
|
192
|
+
clearTimeout(timeout)
|
|
193
|
+
signal?.removeEventListener("abort", onAbort)
|
|
194
|
+
const duration_ms = Math.round(performance.now() - start)
|
|
195
|
+
resolve({ success: false, error: err.message, duration_ms })
|
|
196
|
+
})
|
|
197
|
+
})
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// --- Build Step ---
|
|
201
|
+
|
|
202
|
+
export interface BuildResult {
|
|
203
|
+
success: boolean
|
|
204
|
+
error?: string
|
|
205
|
+
duration_ms: number
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Runs build.sh once if it exists. Returns success immediately if the file is missing.
|
|
210
|
+
*/
|
|
211
|
+
export async function runBuild(
|
|
212
|
+
buildShPath: string,
|
|
213
|
+
cwd: string,
|
|
214
|
+
signal?: AbortSignal,
|
|
215
|
+
): Promise<BuildResult> {
|
|
216
|
+
if (!await Bun.file(buildShPath).exists()) {
|
|
217
|
+
return { success: true, duration_ms: 0 }
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
const start = performance.now()
|
|
221
|
+
return new Promise((resolve) => {
|
|
222
|
+
const proc = spawn("bash", [buildShPath], {
|
|
223
|
+
cwd,
|
|
224
|
+
env: { ...process.env },
|
|
225
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
226
|
+
detached: true,
|
|
227
|
+
})
|
|
228
|
+
let timedOut = false
|
|
229
|
+
const timeout = setTimeout(() => {
|
|
230
|
+
timedOut = true
|
|
231
|
+
killProcessGroup(proc)
|
|
232
|
+
}, 120_000)
|
|
233
|
+
|
|
234
|
+
const onAbort = () => {
|
|
235
|
+
killProcessGroup(proc)
|
|
236
|
+
}
|
|
237
|
+
signal?.addEventListener("abort", onAbort, { once: true })
|
|
238
|
+
|
|
239
|
+
let stderr = ""
|
|
240
|
+
proc.stderr!.setEncoding("utf-8")
|
|
241
|
+
proc.stderr!.on("data", (chunk: string) => {
|
|
242
|
+
stderr += chunk
|
|
243
|
+
})
|
|
244
|
+
|
|
245
|
+
proc.on("close", (exitCode) => {
|
|
246
|
+
clearTimeout(timeout)
|
|
247
|
+
signal?.removeEventListener("abort", onAbort)
|
|
248
|
+
const duration_ms = Math.round(performance.now() - start)
|
|
249
|
+
|
|
250
|
+
if (signal?.aborted) {
|
|
251
|
+
resolve({ success: false, error: "aborted", duration_ms })
|
|
252
|
+
return
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
if (timedOut) {
|
|
256
|
+
resolve({ success: false, error: "Build timed out after 120000ms", duration_ms })
|
|
257
|
+
return
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
if (exitCode !== 0) {
|
|
261
|
+
resolve({
|
|
262
|
+
success: false,
|
|
263
|
+
error: `build.sh exit code ${exitCode}${stderr ? `: ${stderr.trim().slice(0, 200)}` : ""}`,
|
|
264
|
+
duration_ms,
|
|
265
|
+
})
|
|
266
|
+
return
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
resolve({ success: true, duration_ms })
|
|
270
|
+
})
|
|
271
|
+
|
|
272
|
+
proc.on("error", (err) => {
|
|
273
|
+
clearTimeout(timeout)
|
|
274
|
+
signal?.removeEventListener("abort", onAbort)
|
|
275
|
+
const duration_ms = Math.round(performance.now() - start)
|
|
276
|
+
resolve({ success: false, error: err.message, duration_ms })
|
|
277
|
+
})
|
|
278
|
+
})
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// --- Validation ---
|
|
282
|
+
|
|
283
|
+
function validateFiniteFields(output: Record<string, unknown>, fields: string[], label: string): string[] {
|
|
284
|
+
const errors: string[] = []
|
|
285
|
+
for (const field of fields) {
|
|
286
|
+
const value = output[field]
|
|
287
|
+
if (value === undefined) {
|
|
288
|
+
errors.push(`${label} "${field}" missing from output`)
|
|
289
|
+
} else if (typeof value !== "number" || !isFinite(value)) {
|
|
290
|
+
errors.push(`${label} "${field}" is not a finite number: ${value}`)
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
return errors
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/** Validates a measurement output has all required fields as finite numbers. */
|
|
297
|
+
export function validateMeasurementOutput(
|
|
298
|
+
output: Record<string, unknown>,
|
|
299
|
+
config: ProgramConfig,
|
|
300
|
+
): { valid: boolean; errors: string[] } {
|
|
301
|
+
const errors = [
|
|
302
|
+
...validateFiniteFields(output, [config.metric_field], "metric_field"),
|
|
303
|
+
...validateFiniteFields(output, Object.keys(config.quality_gates), "quality gate field"),
|
|
304
|
+
...validateFiniteFields(output, Object.keys(config.secondary_metrics ?? {}), "secondary metric field"),
|
|
305
|
+
]
|
|
306
|
+
|
|
307
|
+
return { valid: errors.length === 0, errors }
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
/** Checks quality gate thresholds (separate from field existence validation). */
|
|
311
|
+
export function checkQualityGates(
|
|
312
|
+
output: Record<string, number>,
|
|
313
|
+
config: ProgramConfig,
|
|
314
|
+
): { passed: boolean; violations: string[] } {
|
|
315
|
+
const violations: string[] = []
|
|
316
|
+
|
|
317
|
+
for (const [field, gate] of Object.entries(config.quality_gates)) {
|
|
318
|
+
const value = output[field]
|
|
319
|
+
if (value === undefined) continue
|
|
320
|
+
if (gate.max !== undefined && value > gate.max) {
|
|
321
|
+
violations.push(`${field}=${value} exceeds max ${gate.max}`)
|
|
322
|
+
}
|
|
323
|
+
if (gate.min !== undefined && value < gate.min) {
|
|
324
|
+
violations.push(`${field}=${value} below min ${gate.min}`)
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
return { passed: violations.length === 0, violations }
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// --- Measurement Series ---
|
|
332
|
+
|
|
333
|
+
/**
|
|
334
|
+
* Runs measure.sh N times (config.repeats), computes median, validates all outputs.
|
|
335
|
+
* Every configured repeat must succeed; partial measurement failures invalidate the series.
|
|
336
|
+
*/
|
|
337
|
+
export async function runMeasurementSeries(
|
|
338
|
+
measureShPath: string,
|
|
339
|
+
cwd: string,
|
|
340
|
+
config: ProgramConfig,
|
|
341
|
+
signal?: AbortSignal,
|
|
342
|
+
buildShPath?: string,
|
|
343
|
+
): Promise<MeasurementSeriesResult> {
|
|
344
|
+
const totalStart = performance.now()
|
|
345
|
+
|
|
346
|
+
// Run build step once before measuring
|
|
347
|
+
if (buildShPath) {
|
|
348
|
+
const buildResult = await runBuild(buildShPath, cwd, signal)
|
|
349
|
+
if (!buildResult.success) {
|
|
350
|
+
return {
|
|
351
|
+
success: false,
|
|
352
|
+
median_metric: 0,
|
|
353
|
+
median_quality_gates: {},
|
|
354
|
+
median_secondary_metrics: {},
|
|
355
|
+
quality_gates_passed: false,
|
|
356
|
+
gate_violations: [],
|
|
357
|
+
individual_runs: [],
|
|
358
|
+
duration_ms: Math.round(performance.now() - totalStart),
|
|
359
|
+
failure_reason: buildResult.error ?? "build failed",
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
const runs: MeasurementResult[] = []
|
|
365
|
+
const validMetrics: number[] = []
|
|
366
|
+
const validGateValues: Record<string, number[]> = {}
|
|
367
|
+
const validSecondaryValues: Record<string, number[]> = {}
|
|
368
|
+
let invalidOutputCount = 0
|
|
369
|
+
|
|
370
|
+
for (let i = 0; i < config.repeats; i++) {
|
|
371
|
+
if (signal?.aborted) break
|
|
372
|
+
// eslint-disable-next-line no-await-in-loop -- measurements must run sequentially
|
|
373
|
+
const result = await runMeasurement(measureShPath, cwd, undefined, signal)
|
|
374
|
+
runs.push(result)
|
|
375
|
+
|
|
376
|
+
if (!result.success) continue
|
|
377
|
+
|
|
378
|
+
const validation = validateMeasurementOutput(result.output, config)
|
|
379
|
+
if (!validation.valid) {
|
|
380
|
+
invalidOutputCount++
|
|
381
|
+
continue
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
validMetrics.push(result.output[config.metric_field] as number)
|
|
385
|
+
collectFiniteValues(result.output, Object.keys(config.quality_gates), validGateValues)
|
|
386
|
+
collectFiniteValues(result.output, Object.keys(config.secondary_metrics ?? {}), validSecondaryValues)
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
const duration_ms = Math.round(performance.now() - totalStart)
|
|
390
|
+
|
|
391
|
+
if (signal?.aborted) {
|
|
392
|
+
return {
|
|
393
|
+
success: false,
|
|
394
|
+
median_metric: 0,
|
|
395
|
+
median_quality_gates: {},
|
|
396
|
+
median_secondary_metrics: {},
|
|
397
|
+
quality_gates_passed: false,
|
|
398
|
+
gate_violations: [],
|
|
399
|
+
individual_runs: runs,
|
|
400
|
+
duration_ms,
|
|
401
|
+
failure_reason: "aborted",
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
if (runs.length !== config.repeats || validMetrics.length !== config.repeats || invalidOutputCount > 0) {
|
|
406
|
+
const failedRuns = runs
|
|
407
|
+
.filter((run): run is Extract<MeasurementResult, { success: false }> => !run.success)
|
|
408
|
+
.map((run) => run.error)
|
|
409
|
+
const invalidRuns = runs
|
|
410
|
+
.filter((run): run is Extract<MeasurementResult, { success: true }> => run.success)
|
|
411
|
+
.map((run) => validateMeasurementOutput(run.output, config).errors)
|
|
412
|
+
.filter((errors) => errors.length > 0)
|
|
413
|
+
.flat()
|
|
414
|
+
|
|
415
|
+
const reasons = [...failedRuns, ...invalidRuns]
|
|
416
|
+
return {
|
|
417
|
+
success: false,
|
|
418
|
+
median_metric: 0,
|
|
419
|
+
median_quality_gates: {},
|
|
420
|
+
median_secondary_metrics: {},
|
|
421
|
+
quality_gates_passed: false,
|
|
422
|
+
gate_violations: [],
|
|
423
|
+
individual_runs: runs,
|
|
424
|
+
duration_ms,
|
|
425
|
+
failure_reason: reasons.length > 0 ? reasons.join("; ") : "measurement series incomplete",
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
const medianMetric = median(validMetrics)
|
|
430
|
+
const medianGates = computeMedians(validGateValues)
|
|
431
|
+
const medianSecondary = computeMedians(validSecondaryValues)
|
|
432
|
+
|
|
433
|
+
const gateCheck = checkQualityGates(medianGates, config)
|
|
434
|
+
|
|
435
|
+
// All runs succeeded at this point (partial failures exit above),
|
|
436
|
+
// so the last run's diagnostics are the most recent.
|
|
437
|
+
const lastRun = runs.at(-1) as Extract<MeasurementResult, { success: true }>
|
|
438
|
+
|
|
439
|
+
return {
|
|
440
|
+
success: true,
|
|
441
|
+
median_metric: medianMetric,
|
|
442
|
+
median_quality_gates: medianGates,
|
|
443
|
+
median_secondary_metrics: medianSecondary,
|
|
444
|
+
quality_gates_passed: gateCheck.passed,
|
|
445
|
+
gate_violations: gateCheck.violations,
|
|
446
|
+
individual_runs: runs,
|
|
447
|
+
duration_ms,
|
|
448
|
+
diagnostics: lastRun.diagnostics,
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// --- Comparison ---
|
|
453
|
+
|
|
454
|
+
/**
|
|
455
|
+
* Compares measured metric against baseline using noise threshold.
|
|
456
|
+
* noise_threshold is a decimal fraction (e.g. 0.02 for 2%).
|
|
457
|
+
*/
|
|
458
|
+
export function compareMetric(
|
|
459
|
+
baseline: number,
|
|
460
|
+
measured: number,
|
|
461
|
+
noiseThreshold: number,
|
|
462
|
+
direction: "lower" | "higher",
|
|
463
|
+
): "keep" | "regressed" | "noise" {
|
|
464
|
+
const relativeChange =
|
|
465
|
+
direction === "lower"
|
|
466
|
+
? (baseline - measured) / baseline // positive = improvement for "lower"
|
|
467
|
+
: (measured - baseline) / baseline // positive = improvement for "higher"
|
|
468
|
+
|
|
469
|
+
if (relativeChange > noiseThreshold) return "keep"
|
|
470
|
+
if (relativeChange < -noiseThreshold) return "regressed"
|
|
471
|
+
return "noise"
|
|
472
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { getProvider, type AgentModelOption, type AgentProviderID } from "./agent/index.ts"
|
|
2
|
+
import type { ModelSlot } from "./config.ts"
|
|
3
|
+
|
|
4
|
+
export interface ModelPickerOption extends AgentModelOption {
|
|
5
|
+
value: ModelSlot
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
const DEFAULT_EFFORT = "high"
|
|
9
|
+
|
|
10
|
+
export async function loadModelPickerOptions(
|
|
11
|
+
providerId: AgentProviderID,
|
|
12
|
+
cwd: string,
|
|
13
|
+
forceRefresh = false,
|
|
14
|
+
): Promise<ModelPickerOption[]> {
|
|
15
|
+
const options = await getProvider(providerId).listModels?.(cwd, forceRefresh) ?? []
|
|
16
|
+
return options.map((option) => ({
|
|
17
|
+
...option,
|
|
18
|
+
value: { provider: option.provider, model: option.model, effort: DEFAULT_EFFORT },
|
|
19
|
+
}))
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export async function getDefaultModel(providerId: AgentProviderID, cwd: string): Promise<string | null> {
|
|
23
|
+
return await getProvider(providerId).getDefaultModel?.(cwd) ?? null
|
|
24
|
+
}
|