@cat-factory/app 0.27.0 → 0.28.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -181,6 +181,14 @@ const commands = computed<Command[]>(() => {
181
181
  keywords: 'local model runner ollama lm studio llamacpp vllm endpoint',
182
182
  run: () => ui.openLocalModels(),
183
183
  })
184
+ list.push({
185
+ id: 'sandbox',
186
+ label: 'Open Sandbox',
187
+ group: 'Workspace',
188
+ icon: 'i-lucide-flask-conical',
189
+ keywords: 'sandbox prompt model test experiment judge fixture benchmark evaluate',
190
+ run: () => ui.openSandbox(),
191
+ })
184
192
 
185
193
  return list
186
194
  })
@@ -125,6 +125,19 @@ watch(
125
125
  >
126
126
  Integrations
127
127
  </UButton>
128
+ <!-- The Sandbox: try prompt versions/models against graded fixtures, off to the
129
+ side of the board. Opens the on-demand testing window. -->
130
+ <UButton
131
+ block
132
+ color="primary"
133
+ variant="soft"
134
+ size="sm"
135
+ icon="i-lucide-flask-conical"
136
+ class="justify-start"
137
+ @click="ui.openSandbox()"
138
+ >
139
+ Sandbox
140
+ </UButton>
128
141
  </div>
129
142
  </section>
130
143
 
@@ -214,10 +214,24 @@ async function setStatus(item: RequirementReviewItem, itemStatus: ReviewItemStat
214
214
  const recommending = computed(() =>
215
215
  blockId.value ? requirements.isRecommending(blockId.value) : false,
216
216
  )
217
- // Recommendations still awaiting a human decision (the ones to surface for review).
218
- const pendingRecommendations = computed<RequirementRecommendation[]>(() =>
217
+ // Recommendations the Writer has produced that still await a human decision (`ready`).
218
+ const readyRecommendations = computed<RequirementRecommendation[]>(() =>
219
219
  (review.value?.recommendations ?? []).filter((r) => r.status === 'ready'),
220
220
  )
221
+ // Placeholders the Requirement Writer is still producing in the background (`pending`).
222
+ const generatingRecommendations = computed<RequirementRecommendation[]>(() =>
223
+ (review.value?.recommendations ?? []).filter((r) => r.status === 'pending'),
224
+ )
225
+ // "ready / total" progress for the in-flight batch (null when nothing is generating). Scoped to
226
+ // the current wave via `createdAt` (all placeholders in one request share the timestamp), so
227
+ // stale `ready` recommendations the human hasn't acted on from an earlier batch don't inflate it.
228
+ const recommendationProgress = computed(() => {
229
+ const generating = generatingRecommendations.value
230
+ if (generating.length === 0) return null
231
+ const batchTimes = new Set(generating.map((r) => r.createdAt))
232
+ const ready = readyRecommendations.value.filter((r) => batchTimes.has(r.createdAt)).length
233
+ return { ready, total: ready + generating.length }
234
+ })
221
235
  function isMarkedForRecommend(item: RequirementReviewItem): boolean {
222
236
  return markedForRecommend.value.has(item.id)
223
237
  }
@@ -228,18 +242,37 @@ function toggleRecommend(item: RequirementReviewItem) {
228
242
  markedForRecommend.value = next
229
243
  }
230
244
 
231
- // Fire the Writer over the whole marked batch at once (grounded on the project's
232
- // best-practice standards, specs/tech-specs and web search).
245
+ // Fire the Writer over the whole marked batch (grounded on the project's best-practice
246
+ // standards, specs/tech-specs and web search). ASYNCHRONOUS: it returns at once with `pending`
247
+ // placeholders that fill in live; the user can close the window and is notified when the batch
248
+ // is ready. Flush any typed-but-unblurred answers first so nothing the human entered is lost.
233
249
  async function requestRecommendations() {
234
250
  if (!blockId.value || markedForRecommend.value.size === 0) return
235
251
  const ids = [...markedForRecommend.value]
236
252
  try {
237
- await requirements.requestRecommendations(blockId.value, ids)
253
+ await flushDrafts()
254
+ const updated = await requirements.requestRecommendations(blockId.value, ids)
238
255
  markedForRecommend.value = new Set()
239
- toast.add({
240
- title: `Requesting ${ids.length} recommendation${ids.length === 1 ? '' : 's'}…`,
241
- icon: 'i-lucide-sparkles',
242
- })
256
+ const n = ids.length
257
+ const plural = n === 1 ? '' : 's'
258
+ // On a parked run the request returns at once with `pending` placeholders the durable driver
259
+ // fills in the background; off-path (no active pipeline) there is no driver, so the Writer
260
+ // ran inline and the recommendations are already settled. Tell the human which actually
261
+ // happened rather than always promising a background callback.
262
+ const stillGenerating = (updated?.recommendations ?? []).some((r) => r.status === 'pending')
263
+ toast.add(
264
+ stillGenerating
265
+ ? {
266
+ title: `Preparing ${n} recommendation${plural} in the background`,
267
+ description:
268
+ "Your answers are saved — close this if you like; we'll notify you when they're ready.",
269
+ icon: 'i-lucide-sparkles',
270
+ }
271
+ : {
272
+ title: `${n} recommendation${plural} ready`,
273
+ icon: 'i-lucide-sparkles',
274
+ },
275
+ )
243
276
  } catch (e) {
244
277
  notifyError('Could not request recommendations', e)
245
278
  }
@@ -567,18 +600,47 @@ async function resolveExceeded(choice: 'extra-round' | 'proceed' | 'stop-reset')
567
600
  </div>
568
601
  </div>
569
602
 
570
- <!-- Requirement-Writer recommendations awaiting a human decision -->
603
+ <!-- Requirement-Writer recommendations: awaiting a human decision (`ready`) and/or
604
+ still generating in the background (`pending`) -->
571
605
  <section
572
- v-if="pendingRecommendations.length"
606
+ v-if="readyRecommendations.length || generatingRecommendations.length"
573
607
  class="mt-6 border-t border-slate-800 pt-5"
574
608
  >
575
- <div class="mb-3 flex items-center gap-1.5 text-[11px] text-indigo-300">
609
+ <div class="mb-3 flex items-center gap-2 text-[11px] text-indigo-300">
576
610
  <UIcon name="i-lucide-wand-2" class="h-3.5 w-3.5" />
577
611
  <span class="font-semibold uppercase tracking-wide">Recommended answers</span>
612
+ <span
613
+ v-if="recommendationProgress"
614
+ class="ml-auto flex items-center gap-1.5 normal-case text-indigo-300/80"
615
+ >
616
+ <UIcon name="i-lucide-loader-circle" class="h-3.5 w-3.5 animate-spin" />
617
+ {{ recommendationProgress.ready }} / {{ recommendationProgress.total }} ready
618
+ </span>
578
619
  </div>
620
+
621
+ <!-- still-generating placeholders (one per requested finding) -->
622
+ <div v-if="generatingRecommendations.length" class="mb-3 flex flex-col gap-3">
623
+ <div
624
+ v-for="rec in generatingRecommendations"
625
+ :key="rec.id"
626
+ class="flex items-start gap-2 rounded-lg border border-dashed border-indigo-900/50 bg-indigo-950/10 p-3"
627
+ >
628
+ <UIcon
629
+ name="i-lucide-loader-circle"
630
+ class="mt-0.5 h-4 w-4 shrink-0 animate-spin text-indigo-300"
631
+ />
632
+ <div class="min-w-0">
633
+ <span class="text-sm font-medium text-white">{{
634
+ rec.sourceFinding.title
635
+ }}</span>
636
+ <p class="text-xs text-indigo-300/70">Generating a grounded suggestion…</p>
637
+ </div>
638
+ </div>
639
+ </div>
640
+
579
641
  <div class="flex flex-col gap-3">
580
642
  <div
581
- v-for="rec in pendingRecommendations"
643
+ v-for="rec in readyRecommendations"
582
644
  :key="rec.id"
583
645
  class="rounded-lg border border-indigo-900/50 bg-indigo-950/20 p-3"
584
646
  >
@@ -0,0 +1,542 @@
1
+ <script setup lang="ts">
2
+ // The Sandbox surface: a parallel place to test prompts and models against graded
3
+ // fixtures, without touching the board. Three tabs — Experiments (define a matrix of
4
+ // prompt versions × models × fixtures for one agent kind, run it, read the graded grid),
5
+ // Prompts (clone a shipped baseline into an editable candidate lineage and version it),
6
+ // and Fixtures (the graded inputs each run is scored against). Loaded on demand when the
7
+ // window opens; 503 (the deployment hasn't provisioned the Sandbox DB) shows a notice.
8
+ import { computed, ref, watch } from 'vue'
9
+ import type { SandboxGrade, SandboxPromptVersion, SandboxRun } from '~/types/sandbox'
10
+
11
+ const ui = useUiStore()
12
+ const store = useSandboxStore()
13
+ const toast = useToast()
14
+
15
+ const open = computed({
16
+ get: () => ui.sandboxOpen,
17
+ set: (v: boolean) => (v ? ui.openSandbox() : ui.closeSandbox()),
18
+ })
19
+
20
+ const tab = ref<'experiments' | 'prompts' | 'fixtures'>('experiments')
21
+
22
+ watch(open, (isOpen) => {
23
+ if (isOpen) void store.load()
24
+ })
25
+
26
+ // ---- experiment builder ----------------------------------------------------
27
+ const agentKind = ref('requirements-review')
28
+ const name = ref('')
29
+ const selectedPromptIds = ref<string[]>([])
30
+ const selectedModelIds = ref<string[]>([])
31
+ const selectedFixtureIds = ref<string[]>([])
32
+ // The judge model. Empty string = the deployment's routing default (resolved server-side);
33
+ // picking one explicitly is the recourse on a deployment that has no default model wired,
34
+ // where leaving it on default makes every run fail at create time.
35
+ const selectedJudgeModel = ref<string>('')
36
+
37
+ const judgeModelItems = computed(() => [
38
+ { label: 'Deployment default', value: '' },
39
+ ...store.selectableModels.map((m) => ({ label: m.label, value: m.id })),
40
+ ])
41
+
42
+ const kindPrompts = computed(() => store.promptsForKind(agentKind.value))
43
+ const kindFixtures = computed(() => store.fixturesForKind(agentKind.value))
44
+
45
+ // Reset the builder selections to sensible defaults when the agent kind (or loaded data)
46
+ // changes: every baseline prompt + every fixture for the kind, no models yet.
47
+ watch(
48
+ [agentKind, () => store.prompts, () => store.fixtures],
49
+ () => {
50
+ selectedPromptIds.value = kindPrompts.value
51
+ .filter((p) => p.origin === 'baseline')
52
+ .map((p) => p.id)
53
+ selectedFixtureIds.value = kindFixtures.value.map((f) => f.id)
54
+ },
55
+ { immediate: true },
56
+ )
57
+
58
+ const cellCount = computed(
59
+ () =>
60
+ selectedPromptIds.value.length *
61
+ selectedModelIds.value.length *
62
+ selectedFixtureIds.value.length,
63
+ )
64
+
65
+ const canRun = computed(() => cellCount.value > 0 && cellCount.value <= store.maxCells)
66
+
67
+ function toggle(which: 'prompt' | 'model' | 'fixture', id: string, on: boolean) {
68
+ const list =
69
+ which === 'prompt'
70
+ ? selectedPromptIds
71
+ : which === 'model'
72
+ ? selectedModelIds
73
+ : selectedFixtureIds
74
+ list.value = on ? [...new Set([...list.value, id])] : list.value.filter((x) => x !== id)
75
+ }
76
+
77
+ async function createAndRun() {
78
+ if (!canRun.value) return
79
+ try {
80
+ const created = await store.createExperiment({
81
+ name: name.value.trim() || `${agentKind.value} — sandbox run`,
82
+ agentKind: agentKind.value,
83
+ judgeModel: selectedJudgeModel.value || undefined,
84
+ matrix: {
85
+ promptVersionIds: selectedPromptIds.value,
86
+ models: selectedModelIds.value,
87
+ fixtureIds: selectedFixtureIds.value,
88
+ },
89
+ })
90
+ name.value = ''
91
+ toast.add({ title: 'Running experiment…', icon: 'i-lucide-flask-conical', color: 'info' })
92
+ await store.launch(created.id)
93
+ toast.add({ title: 'Experiment complete', icon: 'i-lucide-check', color: 'success' })
94
+ } catch (e) {
95
+ toast.add({
96
+ title: 'Could not run the experiment',
97
+ description: e instanceof Error ? e.message : String(e),
98
+ icon: 'i-lucide-triangle-alert',
99
+ color: 'error',
100
+ })
101
+ }
102
+ }
103
+
104
+ // ---- results grid ----------------------------------------------------------
105
+ const gradeByRun = computed(() => {
106
+ const map = new Map<string, SandboxGrade>()
107
+ for (const g of store.detail?.grades ?? []) map.set(g.runId, g)
108
+ return map
109
+ })
110
+ const selectedRun = ref<SandboxRun | null>(null)
111
+
112
+ function scoreColor(score: number): string {
113
+ if (score >= 4) return 'text-emerald-400'
114
+ if (score >= 3) return 'text-amber-400'
115
+ return 'text-rose-400'
116
+ }
117
+
118
+ // ---- prompt editor ---------------------------------------------------------
119
+ const editing = ref<SandboxPromptVersion | null>(null)
120
+ const editText = ref('')
121
+ const savingPrompt = ref(false)
122
+
123
+ function edit(prompt: SandboxPromptVersion) {
124
+ editing.value = prompt
125
+ editText.value = prompt.systemText
126
+ }
127
+
128
+ async function saveVersion() {
129
+ if (!editing.value || !editText.value.trim()) return
130
+ savingPrompt.value = true
131
+ try {
132
+ await store.saveVersion(editing.value.id, editText.value)
133
+ toast.add({ title: 'Saved a new version', icon: 'i-lucide-check', color: 'success' })
134
+ editing.value = null
135
+ } catch (e) {
136
+ toast.add({
137
+ title: 'Could not save the version',
138
+ description: e instanceof Error ? e.message : String(e),
139
+ icon: 'i-lucide-triangle-alert',
140
+ color: 'error',
141
+ })
142
+ } finally {
143
+ savingPrompt.value = false
144
+ }
145
+ }
146
+
147
+ async function archive(prompt: SandboxPromptVersion) {
148
+ try {
149
+ await store.archivePrompt(prompt.id)
150
+ if (editing.value?.id === prompt.id) editing.value = null
151
+ } catch (e) {
152
+ toast.add({
153
+ title: 'Could not archive',
154
+ description: e instanceof Error ? e.message : String(e),
155
+ icon: 'i-lucide-triangle-alert',
156
+ color: 'error',
157
+ })
158
+ }
159
+ }
160
+
161
+ const fixtureName = (id: string) => store.fixtures.find((f) => f.id === id)?.name ?? id
162
+ </script>
163
+
164
+ <template>
165
+ <UModal
166
+ v-model:open="open"
167
+ title="Sandbox — prompt & model testing"
168
+ description="Try prompt versions and models against graded fixtures, scored by a judge model."
169
+ :ui="{ content: 'max-w-5xl' }"
170
+ >
171
+ <template #body>
172
+ <div v-if="store.loading" class="flex items-center justify-center py-12">
173
+ <UIcon name="i-lucide-loader-circle" class="h-6 w-6 animate-spin text-slate-400" />
174
+ </div>
175
+
176
+ <div
177
+ v-else-if="!store.available"
178
+ class="rounded-lg border border-slate-700 bg-slate-900/50 p-6 text-sm text-slate-300"
179
+ >
180
+ <p class="font-medium text-slate-200">The Sandbox isn't enabled for this deployment.</p>
181
+ <p class="mt-1 text-slate-400">
182
+ It needs its own database (a dedicated <code>SANDBOX_DB</code> on Cloudflare, or the
183
+ <code>sandbox</code> Postgres schema on Node). Provision it and reload.
184
+ </p>
185
+ </div>
186
+
187
+ <div
188
+ v-else-if="store.error"
189
+ class="rounded-lg border border-rose-800 bg-rose-950/40 p-6 text-sm text-rose-200"
190
+ >
191
+ <p class="font-medium text-rose-100">The Sandbox failed to load.</p>
192
+ <p class="mt-1 text-rose-300">{{ store.error }}</p>
193
+ <UButton class="mt-3" size="xs" color="neutral" variant="subtle" @click="store.load()">
194
+ Retry
195
+ </UButton>
196
+ </div>
197
+
198
+ <div v-else class="space-y-4">
199
+ <UTabs
200
+ v-model="tab"
201
+ :items="[
202
+ { label: 'Experiments', value: 'experiments', icon: 'i-lucide-flask-conical' },
203
+ { label: 'Prompts', value: 'prompts', icon: 'i-lucide-file-text' },
204
+ { label: 'Fixtures', value: 'fixtures', icon: 'i-lucide-clipboard-list' },
205
+ ]"
206
+ />
207
+
208
+ <!-- ============================= EXPERIMENTS ============================= -->
209
+ <div v-if="tab === 'experiments'" class="grid gap-4 lg:grid-cols-2">
210
+ <!-- builder -->
211
+ <div class="space-y-3 rounded-lg border border-slate-700 bg-slate-900/40 p-3">
212
+ <p class="text-[11px] font-semibold uppercase tracking-wide text-slate-400">
213
+ New experiment
214
+ </p>
215
+
216
+ <UFormField label="Agent">
217
+ <USelect
218
+ v-model="agentKind"
219
+ :items="store.agentKinds.map((k) => ({ label: k.label, value: k.agentKind }))"
220
+ value-key="value"
221
+ class="w-full"
222
+ />
223
+ </UFormField>
224
+
225
+ <div>
226
+ <span class="mb-1 block text-[10px] uppercase tracking-wide text-slate-500">
227
+ Prompt versions
228
+ </span>
229
+ <div class="max-h-28 space-y-1 overflow-auto pr-1">
230
+ <label
231
+ v-for="p in kindPrompts"
232
+ :key="p.id"
233
+ class="flex items-center gap-2 text-sm text-slate-300"
234
+ >
235
+ <UCheckbox
236
+ :model-value="selectedPromptIds.includes(p.id)"
237
+ @update:model-value="
238
+ (v: boolean | 'indeterminate') => toggle('prompt', p.id, v === true)
239
+ "
240
+ />
241
+ <span class="truncate">{{ p.name }}</span>
242
+ <UBadge
243
+ :color="p.origin === 'baseline' ? 'neutral' : 'primary'"
244
+ variant="soft"
245
+ size="xs"
246
+ >
247
+ {{ p.origin === 'baseline' ? 'baseline' : `v${p.version}` }}
248
+ </UBadge>
249
+ </label>
250
+ </div>
251
+ </div>
252
+
253
+ <div>
254
+ <span class="mb-1 block text-[10px] uppercase tracking-wide text-slate-500">
255
+ Models
256
+ </span>
257
+ <div class="max-h-28 space-y-1 overflow-auto pr-1">
258
+ <label
259
+ v-for="m in store.selectableModels"
260
+ :key="m.id"
261
+ class="flex items-center gap-2 text-sm text-slate-300"
262
+ >
263
+ <UCheckbox
264
+ :model-value="selectedModelIds.includes(m.id)"
265
+ @update:model-value="
266
+ (v: boolean | 'indeterminate') => toggle('model', m.id, v === true)
267
+ "
268
+ />
269
+ <span class="truncate">{{ m.label }}</span>
270
+ </label>
271
+ <p v-if="!store.selectableModels.length" class="text-xs text-slate-500">
272
+ No selectable models — configure a provider key or enable Cloudflare AI.
273
+ </p>
274
+ </div>
275
+ </div>
276
+
277
+ <div>
278
+ <span class="mb-1 block text-[10px] uppercase tracking-wide text-slate-500">
279
+ Fixtures
280
+ </span>
281
+ <div class="max-h-28 space-y-1 overflow-auto pr-1">
282
+ <label
283
+ v-for="f in kindFixtures"
284
+ :key="f.id"
285
+ class="flex items-center gap-2 text-sm text-slate-300"
286
+ >
287
+ <UCheckbox
288
+ :model-value="selectedFixtureIds.includes(f.id)"
289
+ @update:model-value="
290
+ (v: boolean | 'indeterminate') => toggle('fixture', f.id, v === true)
291
+ "
292
+ />
293
+ <span class="truncate">{{ f.name }}</span>
294
+ </label>
295
+ <p v-if="!kindFixtures.length" class="text-xs text-slate-500">
296
+ No fixtures for this agent.
297
+ </p>
298
+ </div>
299
+ </div>
300
+
301
+ <UFormField label="Judge model" hint="grades every cell">
302
+ <USelect v-model="selectedJudgeModel" :items="judgeModelItems" />
303
+ </UFormField>
304
+
305
+ <UFormField label="Name (optional)">
306
+ <UInput v-model="name" :placeholder="`${agentKind} — sandbox run`" />
307
+ </UFormField>
308
+
309
+ <div class="flex items-center justify-between">
310
+ <span class="text-xs text-slate-500">
311
+ {{ cellCount }} cell{{ cellCount === 1 ? '' : 's' }}
312
+ <span v-if="cellCount > store.maxCells" class="text-rose-400">
313
+ (max {{ store.maxCells }})
314
+ </span>
315
+ </span>
316
+ <UButton
317
+ color="primary"
318
+ icon="i-lucide-play"
319
+ size="sm"
320
+ :loading="store.launching"
321
+ :disabled="!canRun"
322
+ @click="createAndRun()"
323
+ >
324
+ Run
325
+ </UButton>
326
+ </div>
327
+ </div>
328
+
329
+ <!-- history + results -->
330
+ <div class="space-y-3">
331
+ <div v-if="store.detail" class="rounded-lg border border-slate-700 bg-slate-900/40 p-3">
332
+ <div class="mb-2 flex items-center justify-between">
333
+ <p class="text-sm font-medium text-slate-200">
334
+ {{ store.detail.experiment.name }}
335
+ </p>
336
+ <UBadge variant="soft" size="xs">{{ store.detail.experiment.status }}</UBadge>
337
+ </div>
338
+ <div class="overflow-auto">
339
+ <table class="w-full text-left text-xs">
340
+ <thead class="text-slate-500">
341
+ <tr>
342
+ <th class="py-1 pr-2 font-medium">Prompt</th>
343
+ <th class="py-1 pr-2 font-medium">Model</th>
344
+ <th class="py-1 pr-2 font-medium">Fixture</th>
345
+ <th class="py-1 pr-2 font-medium">Score</th>
346
+ <th class="py-1 font-medium">Objective</th>
347
+ </tr>
348
+ </thead>
349
+ <tbody>
350
+ <tr
351
+ v-for="run in store.detail.runs"
352
+ :key="run.id"
353
+ class="cursor-pointer border-t border-slate-800 hover:bg-slate-800/40"
354
+ @click="selectedRun = run"
355
+ >
356
+ <td class="py-1 pr-2 text-slate-300">{{ run.promptLabel }}</td>
357
+ <td class="py-1 pr-2 font-mono text-[11px] text-slate-400">
358
+ {{ run.model }}
359
+ </td>
360
+ <td class="py-1 pr-2 text-slate-400">{{ fixtureName(run.fixtureId) }}</td>
361
+ <td class="py-1 pr-2">
362
+ <span
363
+ v-if="gradeByRun.get(run.id)"
364
+ :class="scoreColor(gradeByRun.get(run.id)!.weightedTotal)"
365
+ class="font-semibold"
366
+ >
367
+ {{ gradeByRun.get(run.id)!.weightedTotal.toFixed(2) }}
368
+ </span>
369
+ <span v-else-if="run.status === 'failed'" class="text-rose-400"
370
+ >failed</span
371
+ >
372
+ <span v-else class="text-slate-600">—</span>
373
+ </td>
374
+ <td class="py-1">
375
+ <span
376
+ v-if="gradeByRun.get(run.id)?.objective"
377
+ :class="
378
+ gradeByRun.get(run.id)!.objective!.pass
379
+ ? 'text-emerald-400'
380
+ : 'text-amber-400'
381
+ "
382
+ >
383
+ {{ gradeByRun.get(run.id)!.objective!.caught }}/{{
384
+ gradeByRun.get(run.id)!.objective!.total
385
+ }}
386
+ </span>
387
+ <span v-else class="text-slate-600">—</span>
388
+ </td>
389
+ </tr>
390
+ </tbody>
391
+ </table>
392
+ </div>
393
+
394
+ <!-- selected cell output -->
395
+ <div v-if="selectedRun" class="mt-3 border-t border-slate-800 pt-2">
396
+ <p class="mb-1 text-[11px] uppercase tracking-wide text-slate-500">
397
+ {{ selectedRun.promptLabel }} · {{ selectedRun.model }}
398
+ </p>
399
+ <p v-if="selectedRun.error" class="text-xs text-rose-400">
400
+ {{ selectedRun.error }}
401
+ </p>
402
+ <pre
403
+ v-if="selectedRun.outputText"
404
+ class="max-h-48 overflow-auto whitespace-pre-wrap rounded bg-slate-950/60 p-2 text-[11px] text-slate-300"
405
+ >{{ selectedRun.outputText }}</pre
406
+ >
407
+ <div v-if="gradeByRun.get(selectedRun.id)" class="mt-2 space-y-0.5">
408
+ <p
409
+ v-for="d in gradeByRun.get(selectedRun.id)!.scores"
410
+ :key="d.key"
411
+ class="text-[11px] text-slate-400"
412
+ >
413
+ <span :class="scoreColor(d.score)" class="font-semibold">{{ d.score }}</span>
414
+ <span class="ml-1 text-slate-300">{{ d.key }}</span>
415
+ <span v-if="d.rationale" class="ml-1 text-slate-500">— {{ d.rationale }}</span>
416
+ </p>
417
+ </div>
418
+ </div>
419
+ </div>
420
+
421
+ <p class="text-[11px] uppercase tracking-wide text-slate-500">Past experiments</p>
422
+ <div class="max-h-56 space-y-1 overflow-auto">
423
+ <button
424
+ v-for="x in store.experiments"
425
+ :key="x.id"
426
+ class="flex w-full items-center justify-between rounded-md border border-slate-800 bg-slate-900/40 px-2 py-1.5 text-left text-sm hover:bg-slate-800/50"
427
+ @click="store.openExperiment(x.id)"
428
+ >
429
+ <span class="truncate text-slate-300">{{ x.name }}</span>
430
+ <UBadge variant="soft" size="xs">{{ x.status }}</UBadge>
431
+ </button>
432
+ <p v-if="!store.experiments.length" class="text-xs text-slate-500">
433
+ No experiments yet.
434
+ </p>
435
+ </div>
436
+ </div>
437
+ </div>
438
+
439
+ <!-- ============================== PROMPTS ============================== -->
440
+ <div v-else-if="tab === 'prompts'" class="grid gap-4 lg:grid-cols-2">
441
+ <div class="max-h-[28rem] space-y-1.5 overflow-auto pr-1">
442
+ <div
443
+ v-for="p in store.prompts"
444
+ :key="p.id"
445
+ class="flex items-center justify-between rounded-md border border-slate-800 bg-slate-900/40 px-2.5 py-1.5 text-sm"
446
+ >
447
+ <div class="min-w-0">
448
+ <div class="flex items-center gap-2">
449
+ <span class="truncate text-slate-200">{{ p.name }}</span>
450
+ <UBadge
451
+ :color="p.origin === 'baseline' ? 'neutral' : 'primary'"
452
+ variant="soft"
453
+ size="xs"
454
+ >
455
+ {{ p.origin === 'baseline' ? 'baseline' : `v${p.version}` }}
456
+ </UBadge>
457
+ </div>
458
+ <span class="text-[11px] text-slate-500">{{ p.agentKind }}</span>
459
+ </div>
460
+ <div class="flex items-center gap-1">
461
+ <UButton
462
+ icon="i-lucide-pencil"
463
+ color="neutral"
464
+ variant="ghost"
465
+ size="xs"
466
+ :title="p.origin === 'baseline' ? 'Fork into a candidate' : 'Edit / version'"
467
+ @click="edit(p)"
468
+ />
469
+ <UButton
470
+ v-if="p.origin === 'candidate'"
471
+ icon="i-lucide-archive"
472
+ color="error"
473
+ variant="ghost"
474
+ size="xs"
475
+ @click="archive(p)"
476
+ />
477
+ </div>
478
+ </div>
479
+ </div>
480
+
481
+ <div
482
+ v-if="editing"
483
+ class="space-y-2 rounded-lg border border-slate-700 bg-slate-900/40 p-3"
484
+ >
485
+ <p class="text-[11px] uppercase tracking-wide text-slate-500">
486
+ {{ editing.origin === 'baseline' ? 'Fork' : 'New version of' }} · {{ editing.name }}
487
+ </p>
488
+ <UTextarea v-model="editText" :rows="16" class="w-full font-mono text-xs" autoresize />
489
+ <div class="flex justify-end gap-2">
490
+ <UButton color="neutral" variant="ghost" size="sm" @click="editing = null">
491
+ Cancel
492
+ </UButton>
493
+ <UButton
494
+ color="primary"
495
+ icon="i-lucide-save"
496
+ size="sm"
497
+ :loading="savingPrompt"
498
+ :disabled="!editText.trim()"
499
+ @click="saveVersion()"
500
+ >
501
+ Save new version
502
+ </UButton>
503
+ </div>
504
+ </div>
505
+ <p v-else class="self-start text-xs text-slate-500">
506
+ Pick a prompt to fork a shipped baseline or version a candidate. Each save appends an
507
+ immutable version you can put under test.
508
+ </p>
509
+ </div>
510
+
511
+ <!-- ============================== FIXTURES ============================== -->
512
+ <div v-else class="max-h-[28rem] space-y-1.5 overflow-auto pr-1">
513
+ <div
514
+ v-for="f in store.fixtures"
515
+ :key="f.id"
516
+ class="rounded-md border border-slate-800 bg-slate-900/40 px-2.5 py-2 text-sm"
517
+ >
518
+ <div class="flex items-center justify-between">
519
+ <span class="text-slate-200">{{ f.name }}</span>
520
+ <div class="flex items-center gap-1.5">
521
+ <UBadge variant="soft" size="xs">{{ f.kind }}</UBadge>
522
+ <UBadge
523
+ :color="f.origin === 'builtin' ? 'neutral' : 'primary'"
524
+ variant="soft"
525
+ size="xs"
526
+ >
527
+ {{ f.origin }}
528
+ </UBadge>
529
+ </div>
530
+ </div>
531
+ <p v-if="f.objective?.kind === 'findings'" class="mt-0.5 text-[11px] text-slate-500">
532
+ {{ f.objective.expectations.length }} graded expectation{{
533
+ f.objective.expectations.length === 1 ? '' : 's'
534
+ }}
535
+ </p>
536
+ </div>
537
+ <p v-if="!store.fixtures.length" class="text-xs text-slate-500">No fixtures.</p>
538
+ </div>
539
+ </div>
540
+ </template>
541
+ </UModal>
542
+ </template>
@@ -83,11 +83,17 @@ export function reviewsApi({ http, ws }: ApiContext) {
83
83
  ),
84
84
 
85
85
  // Ask the Requirement Writer to recommend grounded answers for a batch of findings (by
86
- // item id). Returns the review with `ready` recommendations for the human to act on.
87
- requestRecommendations: (workspaceId: string, blockId: string, itemIds: string[]) =>
86
+ // item id). Returns the review with `pending` placeholder recommendations; they fill in
87
+ // (`ready`) asynchronously via the `requirements` stream as the Writer produces each.
88
+ requestRecommendations: (
89
+ workspaceId: string,
90
+ blockId: string,
91
+ itemIds: string[],
92
+ note?: string,
93
+ ) =>
88
94
  http<RequirementReview | null>(
89
95
  `${ws(workspaceId)}/blocks/${encodeURIComponent(blockId)}/requirement-review/recommend`,
90
- { method: 'POST', body: { itemIds } },
96
+ { method: 'POST', body: { itemIds, ...(note ? { note } : {}) } },
91
97
  ),
92
98
 
93
99
  // Accept a recommendation (becomes the finding's answer), reject it, or re-request it
@@ -0,0 +1,57 @@
1
+ import type {
2
+ CloneSandboxPromptInput,
3
+ CreateSandboxExperimentInput,
4
+ SandboxExperiment,
5
+ SandboxExperimentDetail,
6
+ SandboxFixture,
7
+ SandboxOverview,
8
+ SandboxPromptVersion,
9
+ SaveSandboxVersionInput,
10
+ } from '~/types/sandbox'
11
+ import type { ApiContext } from './context'
12
+
13
+ /**
14
+ * The Sandbox API (the parallel prompt/model testing surface): manage versioned prompt
15
+ * candidates + the fixture library, define experiments (prompt × model × fixture), and
16
+ * launch one to run + grade every cell. Opt-in: every endpoint 503s when the deployment
17
+ * hasn't wired the Sandbox (its dedicated DB / schema).
18
+ */
19
+ export function sandboxApi({ http, ws }: ApiContext) {
20
+ const base = (workspaceId: string) => `${ws(workspaceId)}/sandbox`
21
+ return {
22
+ getSandboxOverview: (workspaceId: string) =>
23
+ http<SandboxOverview>(`${base(workspaceId)}/overview`),
24
+
25
+ // ---- prompt versions -------------------------------------------------
26
+ cloneSandboxPrompt: (workspaceId: string, body: CloneSandboxPromptInput) =>
27
+ http<SandboxPromptVersion>(`${base(workspaceId)}/prompts/clone`, { method: 'POST', body }),
28
+ saveSandboxVersion: (workspaceId: string, body: SaveSandboxVersionInput) =>
29
+ http<SandboxPromptVersion>(`${base(workspaceId)}/prompts`, { method: 'POST', body }),
30
+ setSandboxPromptLabels: (workspaceId: string, promptId: string, labels: string[]) =>
31
+ http<SandboxPromptVersion>(
32
+ `${base(workspaceId)}/prompts/${encodeURIComponent(promptId)}/labels`,
33
+ { method: 'PATCH', body: { labels } },
34
+ ),
35
+ archiveSandboxPrompt: (workspaceId: string, promptId: string) =>
36
+ http(`${base(workspaceId)}/prompts/${encodeURIComponent(promptId)}`, { method: 'DELETE' }),
37
+
38
+ // ---- fixtures --------------------------------------------------------
39
+ createSandboxFixture: (workspaceId: string, body: Partial<SandboxFixture>) =>
40
+ http<SandboxFixture>(`${base(workspaceId)}/fixtures`, { method: 'POST', body }),
41
+ deleteSandboxFixture: (workspaceId: string, fixtureId: string) =>
42
+ http(`${base(workspaceId)}/fixtures/${encodeURIComponent(fixtureId)}`, { method: 'DELETE' }),
43
+
44
+ // ---- experiments -----------------------------------------------------
45
+ createSandboxExperiment: (workspaceId: string, body: CreateSandboxExperimentInput) =>
46
+ http<SandboxExperiment>(`${base(workspaceId)}/experiments`, { method: 'POST', body }),
47
+ getSandboxExperiment: (workspaceId: string, experimentId: string) =>
48
+ http<SandboxExperimentDetail>(
49
+ `${base(workspaceId)}/experiments/${encodeURIComponent(experimentId)}`,
50
+ ),
51
+ launchSandboxExperiment: (workspaceId: string, experimentId: string) =>
52
+ http<SandboxExperimentDetail>(
53
+ `${base(workspaceId)}/experiments/${encodeURIComponent(experimentId)}/launch`,
54
+ { method: 'POST' },
55
+ ),
56
+ }
57
+ }
@@ -15,6 +15,7 @@ import { presetsApi } from './api/presets'
15
15
  import { providerConnectionsApi } from './api/providerConnections'
16
16
  import { recurringApi } from './api/recurring'
17
17
  import { releaseHealthApi } from './api/releaseHealth'
18
+ import { sandboxApi } from './api/sandbox'
18
19
  import { reviewsApi } from './api/reviews'
19
20
  import { slackApi } from './api/slack'
20
21
  import { specApi } from './api/spec'
@@ -89,6 +90,7 @@ export function useApi() {
89
90
  ...providerConnectionsApi(ctx),
90
91
  ...releaseHealthApi(ctx),
91
92
  ...recurringApi(ctx),
93
+ ...sandboxApi(ctx),
92
94
  ...githubApi(ctx),
93
95
  ...slackApi(ctx),
94
96
  ...bootstrapApi(ctx),
@@ -33,6 +33,7 @@ import ProviderConnectionPanel from '~/components/settings/ProviderConnectionPan
33
33
  import ProviderConfigBanner from '~/components/layout/ProviderConfigBanner.vue'
34
34
  import ModelConfigurationPanel from '~/components/settings/ModelConfigurationPanel.vue'
35
35
  import LocalModelEndpointsPanel from '~/components/settings/LocalModelEndpointsPanel.vue'
36
+ import SandboxPanel from '~/components/sandbox/SandboxPanel.vue'
36
37
  import UserSecretsSection from '~/components/settings/UserSecretsSection.vue'
37
38
  import OpenRouterCatalogPanel from '~/components/settings/OpenRouterCatalogPanel.vue'
38
39
  import VendorCredentialsModal from '~/components/providers/VendorCredentialsModal.vue'
@@ -189,6 +190,7 @@ watch(
189
190
  <ProviderConnectionPanel />
190
191
  <ModelConfigurationPanel />
191
192
  <LocalModelEndpointsPanel />
193
+ <SandboxPanel />
192
194
  <UserSecretsSection />
193
195
  <OpenRouterCatalogPanel />
194
196
  <VendorCredentialsModal />
@@ -44,14 +44,21 @@ export const useRequirementsStore = defineStore('requirements', () => {
44
44
  function reviewFor(blockId: string): RequirementReview | null {
45
45
  return reviews.value[blockId] ?? null
46
46
  }
47
+ /** Whether the Requirement Writer is still producing recommendations for a block (a `pending`
48
+ * placeholder exists). Server-derived, so the "Recommending…" state survives the window closing
49
+ * and a page reload — the client-local `recommending` set only covers the request round-trip. */
50
+ function hasPendingRecommendations(blockId: string): boolean {
51
+ return (reviews.value[blockId]?.recommendations ?? []).some((r) => r.status === 'pending')
52
+ }
47
53
  /**
48
54
  * The async background stage a block's review is in, or null. While the driver folds the
49
- * answers (`incorporating`) then re-reviews the document (`reviewing`), NO human action is
50
- * needed so the board suppresses the "Approval needed" gate and shows this working state
51
- * instead, with copy that names which of the two stages is running.
55
+ * answers (`incorporating`) then re-reviews the document (`reviewing`), or the Requirement
56
+ * Writer is producing recommendations (`recommending`), NO human action is needed so the
57
+ * board suppresses the "Approval needed" gate and shows this working state instead, with copy
58
+ * that names which stage is running.
52
59
  */
53
60
  function backgroundStage(blockId: string): 'incorporating' | 'reviewing' | 'recommending' | null {
54
- if (recommending.value.has(blockId)) return 'recommending'
61
+ if (recommending.value.has(blockId) || hasPendingRecommendations(blockId)) return 'recommending'
55
62
  const status = reviews.value[blockId]?.status
56
63
  return status === 'incorporating' || status === 'reviewing' ? status : null
57
64
  }
@@ -176,19 +183,26 @@ export const useRequirementsStore = defineStore('requirements', () => {
176
183
  }
177
184
 
178
185
  function isRecommending(blockId: string): boolean {
179
- return recommending.value.has(blockId)
186
+ return recommending.value.has(blockId) || hasPendingRecommendations(blockId)
180
187
  }
181
188
 
182
189
  /**
183
190
  * Ask the Requirement Writer to recommend answers for a batch of findings (by item id).
184
- * Runs the Writer inline (grounded on best-practice fragments spec/tech-spec → web) and
185
- * returns the review with `ready` recommendations to accept/reject. Shows a `recommending`
186
- * background stage on the board while it runs.
191
+ * ASYNCHRONOUS: returns at once with `pending` placeholder recommendations (the Writer runs
192
+ * per finding in the durable driver), which fill in (`ready`) via live `requirements` stream
193
+ * events; a notification calls the user back when the batch is ready. The board shows the
194
+ * `recommending` background stage while any placeholder is pending. Optional `note` steers the
195
+ * whole batch.
187
196
  */
188
- async function requestRecommendations(blockId: string, itemIds: string[]) {
197
+ async function requestRecommendations(blockId: string, itemIds: string[], note?: string) {
189
198
  withFlag(recommending, blockId, true)
190
199
  try {
191
- const updated = await api.requestRecommendations(workspace.requireId(), blockId, itemIds)
200
+ const updated = await api.requestRecommendations(
201
+ workspace.requireId(),
202
+ blockId,
203
+ itemIds,
204
+ note,
205
+ )
192
206
  if (updated) store(updated)
193
207
  return updated
194
208
  } finally {
@@ -0,0 +1,174 @@
1
+ import { defineStore } from 'pinia'
2
+ import { computed, ref } from 'vue'
3
+ import type { ModelOption } from '~/types/domain'
4
+ import type {
5
+ CreateSandboxExperimentInput,
6
+ SandboxAgentKindMeta,
7
+ SandboxExperiment,
8
+ SandboxExperimentDetail,
9
+ SandboxFixture,
10
+ SandboxOverview,
11
+ SandboxPromptVersion,
12
+ } from '~/types/sandbox'
13
+ import { useWorkspaceStore } from '~/stores/workspace'
14
+
15
+ /**
16
+ * The Sandbox (parallel prompt/model testing surface). Loaded on demand when the panel
17
+ * opens (it's an opt-in, secondary surface, not part of the board snapshot): the testable
18
+ * agent-kind catalog, the shipped baselines + stored candidate prompt versions, the
19
+ * fixture library, and experiment definitions. Running an experiment grades every cell
20
+ * with a judge model; `launch` returns the full result grid.
21
+ */
22
+ export const useSandboxStore = defineStore('sandbox', () => {
23
+ const api = useApi()
24
+
25
+ const available = ref(true)
26
+ const loading = ref(false)
27
+ const error = ref<string | null>(null)
28
+
29
+ const agentKinds = ref<SandboxAgentKindMeta[]>([])
30
+ const prompts = ref<SandboxPromptVersion[]>([])
31
+ const fixtures = ref<SandboxFixture[]>([])
32
+ const experiments = ref<SandboxExperiment[]>([])
33
+ const models = ref<ModelOption[]>([])
34
+ /** The matrix cell cap (from the backend overview, so the builder gates on the same limit). */
35
+ const maxCells = ref(100)
36
+
37
+ /** The currently-opened experiment's full detail (result grid), if any. */
38
+ const detail = ref<SandboxExperimentDetail | null>(null)
39
+ const launching = ref(false)
40
+
41
+ function hydrate(overview: SandboxOverview) {
42
+ agentKinds.value = overview.agentKinds
43
+ prompts.value = overview.prompts
44
+ fixtures.value = overview.fixtures
45
+ experiments.value = [...overview.experiments].sort((a, b) => b.createdAt - a.createdAt)
46
+ maxCells.value = overview.maxCells
47
+ }
48
+
49
+ /** Patch one experiment into the list in place (newest-first), without a full reload. */
50
+ function upsertExperiment(experiment: SandboxExperiment) {
51
+ const next = experiments.value.filter((e) => e.id !== experiment.id)
52
+ next.push(experiment)
53
+ experiments.value = next.sort((a, b) => b.createdAt - a.createdAt)
54
+ }
55
+
56
+ /** Load the overview + the workspace model catalog. The 503 (feature off) is surfaced. */
57
+ async function load() {
58
+ const ws = useWorkspaceStore()
59
+ if (!ws.workspaceId) return
60
+ loading.value = true
61
+ error.value = null
62
+ try {
63
+ const [overview, modelList] = await Promise.all([
64
+ api.getSandboxOverview(ws.requireId()),
65
+ api.getWorkspaceModels(ws.requireId()),
66
+ ])
67
+ hydrate(overview)
68
+ models.value = modelList
69
+ available.value = true
70
+ } catch (e) {
71
+ const status =
72
+ (e as { statusCode?: number; response?: { status?: number } })?.statusCode ??
73
+ (e as { response?: { status?: number } })?.response?.status
74
+ if (status === 503) {
75
+ available.value = false
76
+ } else {
77
+ error.value = e instanceof Error ? e.message : String(e)
78
+ }
79
+ } finally {
80
+ loading.value = false
81
+ }
82
+ }
83
+
84
+ /** Selectable models for the experiment picker (the backend computed `available`). */
85
+ const selectableModels = computed(() => models.value.filter((m) => m.available !== false))
86
+
87
+ /** Prompt versions for one agent kind (baselines first, then candidates). */
88
+ function promptsForKind(agentKind: string): SandboxPromptVersion[] {
89
+ return prompts.value.filter((p) => p.agentKind === agentKind)
90
+ }
91
+
92
+ /** Fixtures authored for one agent kind, filtered by the catalog's `fixtureKinds`. */
93
+ function fixturesForKind(agentKind: string): SandboxFixture[] {
94
+ const meta = agentKinds.value.find((k) => k.agentKind === agentKind)
95
+ if (!meta) return fixtures.value
96
+ // The backend catalog is the source of truth for the fixture↔kind mapping.
97
+ const wanted = meta.fixtureKinds
98
+ return fixtures.value.filter((f) => wanted.includes(f.kind))
99
+ }
100
+
101
+ async function clonePrompt(agentKind: string, basePromptId: string | null, name?: string) {
102
+ const ws = useWorkspaceStore()
103
+ const created = await api.cloneSandboxPrompt(ws.requireId(), { agentKind, basePromptId, name })
104
+ await load()
105
+ return created
106
+ }
107
+
108
+ async function saveVersion(parentId: string, systemText: string) {
109
+ const ws = useWorkspaceStore()
110
+ const saved = await api.saveSandboxVersion(ws.requireId(), { parentId, systemText })
111
+ await load()
112
+ return saved
113
+ }
114
+
115
+ async function archivePrompt(promptId: string) {
116
+ const ws = useWorkspaceStore()
117
+ await api.archiveSandboxPrompt(ws.requireId(), promptId)
118
+ await load()
119
+ }
120
+
121
+ async function createExperiment(input: CreateSandboxExperimentInput) {
122
+ const ws = useWorkspaceStore()
123
+ const created = await api.createSandboxExperiment(ws.requireId(), input)
124
+ await load()
125
+ return created
126
+ }
127
+
128
+ async function openExperiment(experimentId: string) {
129
+ const ws = useWorkspaceStore()
130
+ detail.value = await api.getSandboxExperiment(ws.requireId(), experimentId)
131
+ return detail.value
132
+ }
133
+
134
+ async function launch(experimentId: string) {
135
+ const ws = useWorkspaceStore()
136
+ launching.value = true
137
+ try {
138
+ // `launch` returns the full graded grid AND the updated experiment, so patch both in
139
+ // place rather than calling `load()`: a transient failure in that follow-up fetch
140
+ // would otherwise set `error` and hide the freshly-returned result grid behind the
141
+ // error panel (and re-fetch the whole overview + model catalog for nothing).
142
+ const result = await api.launchSandboxExperiment(ws.requireId(), experimentId)
143
+ detail.value = result
144
+ upsertExperiment(result.experiment)
145
+ return result
146
+ } finally {
147
+ launching.value = false
148
+ }
149
+ }
150
+
151
+ return {
152
+ available,
153
+ loading,
154
+ error,
155
+ agentKinds,
156
+ prompts,
157
+ fixtures,
158
+ experiments,
159
+ models,
160
+ maxCells,
161
+ selectableModels,
162
+ detail,
163
+ launching,
164
+ load,
165
+ promptsForKind,
166
+ fixturesForKind,
167
+ clonePrompt,
168
+ saveVersion,
169
+ archivePrompt,
170
+ createExperiment,
171
+ openExperiment,
172
+ launch,
173
+ }
174
+ })
package/app/stores/ui.ts CHANGED
@@ -104,6 +104,8 @@ export const useUiStore = defineStore('ui', () => {
104
104
  const vendorCredentialsOpen = ref(false)
105
105
  // Per-user settings panel: the signed-in user's own-machine local model runners.
106
106
  const localModelsOpen = ref(false)
107
+ // The Sandbox (parallel prompt/model testing) surface — an opt-in, on-demand window.
108
+ const sandboxOpen = ref(false)
107
109
  const userSecretsOpen = ref(false)
108
110
  // Per-workspace settings panel: the OpenRouter dynamic catalog (browse/enable gateway models).
109
111
  const openRouterOpen = ref(false)
@@ -364,6 +366,12 @@ export const useUiStore = defineStore('ui', () => {
364
366
  function closeLocalModels() {
365
367
  localModelsOpen.value = false
366
368
  }
369
+ function openSandbox() {
370
+ sandboxOpen.value = true
371
+ }
372
+ function closeSandbox() {
373
+ sandboxOpen.value = false
374
+ }
367
375
  function openUserSecrets() {
368
376
  userSecretsOpen.value = true
369
377
  }
@@ -464,6 +472,7 @@ export const useUiStore = defineStore('ui', () => {
464
472
  modelConfigOpen,
465
473
  vendorCredentialsOpen,
466
474
  localModelsOpen,
475
+ sandboxOpen,
467
476
  userSecretsOpen,
468
477
  openRouterOpen,
469
478
  aiProviderSetupOpen,
@@ -528,6 +537,8 @@ export const useUiStore = defineStore('ui', () => {
528
537
  closeVendorCredentials,
529
538
  openLocalModels,
530
539
  closeLocalModels,
540
+ openSandbox,
541
+ closeSandbox,
531
542
  openUserSecrets,
532
543
  closeUserSecrets,
533
544
  openOpenRouter,
@@ -50,8 +50,12 @@ export type RequirementReviewStatus =
50
50
  /** How a human resolves a review that hit its iteration cap. */
51
51
  export type ResolveRequirementsExceededChoice = 'extra-round' | 'proceed' | 'stop-reset'
52
52
 
53
- /** Lifecycle of a Requirement-Writer recommendation. */
54
- export type RecommendationStatus = 'ready' | 'accepted' | 'rejected'
53
+ /**
54
+ * Lifecycle of a Requirement-Writer recommendation. `pending` is a placeholder created the
55
+ * moment the human requests it — the Writer is still producing the suggestion in the background
56
+ * (the async story); it fills in to `ready` via the `requirements` stream.
57
+ */
58
+ export type RecommendationStatus = 'pending' | 'ready' | 'accepted' | 'rejected'
55
59
 
56
60
  /**
57
61
  * A Requirement-Writer suggestion for one finding. First-class on the review (survives the
@@ -60,7 +64,7 @@ export type RecommendationStatus = 'ready' | 'accepted' | 'rejected'
60
64
  */
61
65
  export interface RequirementRecommendation {
62
66
  id: string
63
- sourceFinding: { title: string; detail: string }
67
+ sourceFinding: { title: string; detail: string; itemId?: string }
64
68
  recommendedText: string
65
69
  status: RecommendationStatus
66
70
  note: string | null
@@ -0,0 +1,183 @@
1
+ // Sandbox (the parallel prompt/model testing surface) wire shapes, hand-mirrored from
2
+ // `@cat-factory/contracts` (sandbox.ts) so a backend payload drops straight into the
3
+ // store. Clone a shipped agent prompt into a versioned candidate, run an experiment
4
+ // matrix (prompt versions × models × fixtures) for one agent kind, and grade every cell
5
+ // with a judge model plus (where a fixture supports it) an objective findings score.
6
+
7
+ export type SandboxPromptOrigin = 'baseline' | 'candidate'
8
+
9
+ export interface SandboxPromptVersion {
10
+ id: string
11
+ lineageId: string
12
+ agentKind: string
13
+ name: string
14
+ origin: SandboxPromptOrigin
15
+ systemText: string
16
+ basePromptId: string | null
17
+ version: number
18
+ parentId: string | null
19
+ labels: string[]
20
+ createdAt: number
21
+ createdBy: string | null
22
+ archivedAt: number | null
23
+ }
24
+
25
+ export type SandboxFixtureKind =
26
+ | 'requirements'
27
+ | 'clarity'
28
+ | 'architecture'
29
+ | 'code-review'
30
+ | 'repo-feature'
31
+ | 'repo-bug'
32
+
33
+ export interface SandboxExpectation {
34
+ id: string
35
+ summary: string
36
+ detail: string
37
+ trickiness: number
38
+ impact: number
39
+ matchHints: string[]
40
+ }
41
+
42
+ export type SandboxFixtureObjective =
43
+ | { kind: 'tests'; testCmd: string }
44
+ | { kind: 'findings'; expectations: SandboxExpectation[] }
45
+
46
+ export interface SandboxFixture {
47
+ id: string
48
+ kind: SandboxFixtureKind
49
+ name: string
50
+ payload: Record<string, unknown> | null
51
+ repoRef: { owner: string; name: string; seedRef: string } | null
52
+ objective: SandboxFixtureObjective | null
53
+ origin: 'builtin' | 'custom'
54
+ createdAt: number
55
+ }
56
+
57
+ export type SandboxExperimentStatus = 'draft' | 'running' | 'done' | 'failed'
58
+
59
+ export interface SandboxMatrix {
60
+ promptVersionIds: string[]
61
+ models: string[]
62
+ fixtureIds: string[]
63
+ }
64
+
65
+ export interface SandboxExperiment {
66
+ id: string
67
+ name: string
68
+ agentKind: string
69
+ judgeModel: string
70
+ repeats: number
71
+ status: SandboxExperimentStatus
72
+ matrix: SandboxMatrix
73
+ budgetTokens: number | null
74
+ createdAt: number
75
+ createdBy: string | null
76
+ }
77
+
78
+ export type SandboxRunStatus = 'queued' | 'running' | 'done' | 'failed'
79
+
80
+ export interface SandboxTokenUsage {
81
+ inputTokens: number
82
+ outputTokens: number
83
+ }
84
+
85
+ export interface SandboxRun {
86
+ id: string
87
+ experimentId: string
88
+ promptVersionId: string
89
+ model: string
90
+ fixtureId: string
91
+ repeatIndex: number
92
+ status: SandboxRunStatus
93
+ outputText: string | null
94
+ usage: SandboxTokenUsage | null
95
+ latencyMs: number | null
96
+ branch: string | null
97
+ prUrl: string | null
98
+ diff: string | null
99
+ error: string | null
100
+ seedSha: string | null
101
+ promptLabel: string
102
+ startedAt: number | null
103
+ finishedAt: number | null
104
+ }
105
+
106
+ export interface SandboxGradeDimension {
107
+ key: string
108
+ score: number
109
+ rationale: string
110
+ }
111
+
112
+ export interface SandboxObjectiveResult {
113
+ kind: 'tests' | 'findings'
114
+ pass: boolean
115
+ detail: string
116
+ impactRecall: number | null
117
+ wowBonus: number | null
118
+ caught: number | null
119
+ total: number | null
120
+ missedHighImpact: string[] | null
121
+ }
122
+
123
+ export interface SandboxGrade {
124
+ id: string
125
+ runId: string
126
+ judgeModel: string
127
+ scores: SandboxGradeDimension[]
128
+ weightedTotal: number
129
+ objective: SandboxObjectiveResult | null
130
+ createdAt: number
131
+ }
132
+
133
+ /** The Sandbox catalog entry for a testable agent kind (from the overview). */
134
+ export interface SandboxAgentKindMeta {
135
+ agentKind: string
136
+ label: string
137
+ bucket: 'inline' | 'container'
138
+ rubric: 'requirement-review' | 'code-review' | 'implementation'
139
+ /** Fixture kinds this agent is exercised against (the UI filters the library by these). */
140
+ fixtureKinds: SandboxFixtureKind[]
141
+ basePromptId: string | null
142
+ }
143
+
144
+ /** The composite the management surface loads on open (`GET /sandbox/overview`). */
145
+ export interface SandboxOverview {
146
+ agentKinds: SandboxAgentKindMeta[]
147
+ prompts: SandboxPromptVersion[]
148
+ fixtures: SandboxFixture[]
149
+ experiments: SandboxExperiment[]
150
+ /** The matrix cell cap (the backend cost guard), so the builder gates on the same limit. */
151
+ maxCells: number
152
+ }
153
+
154
+ /** An experiment with its result grid (`GET /sandbox/experiments/:id`, also from launch). */
155
+ export interface SandboxExperimentDetail {
156
+ experiment: SandboxExperiment
157
+ runs: SandboxRun[]
158
+ grades: SandboxGrade[]
159
+ }
160
+
161
+ // ---- request bodies --------------------------------------------------------
162
+
163
+ export interface CloneSandboxPromptInput {
164
+ agentKind: string
165
+ basePromptId: string | null
166
+ name?: string
167
+ labels?: string[]
168
+ }
169
+
170
+ export interface SaveSandboxVersionInput {
171
+ parentId: string
172
+ systemText: string
173
+ labels?: string[]
174
+ }
175
+
176
+ export interface CreateSandboxExperimentInput {
177
+ name: string
178
+ agentKind: string
179
+ matrix: SandboxMatrix
180
+ judgeModel?: string
181
+ repeats?: number
182
+ budgetTokens?: number | null
183
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cat-factory/app",
3
- "version": "0.27.0",
3
+ "version": "0.28.1",
4
4
  "description": "Reusable Nuxt layer for the Agent Architecture Board SPA (components, stores, composables, pages). Consume it from a thin deployment app via `extends: ['@cat-factory/app']` and point it at your backend with NUXT_PUBLIC_API_BASE. See deploy/frontend for an example.",
5
5
  "repository": {
6
6
  "type": "git",