@cat-factory/app 0.27.0 → 0.28.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/app/components/layout/CommandBar.vue +8 -0
- package/app/components/layout/SideBar.vue +13 -0
- package/app/components/requirements/RequirementsReviewWindow.vue +75 -13
- package/app/components/sandbox/SandboxPanel.vue +542 -0
- package/app/composables/api/reviews.ts +9 -3
- package/app/composables/api/sandbox.ts +57 -0
- package/app/composables/useApi.ts +2 -0
- package/app/pages/index.vue +2 -0
- package/app/stores/requirements.ts +24 -10
- package/app/stores/sandbox.ts +174 -0
- package/app/stores/ui.ts +11 -0
- package/app/types/requirements.ts +7 -3
- package/app/types/sandbox.ts +183 -0
- package/package.json +1 -1
|
@@ -181,6 +181,14 @@ const commands = computed<Command[]>(() => {
|
|
|
181
181
|
keywords: 'local model runner ollama lm studio llamacpp vllm endpoint',
|
|
182
182
|
run: () => ui.openLocalModels(),
|
|
183
183
|
})
|
|
184
|
+
list.push({
|
|
185
|
+
id: 'sandbox',
|
|
186
|
+
label: 'Open Sandbox',
|
|
187
|
+
group: 'Workspace',
|
|
188
|
+
icon: 'i-lucide-flask-conical',
|
|
189
|
+
keywords: 'sandbox prompt model test experiment judge fixture benchmark evaluate',
|
|
190
|
+
run: () => ui.openSandbox(),
|
|
191
|
+
})
|
|
184
192
|
|
|
185
193
|
return list
|
|
186
194
|
})
|
|
@@ -125,6 +125,19 @@ watch(
|
|
|
125
125
|
>
|
|
126
126
|
Integrations
|
|
127
127
|
</UButton>
|
|
128
|
+
<!-- The Sandbox: try prompt versions/models against graded fixtures, off to the
|
|
129
|
+
side of the board. Opens the on-demand testing window. -->
|
|
130
|
+
<UButton
|
|
131
|
+
block
|
|
132
|
+
color="primary"
|
|
133
|
+
variant="soft"
|
|
134
|
+
size="sm"
|
|
135
|
+
icon="i-lucide-flask-conical"
|
|
136
|
+
class="justify-start"
|
|
137
|
+
@click="ui.openSandbox()"
|
|
138
|
+
>
|
|
139
|
+
Sandbox
|
|
140
|
+
</UButton>
|
|
128
141
|
</div>
|
|
129
142
|
</section>
|
|
130
143
|
|
|
@@ -214,10 +214,24 @@ async function setStatus(item: RequirementReviewItem, itemStatus: ReviewItemStat
|
|
|
214
214
|
const recommending = computed(() =>
|
|
215
215
|
blockId.value ? requirements.isRecommending(blockId.value) : false,
|
|
216
216
|
)
|
|
217
|
-
// Recommendations still
|
|
218
|
-
const
|
|
217
|
+
// Recommendations the Writer has produced that still await a human decision (`ready`).
|
|
218
|
+
const readyRecommendations = computed<RequirementRecommendation[]>(() =>
|
|
219
219
|
(review.value?.recommendations ?? []).filter((r) => r.status === 'ready'),
|
|
220
220
|
)
|
|
221
|
+
// Placeholders the Requirement Writer is still producing in the background (`pending`).
|
|
222
|
+
const generatingRecommendations = computed<RequirementRecommendation[]>(() =>
|
|
223
|
+
(review.value?.recommendations ?? []).filter((r) => r.status === 'pending'),
|
|
224
|
+
)
|
|
225
|
+
// "ready / total" progress for the in-flight batch (null when nothing is generating). Scoped to
|
|
226
|
+
// the current wave via `createdAt` (all placeholders in one request share the timestamp), so
|
|
227
|
+
// stale `ready` recommendations the human hasn't acted on from an earlier batch don't inflate it.
|
|
228
|
+
const recommendationProgress = computed(() => {
|
|
229
|
+
const generating = generatingRecommendations.value
|
|
230
|
+
if (generating.length === 0) return null
|
|
231
|
+
const batchTimes = new Set(generating.map((r) => r.createdAt))
|
|
232
|
+
const ready = readyRecommendations.value.filter((r) => batchTimes.has(r.createdAt)).length
|
|
233
|
+
return { ready, total: ready + generating.length }
|
|
234
|
+
})
|
|
221
235
|
function isMarkedForRecommend(item: RequirementReviewItem): boolean {
|
|
222
236
|
return markedForRecommend.value.has(item.id)
|
|
223
237
|
}
|
|
@@ -228,18 +242,37 @@ function toggleRecommend(item: RequirementReviewItem) {
|
|
|
228
242
|
markedForRecommend.value = next
|
|
229
243
|
}
|
|
230
244
|
|
|
231
|
-
// Fire the Writer over the whole marked batch
|
|
232
|
-
//
|
|
245
|
+
// Fire the Writer over the whole marked batch (grounded on the project's best-practice
|
|
246
|
+
// standards, specs/tech-specs and web search). ASYNCHRONOUS: it returns at once with `pending`
|
|
247
|
+
// placeholders that fill in live; the user can close the window and is notified when the batch
|
|
248
|
+
// is ready. Flush any typed-but-unblurred answers first so nothing the human entered is lost.
|
|
233
249
|
async function requestRecommendations() {
|
|
234
250
|
if (!blockId.value || markedForRecommend.value.size === 0) return
|
|
235
251
|
const ids = [...markedForRecommend.value]
|
|
236
252
|
try {
|
|
237
|
-
await
|
|
253
|
+
await flushDrafts()
|
|
254
|
+
const updated = await requirements.requestRecommendations(blockId.value, ids)
|
|
238
255
|
markedForRecommend.value = new Set()
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
256
|
+
const n = ids.length
|
|
257
|
+
const plural = n === 1 ? '' : 's'
|
|
258
|
+
// On a parked run the request returns at once with `pending` placeholders the durable driver
|
|
259
|
+
// fills in the background; off-path (no active pipeline) there is no driver, so the Writer
|
|
260
|
+
// ran inline and the recommendations are already settled. Tell the human which actually
|
|
261
|
+
// happened rather than always promising a background callback.
|
|
262
|
+
const stillGenerating = (updated?.recommendations ?? []).some((r) => r.status === 'pending')
|
|
263
|
+
toast.add(
|
|
264
|
+
stillGenerating
|
|
265
|
+
? {
|
|
266
|
+
title: `Preparing ${n} recommendation${plural} in the background`,
|
|
267
|
+
description:
|
|
268
|
+
"Your answers are saved — close this if you like; we'll notify you when they're ready.",
|
|
269
|
+
icon: 'i-lucide-sparkles',
|
|
270
|
+
}
|
|
271
|
+
: {
|
|
272
|
+
title: `${n} recommendation${plural} ready`,
|
|
273
|
+
icon: 'i-lucide-sparkles',
|
|
274
|
+
},
|
|
275
|
+
)
|
|
243
276
|
} catch (e) {
|
|
244
277
|
notifyError('Could not request recommendations', e)
|
|
245
278
|
}
|
|
@@ -567,18 +600,47 @@ async function resolveExceeded(choice: 'extra-round' | 'proceed' | 'stop-reset')
|
|
|
567
600
|
</div>
|
|
568
601
|
</div>
|
|
569
602
|
|
|
570
|
-
<!-- Requirement-Writer recommendations awaiting a human decision
|
|
603
|
+
<!-- Requirement-Writer recommendations: awaiting a human decision (`ready`) and/or
|
|
604
|
+
still generating in the background (`pending`) -->
|
|
571
605
|
<section
|
|
572
|
-
v-if="
|
|
606
|
+
v-if="readyRecommendations.length || generatingRecommendations.length"
|
|
573
607
|
class="mt-6 border-t border-slate-800 pt-5"
|
|
574
608
|
>
|
|
575
|
-
<div class="mb-3 flex items-center gap-
|
|
609
|
+
<div class="mb-3 flex items-center gap-2 text-[11px] text-indigo-300">
|
|
576
610
|
<UIcon name="i-lucide-wand-2" class="h-3.5 w-3.5" />
|
|
577
611
|
<span class="font-semibold uppercase tracking-wide">Recommended answers</span>
|
|
612
|
+
<span
|
|
613
|
+
v-if="recommendationProgress"
|
|
614
|
+
class="ml-auto flex items-center gap-1.5 normal-case text-indigo-300/80"
|
|
615
|
+
>
|
|
616
|
+
<UIcon name="i-lucide-loader-circle" class="h-3.5 w-3.5 animate-spin" />
|
|
617
|
+
{{ recommendationProgress.ready }} / {{ recommendationProgress.total }} ready
|
|
618
|
+
</span>
|
|
578
619
|
</div>
|
|
620
|
+
|
|
621
|
+
<!-- still-generating placeholders (one per requested finding) -->
|
|
622
|
+
<div v-if="generatingRecommendations.length" class="mb-3 flex flex-col gap-3">
|
|
623
|
+
<div
|
|
624
|
+
v-for="rec in generatingRecommendations"
|
|
625
|
+
:key="rec.id"
|
|
626
|
+
class="flex items-start gap-2 rounded-lg border border-dashed border-indigo-900/50 bg-indigo-950/10 p-3"
|
|
627
|
+
>
|
|
628
|
+
<UIcon
|
|
629
|
+
name="i-lucide-loader-circle"
|
|
630
|
+
class="mt-0.5 h-4 w-4 shrink-0 animate-spin text-indigo-300"
|
|
631
|
+
/>
|
|
632
|
+
<div class="min-w-0">
|
|
633
|
+
<span class="text-sm font-medium text-white">{{
|
|
634
|
+
rec.sourceFinding.title
|
|
635
|
+
}}</span>
|
|
636
|
+
<p class="text-xs text-indigo-300/70">Generating a grounded suggestion…</p>
|
|
637
|
+
</div>
|
|
638
|
+
</div>
|
|
639
|
+
</div>
|
|
640
|
+
|
|
579
641
|
<div class="flex flex-col gap-3">
|
|
580
642
|
<div
|
|
581
|
-
v-for="rec in
|
|
643
|
+
v-for="rec in readyRecommendations"
|
|
582
644
|
:key="rec.id"
|
|
583
645
|
class="rounded-lg border border-indigo-900/50 bg-indigo-950/20 p-3"
|
|
584
646
|
>
|
|
@@ -0,0 +1,542 @@
|
|
|
1
|
+
<script setup lang="ts">
|
|
2
|
+
// The Sandbox surface: a parallel place to test prompts and models against graded
|
|
3
|
+
// fixtures, without touching the board. Three tabs — Experiments (define a matrix of
|
|
4
|
+
// prompt versions × models × fixtures for one agent kind, run it, read the graded grid),
|
|
5
|
+
// Prompts (clone a shipped baseline into an editable candidate lineage and version it),
|
|
6
|
+
// and Fixtures (the graded inputs each run is scored against). Loaded on demand when the
|
|
7
|
+
// window opens; 503 (the deployment hasn't provisioned the Sandbox DB) shows a notice.
|
|
8
|
+
import { computed, ref, watch } from 'vue'
|
|
9
|
+
import type { SandboxGrade, SandboxPromptVersion, SandboxRun } from '~/types/sandbox'
|
|
10
|
+
|
|
11
|
+
const ui = useUiStore()
|
|
12
|
+
const store = useSandboxStore()
|
|
13
|
+
const toast = useToast()
|
|
14
|
+
|
|
15
|
+
const open = computed({
|
|
16
|
+
get: () => ui.sandboxOpen,
|
|
17
|
+
set: (v: boolean) => (v ? ui.openSandbox() : ui.closeSandbox()),
|
|
18
|
+
})
|
|
19
|
+
|
|
20
|
+
const tab = ref<'experiments' | 'prompts' | 'fixtures'>('experiments')
|
|
21
|
+
|
|
22
|
+
watch(open, (isOpen) => {
|
|
23
|
+
if (isOpen) void store.load()
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
// ---- experiment builder ----------------------------------------------------
|
|
27
|
+
const agentKind = ref('requirements-review')
|
|
28
|
+
const name = ref('')
|
|
29
|
+
const selectedPromptIds = ref<string[]>([])
|
|
30
|
+
const selectedModelIds = ref<string[]>([])
|
|
31
|
+
const selectedFixtureIds = ref<string[]>([])
|
|
32
|
+
// The judge model. Empty string = the deployment's routing default (resolved server-side);
|
|
33
|
+
// picking one explicitly is the recourse on a deployment that has no default model wired,
|
|
34
|
+
// where leaving it on default makes every run fail at create time.
|
|
35
|
+
const selectedJudgeModel = ref<string>('')
|
|
36
|
+
|
|
37
|
+
const judgeModelItems = computed(() => [
|
|
38
|
+
{ label: 'Deployment default', value: '' },
|
|
39
|
+
...store.selectableModels.map((m) => ({ label: m.label, value: m.id })),
|
|
40
|
+
])
|
|
41
|
+
|
|
42
|
+
const kindPrompts = computed(() => store.promptsForKind(agentKind.value))
|
|
43
|
+
const kindFixtures = computed(() => store.fixturesForKind(agentKind.value))
|
|
44
|
+
|
|
45
|
+
// Reset the builder selections to sensible defaults when the agent kind (or loaded data)
|
|
46
|
+
// changes: every baseline prompt + every fixture for the kind, no models yet.
|
|
47
|
+
watch(
|
|
48
|
+
[agentKind, () => store.prompts, () => store.fixtures],
|
|
49
|
+
() => {
|
|
50
|
+
selectedPromptIds.value = kindPrompts.value
|
|
51
|
+
.filter((p) => p.origin === 'baseline')
|
|
52
|
+
.map((p) => p.id)
|
|
53
|
+
selectedFixtureIds.value = kindFixtures.value.map((f) => f.id)
|
|
54
|
+
},
|
|
55
|
+
{ immediate: true },
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
const cellCount = computed(
|
|
59
|
+
() =>
|
|
60
|
+
selectedPromptIds.value.length *
|
|
61
|
+
selectedModelIds.value.length *
|
|
62
|
+
selectedFixtureIds.value.length,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
const canRun = computed(() => cellCount.value > 0 && cellCount.value <= store.maxCells)
|
|
66
|
+
|
|
67
|
+
function toggle(which: 'prompt' | 'model' | 'fixture', id: string, on: boolean) {
|
|
68
|
+
const list =
|
|
69
|
+
which === 'prompt'
|
|
70
|
+
? selectedPromptIds
|
|
71
|
+
: which === 'model'
|
|
72
|
+
? selectedModelIds
|
|
73
|
+
: selectedFixtureIds
|
|
74
|
+
list.value = on ? [...new Set([...list.value, id])] : list.value.filter((x) => x !== id)
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
async function createAndRun() {
|
|
78
|
+
if (!canRun.value) return
|
|
79
|
+
try {
|
|
80
|
+
const created = await store.createExperiment({
|
|
81
|
+
name: name.value.trim() || `${agentKind.value} — sandbox run`,
|
|
82
|
+
agentKind: agentKind.value,
|
|
83
|
+
judgeModel: selectedJudgeModel.value || undefined,
|
|
84
|
+
matrix: {
|
|
85
|
+
promptVersionIds: selectedPromptIds.value,
|
|
86
|
+
models: selectedModelIds.value,
|
|
87
|
+
fixtureIds: selectedFixtureIds.value,
|
|
88
|
+
},
|
|
89
|
+
})
|
|
90
|
+
name.value = ''
|
|
91
|
+
toast.add({ title: 'Running experiment…', icon: 'i-lucide-flask-conical', color: 'info' })
|
|
92
|
+
await store.launch(created.id)
|
|
93
|
+
toast.add({ title: 'Experiment complete', icon: 'i-lucide-check', color: 'success' })
|
|
94
|
+
} catch (e) {
|
|
95
|
+
toast.add({
|
|
96
|
+
title: 'Could not run the experiment',
|
|
97
|
+
description: e instanceof Error ? e.message : String(e),
|
|
98
|
+
icon: 'i-lucide-triangle-alert',
|
|
99
|
+
color: 'error',
|
|
100
|
+
})
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// ---- results grid ----------------------------------------------------------
|
|
105
|
+
const gradeByRun = computed(() => {
|
|
106
|
+
const map = new Map<string, SandboxGrade>()
|
|
107
|
+
for (const g of store.detail?.grades ?? []) map.set(g.runId, g)
|
|
108
|
+
return map
|
|
109
|
+
})
|
|
110
|
+
const selectedRun = ref<SandboxRun | null>(null)
|
|
111
|
+
|
|
112
|
+
function scoreColor(score: number): string {
|
|
113
|
+
if (score >= 4) return 'text-emerald-400'
|
|
114
|
+
if (score >= 3) return 'text-amber-400'
|
|
115
|
+
return 'text-rose-400'
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// ---- prompt editor ---------------------------------------------------------
|
|
119
|
+
const editing = ref<SandboxPromptVersion | null>(null)
|
|
120
|
+
const editText = ref('')
|
|
121
|
+
const savingPrompt = ref(false)
|
|
122
|
+
|
|
123
|
+
function edit(prompt: SandboxPromptVersion) {
|
|
124
|
+
editing.value = prompt
|
|
125
|
+
editText.value = prompt.systemText
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
async function saveVersion() {
|
|
129
|
+
if (!editing.value || !editText.value.trim()) return
|
|
130
|
+
savingPrompt.value = true
|
|
131
|
+
try {
|
|
132
|
+
await store.saveVersion(editing.value.id, editText.value)
|
|
133
|
+
toast.add({ title: 'Saved a new version', icon: 'i-lucide-check', color: 'success' })
|
|
134
|
+
editing.value = null
|
|
135
|
+
} catch (e) {
|
|
136
|
+
toast.add({
|
|
137
|
+
title: 'Could not save the version',
|
|
138
|
+
description: e instanceof Error ? e.message : String(e),
|
|
139
|
+
icon: 'i-lucide-triangle-alert',
|
|
140
|
+
color: 'error',
|
|
141
|
+
})
|
|
142
|
+
} finally {
|
|
143
|
+
savingPrompt.value = false
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
async function archive(prompt: SandboxPromptVersion) {
|
|
148
|
+
try {
|
|
149
|
+
await store.archivePrompt(prompt.id)
|
|
150
|
+
if (editing.value?.id === prompt.id) editing.value = null
|
|
151
|
+
} catch (e) {
|
|
152
|
+
toast.add({
|
|
153
|
+
title: 'Could not archive',
|
|
154
|
+
description: e instanceof Error ? e.message : String(e),
|
|
155
|
+
icon: 'i-lucide-triangle-alert',
|
|
156
|
+
color: 'error',
|
|
157
|
+
})
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
const fixtureName = (id: string) => store.fixtures.find((f) => f.id === id)?.name ?? id
|
|
162
|
+
</script>
|
|
163
|
+
|
|
164
|
+
<template>
|
|
165
|
+
<UModal
|
|
166
|
+
v-model:open="open"
|
|
167
|
+
title="Sandbox — prompt & model testing"
|
|
168
|
+
description="Try prompt versions and models against graded fixtures, scored by a judge model."
|
|
169
|
+
:ui="{ content: 'max-w-5xl' }"
|
|
170
|
+
>
|
|
171
|
+
<template #body>
|
|
172
|
+
<div v-if="store.loading" class="flex items-center justify-center py-12">
|
|
173
|
+
<UIcon name="i-lucide-loader-circle" class="h-6 w-6 animate-spin text-slate-400" />
|
|
174
|
+
</div>
|
|
175
|
+
|
|
176
|
+
<div
|
|
177
|
+
v-else-if="!store.available"
|
|
178
|
+
class="rounded-lg border border-slate-700 bg-slate-900/50 p-6 text-sm text-slate-300"
|
|
179
|
+
>
|
|
180
|
+
<p class="font-medium text-slate-200">The Sandbox isn't enabled for this deployment.</p>
|
|
181
|
+
<p class="mt-1 text-slate-400">
|
|
182
|
+
It needs its own database (a dedicated <code>SANDBOX_DB</code> on Cloudflare, or the
|
|
183
|
+
<code>sandbox</code> Postgres schema on Node). Provision it and reload.
|
|
184
|
+
</p>
|
|
185
|
+
</div>
|
|
186
|
+
|
|
187
|
+
<div
|
|
188
|
+
v-else-if="store.error"
|
|
189
|
+
class="rounded-lg border border-rose-800 bg-rose-950/40 p-6 text-sm text-rose-200"
|
|
190
|
+
>
|
|
191
|
+
<p class="font-medium text-rose-100">The Sandbox failed to load.</p>
|
|
192
|
+
<p class="mt-1 text-rose-300">{{ store.error }}</p>
|
|
193
|
+
<UButton class="mt-3" size="xs" color="neutral" variant="subtle" @click="store.load()">
|
|
194
|
+
Retry
|
|
195
|
+
</UButton>
|
|
196
|
+
</div>
|
|
197
|
+
|
|
198
|
+
<div v-else class="space-y-4">
|
|
199
|
+
<UTabs
|
|
200
|
+
v-model="tab"
|
|
201
|
+
:items="[
|
|
202
|
+
{ label: 'Experiments', value: 'experiments', icon: 'i-lucide-flask-conical' },
|
|
203
|
+
{ label: 'Prompts', value: 'prompts', icon: 'i-lucide-file-text' },
|
|
204
|
+
{ label: 'Fixtures', value: 'fixtures', icon: 'i-lucide-clipboard-list' },
|
|
205
|
+
]"
|
|
206
|
+
/>
|
|
207
|
+
|
|
208
|
+
<!-- ============================= EXPERIMENTS ============================= -->
|
|
209
|
+
<div v-if="tab === 'experiments'" class="grid gap-4 lg:grid-cols-2">
|
|
210
|
+
<!-- builder -->
|
|
211
|
+
<div class="space-y-3 rounded-lg border border-slate-700 bg-slate-900/40 p-3">
|
|
212
|
+
<p class="text-[11px] font-semibold uppercase tracking-wide text-slate-400">
|
|
213
|
+
New experiment
|
|
214
|
+
</p>
|
|
215
|
+
|
|
216
|
+
<UFormField label="Agent">
|
|
217
|
+
<USelect
|
|
218
|
+
v-model="agentKind"
|
|
219
|
+
:items="store.agentKinds.map((k) => ({ label: k.label, value: k.agentKind }))"
|
|
220
|
+
value-key="value"
|
|
221
|
+
class="w-full"
|
|
222
|
+
/>
|
|
223
|
+
</UFormField>
|
|
224
|
+
|
|
225
|
+
<div>
|
|
226
|
+
<span class="mb-1 block text-[10px] uppercase tracking-wide text-slate-500">
|
|
227
|
+
Prompt versions
|
|
228
|
+
</span>
|
|
229
|
+
<div class="max-h-28 space-y-1 overflow-auto pr-1">
|
|
230
|
+
<label
|
|
231
|
+
v-for="p in kindPrompts"
|
|
232
|
+
:key="p.id"
|
|
233
|
+
class="flex items-center gap-2 text-sm text-slate-300"
|
|
234
|
+
>
|
|
235
|
+
<UCheckbox
|
|
236
|
+
:model-value="selectedPromptIds.includes(p.id)"
|
|
237
|
+
@update:model-value="
|
|
238
|
+
(v: boolean | 'indeterminate') => toggle('prompt', p.id, v === true)
|
|
239
|
+
"
|
|
240
|
+
/>
|
|
241
|
+
<span class="truncate">{{ p.name }}</span>
|
|
242
|
+
<UBadge
|
|
243
|
+
:color="p.origin === 'baseline' ? 'neutral' : 'primary'"
|
|
244
|
+
variant="soft"
|
|
245
|
+
size="xs"
|
|
246
|
+
>
|
|
247
|
+
{{ p.origin === 'baseline' ? 'baseline' : `v${p.version}` }}
|
|
248
|
+
</UBadge>
|
|
249
|
+
</label>
|
|
250
|
+
</div>
|
|
251
|
+
</div>
|
|
252
|
+
|
|
253
|
+
<div>
|
|
254
|
+
<span class="mb-1 block text-[10px] uppercase tracking-wide text-slate-500">
|
|
255
|
+
Models
|
|
256
|
+
</span>
|
|
257
|
+
<div class="max-h-28 space-y-1 overflow-auto pr-1">
|
|
258
|
+
<label
|
|
259
|
+
v-for="m in store.selectableModels"
|
|
260
|
+
:key="m.id"
|
|
261
|
+
class="flex items-center gap-2 text-sm text-slate-300"
|
|
262
|
+
>
|
|
263
|
+
<UCheckbox
|
|
264
|
+
:model-value="selectedModelIds.includes(m.id)"
|
|
265
|
+
@update:model-value="
|
|
266
|
+
(v: boolean | 'indeterminate') => toggle('model', m.id, v === true)
|
|
267
|
+
"
|
|
268
|
+
/>
|
|
269
|
+
<span class="truncate">{{ m.label }}</span>
|
|
270
|
+
</label>
|
|
271
|
+
<p v-if="!store.selectableModels.length" class="text-xs text-slate-500">
|
|
272
|
+
No selectable models — configure a provider key or enable Cloudflare AI.
|
|
273
|
+
</p>
|
|
274
|
+
</div>
|
|
275
|
+
</div>
|
|
276
|
+
|
|
277
|
+
<div>
|
|
278
|
+
<span class="mb-1 block text-[10px] uppercase tracking-wide text-slate-500">
|
|
279
|
+
Fixtures
|
|
280
|
+
</span>
|
|
281
|
+
<div class="max-h-28 space-y-1 overflow-auto pr-1">
|
|
282
|
+
<label
|
|
283
|
+
v-for="f in kindFixtures"
|
|
284
|
+
:key="f.id"
|
|
285
|
+
class="flex items-center gap-2 text-sm text-slate-300"
|
|
286
|
+
>
|
|
287
|
+
<UCheckbox
|
|
288
|
+
:model-value="selectedFixtureIds.includes(f.id)"
|
|
289
|
+
@update:model-value="
|
|
290
|
+
(v: boolean | 'indeterminate') => toggle('fixture', f.id, v === true)
|
|
291
|
+
"
|
|
292
|
+
/>
|
|
293
|
+
<span class="truncate">{{ f.name }}</span>
|
|
294
|
+
</label>
|
|
295
|
+
<p v-if="!kindFixtures.length" class="text-xs text-slate-500">
|
|
296
|
+
No fixtures for this agent.
|
|
297
|
+
</p>
|
|
298
|
+
</div>
|
|
299
|
+
</div>
|
|
300
|
+
|
|
301
|
+
<UFormField label="Judge model" hint="grades every cell">
|
|
302
|
+
<USelect v-model="selectedJudgeModel" :items="judgeModelItems" />
|
|
303
|
+
</UFormField>
|
|
304
|
+
|
|
305
|
+
<UFormField label="Name (optional)">
|
|
306
|
+
<UInput v-model="name" :placeholder="`${agentKind} — sandbox run`" />
|
|
307
|
+
</UFormField>
|
|
308
|
+
|
|
309
|
+
<div class="flex items-center justify-between">
|
|
310
|
+
<span class="text-xs text-slate-500">
|
|
311
|
+
{{ cellCount }} cell{{ cellCount === 1 ? '' : 's' }}
|
|
312
|
+
<span v-if="cellCount > store.maxCells" class="text-rose-400">
|
|
313
|
+
(max {{ store.maxCells }})
|
|
314
|
+
</span>
|
|
315
|
+
</span>
|
|
316
|
+
<UButton
|
|
317
|
+
color="primary"
|
|
318
|
+
icon="i-lucide-play"
|
|
319
|
+
size="sm"
|
|
320
|
+
:loading="store.launching"
|
|
321
|
+
:disabled="!canRun"
|
|
322
|
+
@click="createAndRun()"
|
|
323
|
+
>
|
|
324
|
+
Run
|
|
325
|
+
</UButton>
|
|
326
|
+
</div>
|
|
327
|
+
</div>
|
|
328
|
+
|
|
329
|
+
<!-- history + results -->
|
|
330
|
+
<div class="space-y-3">
|
|
331
|
+
<div v-if="store.detail" class="rounded-lg border border-slate-700 bg-slate-900/40 p-3">
|
|
332
|
+
<div class="mb-2 flex items-center justify-between">
|
|
333
|
+
<p class="text-sm font-medium text-slate-200">
|
|
334
|
+
{{ store.detail.experiment.name }}
|
|
335
|
+
</p>
|
|
336
|
+
<UBadge variant="soft" size="xs">{{ store.detail.experiment.status }}</UBadge>
|
|
337
|
+
</div>
|
|
338
|
+
<div class="overflow-auto">
|
|
339
|
+
<table class="w-full text-left text-xs">
|
|
340
|
+
<thead class="text-slate-500">
|
|
341
|
+
<tr>
|
|
342
|
+
<th class="py-1 pr-2 font-medium">Prompt</th>
|
|
343
|
+
<th class="py-1 pr-2 font-medium">Model</th>
|
|
344
|
+
<th class="py-1 pr-2 font-medium">Fixture</th>
|
|
345
|
+
<th class="py-1 pr-2 font-medium">Score</th>
|
|
346
|
+
<th class="py-1 font-medium">Objective</th>
|
|
347
|
+
</tr>
|
|
348
|
+
</thead>
|
|
349
|
+
<tbody>
|
|
350
|
+
<tr
|
|
351
|
+
v-for="run in store.detail.runs"
|
|
352
|
+
:key="run.id"
|
|
353
|
+
class="cursor-pointer border-t border-slate-800 hover:bg-slate-800/40"
|
|
354
|
+
@click="selectedRun = run"
|
|
355
|
+
>
|
|
356
|
+
<td class="py-1 pr-2 text-slate-300">{{ run.promptLabel }}</td>
|
|
357
|
+
<td class="py-1 pr-2 font-mono text-[11px] text-slate-400">
|
|
358
|
+
{{ run.model }}
|
|
359
|
+
</td>
|
|
360
|
+
<td class="py-1 pr-2 text-slate-400">{{ fixtureName(run.fixtureId) }}</td>
|
|
361
|
+
<td class="py-1 pr-2">
|
|
362
|
+
<span
|
|
363
|
+
v-if="gradeByRun.get(run.id)"
|
|
364
|
+
:class="scoreColor(gradeByRun.get(run.id)!.weightedTotal)"
|
|
365
|
+
class="font-semibold"
|
|
366
|
+
>
|
|
367
|
+
{{ gradeByRun.get(run.id)!.weightedTotal.toFixed(2) }}
|
|
368
|
+
</span>
|
|
369
|
+
<span v-else-if="run.status === 'failed'" class="text-rose-400"
|
|
370
|
+
>failed</span
|
|
371
|
+
>
|
|
372
|
+
<span v-else class="text-slate-600">—</span>
|
|
373
|
+
</td>
|
|
374
|
+
<td class="py-1">
|
|
375
|
+
<span
|
|
376
|
+
v-if="gradeByRun.get(run.id)?.objective"
|
|
377
|
+
:class="
|
|
378
|
+
gradeByRun.get(run.id)!.objective!.pass
|
|
379
|
+
? 'text-emerald-400'
|
|
380
|
+
: 'text-amber-400'
|
|
381
|
+
"
|
|
382
|
+
>
|
|
383
|
+
{{ gradeByRun.get(run.id)!.objective!.caught }}/{{
|
|
384
|
+
gradeByRun.get(run.id)!.objective!.total
|
|
385
|
+
}}
|
|
386
|
+
</span>
|
|
387
|
+
<span v-else class="text-slate-600">—</span>
|
|
388
|
+
</td>
|
|
389
|
+
</tr>
|
|
390
|
+
</tbody>
|
|
391
|
+
</table>
|
|
392
|
+
</div>
|
|
393
|
+
|
|
394
|
+
<!-- selected cell output -->
|
|
395
|
+
<div v-if="selectedRun" class="mt-3 border-t border-slate-800 pt-2">
|
|
396
|
+
<p class="mb-1 text-[11px] uppercase tracking-wide text-slate-500">
|
|
397
|
+
{{ selectedRun.promptLabel }} · {{ selectedRun.model }}
|
|
398
|
+
</p>
|
|
399
|
+
<p v-if="selectedRun.error" class="text-xs text-rose-400">
|
|
400
|
+
{{ selectedRun.error }}
|
|
401
|
+
</p>
|
|
402
|
+
<pre
|
|
403
|
+
v-if="selectedRun.outputText"
|
|
404
|
+
class="max-h-48 overflow-auto whitespace-pre-wrap rounded bg-slate-950/60 p-2 text-[11px] text-slate-300"
|
|
405
|
+
>{{ selectedRun.outputText }}</pre
|
|
406
|
+
>
|
|
407
|
+
<div v-if="gradeByRun.get(selectedRun.id)" class="mt-2 space-y-0.5">
|
|
408
|
+
<p
|
|
409
|
+
v-for="d in gradeByRun.get(selectedRun.id)!.scores"
|
|
410
|
+
:key="d.key"
|
|
411
|
+
class="text-[11px] text-slate-400"
|
|
412
|
+
>
|
|
413
|
+
<span :class="scoreColor(d.score)" class="font-semibold">{{ d.score }}</span>
|
|
414
|
+
<span class="ml-1 text-slate-300">{{ d.key }}</span>
|
|
415
|
+
<span v-if="d.rationale" class="ml-1 text-slate-500">— {{ d.rationale }}</span>
|
|
416
|
+
</p>
|
|
417
|
+
</div>
|
|
418
|
+
</div>
|
|
419
|
+
</div>
|
|
420
|
+
|
|
421
|
+
<p class="text-[11px] uppercase tracking-wide text-slate-500">Past experiments</p>
|
|
422
|
+
<div class="max-h-56 space-y-1 overflow-auto">
|
|
423
|
+
<button
|
|
424
|
+
v-for="x in store.experiments"
|
|
425
|
+
:key="x.id"
|
|
426
|
+
class="flex w-full items-center justify-between rounded-md border border-slate-800 bg-slate-900/40 px-2 py-1.5 text-left text-sm hover:bg-slate-800/50"
|
|
427
|
+
@click="store.openExperiment(x.id)"
|
|
428
|
+
>
|
|
429
|
+
<span class="truncate text-slate-300">{{ x.name }}</span>
|
|
430
|
+
<UBadge variant="soft" size="xs">{{ x.status }}</UBadge>
|
|
431
|
+
</button>
|
|
432
|
+
<p v-if="!store.experiments.length" class="text-xs text-slate-500">
|
|
433
|
+
No experiments yet.
|
|
434
|
+
</p>
|
|
435
|
+
</div>
|
|
436
|
+
</div>
|
|
437
|
+
</div>
|
|
438
|
+
|
|
439
|
+
<!-- ============================== PROMPTS ============================== -->
|
|
440
|
+
<div v-else-if="tab === 'prompts'" class="grid gap-4 lg:grid-cols-2">
|
|
441
|
+
<div class="max-h-[28rem] space-y-1.5 overflow-auto pr-1">
|
|
442
|
+
<div
|
|
443
|
+
v-for="p in store.prompts"
|
|
444
|
+
:key="p.id"
|
|
445
|
+
class="flex items-center justify-between rounded-md border border-slate-800 bg-slate-900/40 px-2.5 py-1.5 text-sm"
|
|
446
|
+
>
|
|
447
|
+
<div class="min-w-0">
|
|
448
|
+
<div class="flex items-center gap-2">
|
|
449
|
+
<span class="truncate text-slate-200">{{ p.name }}</span>
|
|
450
|
+
<UBadge
|
|
451
|
+
:color="p.origin === 'baseline' ? 'neutral' : 'primary'"
|
|
452
|
+
variant="soft"
|
|
453
|
+
size="xs"
|
|
454
|
+
>
|
|
455
|
+
{{ p.origin === 'baseline' ? 'baseline' : `v${p.version}` }}
|
|
456
|
+
</UBadge>
|
|
457
|
+
</div>
|
|
458
|
+
<span class="text-[11px] text-slate-500">{{ p.agentKind }}</span>
|
|
459
|
+
</div>
|
|
460
|
+
<div class="flex items-center gap-1">
|
|
461
|
+
<UButton
|
|
462
|
+
icon="i-lucide-pencil"
|
|
463
|
+
color="neutral"
|
|
464
|
+
variant="ghost"
|
|
465
|
+
size="xs"
|
|
466
|
+
:title="p.origin === 'baseline' ? 'Fork into a candidate' : 'Edit / version'"
|
|
467
|
+
@click="edit(p)"
|
|
468
|
+
/>
|
|
469
|
+
<UButton
|
|
470
|
+
v-if="p.origin === 'candidate'"
|
|
471
|
+
icon="i-lucide-archive"
|
|
472
|
+
color="error"
|
|
473
|
+
variant="ghost"
|
|
474
|
+
size="xs"
|
|
475
|
+
@click="archive(p)"
|
|
476
|
+
/>
|
|
477
|
+
</div>
|
|
478
|
+
</div>
|
|
479
|
+
</div>
|
|
480
|
+
|
|
481
|
+
<div
|
|
482
|
+
v-if="editing"
|
|
483
|
+
class="space-y-2 rounded-lg border border-slate-700 bg-slate-900/40 p-3"
|
|
484
|
+
>
|
|
485
|
+
<p class="text-[11px] uppercase tracking-wide text-slate-500">
|
|
486
|
+
{{ editing.origin === 'baseline' ? 'Fork' : 'New version of' }} · {{ editing.name }}
|
|
487
|
+
</p>
|
|
488
|
+
<UTextarea v-model="editText" :rows="16" class="w-full font-mono text-xs" autoresize />
|
|
489
|
+
<div class="flex justify-end gap-2">
|
|
490
|
+
<UButton color="neutral" variant="ghost" size="sm" @click="editing = null">
|
|
491
|
+
Cancel
|
|
492
|
+
</UButton>
|
|
493
|
+
<UButton
|
|
494
|
+
color="primary"
|
|
495
|
+
icon="i-lucide-save"
|
|
496
|
+
size="sm"
|
|
497
|
+
:loading="savingPrompt"
|
|
498
|
+
:disabled="!editText.trim()"
|
|
499
|
+
@click="saveVersion()"
|
|
500
|
+
>
|
|
501
|
+
Save new version
|
|
502
|
+
</UButton>
|
|
503
|
+
</div>
|
|
504
|
+
</div>
|
|
505
|
+
<p v-else class="self-start text-xs text-slate-500">
|
|
506
|
+
Pick a prompt to fork a shipped baseline or version a candidate. Each save appends an
|
|
507
|
+
immutable version you can put under test.
|
|
508
|
+
</p>
|
|
509
|
+
</div>
|
|
510
|
+
|
|
511
|
+
<!-- ============================== FIXTURES ============================== -->
|
|
512
|
+
<div v-else class="max-h-[28rem] space-y-1.5 overflow-auto pr-1">
|
|
513
|
+
<div
|
|
514
|
+
v-for="f in store.fixtures"
|
|
515
|
+
:key="f.id"
|
|
516
|
+
class="rounded-md border border-slate-800 bg-slate-900/40 px-2.5 py-2 text-sm"
|
|
517
|
+
>
|
|
518
|
+
<div class="flex items-center justify-between">
|
|
519
|
+
<span class="text-slate-200">{{ f.name }}</span>
|
|
520
|
+
<div class="flex items-center gap-1.5">
|
|
521
|
+
<UBadge variant="soft" size="xs">{{ f.kind }}</UBadge>
|
|
522
|
+
<UBadge
|
|
523
|
+
:color="f.origin === 'builtin' ? 'neutral' : 'primary'"
|
|
524
|
+
variant="soft"
|
|
525
|
+
size="xs"
|
|
526
|
+
>
|
|
527
|
+
{{ f.origin }}
|
|
528
|
+
</UBadge>
|
|
529
|
+
</div>
|
|
530
|
+
</div>
|
|
531
|
+
<p v-if="f.objective?.kind === 'findings'" class="mt-0.5 text-[11px] text-slate-500">
|
|
532
|
+
{{ f.objective.expectations.length }} graded expectation{{
|
|
533
|
+
f.objective.expectations.length === 1 ? '' : 's'
|
|
534
|
+
}}
|
|
535
|
+
</p>
|
|
536
|
+
</div>
|
|
537
|
+
<p v-if="!store.fixtures.length" class="text-xs text-slate-500">No fixtures.</p>
|
|
538
|
+
</div>
|
|
539
|
+
</div>
|
|
540
|
+
</template>
|
|
541
|
+
</UModal>
|
|
542
|
+
</template>
|
|
@@ -83,11 +83,17 @@ export function reviewsApi({ http, ws }: ApiContext) {
|
|
|
83
83
|
),
|
|
84
84
|
|
|
85
85
|
// Ask the Requirement Writer to recommend grounded answers for a batch of findings (by
|
|
86
|
-
// item id). Returns the review with `
|
|
87
|
-
|
|
86
|
+
// item id). Returns the review with `pending` placeholder recommendations; they fill in
|
|
87
|
+
// (`ready`) asynchronously via the `requirements` stream as the Writer produces each.
|
|
88
|
+
requestRecommendations: (
|
|
89
|
+
workspaceId: string,
|
|
90
|
+
blockId: string,
|
|
91
|
+
itemIds: string[],
|
|
92
|
+
note?: string,
|
|
93
|
+
) =>
|
|
88
94
|
http<RequirementReview | null>(
|
|
89
95
|
`${ws(workspaceId)}/blocks/${encodeURIComponent(blockId)}/requirement-review/recommend`,
|
|
90
|
-
{ method: 'POST', body: { itemIds } },
|
|
96
|
+
{ method: 'POST', body: { itemIds, ...(note ? { note } : {}) } },
|
|
91
97
|
),
|
|
92
98
|
|
|
93
99
|
// Accept a recommendation (becomes the finding's answer), reject it, or re-request it
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
CloneSandboxPromptInput,
|
|
3
|
+
CreateSandboxExperimentInput,
|
|
4
|
+
SandboxExperiment,
|
|
5
|
+
SandboxExperimentDetail,
|
|
6
|
+
SandboxFixture,
|
|
7
|
+
SandboxOverview,
|
|
8
|
+
SandboxPromptVersion,
|
|
9
|
+
SaveSandboxVersionInput,
|
|
10
|
+
} from '~/types/sandbox'
|
|
11
|
+
import type { ApiContext } from './context'
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* The Sandbox API (the parallel prompt/model testing surface): manage versioned prompt
|
|
15
|
+
* candidates + the fixture library, define experiments (prompt × model × fixture), and
|
|
16
|
+
* launch one to run + grade every cell. Opt-in: every endpoint 503s when the deployment
|
|
17
|
+
* hasn't wired the Sandbox (its dedicated DB / schema).
|
|
18
|
+
*/
|
|
19
|
+
export function sandboxApi({ http, ws }: ApiContext) {
|
|
20
|
+
const base = (workspaceId: string) => `${ws(workspaceId)}/sandbox`
|
|
21
|
+
return {
|
|
22
|
+
getSandboxOverview: (workspaceId: string) =>
|
|
23
|
+
http<SandboxOverview>(`${base(workspaceId)}/overview`),
|
|
24
|
+
|
|
25
|
+
// ---- prompt versions -------------------------------------------------
|
|
26
|
+
cloneSandboxPrompt: (workspaceId: string, body: CloneSandboxPromptInput) =>
|
|
27
|
+
http<SandboxPromptVersion>(`${base(workspaceId)}/prompts/clone`, { method: 'POST', body }),
|
|
28
|
+
saveSandboxVersion: (workspaceId: string, body: SaveSandboxVersionInput) =>
|
|
29
|
+
http<SandboxPromptVersion>(`${base(workspaceId)}/prompts`, { method: 'POST', body }),
|
|
30
|
+
setSandboxPromptLabels: (workspaceId: string, promptId: string, labels: string[]) =>
|
|
31
|
+
http<SandboxPromptVersion>(
|
|
32
|
+
`${base(workspaceId)}/prompts/${encodeURIComponent(promptId)}/labels`,
|
|
33
|
+
{ method: 'PATCH', body: { labels } },
|
|
34
|
+
),
|
|
35
|
+
archiveSandboxPrompt: (workspaceId: string, promptId: string) =>
|
|
36
|
+
http(`${base(workspaceId)}/prompts/${encodeURIComponent(promptId)}`, { method: 'DELETE' }),
|
|
37
|
+
|
|
38
|
+
// ---- fixtures --------------------------------------------------------
|
|
39
|
+
createSandboxFixture: (workspaceId: string, body: Partial<SandboxFixture>) =>
|
|
40
|
+
http<SandboxFixture>(`${base(workspaceId)}/fixtures`, { method: 'POST', body }),
|
|
41
|
+
deleteSandboxFixture: (workspaceId: string, fixtureId: string) =>
|
|
42
|
+
http(`${base(workspaceId)}/fixtures/${encodeURIComponent(fixtureId)}`, { method: 'DELETE' }),
|
|
43
|
+
|
|
44
|
+
// ---- experiments -----------------------------------------------------
|
|
45
|
+
createSandboxExperiment: (workspaceId: string, body: CreateSandboxExperimentInput) =>
|
|
46
|
+
http<SandboxExperiment>(`${base(workspaceId)}/experiments`, { method: 'POST', body }),
|
|
47
|
+
getSandboxExperiment: (workspaceId: string, experimentId: string) =>
|
|
48
|
+
http<SandboxExperimentDetail>(
|
|
49
|
+
`${base(workspaceId)}/experiments/${encodeURIComponent(experimentId)}`,
|
|
50
|
+
),
|
|
51
|
+
launchSandboxExperiment: (workspaceId: string, experimentId: string) =>
|
|
52
|
+
http<SandboxExperimentDetail>(
|
|
53
|
+
`${base(workspaceId)}/experiments/${encodeURIComponent(experimentId)}/launch`,
|
|
54
|
+
{ method: 'POST' },
|
|
55
|
+
),
|
|
56
|
+
}
|
|
57
|
+
}
|
|
@@ -15,6 +15,7 @@ import { presetsApi } from './api/presets'
|
|
|
15
15
|
import { providerConnectionsApi } from './api/providerConnections'
|
|
16
16
|
import { recurringApi } from './api/recurring'
|
|
17
17
|
import { releaseHealthApi } from './api/releaseHealth'
|
|
18
|
+
import { sandboxApi } from './api/sandbox'
|
|
18
19
|
import { reviewsApi } from './api/reviews'
|
|
19
20
|
import { slackApi } from './api/slack'
|
|
20
21
|
import { specApi } from './api/spec'
|
|
@@ -89,6 +90,7 @@ export function useApi() {
|
|
|
89
90
|
...providerConnectionsApi(ctx),
|
|
90
91
|
...releaseHealthApi(ctx),
|
|
91
92
|
...recurringApi(ctx),
|
|
93
|
+
...sandboxApi(ctx),
|
|
92
94
|
...githubApi(ctx),
|
|
93
95
|
...slackApi(ctx),
|
|
94
96
|
...bootstrapApi(ctx),
|
package/app/pages/index.vue
CHANGED
|
@@ -33,6 +33,7 @@ import ProviderConnectionPanel from '~/components/settings/ProviderConnectionPan
|
|
|
33
33
|
import ProviderConfigBanner from '~/components/layout/ProviderConfigBanner.vue'
|
|
34
34
|
import ModelConfigurationPanel from '~/components/settings/ModelConfigurationPanel.vue'
|
|
35
35
|
import LocalModelEndpointsPanel from '~/components/settings/LocalModelEndpointsPanel.vue'
|
|
36
|
+
import SandboxPanel from '~/components/sandbox/SandboxPanel.vue'
|
|
36
37
|
import UserSecretsSection from '~/components/settings/UserSecretsSection.vue'
|
|
37
38
|
import OpenRouterCatalogPanel from '~/components/settings/OpenRouterCatalogPanel.vue'
|
|
38
39
|
import VendorCredentialsModal from '~/components/providers/VendorCredentialsModal.vue'
|
|
@@ -189,6 +190,7 @@ watch(
|
|
|
189
190
|
<ProviderConnectionPanel />
|
|
190
191
|
<ModelConfigurationPanel />
|
|
191
192
|
<LocalModelEndpointsPanel />
|
|
193
|
+
<SandboxPanel />
|
|
192
194
|
<UserSecretsSection />
|
|
193
195
|
<OpenRouterCatalogPanel />
|
|
194
196
|
<VendorCredentialsModal />
|
|
@@ -44,14 +44,21 @@ export const useRequirementsStore = defineStore('requirements', () => {
|
|
|
44
44
|
function reviewFor(blockId: string): RequirementReview | null {
|
|
45
45
|
return reviews.value[blockId] ?? null
|
|
46
46
|
}
|
|
47
|
+
/** Whether the Requirement Writer is still producing recommendations for a block (a `pending`
|
|
48
|
+
* placeholder exists). Server-derived, so the "Recommending…" state survives the window closing
|
|
49
|
+
* and a page reload — the client-local `recommending` set only covers the request round-trip. */
|
|
50
|
+
function hasPendingRecommendations(blockId: string): boolean {
|
|
51
|
+
return (reviews.value[blockId]?.recommendations ?? []).some((r) => r.status === 'pending')
|
|
52
|
+
}
|
|
47
53
|
/**
|
|
48
54
|
* The async background stage a block's review is in, or null. While the driver folds the
|
|
49
|
-
* answers (`incorporating`) then re-reviews the document (`reviewing`),
|
|
50
|
-
*
|
|
51
|
-
*
|
|
55
|
+
* answers (`incorporating`) then re-reviews the document (`reviewing`), or the Requirement
|
|
56
|
+
* Writer is producing recommendations (`recommending`), NO human action is needed — so the
|
|
57
|
+
* board suppresses the "Approval needed" gate and shows this working state instead, with copy
|
|
58
|
+
* that names which stage is running.
|
|
52
59
|
*/
|
|
53
60
|
function backgroundStage(blockId: string): 'incorporating' | 'reviewing' | 'recommending' | null {
|
|
54
|
-
if (recommending.value.has(blockId)) return 'recommending'
|
|
61
|
+
if (recommending.value.has(blockId) || hasPendingRecommendations(blockId)) return 'recommending'
|
|
55
62
|
const status = reviews.value[blockId]?.status
|
|
56
63
|
return status === 'incorporating' || status === 'reviewing' ? status : null
|
|
57
64
|
}
|
|
@@ -176,19 +183,26 @@ export const useRequirementsStore = defineStore('requirements', () => {
|
|
|
176
183
|
}
|
|
177
184
|
|
|
178
185
|
function isRecommending(blockId: string): boolean {
|
|
179
|
-
return recommending.value.has(blockId)
|
|
186
|
+
return recommending.value.has(blockId) || hasPendingRecommendations(blockId)
|
|
180
187
|
}
|
|
181
188
|
|
|
182
189
|
/**
|
|
183
190
|
* Ask the Requirement Writer to recommend answers for a batch of findings (by item id).
|
|
184
|
-
*
|
|
185
|
-
*
|
|
186
|
-
*
|
|
191
|
+
* ASYNCHRONOUS: returns at once with `pending` placeholder recommendations (the Writer runs
|
|
192
|
+
* per finding in the durable driver), which fill in (`ready`) via live `requirements` stream
|
|
193
|
+
* events; a notification calls the user back when the batch is ready. The board shows the
|
|
194
|
+
* `recommending` background stage while any placeholder is pending. Optional `note` steers the
|
|
195
|
+
* whole batch.
|
|
187
196
|
*/
|
|
188
|
-
async function requestRecommendations(blockId: string, itemIds: string[]) {
|
|
197
|
+
async function requestRecommendations(blockId: string, itemIds: string[], note?: string) {
|
|
189
198
|
withFlag(recommending, blockId, true)
|
|
190
199
|
try {
|
|
191
|
-
const updated = await api.requestRecommendations(
|
|
200
|
+
const updated = await api.requestRecommendations(
|
|
201
|
+
workspace.requireId(),
|
|
202
|
+
blockId,
|
|
203
|
+
itemIds,
|
|
204
|
+
note,
|
|
205
|
+
)
|
|
192
206
|
if (updated) store(updated)
|
|
193
207
|
return updated
|
|
194
208
|
} finally {
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
import { defineStore } from 'pinia'
|
|
2
|
+
import { computed, ref } from 'vue'
|
|
3
|
+
import type { ModelOption } from '~/types/domain'
|
|
4
|
+
import type {
|
|
5
|
+
CreateSandboxExperimentInput,
|
|
6
|
+
SandboxAgentKindMeta,
|
|
7
|
+
SandboxExperiment,
|
|
8
|
+
SandboxExperimentDetail,
|
|
9
|
+
SandboxFixture,
|
|
10
|
+
SandboxOverview,
|
|
11
|
+
SandboxPromptVersion,
|
|
12
|
+
} from '~/types/sandbox'
|
|
13
|
+
import { useWorkspaceStore } from '~/stores/workspace'
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* The Sandbox (parallel prompt/model testing surface). Loaded on demand when the panel
|
|
17
|
+
* opens (it's an opt-in, secondary surface, not part of the board snapshot): the testable
|
|
18
|
+
* agent-kind catalog, the shipped baselines + stored candidate prompt versions, the
|
|
19
|
+
* fixture library, and experiment definitions. Running an experiment grades every cell
|
|
20
|
+
* with a judge model; `launch` returns the full result grid.
|
|
21
|
+
*/
|
|
22
|
+
export const useSandboxStore = defineStore('sandbox', () => {
|
|
23
|
+
const api = useApi()
|
|
24
|
+
|
|
25
|
+
const available = ref(true)
|
|
26
|
+
const loading = ref(false)
|
|
27
|
+
const error = ref<string | null>(null)
|
|
28
|
+
|
|
29
|
+
const agentKinds = ref<SandboxAgentKindMeta[]>([])
|
|
30
|
+
const prompts = ref<SandboxPromptVersion[]>([])
|
|
31
|
+
const fixtures = ref<SandboxFixture[]>([])
|
|
32
|
+
const experiments = ref<SandboxExperiment[]>([])
|
|
33
|
+
const models = ref<ModelOption[]>([])
|
|
34
|
+
/** The matrix cell cap (from the backend overview, so the builder gates on the same limit). */
|
|
35
|
+
const maxCells = ref(100)
|
|
36
|
+
|
|
37
|
+
/** The currently-opened experiment's full detail (result grid), if any. */
|
|
38
|
+
const detail = ref<SandboxExperimentDetail | null>(null)
|
|
39
|
+
const launching = ref(false)
|
|
40
|
+
|
|
41
|
+
function hydrate(overview: SandboxOverview) {
|
|
42
|
+
agentKinds.value = overview.agentKinds
|
|
43
|
+
prompts.value = overview.prompts
|
|
44
|
+
fixtures.value = overview.fixtures
|
|
45
|
+
experiments.value = [...overview.experiments].sort((a, b) => b.createdAt - a.createdAt)
|
|
46
|
+
maxCells.value = overview.maxCells
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Patch one experiment into the list in place (newest-first), without a full reload. */
|
|
50
|
+
function upsertExperiment(experiment: SandboxExperiment) {
|
|
51
|
+
const next = experiments.value.filter((e) => e.id !== experiment.id)
|
|
52
|
+
next.push(experiment)
|
|
53
|
+
experiments.value = next.sort((a, b) => b.createdAt - a.createdAt)
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/** Load the overview + the workspace model catalog. The 503 (feature off) is surfaced. */
|
|
57
|
+
async function load() {
|
|
58
|
+
const ws = useWorkspaceStore()
|
|
59
|
+
if (!ws.workspaceId) return
|
|
60
|
+
loading.value = true
|
|
61
|
+
error.value = null
|
|
62
|
+
try {
|
|
63
|
+
const [overview, modelList] = await Promise.all([
|
|
64
|
+
api.getSandboxOverview(ws.requireId()),
|
|
65
|
+
api.getWorkspaceModels(ws.requireId()),
|
|
66
|
+
])
|
|
67
|
+
hydrate(overview)
|
|
68
|
+
models.value = modelList
|
|
69
|
+
available.value = true
|
|
70
|
+
} catch (e) {
|
|
71
|
+
const status =
|
|
72
|
+
(e as { statusCode?: number; response?: { status?: number } })?.statusCode ??
|
|
73
|
+
(e as { response?: { status?: number } })?.response?.status
|
|
74
|
+
if (status === 503) {
|
|
75
|
+
available.value = false
|
|
76
|
+
} else {
|
|
77
|
+
error.value = e instanceof Error ? e.message : String(e)
|
|
78
|
+
}
|
|
79
|
+
} finally {
|
|
80
|
+
loading.value = false
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/** Selectable models for the experiment picker (the backend computed `available`). */
|
|
85
|
+
const selectableModels = computed(() => models.value.filter((m) => m.available !== false))
|
|
86
|
+
|
|
87
|
+
/** Prompt versions for one agent kind (baselines first, then candidates). */
|
|
88
|
+
function promptsForKind(agentKind: string): SandboxPromptVersion[] {
|
|
89
|
+
return prompts.value.filter((p) => p.agentKind === agentKind)
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/** Fixtures authored for one agent kind, filtered by the catalog's `fixtureKinds`. */
|
|
93
|
+
function fixturesForKind(agentKind: string): SandboxFixture[] {
|
|
94
|
+
const meta = agentKinds.value.find((k) => k.agentKind === agentKind)
|
|
95
|
+
if (!meta) return fixtures.value
|
|
96
|
+
// The backend catalog is the source of truth for the fixture↔kind mapping.
|
|
97
|
+
const wanted = meta.fixtureKinds
|
|
98
|
+
return fixtures.value.filter((f) => wanted.includes(f.kind))
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
async function clonePrompt(agentKind: string, basePromptId: string | null, name?: string) {
|
|
102
|
+
const ws = useWorkspaceStore()
|
|
103
|
+
const created = await api.cloneSandboxPrompt(ws.requireId(), { agentKind, basePromptId, name })
|
|
104
|
+
await load()
|
|
105
|
+
return created
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
async function saveVersion(parentId: string, systemText: string) {
|
|
109
|
+
const ws = useWorkspaceStore()
|
|
110
|
+
const saved = await api.saveSandboxVersion(ws.requireId(), { parentId, systemText })
|
|
111
|
+
await load()
|
|
112
|
+
return saved
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
async function archivePrompt(promptId: string) {
|
|
116
|
+
const ws = useWorkspaceStore()
|
|
117
|
+
await api.archiveSandboxPrompt(ws.requireId(), promptId)
|
|
118
|
+
await load()
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
async function createExperiment(input: CreateSandboxExperimentInput) {
|
|
122
|
+
const ws = useWorkspaceStore()
|
|
123
|
+
const created = await api.createSandboxExperiment(ws.requireId(), input)
|
|
124
|
+
await load()
|
|
125
|
+
return created
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
async function openExperiment(experimentId: string) {
|
|
129
|
+
const ws = useWorkspaceStore()
|
|
130
|
+
detail.value = await api.getSandboxExperiment(ws.requireId(), experimentId)
|
|
131
|
+
return detail.value
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
async function launch(experimentId: string) {
|
|
135
|
+
const ws = useWorkspaceStore()
|
|
136
|
+
launching.value = true
|
|
137
|
+
try {
|
|
138
|
+
// `launch` returns the full graded grid AND the updated experiment, so patch both in
|
|
139
|
+
// place rather than calling `load()`: a transient failure in that follow-up fetch
|
|
140
|
+
// would otherwise set `error` and hide the freshly-returned result grid behind the
|
|
141
|
+
// error panel (and re-fetch the whole overview + model catalog for nothing).
|
|
142
|
+
const result = await api.launchSandboxExperiment(ws.requireId(), experimentId)
|
|
143
|
+
detail.value = result
|
|
144
|
+
upsertExperiment(result.experiment)
|
|
145
|
+
return result
|
|
146
|
+
} finally {
|
|
147
|
+
launching.value = false
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
available,
|
|
153
|
+
loading,
|
|
154
|
+
error,
|
|
155
|
+
agentKinds,
|
|
156
|
+
prompts,
|
|
157
|
+
fixtures,
|
|
158
|
+
experiments,
|
|
159
|
+
models,
|
|
160
|
+
maxCells,
|
|
161
|
+
selectableModels,
|
|
162
|
+
detail,
|
|
163
|
+
launching,
|
|
164
|
+
load,
|
|
165
|
+
promptsForKind,
|
|
166
|
+
fixturesForKind,
|
|
167
|
+
clonePrompt,
|
|
168
|
+
saveVersion,
|
|
169
|
+
archivePrompt,
|
|
170
|
+
createExperiment,
|
|
171
|
+
openExperiment,
|
|
172
|
+
launch,
|
|
173
|
+
}
|
|
174
|
+
})
|
package/app/stores/ui.ts
CHANGED
|
@@ -104,6 +104,8 @@ export const useUiStore = defineStore('ui', () => {
|
|
|
104
104
|
const vendorCredentialsOpen = ref(false)
|
|
105
105
|
// Per-user settings panel: the signed-in user's own-machine local model runners.
|
|
106
106
|
const localModelsOpen = ref(false)
|
|
107
|
+
// The Sandbox (parallel prompt/model testing) surface — an opt-in, on-demand window.
|
|
108
|
+
const sandboxOpen = ref(false)
|
|
107
109
|
const userSecretsOpen = ref(false)
|
|
108
110
|
// Per-workspace settings panel: the OpenRouter dynamic catalog (browse/enable gateway models).
|
|
109
111
|
const openRouterOpen = ref(false)
|
|
@@ -364,6 +366,12 @@ export const useUiStore = defineStore('ui', () => {
|
|
|
364
366
|
function closeLocalModels() {
|
|
365
367
|
localModelsOpen.value = false
|
|
366
368
|
}
|
|
369
|
+
function openSandbox() {
|
|
370
|
+
sandboxOpen.value = true
|
|
371
|
+
}
|
|
372
|
+
function closeSandbox() {
|
|
373
|
+
sandboxOpen.value = false
|
|
374
|
+
}
|
|
367
375
|
function openUserSecrets() {
|
|
368
376
|
userSecretsOpen.value = true
|
|
369
377
|
}
|
|
@@ -464,6 +472,7 @@ export const useUiStore = defineStore('ui', () => {
|
|
|
464
472
|
modelConfigOpen,
|
|
465
473
|
vendorCredentialsOpen,
|
|
466
474
|
localModelsOpen,
|
|
475
|
+
sandboxOpen,
|
|
467
476
|
userSecretsOpen,
|
|
468
477
|
openRouterOpen,
|
|
469
478
|
aiProviderSetupOpen,
|
|
@@ -528,6 +537,8 @@ export const useUiStore = defineStore('ui', () => {
|
|
|
528
537
|
closeVendorCredentials,
|
|
529
538
|
openLocalModels,
|
|
530
539
|
closeLocalModels,
|
|
540
|
+
openSandbox,
|
|
541
|
+
closeSandbox,
|
|
531
542
|
openUserSecrets,
|
|
532
543
|
closeUserSecrets,
|
|
533
544
|
openOpenRouter,
|
|
@@ -50,8 +50,12 @@ export type RequirementReviewStatus =
|
|
|
50
50
|
/** How a human resolves a review that hit its iteration cap. */
|
|
51
51
|
export type ResolveRequirementsExceededChoice = 'extra-round' | 'proceed' | 'stop-reset'
|
|
52
52
|
|
|
53
|
-
/**
|
|
54
|
-
|
|
53
|
+
/**
|
|
54
|
+
* Lifecycle of a Requirement-Writer recommendation. `pending` is a placeholder created the
|
|
55
|
+
* moment the human requests it — the Writer is still producing the suggestion in the background
|
|
56
|
+
* (the async story); it fills in to `ready` via the `requirements` stream.
|
|
57
|
+
*/
|
|
58
|
+
export type RecommendationStatus = 'pending' | 'ready' | 'accepted' | 'rejected'
|
|
55
59
|
|
|
56
60
|
/**
|
|
57
61
|
* A Requirement-Writer suggestion for one finding. First-class on the review (survives the
|
|
@@ -60,7 +64,7 @@ export type RecommendationStatus = 'ready' | 'accepted' | 'rejected'
|
|
|
60
64
|
*/
|
|
61
65
|
export interface RequirementRecommendation {
|
|
62
66
|
id: string
|
|
63
|
-
sourceFinding: { title: string; detail: string }
|
|
67
|
+
sourceFinding: { title: string; detail: string; itemId?: string }
|
|
64
68
|
recommendedText: string
|
|
65
69
|
status: RecommendationStatus
|
|
66
70
|
note: string | null
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
// Sandbox (the parallel prompt/model testing surface) wire shapes, hand-mirrored from
|
|
2
|
+
// `@cat-factory/contracts` (sandbox.ts) so a backend payload drops straight into the
|
|
3
|
+
// store. Clone a shipped agent prompt into a versioned candidate, run an experiment
|
|
4
|
+
// matrix (prompt versions × models × fixtures) for one agent kind, and grade every cell
|
|
5
|
+
// with a judge model plus (where a fixture supports it) an objective findings score.
|
|
6
|
+
|
|
7
|
+
export type SandboxPromptOrigin = 'baseline' | 'candidate'
|
|
8
|
+
|
|
9
|
+
export interface SandboxPromptVersion {
|
|
10
|
+
id: string
|
|
11
|
+
lineageId: string
|
|
12
|
+
agentKind: string
|
|
13
|
+
name: string
|
|
14
|
+
origin: SandboxPromptOrigin
|
|
15
|
+
systemText: string
|
|
16
|
+
basePromptId: string | null
|
|
17
|
+
version: number
|
|
18
|
+
parentId: string | null
|
|
19
|
+
labels: string[]
|
|
20
|
+
createdAt: number
|
|
21
|
+
createdBy: string | null
|
|
22
|
+
archivedAt: number | null
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export type SandboxFixtureKind =
|
|
26
|
+
| 'requirements'
|
|
27
|
+
| 'clarity'
|
|
28
|
+
| 'architecture'
|
|
29
|
+
| 'code-review'
|
|
30
|
+
| 'repo-feature'
|
|
31
|
+
| 'repo-bug'
|
|
32
|
+
|
|
33
|
+
export interface SandboxExpectation {
|
|
34
|
+
id: string
|
|
35
|
+
summary: string
|
|
36
|
+
detail: string
|
|
37
|
+
trickiness: number
|
|
38
|
+
impact: number
|
|
39
|
+
matchHints: string[]
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export type SandboxFixtureObjective =
|
|
43
|
+
| { kind: 'tests'; testCmd: string }
|
|
44
|
+
| { kind: 'findings'; expectations: SandboxExpectation[] }
|
|
45
|
+
|
|
46
|
+
export interface SandboxFixture {
|
|
47
|
+
id: string
|
|
48
|
+
kind: SandboxFixtureKind
|
|
49
|
+
name: string
|
|
50
|
+
payload: Record<string, unknown> | null
|
|
51
|
+
repoRef: { owner: string; name: string; seedRef: string } | null
|
|
52
|
+
objective: SandboxFixtureObjective | null
|
|
53
|
+
origin: 'builtin' | 'custom'
|
|
54
|
+
createdAt: number
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export type SandboxExperimentStatus = 'draft' | 'running' | 'done' | 'failed'
|
|
58
|
+
|
|
59
|
+
export interface SandboxMatrix {
|
|
60
|
+
promptVersionIds: string[]
|
|
61
|
+
models: string[]
|
|
62
|
+
fixtureIds: string[]
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export interface SandboxExperiment {
|
|
66
|
+
id: string
|
|
67
|
+
name: string
|
|
68
|
+
agentKind: string
|
|
69
|
+
judgeModel: string
|
|
70
|
+
repeats: number
|
|
71
|
+
status: SandboxExperimentStatus
|
|
72
|
+
matrix: SandboxMatrix
|
|
73
|
+
budgetTokens: number | null
|
|
74
|
+
createdAt: number
|
|
75
|
+
createdBy: string | null
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export type SandboxRunStatus = 'queued' | 'running' | 'done' | 'failed'
|
|
79
|
+
|
|
80
|
+
export interface SandboxTokenUsage {
|
|
81
|
+
inputTokens: number
|
|
82
|
+
outputTokens: number
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export interface SandboxRun {
|
|
86
|
+
id: string
|
|
87
|
+
experimentId: string
|
|
88
|
+
promptVersionId: string
|
|
89
|
+
model: string
|
|
90
|
+
fixtureId: string
|
|
91
|
+
repeatIndex: number
|
|
92
|
+
status: SandboxRunStatus
|
|
93
|
+
outputText: string | null
|
|
94
|
+
usage: SandboxTokenUsage | null
|
|
95
|
+
latencyMs: number | null
|
|
96
|
+
branch: string | null
|
|
97
|
+
prUrl: string | null
|
|
98
|
+
diff: string | null
|
|
99
|
+
error: string | null
|
|
100
|
+
seedSha: string | null
|
|
101
|
+
promptLabel: string
|
|
102
|
+
startedAt: number | null
|
|
103
|
+
finishedAt: number | null
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export interface SandboxGradeDimension {
|
|
107
|
+
key: string
|
|
108
|
+
score: number
|
|
109
|
+
rationale: string
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
export interface SandboxObjectiveResult {
|
|
113
|
+
kind: 'tests' | 'findings'
|
|
114
|
+
pass: boolean
|
|
115
|
+
detail: string
|
|
116
|
+
impactRecall: number | null
|
|
117
|
+
wowBonus: number | null
|
|
118
|
+
caught: number | null
|
|
119
|
+
total: number | null
|
|
120
|
+
missedHighImpact: string[] | null
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
export interface SandboxGrade {
|
|
124
|
+
id: string
|
|
125
|
+
runId: string
|
|
126
|
+
judgeModel: string
|
|
127
|
+
scores: SandboxGradeDimension[]
|
|
128
|
+
weightedTotal: number
|
|
129
|
+
objective: SandboxObjectiveResult | null
|
|
130
|
+
createdAt: number
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/** The Sandbox catalog entry for a testable agent kind (from the overview). */
|
|
134
|
+
export interface SandboxAgentKindMeta {
|
|
135
|
+
agentKind: string
|
|
136
|
+
label: string
|
|
137
|
+
bucket: 'inline' | 'container'
|
|
138
|
+
rubric: 'requirement-review' | 'code-review' | 'implementation'
|
|
139
|
+
/** Fixture kinds this agent is exercised against (the UI filters the library by these). */
|
|
140
|
+
fixtureKinds: SandboxFixtureKind[]
|
|
141
|
+
basePromptId: string | null
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/** The composite the management surface loads on open (`GET /sandbox/overview`). */
|
|
145
|
+
export interface SandboxOverview {
|
|
146
|
+
agentKinds: SandboxAgentKindMeta[]
|
|
147
|
+
prompts: SandboxPromptVersion[]
|
|
148
|
+
fixtures: SandboxFixture[]
|
|
149
|
+
experiments: SandboxExperiment[]
|
|
150
|
+
/** The matrix cell cap (the backend cost guard), so the builder gates on the same limit. */
|
|
151
|
+
maxCells: number
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/** An experiment with its result grid (`GET /sandbox/experiments/:id`, also from launch). */
|
|
155
|
+
export interface SandboxExperimentDetail {
|
|
156
|
+
experiment: SandboxExperiment
|
|
157
|
+
runs: SandboxRun[]
|
|
158
|
+
grades: SandboxGrade[]
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// ---- request bodies --------------------------------------------------------
|
|
162
|
+
|
|
163
|
+
export interface CloneSandboxPromptInput {
|
|
164
|
+
agentKind: string
|
|
165
|
+
basePromptId: string | null
|
|
166
|
+
name?: string
|
|
167
|
+
labels?: string[]
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
export interface SaveSandboxVersionInput {
|
|
171
|
+
parentId: string
|
|
172
|
+
systemText: string
|
|
173
|
+
labels?: string[]
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export interface CreateSandboxExperimentInput {
|
|
177
|
+
name: string
|
|
178
|
+
agentKind: string
|
|
179
|
+
matrix: SandboxMatrix
|
|
180
|
+
judgeModel?: string
|
|
181
|
+
repeats?: number
|
|
182
|
+
budgetTokens?: number | null
|
|
183
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@cat-factory/app",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.28.1",
|
|
4
4
|
"description": "Reusable Nuxt layer for the Agent Architecture Board SPA (components, stores, composables, pages). Consume it from a thin deployment app via `extends: ['@cat-factory/app']` and point it at your backend with NUXT_PUBLIC_API_BASE. See deploy/frontend for an example.",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|