@swarmclawai/swarmclaw 1.5.71 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -0
- package/package.json +2 -2
- package/src/app/home/page.tsx +11 -3
- package/src/app/quality/page.tsx +7 -0
- package/src/components/home/home-launchpad.tsx +32 -0
- package/src/components/layout/sidebar-rail.tsx +6 -0
- package/src/components/quality/quality-workspace.tsx +632 -0
- package/src/components/runs/run-list.tsx +53 -2
- package/src/components/shared/command-palette.tsx +1 -0
- package/src/lib/app/navigation.ts +1 -0
- package/src/lib/app/view-constants.test.ts +8 -1
- package/src/lib/app/view-constants.ts +9 -1
- package/src/lib/quality/quality-summary.test.ts +122 -0
- package/src/lib/quality/quality-summary.ts +150 -0
- package/src/lib/server/missions/mission-templates.test.ts +22 -0
- package/src/lib/server/missions/mission-templates.ts +116 -0
- package/src/types/session.ts +1 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
'use client'
|
|
2
2
|
|
|
3
|
-
import { useEffect, useState, useCallback } from 'react'
|
|
3
|
+
import { useEffect, useMemo, useState, useCallback } from 'react'
|
|
4
4
|
import { api } from '@/lib/app/api-client'
|
|
5
5
|
import { useNow } from '@/hooks/use-now'
|
|
6
6
|
import { useWs } from '@/hooks/use-ws'
|
|
@@ -37,6 +37,8 @@ export function RunList() {
|
|
|
37
37
|
const [loading, setLoading] = useState(true)
|
|
38
38
|
const [autoRefresh, setAutoRefresh] = useState(false)
|
|
39
39
|
const [statusFilter, setStatusFilter] = useState<SessionRunStatus | null>(null)
|
|
40
|
+
const [sourceFilter, setSourceFilter] = useState<string>('all')
|
|
41
|
+
const [query, setQuery] = useState('')
|
|
40
42
|
const [selected, setSelected] = useState<SessionRunRecord | null>(null)
|
|
41
43
|
const [selectedEvents, setSelectedEvents] = useState<RunEventRecord[]>([])
|
|
42
44
|
const [eventsLoading, setEventsLoading] = useState(false)
|
|
@@ -84,7 +86,30 @@ export function RunList() {
|
|
|
84
86
|
setEventsLoading(true)
|
|
85
87
|
}, [])
|
|
86
88
|
|
|
87
|
-
const
|
|
89
|
+
const sources = useMemo(() => {
|
|
90
|
+
return Array.from(new Set(runs.map((run) => run.source).filter(Boolean))).sort((a, b) => a.localeCompare(b))
|
|
91
|
+
}, [runs])
|
|
92
|
+
|
|
93
|
+
const filtered = useMemo(() => {
|
|
94
|
+
const normalizedQuery = query.trim().toLowerCase()
|
|
95
|
+
return runs.filter((run) => {
|
|
96
|
+
if (statusFilter && run.status !== statusFilter) return false
|
|
97
|
+
if (sourceFilter !== 'all' && run.source !== sourceFilter) return false
|
|
98
|
+
if (!normalizedQuery) return true
|
|
99
|
+
const searchable = [
|
|
100
|
+
run.id,
|
|
101
|
+
run.sessionId,
|
|
102
|
+
run.source,
|
|
103
|
+
run.messagePreview,
|
|
104
|
+
run.error,
|
|
105
|
+
run.resultPreview,
|
|
106
|
+
run.kind,
|
|
107
|
+
run.ownerType,
|
|
108
|
+
run.ownerId,
|
|
109
|
+
]
|
|
110
|
+
return searchable.some((value) => String(value || '').toLowerCase().includes(normalizedQuery))
|
|
111
|
+
})
|
|
112
|
+
}, [query, runs, sourceFilter, statusFilter])
|
|
88
113
|
const selectedResultGrounding = selectedEvents
|
|
89
114
|
.slice()
|
|
90
115
|
.reverse()
|
|
@@ -130,6 +155,32 @@ export function RunList() {
|
|
|
130
155
|
{autoRefresh ? 'LIVE' : 'PAUSED'}
|
|
131
156
|
</button>
|
|
132
157
|
</div>
|
|
158
|
+
<div className="flex flex-col gap-2 sm:flex-row sm:items-center">
|
|
159
|
+
<div className="relative flex-1 min-w-[180px]">
|
|
160
|
+
<svg width="13" height="13" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" className="absolute left-2.5 top-1/2 -translate-y-1/2 text-text-3/50">
|
|
161
|
+
<circle cx="11" cy="11" r="8" /><line x1="21" y1="21" x2="16.65" y2="16.65" />
|
|
162
|
+
</svg>
|
|
163
|
+
<input
|
|
164
|
+
value={query}
|
|
165
|
+
onChange={(event) => setQuery(event.target.value)}
|
|
166
|
+
placeholder="Search run id, source, error, or result"
|
|
167
|
+
className="w-full rounded-[8px] border border-white/[0.06] bg-white/[0.03] py-1.5 pl-8 pr-3 text-[12px] text-text outline-none transition-colors placeholder:text-text-3/45 focus:border-accent-bright/35"
|
|
168
|
+
/>
|
|
169
|
+
</div>
|
|
170
|
+
<label className="flex items-center gap-2 text-[10px] font-700 uppercase tracking-[0.08em] text-text-3/60">
|
|
171
|
+
Source
|
|
172
|
+
<select
|
|
173
|
+
value={sourceFilter}
|
|
174
|
+
onChange={(event) => setSourceFilter(event.target.value)}
|
|
175
|
+
className="rounded-[8px] border border-white/[0.06] bg-white/[0.03] px-2 py-1.5 text-[11px] font-600 normal-case tracking-normal text-text outline-none"
|
|
176
|
+
>
|
|
177
|
+
<option value="all">All sources</option>
|
|
178
|
+
{sources.map((source) => (
|
|
179
|
+
<option key={source} value={source}>{source}</option>
|
|
180
|
+
))}
|
|
181
|
+
</select>
|
|
182
|
+
</label>
|
|
183
|
+
</div>
|
|
133
184
|
</div>
|
|
134
185
|
|
|
135
186
|
{/* Count */}
|
|
@@ -98,6 +98,7 @@ function CommandPaletteInner({ setOpen }: { setOpen: (v: boolean) => void }) {
|
|
|
98
98
|
{ id: 'providers', label: 'Providers', description: 'Model providers and endpoints', keywords: ['openai', 'anthropic', 'ollama', 'endpoint'] },
|
|
99
99
|
{ id: 'secrets', label: 'Secrets', description: 'Credentials and encrypted secrets', keywords: ['api key', 'token', 'credential'] },
|
|
100
100
|
{ id: 'autonomy', label: 'Autonomy', description: 'Estops, incidents, and runtime controls', keywords: ['estop', 'incident', 'runtime', 'safety'] },
|
|
101
|
+
{ id: 'quality', label: 'Quality', description: 'Evals, approvals, run review, and release readiness', keywords: ['eval', 'approval', 'runs', 'release', 'qa'] },
|
|
101
102
|
{ id: 'settings', label: 'Settings', description: 'General app configuration', keywords: ['preferences', 'theme', 'heartbeat'] },
|
|
102
103
|
] as const
|
|
103
104
|
for (const view of views) {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import assert from 'node:assert/strict'
|
|
2
2
|
import { describe, it } from 'node:test'
|
|
3
3
|
|
|
4
|
-
import { isPanelSidebarView, shouldAutoOpenPanelSidebar } from './view-constants'
|
|
4
|
+
import { FULL_WIDTH_VIEWS, VIEW_DESCRIPTIONS, VIEW_LABELS, isPanelSidebarView, shouldAutoOpenPanelSidebar } from './view-constants'
|
|
5
5
|
|
|
6
6
|
describe('panel sidebar route helpers', () => {
|
|
7
7
|
it('treats knowledge as a panel-backed view', () => {
|
|
@@ -18,4 +18,11 @@ describe('panel sidebar route helpers', () => {
|
|
|
18
18
|
assert.equal(shouldAutoOpenPanelSidebar('settings', true), false)
|
|
19
19
|
assert.equal(shouldAutoOpenPanelSidebar(null, true), false)
|
|
20
20
|
})
|
|
21
|
+
|
|
22
|
+
it('registers quality as a full-width operator workspace', () => {
|
|
23
|
+
assert.equal(VIEW_LABELS.quality, 'Quality')
|
|
24
|
+
assert.match(VIEW_DESCRIPTIONS.quality, /evals/i)
|
|
25
|
+
assert.equal(FULL_WIDTH_VIEWS.has('quality'), true)
|
|
26
|
+
assert.equal(isPanelSidebarView('quality'), false)
|
|
27
|
+
})
|
|
21
28
|
})
|
|
@@ -11,6 +11,7 @@ export const VIEW_LABELS: Record<AppView, string> = {
|
|
|
11
11
|
memory: 'Memory',
|
|
12
12
|
|
|
13
13
|
tasks: 'Tasks',
|
|
14
|
+
quality: 'Quality',
|
|
14
15
|
missions: 'Missions',
|
|
15
16
|
secrets: 'Secrets',
|
|
16
17
|
wallets: 'Wallets',
|
|
@@ -59,6 +60,7 @@ export const VIEW_DESCRIPTIONS: Record<AppView, string> = {
|
|
|
59
60
|
memory: 'Long-term agent memory store',
|
|
60
61
|
|
|
61
62
|
tasks: 'Task board for agent work and queued runs',
|
|
63
|
+
quality: 'Operator quality center for evals, approvals, run review, and release readiness',
|
|
62
64
|
missions: 'Autonomous goal-driven agent runs with budgets and morning reports',
|
|
63
65
|
secrets: 'API keys, tokens, and encrypted credentials',
|
|
64
66
|
wallets: 'Crypto wallets for agent-initiated on-chain transactions',
|
|
@@ -118,6 +120,12 @@ export const VIEW_EMPTY_STATES: Record<Exclude<AppView, 'agents' | 'home'>, { ic
|
|
|
118
120
|
description: 'A kanban board for managing agent work. Create tasks, assign them to agents, and track progress.',
|
|
119
121
|
features: ['Kanban columns: Backlog, Queued, Running, Completed, Failed', 'Assign tasks to specific agents', 'Track retries, results, and logs', 'Review status without leaving the board'],
|
|
120
122
|
},
|
|
123
|
+
quality: {
|
|
124
|
+
icon: 'badge-check',
|
|
125
|
+
title: 'Quality',
|
|
126
|
+
description: 'Operator center for trusting autonomous agents before, during, and after a release.',
|
|
127
|
+
features: ['Review run health, failed work, and pending approvals', 'Run scenario and suite evals against selected agents', 'Approve or reject human-loop, tool, connector, and skill requests', 'Inspect replay evidence from recent agent runs'],
|
|
128
|
+
},
|
|
121
129
|
missions: {
|
|
122
130
|
icon: 'target',
|
|
123
131
|
title: 'Missions',
|
|
@@ -243,7 +251,7 @@ export const VIEW_EMPTY_STATES: Record<Exclude<AppView, 'agents' | 'home'>, { ic
|
|
|
243
251
|
export const FULL_WIDTH_VIEWS = new Set<AppView>([
|
|
244
252
|
'home', 'org_chart', 'inbox', 'chatrooms', 'protocols', 'schedules', 'secrets', 'wallets', 'providers', 'skills',
|
|
245
253
|
'connectors', 'webhooks', 'mcp_servers', 'knowledge', 'extensions',
|
|
246
|
-
'usage', 'runs', 'autonomy', 'logs', 'settings', 'activity', 'projects', 'swarmfeed', 'marketplace', 'missions',
|
|
254
|
+
'usage', 'runs', 'quality', 'autonomy', 'logs', 'settings', 'activity', 'projects', 'swarmfeed', 'marketplace', 'missions',
|
|
247
255
|
])
|
|
248
256
|
|
|
249
257
|
export const PANEL_SIDEBAR_VIEWS = new Set<AppView>([
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import assert from 'node:assert/strict'
|
|
2
|
+
import { describe, it } from 'node:test'
|
|
3
|
+
|
|
4
|
+
import {
|
|
5
|
+
buildQualityOverviewSummary,
|
|
6
|
+
groupApprovalsByCategory,
|
|
7
|
+
summarizeEvalRuns,
|
|
8
|
+
summarizeRunHealth,
|
|
9
|
+
} from './quality-summary'
|
|
10
|
+
import type { EvalRun } from '@/lib/server/eval/types'
|
|
11
|
+
import type { ApprovalRequest, SessionRunRecord } from '@/types'
|
|
12
|
+
|
|
13
|
+
function run(overrides: Partial<SessionRunRecord>): SessionRunRecord {
|
|
14
|
+
return {
|
|
15
|
+
id: overrides.id || 'run_1',
|
|
16
|
+
sessionId: overrides.sessionId || 'sess_1',
|
|
17
|
+
source: overrides.source || 'chat',
|
|
18
|
+
internal: overrides.internal ?? false,
|
|
19
|
+
mode: overrides.mode || 'direct',
|
|
20
|
+
status: overrides.status || 'completed',
|
|
21
|
+
messagePreview: overrides.messagePreview || 'hello',
|
|
22
|
+
queuedAt: overrides.queuedAt ?? 1000,
|
|
23
|
+
...overrides,
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function evalRun(overrides: Partial<EvalRun>): EvalRun {
|
|
28
|
+
return {
|
|
29
|
+
id: overrides.id || 'eval_1',
|
|
30
|
+
scenarioId: overrides.scenarioId || 'coding-prime',
|
|
31
|
+
agentId: overrides.agentId || 'agent_1',
|
|
32
|
+
status: overrides.status || 'completed',
|
|
33
|
+
startedAt: overrides.startedAt ?? 1000,
|
|
34
|
+
endedAt: overrides.endedAt,
|
|
35
|
+
score: overrides.score ?? 8,
|
|
36
|
+
maxScore: overrides.maxScore ?? 10,
|
|
37
|
+
details: overrides.details || [],
|
|
38
|
+
sessionId: overrides.sessionId,
|
|
39
|
+
error: overrides.error,
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function approval(overrides: Partial<ApprovalRequest>): ApprovalRequest {
|
|
44
|
+
return {
|
|
45
|
+
id: overrides.id || 'approval_1',
|
|
46
|
+
category: overrides.category || 'human_loop',
|
|
47
|
+
title: overrides.title || 'Review request',
|
|
48
|
+
description: overrides.description,
|
|
49
|
+
data: overrides.data || {},
|
|
50
|
+
createdAt: overrides.createdAt ?? 1000,
|
|
51
|
+
updatedAt: overrides.updatedAt ?? 1000,
|
|
52
|
+
status: overrides.status || 'pending',
|
|
53
|
+
agentId: overrides.agentId,
|
|
54
|
+
sessionId: overrides.sessionId,
|
|
55
|
+
taskId: overrides.taskId,
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
describe('summarizeRunHealth', () => {
|
|
60
|
+
it('counts run statuses and keeps the most recent failed runs', () => {
|
|
61
|
+
const summary = summarizeRunHealth([
|
|
62
|
+
run({ id: 'old-failed', status: 'failed', queuedAt: 1000 }),
|
|
63
|
+
run({ id: 'running', status: 'running', queuedAt: 2000 }),
|
|
64
|
+
run({ id: 'new-failed', status: 'failed', queuedAt: 3000 }),
|
|
65
|
+
run({ id: 'completed', status: 'completed', queuedAt: 4000 }),
|
|
66
|
+
])
|
|
67
|
+
|
|
68
|
+
assert.equal(summary.total, 4)
|
|
69
|
+
assert.equal(summary.byStatus.failed, 2)
|
|
70
|
+
assert.equal(summary.byStatus.running, 1)
|
|
71
|
+
assert.equal(summary.activeCount, 1)
|
|
72
|
+
assert.equal(summary.needsAttentionCount, 2)
|
|
73
|
+
assert.deepEqual(summary.recentFailures.map((item) => item.id), ['new-failed', 'old-failed'])
|
|
74
|
+
})
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
describe('summarizeEvalRuns', () => {
|
|
78
|
+
it('summarizes completed evals and ignores failed runs for score averages', () => {
|
|
79
|
+
const summary = summarizeEvalRuns([
|
|
80
|
+
evalRun({ id: 'low', score: 4, maxScore: 10, startedAt: 2000 }),
|
|
81
|
+
evalRun({ id: 'failed', status: 'failed', score: 0, maxScore: 10, startedAt: 3000 }),
|
|
82
|
+
evalRun({ id: 'high', score: 9, maxScore: 10, startedAt: 4000 }),
|
|
83
|
+
])
|
|
84
|
+
|
|
85
|
+
assert.equal(summary.totalRuns, 3)
|
|
86
|
+
assert.equal(summary.completedRuns, 2)
|
|
87
|
+
assert.equal(summary.failedRuns, 1)
|
|
88
|
+
assert.equal(summary.latestRun?.id, 'high')
|
|
89
|
+
assert.equal(summary.averagePercent, 65)
|
|
90
|
+
assert.equal(summary.latestCompletedPercent, 90)
|
|
91
|
+
})
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
describe('groupApprovalsByCategory', () => {
|
|
95
|
+
it('groups pending approvals and sorts oldest first inside each category', () => {
|
|
96
|
+
const grouped = groupApprovalsByCategory([
|
|
97
|
+
approval({ id: 'new-human', category: 'human_loop', createdAt: 3000 }),
|
|
98
|
+
approval({ id: 'approved-skill', category: 'extension_install', status: 'approved', createdAt: 1000 }),
|
|
99
|
+
approval({ id: 'old-human', category: 'human_loop', createdAt: 1000 }),
|
|
100
|
+
approval({ id: 'tool', category: 'tool_access', createdAt: 2000 }),
|
|
101
|
+
])
|
|
102
|
+
|
|
103
|
+
assert.equal(grouped.totalPending, 3)
|
|
104
|
+
assert.deepEqual(grouped.categories.map((category) => category.category), ['human_loop', 'tool_access'])
|
|
105
|
+
assert.deepEqual(grouped.categories[0].approvals.map((item) => item.id), ['old-human', 'new-human'])
|
|
106
|
+
})
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
describe('buildQualityOverviewSummary', () => {
|
|
110
|
+
it('combines runs, evals, and approvals into operator action counts', () => {
|
|
111
|
+
const summary = buildQualityOverviewSummary({
|
|
112
|
+
runs: [run({ status: 'failed' }), run({ status: 'running' })],
|
|
113
|
+
evalRuns: [evalRun({ score: 7, maxScore: 10 })],
|
|
114
|
+
approvals: [approval({}), approval({ status: 'rejected' })],
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
assert.equal(summary.needsAttention, 2)
|
|
118
|
+
assert.equal(summary.pendingApprovals, 1)
|
|
119
|
+
assert.equal(summary.activeRuns, 1)
|
|
120
|
+
assert.equal(summary.evalAveragePercent, 70)
|
|
121
|
+
})
|
|
122
|
+
})
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import type { EvalRun } from '@/lib/server/eval/types'
|
|
2
|
+
import type { ApprovalCategory, ApprovalRequest, SessionRunRecord, SessionRunStatus } from '@/types'
|
|
3
|
+
|
|
4
|
+
const RUN_STATUSES: SessionRunStatus[] = ['queued', 'running', 'completed', 'failed', 'cancelled']
|
|
5
|
+
|
|
6
|
+
export interface RunHealthSummary {
|
|
7
|
+
total: number
|
|
8
|
+
byStatus: Record<SessionRunStatus, number>
|
|
9
|
+
activeCount: number
|
|
10
|
+
needsAttentionCount: number
|
|
11
|
+
recentFailures: SessionRunRecord[]
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export interface EvalRunSummary {
|
|
15
|
+
totalRuns: number
|
|
16
|
+
completedRuns: number
|
|
17
|
+
failedRuns: number
|
|
18
|
+
averagePercent: number | null
|
|
19
|
+
latestCompletedPercent: number | null
|
|
20
|
+
latestRun: EvalRun | null
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export interface ApprovalCategoryGroup {
|
|
24
|
+
category: ApprovalCategory
|
|
25
|
+
count: number
|
|
26
|
+
approvals: ApprovalRequest[]
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface ApprovalGroupSummary {
|
|
30
|
+
totalPending: number
|
|
31
|
+
categories: ApprovalCategoryGroup[]
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export interface QualityOverviewSummary {
|
|
35
|
+
runHealth: RunHealthSummary
|
|
36
|
+
evals: EvalRunSummary
|
|
37
|
+
approvals: ApprovalGroupSummary
|
|
38
|
+
needsAttention: number
|
|
39
|
+
pendingApprovals: number
|
|
40
|
+
activeRuns: number
|
|
41
|
+
evalAveragePercent: number | null
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function newestRunTimestamp(run: SessionRunRecord): number {
|
|
45
|
+
return run.endedAt ?? run.startedAt ?? run.queuedAt
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function evalTimestamp(run: EvalRun): number {
|
|
49
|
+
return run.endedAt ?? run.startedAt
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function percent(score: number, maxScore: number): number | null {
|
|
53
|
+
if (!Number.isFinite(score) || !Number.isFinite(maxScore) || maxScore <= 0) return null
|
|
54
|
+
return Math.round((score / maxScore) * 100)
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export function summarizeRunHealth(runs: SessionRunRecord[], opts: { recentFailureLimit?: number } = {}): RunHealthSummary {
|
|
58
|
+
const byStatus = RUN_STATUSES.reduce((acc, status) => {
|
|
59
|
+
acc[status] = 0
|
|
60
|
+
return acc
|
|
61
|
+
}, {} as Record<SessionRunStatus, number>)
|
|
62
|
+
|
|
63
|
+
for (const run of runs) {
|
|
64
|
+
byStatus[run.status] = (byStatus[run.status] ?? 0) + 1
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const recentFailures = runs
|
|
68
|
+
.filter((run) => run.status === 'failed')
|
|
69
|
+
.slice()
|
|
70
|
+
.sort((a, b) => newestRunTimestamp(b) - newestRunTimestamp(a))
|
|
71
|
+
.slice(0, opts.recentFailureLimit ?? 5)
|
|
72
|
+
|
|
73
|
+
return {
|
|
74
|
+
total: runs.length,
|
|
75
|
+
byStatus,
|
|
76
|
+
activeCount: byStatus.queued + byStatus.running,
|
|
77
|
+
needsAttentionCount: byStatus.failed,
|
|
78
|
+
recentFailures,
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
export function summarizeEvalRuns(runs: EvalRun[]): EvalRunSummary {
|
|
83
|
+
const latestRun = runs.length
|
|
84
|
+
? runs.slice().sort((a, b) => evalTimestamp(b) - evalTimestamp(a))[0]
|
|
85
|
+
: null
|
|
86
|
+
const completed = runs.filter((run) => run.status === 'completed')
|
|
87
|
+
const failedRuns = runs.filter((run) => run.status === 'failed').length
|
|
88
|
+
const completedPercents = completed
|
|
89
|
+
.map((run) => percent(run.score, run.maxScore))
|
|
90
|
+
.filter((value): value is number => value !== null)
|
|
91
|
+
|
|
92
|
+
const latestCompleted = completed.length
|
|
93
|
+
? completed.slice().sort((a, b) => evalTimestamp(b) - evalTimestamp(a))[0]
|
|
94
|
+
: null
|
|
95
|
+
const latestCompletedPercent = latestCompleted
|
|
96
|
+
? percent(latestCompleted.score, latestCompleted.maxScore)
|
|
97
|
+
: null
|
|
98
|
+
|
|
99
|
+
return {
|
|
100
|
+
totalRuns: runs.length,
|
|
101
|
+
completedRuns: completed.length,
|
|
102
|
+
failedRuns,
|
|
103
|
+
averagePercent: completedPercents.length
|
|
104
|
+
? Math.round(completedPercents.reduce((sum, value) => sum + value, 0) / completedPercents.length)
|
|
105
|
+
: null,
|
|
106
|
+
latestCompletedPercent,
|
|
107
|
+
latestRun,
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
export function groupApprovalsByCategory(approvals: ApprovalRequest[]): ApprovalGroupSummary {
|
|
112
|
+
const pending = approvals
|
|
113
|
+
.filter((approval) => approval.status === 'pending')
|
|
114
|
+
.slice()
|
|
115
|
+
.sort((a, b) => a.createdAt - b.createdAt)
|
|
116
|
+
const groups = new Map<ApprovalCategory, ApprovalRequest[]>()
|
|
117
|
+
|
|
118
|
+
for (const approval of pending) {
|
|
119
|
+
const items = groups.get(approval.category) ?? []
|
|
120
|
+
items.push(approval)
|
|
121
|
+
groups.set(approval.category, items)
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return {
|
|
125
|
+
totalPending: pending.length,
|
|
126
|
+
categories: Array.from(groups.entries())
|
|
127
|
+
.map(([category, items]) => ({ category, count: items.length, approvals: items }))
|
|
128
|
+
.sort((a, b) => b.count - a.count || a.category.localeCompare(b.category)),
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
export function buildQualityOverviewSummary(params: {
|
|
133
|
+
runs: SessionRunRecord[]
|
|
134
|
+
evalRuns: EvalRun[]
|
|
135
|
+
approvals: ApprovalRequest[]
|
|
136
|
+
}): QualityOverviewSummary {
|
|
137
|
+
const runHealth = summarizeRunHealth(params.runs)
|
|
138
|
+
const evals = summarizeEvalRuns(params.evalRuns)
|
|
139
|
+
const approvals = groupApprovalsByCategory(params.approvals)
|
|
140
|
+
|
|
141
|
+
return {
|
|
142
|
+
runHealth,
|
|
143
|
+
evals,
|
|
144
|
+
approvals,
|
|
145
|
+
needsAttention: runHealth.needsAttentionCount + evals.failedRuns + approvals.totalPending,
|
|
146
|
+
pendingApprovals: approvals.totalPending,
|
|
147
|
+
activeRuns: runHealth.activeCount,
|
|
148
|
+
evalAveragePercent: evals.averagePercent,
|
|
149
|
+
}
|
|
150
|
+
}
|
|
@@ -74,6 +74,28 @@ describe('mission-templates: registry', () => {
|
|
|
74
74
|
assert.ok(template.defaults.successCriteria.some((item) => item.includes('Product Hunt')))
|
|
75
75
|
})
|
|
76
76
|
|
|
77
|
+
it('includes operator quality release templates', () => {
|
|
78
|
+
const expected = [
|
|
79
|
+
'release-candidate-qa',
|
|
80
|
+
'agent-cost-audit',
|
|
81
|
+
'connector-smoke-test',
|
|
82
|
+
'failed-run-triage',
|
|
83
|
+
'weekly-agent-quality-report',
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
for (const id of expected) {
|
|
87
|
+
const template = templates.getMissionTemplate(id)
|
|
88
|
+
assert.ok(template, `expected ${id} template`)
|
|
89
|
+
assert.ok(template.tags.includes('quality') || template.tags.includes('operator-quality'), `${id} should be quality tagged`)
|
|
90
|
+
assert.ok(template.defaults.goal.includes('approval') || template.defaults.goal.includes('evidence'), `${id} should preserve operator guardrails`)
|
|
91
|
+
assert.ok(template.defaults.budget.maxWallclockSec, `${id} should have a wallclock cap`)
|
|
92
|
+
assert.ok(template.defaults.reportSchedule, `${id} should schedule reports`)
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
assert.equal(templates.getMissionTemplate('release-candidate-qa')?.name, 'Release Candidate QA')
|
|
96
|
+
assert.equal(templates.getMissionTemplate('weekly-agent-quality-report')?.category, 'monitoring')
|
|
97
|
+
})
|
|
98
|
+
|
|
77
99
|
it('getMissionTemplate resolves known ids', () => {
|
|
78
100
|
const list = templates.listMissionTemplates()
|
|
79
101
|
const first = list[0]
|
|
@@ -187,6 +187,122 @@ export const BUILT_IN_MISSION_TEMPLATES: MissionTemplate[] = [
|
|
|
187
187
|
reportSchedule: report(12 * HOUR),
|
|
188
188
|
},
|
|
189
189
|
},
|
|
190
|
+
{
|
|
191
|
+
id: 'release-candidate-qa',
|
|
192
|
+
name: 'Release Candidate QA',
|
|
193
|
+
description:
|
|
194
|
+
'Collect release readiness evidence across evals, approvals, failed runs, docs, packaging, and desktop smoke gates.',
|
|
195
|
+
icon: '✅',
|
|
196
|
+
category: 'productivity',
|
|
197
|
+
tags: ['release', 'qa', 'evals', 'operator-quality'],
|
|
198
|
+
setupNote:
|
|
199
|
+
'Set the target version and release branch in the goal. Keep publishing, tagging, and merging behind explicit human approval.',
|
|
200
|
+
defaults: {
|
|
201
|
+
title: 'Release Candidate QA',
|
|
202
|
+
goal:
|
|
203
|
+
'Prepare a release candidate quality report for the target SwarmClaw version. Review recent failed runs, pending approvals, latest eval results, release notes, package metadata, install instructions, CI/build status, and desktop packaging notes. Summarize blockers, risk level, evidence links, and a go/no-go recommendation. Do not merge, tag, publish, deploy, or post publicly without explicit approval.',
|
|
204
|
+
successCriteria: [
|
|
205
|
+
'Failed runs and pending approvals are reviewed with evidence or clear no-findings notes',
|
|
206
|
+
'Eval coverage, score trends, and any failed criteria are summarized',
|
|
207
|
+
'Release notes, package metadata, install pins, and desktop smoke requirements are checked',
|
|
208
|
+
'Final report includes blockers, risks, follow-up tasks, and a go/no-go recommendation',
|
|
209
|
+
],
|
|
210
|
+
budget: budget({ maxUsd: 2, maxTokens: 120_000, maxTurns: 160, maxWallclockSec: DAY }),
|
|
211
|
+
reportSchedule: report(6 * HOUR),
|
|
212
|
+
},
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
id: 'agent-cost-audit',
|
|
216
|
+
name: 'Agent Cost Audit',
|
|
217
|
+
description:
|
|
218
|
+
'Inspect agent/provider spend, token usage, and high-cost runs, then recommend budget or routing adjustments.',
|
|
219
|
+
icon: '💸',
|
|
220
|
+
category: 'monitoring',
|
|
221
|
+
tags: ['cost', 'usage', 'budget', 'quality'],
|
|
222
|
+
setupNote:
|
|
223
|
+
'Add any budget targets, providers, or agents that need special attention before starting.',
|
|
224
|
+
defaults: {
|
|
225
|
+
title: 'Agent Cost Audit',
|
|
226
|
+
goal:
|
|
227
|
+
'Audit recent SwarmClaw agent costs and token usage. Identify top-spend agents, expensive runs, provider anomalies, retry loops, and avoidable tool calls. Produce a markdown report with recommended budget caps, model routing changes, and follow-up quality checks. Do not change budgets or provider settings without approval.',
|
|
228
|
+
successCriteria: [
|
|
229
|
+
'Top cost drivers are listed with agent, provider, source, and supporting evidence',
|
|
230
|
+
'At least 3 concrete cost-control recommendations are included',
|
|
231
|
+
'Any suspected runaway, retry, or noisy automation pattern is flagged',
|
|
232
|
+
],
|
|
233
|
+
budget: budget({ maxUsd: 1.5, maxTokens: 80_000, maxTurns: 100, maxWallclockSec: DAY }),
|
|
234
|
+
reportSchedule: report(DAY),
|
|
235
|
+
},
|
|
236
|
+
},
|
|
237
|
+
{
|
|
238
|
+
id: 'connector-smoke-test',
|
|
239
|
+
name: 'Connector Smoke Test',
|
|
240
|
+
description:
|
|
241
|
+
'Verify configured connector health, delivery paths, approval boundaries, and recent connector-linked run evidence.',
|
|
242
|
+
icon: '🔌',
|
|
243
|
+
category: 'monitoring',
|
|
244
|
+
tags: ['connectors', 'smoke-test', 'approval', 'quality'],
|
|
245
|
+
setupNote:
|
|
246
|
+
'Name the connectors and channels to test. Keep outbound messages or public replies approval-gated.',
|
|
247
|
+
defaults: {
|
|
248
|
+
title: 'Connector Smoke Test',
|
|
249
|
+
goal:
|
|
250
|
+
'Smoke test configured SwarmClaw connectors. Check connector status, recent inbound/outbound activity, approval requirements, related failed runs, and any available logs. Draft a concise pass/fail report per connector with evidence and remediation steps. Do not send public replies or change connector settings without approval.',
|
|
251
|
+
successCriteria: [
|
|
252
|
+
'Each targeted connector receives a pass, warn, or fail status',
|
|
253
|
+
'Recent connector-linked failures or delivery issues are summarized with evidence',
|
|
254
|
+
'Approval boundaries for outbound replies or sender permissions are explicitly checked',
|
|
255
|
+
],
|
|
256
|
+
budget: budget({ maxUsd: 1.25, maxTokens: 70_000, maxTurns: 90, maxWallclockSec: 12 * HOUR }),
|
|
257
|
+
reportSchedule: report(6 * HOUR),
|
|
258
|
+
},
|
|
259
|
+
},
|
|
260
|
+
{
|
|
261
|
+
id: 'failed-run-triage',
|
|
262
|
+
name: 'Failed Run Triage',
|
|
263
|
+
description:
|
|
264
|
+
'Review recent failed runs, cluster root causes, and propose fixes with replay evidence.',
|
|
265
|
+
icon: '🧯',
|
|
266
|
+
category: 'support',
|
|
267
|
+
tags: ['runs', 'triage', 'debugging', 'quality'],
|
|
268
|
+
setupNote:
|
|
269
|
+
'Optionally narrow the mission to a source, agent, task, or release window.',
|
|
270
|
+
defaults: {
|
|
271
|
+
title: 'Failed Run Triage',
|
|
272
|
+
goal:
|
|
273
|
+
'Triage recent failed SwarmClaw runs. Inspect run records, replay events, errors, retrieval evidence, source, owner, and timing. Cluster failures by likely root cause and write a prioritized remediation report with reproduction notes where possible. Do not modify code or settings unless explicitly asked.',
|
|
274
|
+
successCriteria: [
|
|
275
|
+
'Recent failed runs are grouped by likely root cause',
|
|
276
|
+
'Each high-priority failure includes evidence from the run record or replay',
|
|
277
|
+
'Remediation recommendations are prioritized by user impact and confidence',
|
|
278
|
+
],
|
|
279
|
+
budget: budget({ maxUsd: 1.5, maxTokens: 90_000, maxTurns: 120, maxWallclockSec: DAY }),
|
|
280
|
+
reportSchedule: report(6 * HOUR),
|
|
281
|
+
},
|
|
282
|
+
},
|
|
283
|
+
{
|
|
284
|
+
id: 'weekly-agent-quality-report',
|
|
285
|
+
name: 'Weekly Agent Quality Report',
|
|
286
|
+
description:
|
|
287
|
+
'Produce a weekly operator report across eval trends, approvals, failed runs, missions, cost, and release risk.',
|
|
288
|
+
icon: '📈',
|
|
289
|
+
category: 'monitoring',
|
|
290
|
+
tags: ['weekly', 'quality', 'report', 'evals'],
|
|
291
|
+
setupNote:
|
|
292
|
+
'Set the week or workspace scope in the goal if you want a narrower report.',
|
|
293
|
+
defaults: {
|
|
294
|
+
title: 'Weekly Agent Quality Report',
|
|
295
|
+
goal:
|
|
296
|
+
'Produce a weekly SwarmClaw agent quality report. Summarize eval trends, failed and recovered runs, pending or high-risk approvals, mission outcomes, cost changes, connector health, and release-readiness risks. Include a short executive summary and a prioritized action list for the next week.',
|
|
297
|
+
successCriteria: [
|
|
298
|
+
'Report includes eval, run, approval, mission, connector, and cost sections',
|
|
299
|
+
'Top quality risks and regressions are clearly ranked',
|
|
300
|
+
'Next-week action items are specific and tied to evidence',
|
|
301
|
+
],
|
|
302
|
+
budget: budget({ maxUsd: 3, maxTokens: 180_000, maxTurns: 180, maxWallclockSec: 7 * DAY }),
|
|
303
|
+
reportSchedule: report(DAY),
|
|
304
|
+
},
|
|
305
|
+
},
|
|
190
306
|
{
|
|
191
307
|
id: 'hello-world-demo',
|
|
192
308
|
name: 'Hello World Demo',
|
package/src/types/session.ts
CHANGED
|
@@ -231,4 +231,4 @@ export type SessionTool =
|
|
|
231
231
|
| 'crawl'
|
|
232
232
|
|
|
233
233
|
export type SessionType = 'human'
|
|
234
|
-
export type AppView = 'home' | 'agents' | 'org_chart' | 'inbox' | 'chatrooms' | 'protocols' | 'schedules' | 'memory' | 'tasks' | 'secrets' | 'wallets' | 'providers' | 'skills' | 'connectors' | 'webhooks' | 'mcp_servers' | 'knowledge' | 'extensions' | 'usage' | 'runs' | 'autonomy' | 'logs' | 'settings' | 'projects' | 'activity' | 'swarmfeed' | 'marketplace' | 'missions'
|
|
234
|
+
export type AppView = 'home' | 'agents' | 'org_chart' | 'inbox' | 'chatrooms' | 'protocols' | 'schedules' | 'memory' | 'tasks' | 'quality' | 'secrets' | 'wallets' | 'providers' | 'skills' | 'connectors' | 'webhooks' | 'mcp_servers' | 'knowledge' | 'extensions' | 'usage' | 'runs' | 'autonomy' | 'logs' | 'settings' | 'projects' | 'activity' | 'swarmfeed' | 'marketplace' | 'missions'
|