@swarmclawai/swarmclaw 1.5.71 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  'use client'
2
2
 
3
- import { useEffect, useState, useCallback } from 'react'
3
+ import { useEffect, useMemo, useState, useCallback } from 'react'
4
4
  import { api } from '@/lib/app/api-client'
5
5
  import { useNow } from '@/hooks/use-now'
6
6
  import { useWs } from '@/hooks/use-ws'
@@ -37,6 +37,8 @@ export function RunList() {
37
37
  const [loading, setLoading] = useState(true)
38
38
  const [autoRefresh, setAutoRefresh] = useState(false)
39
39
  const [statusFilter, setStatusFilter] = useState<SessionRunStatus | null>(null)
40
+ const [sourceFilter, setSourceFilter] = useState<string>('all')
41
+ const [query, setQuery] = useState('')
40
42
  const [selected, setSelected] = useState<SessionRunRecord | null>(null)
41
43
  const [selectedEvents, setSelectedEvents] = useState<RunEventRecord[]>([])
42
44
  const [eventsLoading, setEventsLoading] = useState(false)
@@ -84,7 +86,30 @@ export function RunList() {
84
86
  setEventsLoading(true)
85
87
  }, [])
86
88
 
87
- const filtered = statusFilter ? runs.filter((r) => r.status === statusFilter) : runs
89
+ const sources = useMemo(() => {
90
+ return Array.from(new Set(runs.map((run) => run.source).filter(Boolean))).sort((a, b) => a.localeCompare(b))
91
+ }, [runs])
92
+
93
+ const filtered = useMemo(() => {
94
+ const normalizedQuery = query.trim().toLowerCase()
95
+ return runs.filter((run) => {
96
+ if (statusFilter && run.status !== statusFilter) return false
97
+ if (sourceFilter !== 'all' && run.source !== sourceFilter) return false
98
+ if (!normalizedQuery) return true
99
+ const searchable = [
100
+ run.id,
101
+ run.sessionId,
102
+ run.source,
103
+ run.messagePreview,
104
+ run.error,
105
+ run.resultPreview,
106
+ run.kind,
107
+ run.ownerType,
108
+ run.ownerId,
109
+ ]
110
+ return searchable.some((value) => String(value || '').toLowerCase().includes(normalizedQuery))
111
+ })
112
+ }, [query, runs, sourceFilter, statusFilter])
88
113
  const selectedResultGrounding = selectedEvents
89
114
  .slice()
90
115
  .reverse()
@@ -130,6 +155,32 @@ export function RunList() {
130
155
  {autoRefresh ? 'LIVE' : 'PAUSED'}
131
156
  </button>
132
157
  </div>
158
+ <div className="flex flex-col gap-2 sm:flex-row sm:items-center">
159
+ <div className="relative flex-1 min-w-[180px]">
160
+ <svg width="13" height="13" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" className="absolute left-2.5 top-1/2 -translate-y-1/2 text-text-3/50">
161
+ <circle cx="11" cy="11" r="8" /><line x1="21" y1="21" x2="16.65" y2="16.65" />
162
+ </svg>
163
+ <input
164
+ value={query}
165
+ onChange={(event) => setQuery(event.target.value)}
166
+ placeholder="Search run id, source, error, or result"
167
+ className="w-full rounded-[8px] border border-white/[0.06] bg-white/[0.03] py-1.5 pl-8 pr-3 text-[12px] text-text outline-none transition-colors placeholder:text-text-3/45 focus:border-accent-bright/35"
168
+ />
169
+ </div>
170
+ <label className="flex items-center gap-2 text-[10px] font-700 uppercase tracking-[0.08em] text-text-3/60">
171
+ Source
172
+ <select
173
+ value={sourceFilter}
174
+ onChange={(event) => setSourceFilter(event.target.value)}
175
+ className="rounded-[8px] border border-white/[0.06] bg-white/[0.03] px-2 py-1.5 text-[11px] font-600 normal-case tracking-normal text-text outline-none"
176
+ >
177
+ <option value="all">All sources</option>
178
+ {sources.map((source) => (
179
+ <option key={source} value={source}>{source}</option>
180
+ ))}
181
+ </select>
182
+ </label>
183
+ </div>
133
184
  </div>
134
185
 
135
186
  {/* Count */}
@@ -98,6 +98,7 @@ function CommandPaletteInner({ setOpen }: { setOpen: (v: boolean) => void }) {
98
98
  { id: 'providers', label: 'Providers', description: 'Model providers and endpoints', keywords: ['openai', 'anthropic', 'ollama', 'endpoint'] },
99
99
  { id: 'secrets', label: 'Secrets', description: 'Credentials and encrypted secrets', keywords: ['api key', 'token', 'credential'] },
100
100
  { id: 'autonomy', label: 'Autonomy', description: 'Estops, incidents, and runtime controls', keywords: ['estop', 'incident', 'runtime', 'safety'] },
101
+ { id: 'quality', label: 'Quality', description: 'Evals, approvals, run review, and release readiness', keywords: ['eval', 'approval', 'runs', 'release', 'qa'] },
101
102
  { id: 'settings', label: 'Settings', description: 'General app configuration', keywords: ['preferences', 'theme', 'heartbeat'] },
102
103
  ] as const
103
104
  for (const view of views) {
@@ -15,6 +15,7 @@ const VIEW_TO_PATH: Record<AppView, string> = {
15
15
  memory: '/memory',
16
16
 
17
17
  tasks: '/tasks',
18
+ quality: '/quality',
18
19
  missions: '/missions',
19
20
  secrets: '/secrets',
20
21
  wallets: '/wallets',
@@ -1,7 +1,7 @@
1
1
  import assert from 'node:assert/strict'
2
2
  import { describe, it } from 'node:test'
3
3
 
4
- import { isPanelSidebarView, shouldAutoOpenPanelSidebar } from './view-constants'
4
+ import { FULL_WIDTH_VIEWS, VIEW_DESCRIPTIONS, VIEW_LABELS, isPanelSidebarView, shouldAutoOpenPanelSidebar } from './view-constants'
5
5
 
6
6
  describe('panel sidebar route helpers', () => {
7
7
  it('treats knowledge as a panel-backed view', () => {
@@ -18,4 +18,11 @@ describe('panel sidebar route helpers', () => {
18
18
  assert.equal(shouldAutoOpenPanelSidebar('settings', true), false)
19
19
  assert.equal(shouldAutoOpenPanelSidebar(null, true), false)
20
20
  })
21
+
22
+ it('registers quality as a full-width operator workspace', () => {
23
+ assert.equal(VIEW_LABELS.quality, 'Quality')
24
+ assert.match(VIEW_DESCRIPTIONS.quality, /evals/i)
25
+ assert.equal(FULL_WIDTH_VIEWS.has('quality'), true)
26
+ assert.equal(isPanelSidebarView('quality'), false)
27
+ })
21
28
  })
@@ -11,6 +11,7 @@ export const VIEW_LABELS: Record<AppView, string> = {
11
11
  memory: 'Memory',
12
12
 
13
13
  tasks: 'Tasks',
14
+ quality: 'Quality',
14
15
  missions: 'Missions',
15
16
  secrets: 'Secrets',
16
17
  wallets: 'Wallets',
@@ -59,6 +60,7 @@ export const VIEW_DESCRIPTIONS: Record<AppView, string> = {
59
60
  memory: 'Long-term agent memory store',
60
61
 
61
62
  tasks: 'Task board for agent work and queued runs',
63
+ quality: 'Operator quality center for evals, approvals, run review, and release readiness',
62
64
  missions: 'Autonomous goal-driven agent runs with budgets and morning reports',
63
65
  secrets: 'API keys, tokens, and encrypted credentials',
64
66
  wallets: 'Crypto wallets for agent-initiated on-chain transactions',
@@ -118,6 +120,12 @@ export const VIEW_EMPTY_STATES: Record<Exclude<AppView, 'agents' | 'home'>, { ic
118
120
  description: 'A kanban board for managing agent work. Create tasks, assign them to agents, and track progress.',
119
121
  features: ['Kanban columns: Backlog, Queued, Running, Completed, Failed', 'Assign tasks to specific agents', 'Track retries, results, and logs', 'Review status without leaving the board'],
120
122
  },
123
+ quality: {
124
+ icon: 'badge-check',
125
+ title: 'Quality',
126
+ description: 'Operator center for trusting autonomous agents before, during, and after a release.',
127
+ features: ['Review run health, failed work, and pending approvals', 'Run scenario and suite evals against selected agents', 'Approve or reject human-loop, tool, connector, and skill requests', 'Inspect replay evidence from recent agent runs'],
128
+ },
121
129
  missions: {
122
130
  icon: 'target',
123
131
  title: 'Missions',
@@ -243,7 +251,7 @@ export const VIEW_EMPTY_STATES: Record<Exclude<AppView, 'agents' | 'home'>, { ic
243
251
  export const FULL_WIDTH_VIEWS = new Set<AppView>([
244
252
  'home', 'org_chart', 'inbox', 'chatrooms', 'protocols', 'schedules', 'secrets', 'wallets', 'providers', 'skills',
245
253
  'connectors', 'webhooks', 'mcp_servers', 'knowledge', 'extensions',
246
- 'usage', 'runs', 'autonomy', 'logs', 'settings', 'activity', 'projects', 'swarmfeed', 'marketplace', 'missions',
254
+ 'usage', 'runs', 'quality', 'autonomy', 'logs', 'settings', 'activity', 'projects', 'swarmfeed', 'marketplace', 'missions',
247
255
  ])
248
256
 
249
257
  export const PANEL_SIDEBAR_VIEWS = new Set<AppView>([
@@ -0,0 +1,122 @@
1
+ import assert from 'node:assert/strict'
2
+ import { describe, it } from 'node:test'
3
+
4
+ import {
5
+ buildQualityOverviewSummary,
6
+ groupApprovalsByCategory,
7
+ summarizeEvalRuns,
8
+ summarizeRunHealth,
9
+ } from './quality-summary'
10
+ import type { EvalRun } from '@/lib/server/eval/types'
11
+ import type { ApprovalRequest, SessionRunRecord } from '@/types'
12
+
13
+ function run(overrides: Partial<SessionRunRecord>): SessionRunRecord {
14
+ return {
15
+ id: overrides.id || 'run_1',
16
+ sessionId: overrides.sessionId || 'sess_1',
17
+ source: overrides.source || 'chat',
18
+ internal: overrides.internal ?? false,
19
+ mode: overrides.mode || 'direct',
20
+ status: overrides.status || 'completed',
21
+ messagePreview: overrides.messagePreview || 'hello',
22
+ queuedAt: overrides.queuedAt ?? 1000,
23
+ ...overrides,
24
+ }
25
+ }
26
+
27
+ function evalRun(overrides: Partial<EvalRun>): EvalRun {
28
+ return {
29
+ id: overrides.id || 'eval_1',
30
+ scenarioId: overrides.scenarioId || 'coding-prime',
31
+ agentId: overrides.agentId || 'agent_1',
32
+ status: overrides.status || 'completed',
33
+ startedAt: overrides.startedAt ?? 1000,
34
+ endedAt: overrides.endedAt,
35
+ score: overrides.score ?? 8,
36
+ maxScore: overrides.maxScore ?? 10,
37
+ details: overrides.details || [],
38
+ sessionId: overrides.sessionId,
39
+ error: overrides.error,
40
+ }
41
+ }
42
+
43
+ function approval(overrides: Partial<ApprovalRequest>): ApprovalRequest {
44
+ return {
45
+ id: overrides.id || 'approval_1',
46
+ category: overrides.category || 'human_loop',
47
+ title: overrides.title || 'Review request',
48
+ description: overrides.description,
49
+ data: overrides.data || {},
50
+ createdAt: overrides.createdAt ?? 1000,
51
+ updatedAt: overrides.updatedAt ?? 1000,
52
+ status: overrides.status || 'pending',
53
+ agentId: overrides.agentId,
54
+ sessionId: overrides.sessionId,
55
+ taskId: overrides.taskId,
56
+ }
57
+ }
58
+
59
+ describe('summarizeRunHealth', () => {
60
+ it('counts run statuses and keeps the most recent failed runs', () => {
61
+ const summary = summarizeRunHealth([
62
+ run({ id: 'old-failed', status: 'failed', queuedAt: 1000 }),
63
+ run({ id: 'running', status: 'running', queuedAt: 2000 }),
64
+ run({ id: 'new-failed', status: 'failed', queuedAt: 3000 }),
65
+ run({ id: 'completed', status: 'completed', queuedAt: 4000 }),
66
+ ])
67
+
68
+ assert.equal(summary.total, 4)
69
+ assert.equal(summary.byStatus.failed, 2)
70
+ assert.equal(summary.byStatus.running, 1)
71
+ assert.equal(summary.activeCount, 1)
72
+ assert.equal(summary.needsAttentionCount, 2)
73
+ assert.deepEqual(summary.recentFailures.map((item) => item.id), ['new-failed', 'old-failed'])
74
+ })
75
+ })
76
+
77
+ describe('summarizeEvalRuns', () => {
78
+ it('summarizes completed evals and ignores failed runs for score averages', () => {
79
+ const summary = summarizeEvalRuns([
80
+ evalRun({ id: 'low', score: 4, maxScore: 10, startedAt: 2000 }),
81
+ evalRun({ id: 'failed', status: 'failed', score: 0, maxScore: 10, startedAt: 3000 }),
82
+ evalRun({ id: 'high', score: 9, maxScore: 10, startedAt: 4000 }),
83
+ ])
84
+
85
+ assert.equal(summary.totalRuns, 3)
86
+ assert.equal(summary.completedRuns, 2)
87
+ assert.equal(summary.failedRuns, 1)
88
+ assert.equal(summary.latestRun?.id, 'high')
89
+ assert.equal(summary.averagePercent, 65)
90
+ assert.equal(summary.latestCompletedPercent, 90)
91
+ })
92
+ })
93
+
94
+ describe('groupApprovalsByCategory', () => {
95
+ it('groups pending approvals and sorts oldest first inside each category', () => {
96
+ const grouped = groupApprovalsByCategory([
97
+ approval({ id: 'new-human', category: 'human_loop', createdAt: 3000 }),
98
+ approval({ id: 'approved-skill', category: 'extension_install', status: 'approved', createdAt: 1000 }),
99
+ approval({ id: 'old-human', category: 'human_loop', createdAt: 1000 }),
100
+ approval({ id: 'tool', category: 'tool_access', createdAt: 2000 }),
101
+ ])
102
+
103
+ assert.equal(grouped.totalPending, 3)
104
+ assert.deepEqual(grouped.categories.map((category) => category.category), ['human_loop', 'tool_access'])
105
+ assert.deepEqual(grouped.categories[0].approvals.map((item) => item.id), ['old-human', 'new-human'])
106
+ })
107
+ })
108
+
109
+ describe('buildQualityOverviewSummary', () => {
110
+ it('combines runs, evals, and approvals into operator action counts', () => {
111
+ const summary = buildQualityOverviewSummary({
112
+ runs: [run({ status: 'failed' }), run({ status: 'running' })],
113
+ evalRuns: [evalRun({ score: 7, maxScore: 10 })],
114
+ approvals: [approval({}), approval({ status: 'rejected' })],
115
+ })
116
+
117
+ assert.equal(summary.needsAttention, 2)
118
+ assert.equal(summary.pendingApprovals, 1)
119
+ assert.equal(summary.activeRuns, 1)
120
+ assert.equal(summary.evalAveragePercent, 70)
121
+ })
122
+ })
@@ -0,0 +1,150 @@
1
+ import type { EvalRun } from '@/lib/server/eval/types'
2
+ import type { ApprovalCategory, ApprovalRequest, SessionRunRecord, SessionRunStatus } from '@/types'
3
+
4
+ const RUN_STATUSES: SessionRunStatus[] = ['queued', 'running', 'completed', 'failed', 'cancelled']
5
+
6
+ export interface RunHealthSummary {
7
+ total: number
8
+ byStatus: Record<SessionRunStatus, number>
9
+ activeCount: number
10
+ needsAttentionCount: number
11
+ recentFailures: SessionRunRecord[]
12
+ }
13
+
14
+ export interface EvalRunSummary {
15
+ totalRuns: number
16
+ completedRuns: number
17
+ failedRuns: number
18
+ averagePercent: number | null
19
+ latestCompletedPercent: number | null
20
+ latestRun: EvalRun | null
21
+ }
22
+
23
+ export interface ApprovalCategoryGroup {
24
+ category: ApprovalCategory
25
+ count: number
26
+ approvals: ApprovalRequest[]
27
+ }
28
+
29
+ export interface ApprovalGroupSummary {
30
+ totalPending: number
31
+ categories: ApprovalCategoryGroup[]
32
+ }
33
+
34
+ export interface QualityOverviewSummary {
35
+ runHealth: RunHealthSummary
36
+ evals: EvalRunSummary
37
+ approvals: ApprovalGroupSummary
38
+ needsAttention: number
39
+ pendingApprovals: number
40
+ activeRuns: number
41
+ evalAveragePercent: number | null
42
+ }
43
+
44
+ function newestRunTimestamp(run: SessionRunRecord): number {
45
+ return run.endedAt ?? run.startedAt ?? run.queuedAt
46
+ }
47
+
48
+ function evalTimestamp(run: EvalRun): number {
49
+ return run.endedAt ?? run.startedAt
50
+ }
51
+
52
+ function percent(score: number, maxScore: number): number | null {
53
+ if (!Number.isFinite(score) || !Number.isFinite(maxScore) || maxScore <= 0) return null
54
+ return Math.round((score / maxScore) * 100)
55
+ }
56
+
57
+ export function summarizeRunHealth(runs: SessionRunRecord[], opts: { recentFailureLimit?: number } = {}): RunHealthSummary {
58
+ const byStatus = RUN_STATUSES.reduce((acc, status) => {
59
+ acc[status] = 0
60
+ return acc
61
+ }, {} as Record<SessionRunStatus, number>)
62
+
63
+ for (const run of runs) {
64
+ byStatus[run.status] = (byStatus[run.status] ?? 0) + 1
65
+ }
66
+
67
+ const recentFailures = runs
68
+ .filter((run) => run.status === 'failed')
69
+ .slice()
70
+ .sort((a, b) => newestRunTimestamp(b) - newestRunTimestamp(a))
71
+ .slice(0, opts.recentFailureLimit ?? 5)
72
+
73
+ return {
74
+ total: runs.length,
75
+ byStatus,
76
+ activeCount: byStatus.queued + byStatus.running,
77
+ needsAttentionCount: byStatus.failed,
78
+ recentFailures,
79
+ }
80
+ }
81
+
82
+ export function summarizeEvalRuns(runs: EvalRun[]): EvalRunSummary {
83
+ const latestRun = runs.length
84
+ ? runs.slice().sort((a, b) => evalTimestamp(b) - evalTimestamp(a))[0]
85
+ : null
86
+ const completed = runs.filter((run) => run.status === 'completed')
87
+ const failedRuns = runs.filter((run) => run.status === 'failed').length
88
+ const completedPercents = completed
89
+ .map((run) => percent(run.score, run.maxScore))
90
+ .filter((value): value is number => value !== null)
91
+
92
+ const latestCompleted = completed.length
93
+ ? completed.slice().sort((a, b) => evalTimestamp(b) - evalTimestamp(a))[0]
94
+ : null
95
+ const latestCompletedPercent = latestCompleted
96
+ ? percent(latestCompleted.score, latestCompleted.maxScore)
97
+ : null
98
+
99
+ return {
100
+ totalRuns: runs.length,
101
+ completedRuns: completed.length,
102
+ failedRuns,
103
+ averagePercent: completedPercents.length
104
+ ? Math.round(completedPercents.reduce((sum, value) => sum + value, 0) / completedPercents.length)
105
+ : null,
106
+ latestCompletedPercent,
107
+ latestRun,
108
+ }
109
+ }
110
+
111
+ export function groupApprovalsByCategory(approvals: ApprovalRequest[]): ApprovalGroupSummary {
112
+ const pending = approvals
113
+ .filter((approval) => approval.status === 'pending')
114
+ .slice()
115
+ .sort((a, b) => a.createdAt - b.createdAt)
116
+ const groups = new Map<ApprovalCategory, ApprovalRequest[]>()
117
+
118
+ for (const approval of pending) {
119
+ const items = groups.get(approval.category) ?? []
120
+ items.push(approval)
121
+ groups.set(approval.category, items)
122
+ }
123
+
124
+ return {
125
+ totalPending: pending.length,
126
+ categories: Array.from(groups.entries())
127
+ .map(([category, items]) => ({ category, count: items.length, approvals: items }))
128
+ .sort((a, b) => b.count - a.count || a.category.localeCompare(b.category)),
129
+ }
130
+ }
131
+
132
+ export function buildQualityOverviewSummary(params: {
133
+ runs: SessionRunRecord[]
134
+ evalRuns: EvalRun[]
135
+ approvals: ApprovalRequest[]
136
+ }): QualityOverviewSummary {
137
+ const runHealth = summarizeRunHealth(params.runs)
138
+ const evals = summarizeEvalRuns(params.evalRuns)
139
+ const approvals = groupApprovalsByCategory(params.approvals)
140
+
141
+ return {
142
+ runHealth,
143
+ evals,
144
+ approvals,
145
+ needsAttention: runHealth.needsAttentionCount + evals.failedRuns + approvals.totalPending,
146
+ pendingApprovals: approvals.totalPending,
147
+ activeRuns: runHealth.activeCount,
148
+ evalAveragePercent: evals.averagePercent,
149
+ }
150
+ }
@@ -74,6 +74,28 @@ describe('mission-templates: registry', () => {
74
74
  assert.ok(template.defaults.successCriteria.some((item) => item.includes('Product Hunt')))
75
75
  })
76
76
 
77
+ it('includes operator quality release templates', () => {
78
+ const expected = [
79
+ 'release-candidate-qa',
80
+ 'agent-cost-audit',
81
+ 'connector-smoke-test',
82
+ 'failed-run-triage',
83
+ 'weekly-agent-quality-report',
84
+ ]
85
+
86
+ for (const id of expected) {
87
+ const template = templates.getMissionTemplate(id)
88
+ assert.ok(template, `expected ${id} template`)
89
+ assert.ok(template.tags.includes('quality') || template.tags.includes('operator-quality'), `${id} should be quality tagged`)
90
+ assert.ok(template.defaults.goal.includes('approval') || template.defaults.goal.includes('evidence'), `${id} should preserve operator guardrails`)
91
+ assert.ok(template.defaults.budget.maxWallclockSec, `${id} should have a wallclock cap`)
92
+ assert.ok(template.defaults.reportSchedule, `${id} should schedule reports`)
93
+ }
94
+
95
+ assert.equal(templates.getMissionTemplate('release-candidate-qa')?.name, 'Release Candidate QA')
96
+ assert.equal(templates.getMissionTemplate('weekly-agent-quality-report')?.category, 'monitoring')
97
+ })
98
+
77
99
  it('getMissionTemplate resolves known ids', () => {
78
100
  const list = templates.listMissionTemplates()
79
101
  const first = list[0]
@@ -187,6 +187,122 @@ export const BUILT_IN_MISSION_TEMPLATES: MissionTemplate[] = [
187
187
  reportSchedule: report(12 * HOUR),
188
188
  },
189
189
  },
190
+ {
191
+ id: 'release-candidate-qa',
192
+ name: 'Release Candidate QA',
193
+ description:
194
+ 'Collect release readiness evidence across evals, approvals, failed runs, docs, packaging, and desktop smoke gates.',
195
+ icon: '✅',
196
+ category: 'productivity',
197
+ tags: ['release', 'qa', 'evals', 'operator-quality'],
198
+ setupNote:
199
+ 'Set the target version and release branch in the goal. Keep publishing, tagging, and merging behind explicit human approval.',
200
+ defaults: {
201
+ title: 'Release Candidate QA',
202
+ goal:
203
+ 'Prepare a release candidate quality report for the target SwarmClaw version. Review recent failed runs, pending approvals, latest eval results, release notes, package metadata, install instructions, CI/build status, and desktop packaging notes. Summarize blockers, risk level, evidence links, and a go/no-go recommendation. Do not merge, tag, publish, deploy, or post publicly without explicit approval.',
204
+ successCriteria: [
205
+ 'Failed runs and pending approvals are reviewed with evidence or clear no-findings notes',
206
+ 'Eval coverage, score trends, and any failed criteria are summarized',
207
+ 'Release notes, package metadata, install pins, and desktop smoke requirements are checked',
208
+ 'Final report includes blockers, risks, follow-up tasks, and a go/no-go recommendation',
209
+ ],
210
+ budget: budget({ maxUsd: 2, maxTokens: 120_000, maxTurns: 160, maxWallclockSec: DAY }),
211
+ reportSchedule: report(6 * HOUR),
212
+ },
213
+ },
214
+ {
215
+ id: 'agent-cost-audit',
216
+ name: 'Agent Cost Audit',
217
+ description:
218
+ 'Inspect agent/provider spend, token usage, and high-cost runs, then recommend budget or routing adjustments.',
219
+ icon: '💸',
220
+ category: 'monitoring',
221
+ tags: ['cost', 'usage', 'budget', 'quality'],
222
+ setupNote:
223
+ 'Add any budget targets, providers, or agents that need special attention before starting.',
224
+ defaults: {
225
+ title: 'Agent Cost Audit',
226
+ goal:
227
+ 'Audit recent SwarmClaw agent costs and token usage. Identify top-spend agents, expensive runs, provider anomalies, retry loops, and avoidable tool calls. Produce a markdown report with recommended budget caps, model routing changes, and follow-up quality checks. Do not change budgets or provider settings without approval.',
228
+ successCriteria: [
229
+ 'Top cost drivers are listed with agent, provider, source, and supporting evidence',
230
+ 'At least 3 concrete cost-control recommendations are included',
231
+ 'Any suspected runaway, retry, or noisy automation pattern is flagged',
232
+ ],
233
+ budget: budget({ maxUsd: 1.5, maxTokens: 80_000, maxTurns: 100, maxWallclockSec: DAY }),
234
+ reportSchedule: report(DAY),
235
+ },
236
+ },
237
+ {
238
+ id: 'connector-smoke-test',
239
+ name: 'Connector Smoke Test',
240
+ description:
241
+ 'Verify configured connector health, delivery paths, approval boundaries, and recent connector-linked run evidence.',
242
+ icon: '🔌',
243
+ category: 'monitoring',
244
+ tags: ['connectors', 'smoke-test', 'approval', 'quality'],
245
+ setupNote:
246
+ 'Name the connectors and channels to test. Keep outbound messages or public replies approval-gated.',
247
+ defaults: {
248
+ title: 'Connector Smoke Test',
249
+ goal:
250
+ 'Smoke test configured SwarmClaw connectors. Check connector status, recent inbound/outbound activity, approval requirements, related failed runs, and any available logs. Draft a concise pass/fail report per connector with evidence and remediation steps. Do not send public replies or change connector settings without approval.',
251
+ successCriteria: [
252
+ 'Each targeted connector receives a pass, warn, or fail status',
253
+ 'Recent connector-linked failures or delivery issues are summarized with evidence',
254
+ 'Approval boundaries for outbound replies or sender permissions are explicitly checked',
255
+ ],
256
+ budget: budget({ maxUsd: 1.25, maxTokens: 70_000, maxTurns: 90, maxWallclockSec: 12 * HOUR }),
257
+ reportSchedule: report(6 * HOUR),
258
+ },
259
+ },
260
+ {
261
+ id: 'failed-run-triage',
262
+ name: 'Failed Run Triage',
263
+ description:
264
+ 'Review recent failed runs, cluster root causes, and propose fixes with replay evidence.',
265
+ icon: '🧯',
266
+ category: 'support',
267
+ tags: ['runs', 'triage', 'debugging', 'quality'],
268
+ setupNote:
269
+ 'Optionally narrow the mission to a source, agent, task, or release window.',
270
+ defaults: {
271
+ title: 'Failed Run Triage',
272
+ goal:
273
+ 'Triage recent failed SwarmClaw runs. Inspect run records, replay events, errors, retrieval evidence, source, owner, and timing. Cluster failures by likely root cause and write a prioritized remediation report with reproduction notes where possible. Do not modify code or settings unless explicitly asked.',
274
+ successCriteria: [
275
+ 'Recent failed runs are grouped by likely root cause',
276
+ 'Each high-priority failure includes evidence from the run record or replay',
277
+ 'Remediation recommendations are prioritized by user impact and confidence',
278
+ ],
279
+ budget: budget({ maxUsd: 1.5, maxTokens: 90_000, maxTurns: 120, maxWallclockSec: DAY }),
280
+ reportSchedule: report(6 * HOUR),
281
+ },
282
+ },
283
+ {
284
+ id: 'weekly-agent-quality-report',
285
+ name: 'Weekly Agent Quality Report',
286
+ description:
287
+ 'Produce a weekly operator report across eval trends, approvals, failed runs, missions, cost, and release risk.',
288
+ icon: '📈',
289
+ category: 'monitoring',
290
+ tags: ['weekly', 'quality', 'report', 'evals'],
291
+ setupNote:
292
+ 'Set the week or workspace scope in the goal if you want a narrower report.',
293
+ defaults: {
294
+ title: 'Weekly Agent Quality Report',
295
+ goal:
296
+ 'Produce a weekly SwarmClaw agent quality report. Summarize eval trends, failed and recovered runs, pending or high-risk approvals, mission outcomes, cost changes, connector health, and release-readiness risks. Include a short executive summary and a prioritized action list for the next week.',
297
+ successCriteria: [
298
+ 'Report includes eval, run, approval, mission, connector, and cost sections',
299
+ 'Top quality risks and regressions are clearly ranked',
300
+ 'Next-week action items are specific and tied to evidence',
301
+ ],
302
+ budget: budget({ maxUsd: 3, maxTokens: 180_000, maxTurns: 180, maxWallclockSec: 7 * DAY }),
303
+ reportSchedule: report(DAY),
304
+ },
305
+ },
190
306
  {
191
307
  id: 'hello-world-demo',
192
308
  name: 'Hello World Demo',
@@ -231,4 +231,4 @@ export type SessionTool =
231
231
  | 'crawl'
232
232
 
233
233
  export type SessionType = 'human'
234
- export type AppView = 'home' | 'agents' | 'org_chart' | 'inbox' | 'chatrooms' | 'protocols' | 'schedules' | 'memory' | 'tasks' | 'secrets' | 'wallets' | 'providers' | 'skills' | 'connectors' | 'webhooks' | 'mcp_servers' | 'knowledge' | 'extensions' | 'usage' | 'runs' | 'autonomy' | 'logs' | 'settings' | 'projects' | 'activity' | 'swarmfeed' | 'marketplace' | 'missions'
234
+ export type AppView = 'home' | 'agents' | 'org_chart' | 'inbox' | 'chatrooms' | 'protocols' | 'schedules' | 'memory' | 'tasks' | 'quality' | 'secrets' | 'wallets' | 'providers' | 'skills' | 'connectors' | 'webhooks' | 'mcp_servers' | 'knowledge' | 'extensions' | 'usage' | 'runs' | 'autonomy' | 'logs' | 'settings' | 'projects' | 'activity' | 'swarmfeed' | 'marketplace' | 'missions'