@swarmclawai/swarmclaw 0.7.3 → 0.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. package/README.md +47 -40
  2. package/bin/package-manager.js +157 -0
  3. package/bin/package-manager.test.js +90 -0
  4. package/bin/server-cmd.js +38 -7
  5. package/bin/swarmclaw.js +54 -4
  6. package/bin/update-cmd.js +48 -10
  7. package/bin/update-cmd.test.js +55 -0
  8. package/package.json +8 -3
  9. package/scripts/postinstall.mjs +26 -0
  10. package/src/app/api/agents/[id]/route.ts +17 -0
  11. package/src/app/api/agents/[id]/thread/route.ts +4 -87
  12. package/src/app/api/agents/route.ts +23 -1
  13. package/src/app/api/auth/route.ts +1 -1
  14. package/src/app/api/chatrooms/[id]/chat/route.ts +16 -5
  15. package/src/app/api/chatrooms/[id]/pins/route.ts +2 -1
  16. package/src/app/api/chatrooms/[id]/reactions/route.ts +2 -1
  17. package/src/app/api/chatrooms/[id]/route.ts +6 -0
  18. package/src/app/api/chats/[id]/route.ts +12 -0
  19. package/src/app/api/chats/heartbeat/route.ts +2 -1
  20. package/src/app/api/chats/route.ts +7 -1
  21. package/src/app/api/external-agents/[id]/heartbeat/route.ts +33 -0
  22. package/src/app/api/external-agents/[id]/route.ts +31 -0
  23. package/src/app/api/external-agents/register/route.ts +3 -0
  24. package/src/app/api/external-agents/route.ts +66 -0
  25. package/src/app/api/gateways/[id]/health/route.ts +28 -0
  26. package/src/app/api/gateways/[id]/route.ts +79 -0
  27. package/src/app/api/gateways/route.ts +57 -0
  28. package/src/app/api/openclaw/gateway/route.ts +10 -7
  29. package/src/app/api/openclaw/skills/route.ts +1 -1
  30. package/src/app/api/providers/[id]/discover-models/route.ts +27 -0
  31. package/src/app/api/schedules/[id]/route.ts +38 -9
  32. package/src/app/api/schedules/route.ts +51 -28
  33. package/src/app/api/settings/route.ts +6 -10
  34. package/src/app/api/setup/doctor/route.ts +6 -4
  35. package/src/app/api/tasks/[id]/route.ts +2 -1
  36. package/src/app/api/tasks/bulk/route.ts +2 -2
  37. package/src/app/page.tsx +126 -15
  38. package/src/cli/binary.test.js +142 -0
  39. package/src/cli/index.js +34 -11
  40. package/src/cli/index.test.js +195 -0
  41. package/src/cli/index.ts +20 -4
  42. package/src/cli/server-cmd.test.js +59 -0
  43. package/src/cli/spec.js +20 -2
  44. package/src/components/agents/agent-sheet.tsx +249 -7
  45. package/src/components/agents/inspector-panel.tsx +3 -2
  46. package/src/components/agents/sandbox-env-panel.tsx +4 -1
  47. package/src/components/auth/setup-wizard.tsx +970 -275
  48. package/src/components/chat/chat-area.tsx +41 -14
  49. package/src/components/chat/chat-card.tsx +2 -1
  50. package/src/components/chat/chat-header.tsx +8 -13
  51. package/src/components/chat/chat-list.tsx +58 -20
  52. package/src/components/chat/message-list.tsx +142 -18
  53. package/src/components/chatrooms/chatroom-input.tsx +96 -33
  54. package/src/components/chatrooms/chatroom-list.tsx +141 -72
  55. package/src/components/chatrooms/chatroom-message.tsx +7 -6
  56. package/src/components/chatrooms/chatroom-sheet.tsx +13 -1
  57. package/src/components/chatrooms/chatroom-tool-request-banner.tsx +5 -2
  58. package/src/components/chatrooms/chatroom-view.tsx +157 -86
  59. package/src/components/chatrooms/reaction-picker.tsx +38 -33
  60. package/src/components/gateways/gateway-sheet.tsx +567 -0
  61. package/src/components/input/chat-input.tsx +135 -86
  62. package/src/components/layout/app-layout.tsx +2 -0
  63. package/src/components/memory/memory-browser.tsx +71 -6
  64. package/src/components/memory/memory-card.tsx +18 -0
  65. package/src/components/memory/memory-detail.tsx +58 -31
  66. package/src/components/memory/memory-sheet.tsx +32 -4
  67. package/src/components/projects/project-detail.tsx +7 -2
  68. package/src/components/providers/provider-list.tsx +158 -2
  69. package/src/components/providers/provider-sheet.tsx +81 -70
  70. package/src/components/shared/bottom-sheet.tsx +31 -15
  71. package/src/components/shared/confirm-dialog.tsx +45 -30
  72. package/src/components/shared/model-combobox.tsx +90 -8
  73. package/src/components/shared/settings/section-heartbeat.tsx +11 -6
  74. package/src/components/shared/settings/section-orchestrator.tsx +3 -0
  75. package/src/components/shared/settings/settings-page.tsx +5 -3
  76. package/src/components/tasks/approvals-panel.tsx +7 -1
  77. package/src/components/ui/dialog.tsx +2 -2
  78. package/src/components/wallets/wallet-approval-dialog.tsx +59 -54
  79. package/src/lib/heartbeat-defaults.ts +48 -0
  80. package/src/lib/memory-presentation.ts +59 -0
  81. package/src/lib/provider-model-discovery-client.ts +29 -0
  82. package/src/lib/providers/index.ts +12 -5
  83. package/src/lib/runtime-loop.ts +105 -3
  84. package/src/lib/safe-storage.ts +6 -1
  85. package/src/lib/server/agent-runtime-config.test.ts +141 -0
  86. package/src/lib/server/agent-runtime-config.ts +277 -0
  87. package/src/lib/server/agent-thread-session.test.ts +85 -0
  88. package/src/lib/server/agent-thread-session.ts +123 -0
  89. package/src/lib/server/approvals-auto-approve.test.ts +59 -0
  90. package/src/lib/server/build-llm.test.ts +13 -5
  91. package/src/lib/server/chat-execution-tool-events.test.ts +87 -2
  92. package/src/lib/server/chat-execution.ts +159 -71
  93. package/src/lib/server/chatroom-helpers.test.ts +7 -0
  94. package/src/lib/server/chatroom-helpers.ts +99 -6
  95. package/src/lib/server/chatroom-session-persistence.test.ts +87 -0
  96. package/src/lib/server/connectors/manager.ts +89 -61
  97. package/src/lib/server/connectors/slack.ts +1 -1
  98. package/src/lib/server/daemon-state.ts +3 -2
  99. package/src/lib/server/data-dir.test.ts +56 -0
  100. package/src/lib/server/data-dir.ts +15 -9
  101. package/src/lib/server/eval/agent-regression.test.ts +47 -0
  102. package/src/lib/server/eval/agent-regression.ts +1742 -0
  103. package/src/lib/server/eval/runner.ts +11 -1
  104. package/src/lib/server/eval/store.ts +2 -1
  105. package/src/lib/server/heartbeat-service.ts +23 -8
  106. package/src/lib/server/heartbeat-wake.ts +6 -2
  107. package/src/lib/server/main-agent-loop.ts +13 -6
  108. package/src/lib/server/openclaw-exec-config.ts +4 -2
  109. package/src/lib/server/openclaw-gateway.ts +123 -36
  110. package/src/lib/server/orchestrator-lg.ts +1 -2
  111. package/src/lib/server/orchestrator.ts +3 -2
  112. package/src/lib/server/plugins.test.ts +9 -1
  113. package/src/lib/server/plugins.ts +12 -2
  114. package/src/lib/server/provider-model-discovery.ts +481 -0
  115. package/src/lib/server/queue.ts +1 -1
  116. package/src/lib/server/runtime-settings.test.ts +119 -0
  117. package/src/lib/server/runtime-settings.ts +12 -92
  118. package/src/lib/server/schedule-normalization.ts +187 -0
  119. package/src/lib/server/session-tools/autonomy-tools.test.ts +23 -0
  120. package/src/lib/server/session-tools/crud.ts +27 -3
  121. package/src/lib/server/session-tools/discovery-approvals.test.ts +170 -0
  122. package/src/lib/server/session-tools/discovery.ts +18 -8
  123. package/src/lib/server/session-tools/file-normalize.test.ts +5 -0
  124. package/src/lib/server/session-tools/file.ts +8 -2
  125. package/src/lib/server/session-tools/http.ts +9 -3
  126. package/src/lib/server/session-tools/index.ts +31 -1
  127. package/src/lib/server/session-tools/manage-schedules.test.ts +137 -0
  128. package/src/lib/server/session-tools/monitor.ts +14 -7
  129. package/src/lib/server/session-tools/openclaw-nodes.test.ts +111 -0
  130. package/src/lib/server/session-tools/openclaw-nodes.ts +86 -20
  131. package/src/lib/server/session-tools/platform.ts +1 -1
  132. package/src/lib/server/session-tools/plugin-creator.ts +9 -2
  133. package/src/lib/server/session-tools/sandbox.ts +51 -92
  134. package/src/lib/server/session-tools/session-info.ts +22 -1
  135. package/src/lib/server/session-tools/session-tools-wiring.test.ts +23 -0
  136. package/src/lib/server/session-tools/shell.ts +2 -2
  137. package/src/lib/server/session-tools/subagent.ts +3 -1
  138. package/src/lib/server/session-tools/web.ts +73 -30
  139. package/src/lib/server/storage.ts +29 -3
  140. package/src/lib/server/stream-agent-chat.test.ts +61 -0
  141. package/src/lib/server/stream-agent-chat.ts +139 -4
  142. package/src/lib/server/structured-extract.ts +1 -1
  143. package/src/lib/server/task-mention.ts +0 -1
  144. package/src/lib/server/tool-aliases.ts +37 -6
  145. package/src/lib/server/tool-capability-policy.ts +1 -1
  146. package/src/lib/setup-defaults.ts +352 -11
  147. package/src/lib/tool-definitions.ts +3 -4
  148. package/src/lib/validation/schemas.ts +55 -1
  149. package/src/stores/use-app-store.ts +43 -1
  150. package/src/stores/use-chatroom-store.ts +153 -26
  151. package/src/types/index.ts +189 -6
  152. package/src/app/api/chats/[id]/main-loop/route.ts +0 -13
@@ -0,0 +1,1742 @@
1
+ import fs from 'node:fs'
2
+ import http, { type IncomingMessage, type Server as HttpServer, type ServerResponse } from 'node:http'
3
+ import net, { type AddressInfo } from 'node:net'
4
+ import { createHash } from 'node:crypto'
5
+ import path from 'node:path'
6
+ import { genId } from '@/lib/id'
7
+ import type { ApprovalRequest, MessageToolEvent, Session } from '@/types'
8
+ import { submitDecision } from '../approvals'
9
+ import { executeSessionChatTurn, type ExecuteChatTurnResult } from '../chat-execution'
10
+ import { WORKSPACE_DIR } from '../data-dir'
11
+ import { getPluginManager } from '../plugins'
12
+ import { sendMailboxEnvelope, listMailbox } from '../session-mailbox'
13
+ import { processDueWatchJobs } from '../watch-jobs'
14
+ import {
15
+ deleteApproval,
16
+ deleteBrowserSession,
17
+ deleteDelegationJob,
18
+ deleteWatchJob,
19
+ decryptKey,
20
+ loadAgents,
21
+ loadApprovals,
22
+ loadDelegationJobs,
23
+ loadSchedules,
24
+ loadSecrets,
25
+ loadSessions,
26
+ loadSettings,
27
+ loadTasks,
28
+ loadWatchJobs,
29
+ saveSchedules,
30
+ saveSecrets,
31
+ saveSessions,
32
+ saveSettings,
33
+ saveTasks,
34
+ } from '../storage'
35
+
36
+ export type RegressionApprovalMode = 'manual' | 'auto' | 'off'
37
+
38
+ export interface RegressionAssertion {
39
+ name: string
40
+ passed: boolean
41
+ details?: string
42
+ weight?: number
43
+ }
44
+
45
+ export interface AgentRegressionScenarioResult {
46
+ scenarioId: string
47
+ name: string
48
+ approvalMode: RegressionApprovalMode
49
+ status: 'passed' | 'failed'
50
+ score: number
51
+ maxScore: number
52
+ assertions: RegressionAssertion[]
53
+ sessionId: string
54
+ workspaceDir: string
55
+ toolNames: string[]
56
+ approvalIds: string[]
57
+ approvals: RegressionApprovalEvidence[]
58
+ responseTexts: string[]
59
+ turns: RegressionTurnEvidence[]
60
+ artifacts: RegressionArtifactEvidence[]
61
+ evidencePaths: {
62
+ transcript: string
63
+ approvals: string
64
+ workspace: string
65
+ }
66
+ }
67
+
68
+ export interface AgentRegressionSuiteResult {
69
+ id: string
70
+ agentId: string
71
+ approvalModes: RegressionApprovalMode[]
72
+ startedAt: number
73
+ endedAt: number
74
+ score: number
75
+ maxScore: number
76
+ scenarios: AgentRegressionScenarioResult[]
77
+ resultsPath: string
78
+ }
79
+
80
+ interface ScenarioContext {
81
+ suiteId: string
82
+ agentId: string
83
+ agent: Record<string, unknown>
84
+ approvalMode: RegressionApprovalMode
85
+ sessionId: string
86
+ workspaceDir: string
87
+ responseTexts: string[]
88
+ toolEvents: MessageToolEvent[]
89
+ toolNames: Set<string>
90
+ turns: RegressionTurnEvidence[]
91
+ }
92
+
93
+ interface AgentRegressionScenarioDefinition {
94
+ id: string
95
+ name: string
96
+ plugins: string[]
97
+ run: (ctx: ScenarioContext) => Promise<AgentRegressionScenarioResult>
98
+ }
99
+
100
+ interface MockMailAccount {
101
+ email: string
102
+ chosenPassword: string
103
+ appPassword: string
104
+ inviteCode: string
105
+ }
106
+
107
+ interface MockSocialAccount {
108
+ email: string
109
+ handle: string
110
+ password: string
111
+ inviteCode: string
112
+ }
113
+
114
+ interface MockVerifiedSignup {
115
+ token: string
116
+ email: string
117
+ handle: string
118
+ password: string
119
+ verificationCode: string
120
+ recoveryToken: string
121
+ verified: boolean
122
+ }
123
+
124
+ interface MockSignupHarness {
125
+ baseUrl: string
126
+ close: () => Promise<void>
127
+ state: {
128
+ mailAccounts: Map<string, MockMailAccount>
129
+ socialAccounts: Map<string, MockSocialAccount>
130
+ pendingVerifiedSignups: Map<string, MockVerifiedSignup>
131
+ }
132
+ }
133
+
134
+ interface MockSmtpMessage {
135
+ mailFrom: string
136
+ recipients: string[]
137
+ data: string
138
+ }
139
+
140
+ interface MockSmtpHarness {
141
+ port: number
142
+ messages: MockSmtpMessage[]
143
+ close: () => Promise<void>
144
+ }
145
+
146
+ interface MockResearchDeployHarness {
147
+ baseUrl: string
148
+ close: () => Promise<void>
149
+ state: {
150
+ deployments: Map<string, string>
151
+ }
152
+ }
153
+
154
+ export interface RegressionToolEventEvidence {
155
+ name: string
156
+ input?: string
157
+ output?: string
158
+ error?: boolean | string
159
+ }
160
+
161
+ export interface RegressionTurnEvidence {
162
+ prompt: string
163
+ responseText: string
164
+ toolEvents: RegressionToolEventEvidence[]
165
+ approvalIds: string[]
166
+ }
167
+
168
+ export interface RegressionArtifactEvidence {
169
+ relativePath: string
170
+ exists: boolean
171
+ size: number
172
+ sha256?: string
173
+ preview?: string
174
+ }
175
+
176
+ export interface RegressionApprovalEvidence {
177
+ id: string
178
+ category: string
179
+ status: string
180
+ title: string
181
+ toolId: string | null
182
+ }
183
+
184
+ function ensureDir(dir: string): void {
185
+ fs.mkdirSync(dir, { recursive: true })
186
+ }
187
+
188
+ function escapeHtml(value: string): string {
189
+ return value
190
+ .replaceAll('&', '&amp;')
191
+ .replaceAll('<', '&lt;')
192
+ .replaceAll('>', '&gt;')
193
+ .replaceAll('"', '&quot;')
194
+ }
195
+
196
+ function signupSeed(input: string): string {
197
+ return createHash('sha1').update(input).digest('hex').slice(0, 8)
198
+ }
199
+
200
+ function htmlDocument(title: string, body: string): string {
201
+ return [
202
+ '<!doctype html>',
203
+ '<html lang="en">',
204
+ '<head>',
205
+ ' <meta charset="utf-8">',
206
+ ` <title>${escapeHtml(title)}</title>`,
207
+ ' <meta name="viewport" content="width=device-width, initial-scale=1">',
208
+ ' <style>',
209
+ ' body { font-family: ui-sans-serif, system-ui, sans-serif; margin: 40px; line-height: 1.5; }',
210
+ ' form { display: grid; gap: 12px; max-width: 420px; }',
211
+ ' label { display: grid; gap: 6px; font-weight: 600; }',
212
+ ' input { padding: 10px 12px; border: 1px solid #cbd5e1; border-radius: 8px; }',
213
+ ' button, a.button { display: inline-flex; align-items: center; justify-content: center; padding: 10px 14px; border-radius: 8px; background: #0f172a; color: white; text-decoration: none; border: none; cursor: pointer; }',
214
+ ' .card { max-width: 720px; padding: 24px; border: 1px solid #cbd5e1; border-radius: 16px; background: #fff; }',
215
+ ' .mono { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; }',
216
+ ' .muted { color: #475569; }',
217
+ ' </style>',
218
+ '</head>',
219
+ '<body>',
220
+ body,
221
+ '</body>',
222
+ '</html>',
223
+ ].join('\n')
224
+ }
225
+
226
+ function sendHtml(res: ServerResponse, statusCode: number, title: string, body: string): void {
227
+ res.statusCode = statusCode
228
+ res.setHeader('content-type', 'text/html; charset=utf-8')
229
+ res.end(htmlDocument(title, body))
230
+ }
231
+
232
+ function redirect(res: ServerResponse, location: string): void {
233
+ res.statusCode = 302
234
+ res.setHeader('location', location)
235
+ res.end()
236
+ }
237
+
238
+ async function readRequestBody(req: IncomingMessage): Promise<string> {
239
+ const chunks: Buffer[] = []
240
+ for await (const chunk of req) {
241
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk))
242
+ }
243
+ return Buffer.concat(chunks).toString('utf8')
244
+ }
245
+
246
+ async function startLocalHttpServer(
247
+ handler: (req: IncomingMessage, res: ServerResponse) => Promise<void>,
248
+ ): Promise<{ server: HttpServer; baseUrl: string }> {
249
+ const server = http.createServer((req, res) => {
250
+ void handler(req, res).catch((error: unknown) => {
251
+ const message = error instanceof Error ? error.stack || error.message : String(error)
252
+ res.statusCode = 500
253
+ res.setHeader('content-type', 'text/plain; charset=utf-8')
254
+ res.end(message)
255
+ })
256
+ })
257
+ await new Promise<void>((resolve, reject) => {
258
+ server.once('error', reject)
259
+ server.listen(0, '127.0.0.1', () => {
260
+ server.off('error', reject)
261
+ resolve()
262
+ })
263
+ })
264
+ const address = server.address()
265
+ if (!address || typeof address === 'string') throw new Error('Mock HTTP server failed to bind to a TCP port.')
266
+ return {
267
+ server,
268
+ baseUrl: `http://127.0.0.1:${address.port}`,
269
+ }
270
+ }
271
+
272
+ async function closeServer(server: HttpServer | net.Server): Promise<void> {
273
+ await new Promise<void>((resolve) => {
274
+ server.close(() => resolve())
275
+ })
276
+ }
277
+
278
+ async function startMockSignupHarness(): Promise<MockSignupHarness> {
279
+ const state = {
280
+ mailAccounts: new Map<string, MockMailAccount>(),
281
+ socialAccounts: new Map<string, MockSocialAccount>(),
282
+ pendingVerifiedSignups: new Map<string, MockVerifiedSignup>(),
283
+ }
284
+
285
+ const { server, baseUrl } = await startLocalHttpServer(async (req, res) => {
286
+ const url = new URL(req.url || '/', baseUrl)
287
+ const pathname = url.pathname
288
+
289
+ if (req.method === 'GET' && pathname === '/') {
290
+ return sendHtml(res, 200, 'Mock Services', `
291
+ <div class="card">
292
+ <h1>Mock External Services</h1>
293
+ <p class="muted">Use these pages to test browser signup, secrets, and verification flows.</p>
294
+ <p><a class="button" href="/mail/signup">Open MockMail signup</a></p>
295
+ <p><a class="button" href="/verify-social/signup">Open Chirper verification signup</a></p>
296
+ </div>
297
+ `)
298
+ }
299
+
300
+ if (req.method === 'GET' && pathname === '/mail/signup') {
301
+ const prefilledEmail = String(url.searchParams.get('email') || '').trim()
302
+ return sendHtml(res, 200, 'MockMail Signup', `
303
+ <div class="card">
304
+ <h1>Create a MockMail account</h1>
305
+ <p class="muted">This mock provider generates an app password and a social invite code after signup.</p>
306
+ <form method="post" action="/mail/signup">
307
+ <label>Email address
308
+ <input id="email" name="email" type="email" autocomplete="email" value="${escapeHtml(prefilledEmail)}" required />
309
+ </label>
310
+ <label>Password
311
+ <input id="password" name="password" type="password" autocomplete="new-password" value="TempMockMailPass!23" required />
312
+ </label>
313
+ <button id="submit" type="submit">Create MockMail account</button>
314
+ </form>
315
+ </div>
316
+ `)
317
+ }
318
+
319
+ if (req.method === 'POST' && pathname === '/mail/signup') {
320
+ const body = await readRequestBody(req)
321
+ const form = new URLSearchParams(body)
322
+ const email = String(form.get('email') || '').trim().toLowerCase()
323
+ const chosenPassword = String(form.get('password') || '').trim()
324
+ if (!email || !chosenPassword) {
325
+ return sendHtml(res, 400, 'MockMail Signup Error', '<div class="card"><h1>Missing email or password</h1></div>')
326
+ }
327
+ const seed = signupSeed(email)
328
+ state.mailAccounts.set(email, {
329
+ email,
330
+ chosenPassword,
331
+ appPassword: `mockmail-app-${seed}`,
332
+ inviteCode: `INV-${seed.slice(0, 6).toUpperCase()}`,
333
+ })
334
+ return redirect(res, `/mail/dashboard?email=${encodeURIComponent(email)}`)
335
+ }
336
+
337
+ if (req.method === 'GET' && pathname === '/mail/dashboard') {
338
+ const email = String(url.searchParams.get('email') || '').trim().toLowerCase()
339
+ const account = state.mailAccounts.get(email)
340
+ if (!account) {
341
+ return sendHtml(res, 404, 'MockMail Account Missing', '<div class="card"><h1>Account not found</h1></div>')
342
+ }
343
+ return sendHtml(res, 200, 'MockMail Dashboard', `
344
+ <div class="card">
345
+ <h1>MockMail account ready</h1>
346
+ <p>Email: <span class="mono" id="mail-email">${escapeHtml(account.email)}</span></p>
347
+ <p>App password: <span class="mono" id="app-password">${escapeHtml(account.appPassword)}</span></p>
348
+ <p>Social invite code: <span class="mono" id="invite-code">${escapeHtml(account.inviteCode)}</span></p>
349
+ <p class="muted">Use the invite code to create a Chirper account.</p>
350
+ <p><a class="button" id="social-link" href="/social/signup?email=${encodeURIComponent(account.email)}&inviteCode=${encodeURIComponent(account.inviteCode)}">Create Chirper account</a></p>
351
+ </div>
352
+ `)
353
+ }
354
+
355
+ if (req.method === 'GET' && pathname === '/social/signup') {
356
+ const email = String(url.searchParams.get('email') || '').trim()
357
+ const inviteCode = String(url.searchParams.get('inviteCode') || '').trim()
358
+ const handle = String(url.searchParams.get('handle') || 'northstar-operator').trim()
359
+ return sendHtml(res, 200, 'Chirper Signup', `
360
+ <div class="card">
361
+ <h1>Create a Chirper account</h1>
362
+ <form method="post" action="/social/signup">
363
+ <label>Email address
364
+ <input id="email" name="email" type="email" value="${escapeHtml(email)}" required />
365
+ </label>
366
+ <label>Handle
367
+ <input id="handle" name="handle" type="text" value="${escapeHtml(handle)}" required />
368
+ </label>
369
+ <label>Password
370
+ <input id="password" name="password" type="password" value="TempChirperPass!23" required />
371
+ </label>
372
+ <label>Invite code
373
+ <input id="inviteCode" name="inviteCode" type="text" value="${escapeHtml(inviteCode)}" required />
374
+ </label>
375
+ <button id="submit" type="submit">Create Chirper account</button>
376
+ </form>
377
+ </div>
378
+ `)
379
+ }
380
+
381
+ if (req.method === 'POST' && pathname === '/social/signup') {
382
+ const body = await readRequestBody(req)
383
+ const form = new URLSearchParams(body)
384
+ const email = String(form.get('email') || '').trim().toLowerCase()
385
+ const handle = String(form.get('handle') || '').trim()
386
+ const password = String(form.get('password') || '').trim()
387
+ const inviteCode = String(form.get('inviteCode') || '').trim()
388
+ const mailAccount = state.mailAccounts.get(email)
389
+ if (!mailAccount || inviteCode !== mailAccount.inviteCode || !handle || !password) {
390
+ return sendHtml(res, 400, 'Chirper Signup Error', `
391
+ <div class="card">
392
+ <h1>Signup failed</h1>
393
+ <p class="muted">A valid invite code from the MockMail dashboard is required.</p>
394
+ </div>
395
+ `)
396
+ }
397
+ state.socialAccounts.set(handle, {
398
+ email,
399
+ handle,
400
+ password,
401
+ inviteCode,
402
+ })
403
+ return redirect(res, `/social/success?handle=${encodeURIComponent(handle)}`)
404
+ }
405
+
406
+ if (req.method === 'GET' && pathname === '/social/success') {
407
+ const handle = String(url.searchParams.get('handle') || '').trim()
408
+ const account = state.socialAccounts.get(handle)
409
+ if (!account) {
410
+ return sendHtml(res, 404, 'Chirper Account Missing', '<div class="card"><h1>Account not found</h1></div>')
411
+ }
412
+ return sendHtml(res, 200, 'Chirper Ready', `
413
+ <div class="card">
414
+ <h1>Chirper account ready</h1>
415
+ <p>Handle: <span class="mono" id="chirper-handle">${escapeHtml(account.handle)}</span></p>
416
+ <p>Email: <span class="mono">${escapeHtml(account.email)}</span></p>
417
+ </div>
418
+ `)
419
+ }
420
+
421
+ if (req.method === 'GET' && pathname === '/verify-social/signup') {
422
+ const prefilledEmail = String(url.searchParams.get('email') || '').trim()
423
+ const prefilledHandle = String(url.searchParams.get('handle') || 'verified-operator').trim()
424
+ return sendHtml(res, 200, 'Chirper Verification Signup', `
425
+ <div class="card">
426
+ <h1>Create a Chirper account with verification</h1>
427
+ <p class="muted">This flow requires a human verification code after the first step.</p>
428
+ <form method="post" action="/verify-social/signup">
429
+ <label>Email address
430
+ <input id="email" name="email" type="email" autocomplete="email" value="${escapeHtml(prefilledEmail)}" required />
431
+ </label>
432
+ <label>Handle
433
+ <input id="handle" name="handle" type="text" value="${escapeHtml(prefilledHandle)}" required />
434
+ </label>
435
+ <label>Password
436
+ <input id="password" name="password" type="password" value="TempVerifiedPass!23" required />
437
+ </label>
438
+ <button id="submit" type="submit">Start verified signup</button>
439
+ </form>
440
+ </div>
441
+ `)
442
+ }
443
+
444
+ if (req.method === 'POST' && pathname === '/verify-social/signup') {
445
+ const body = await readRequestBody(req)
446
+ const form = new URLSearchParams(body)
447
+ const email = String(form.get('email') || '').trim().toLowerCase()
448
+ const handle = String(form.get('handle') || '').trim()
449
+ const password = String(form.get('password') || '').trim()
450
+ if (!email || !handle || !password) {
451
+ return sendHtml(res, 400, 'Verified Signup Error', '<div class="card"><h1>Missing email, handle, or password</h1></div>')
452
+ }
453
+ const token = `verify-${signupSeed(`${email}:${handle}`)}`
454
+ state.pendingVerifiedSignups.set(token, {
455
+ token,
456
+ email,
457
+ handle,
458
+ password,
459
+ verificationCode: '246810',
460
+ recoveryToken: `recover-${signupSeed(`${handle}:${email}:recovery`)}`,
461
+ verified: false,
462
+ })
463
+ return redirect(res, `/verify-social/verify?token=${encodeURIComponent(token)}`)
464
+ }
465
+
466
+ if (req.method === 'GET' && pathname === '/verify-social/verify') {
467
+ const token = String(url.searchParams.get('token') || '').trim()
468
+ const pending = state.pendingVerifiedSignups.get(token)
469
+ if (!pending) {
470
+ return sendHtml(res, 404, 'Verification Missing', '<div class="card"><h1>Verification session not found</h1></div>')
471
+ }
472
+ return sendHtml(res, 200, 'Enter Verification Code', `
473
+ <div class="card">
474
+ <h1>Verification code required</h1>
475
+ <p id="verification-copy">A human verification code was sent out-of-band. Ask the human for the code. Do not guess.</p>
476
+ <form method="post" action="/verify-social/verify">
477
+ <input type="hidden" name="token" value="${escapeHtml(token)}" />
478
+ <label>Verification code
479
+ <input id="code" name="code" type="text" required />
480
+ </label>
481
+ <button id="submit" type="submit">Complete verified signup</button>
482
+ </form>
483
+ </div>
484
+ `)
485
+ }
486
+
487
+ if (req.method === 'POST' && pathname === '/verify-social/verify') {
488
+ const body = await readRequestBody(req)
489
+ const form = new URLSearchParams(body)
490
+ const token = String(form.get('token') || '').trim()
491
+ const code = String(form.get('code') || '').trim()
492
+ const pending = state.pendingVerifiedSignups.get(token)
493
+ if (!pending) {
494
+ return sendHtml(res, 404, 'Verification Missing', '<div class="card"><h1>Verification session not found</h1></div>')
495
+ }
496
+ if (code !== pending.verificationCode) {
497
+ return sendHtml(res, 400, 'Verification Failed', `
498
+ <div class="card">
499
+ <h1>Incorrect code</h1>
500
+ <p class="muted">The verification code must come from a human. Do not guess.</p>
501
+ </div>
502
+ `)
503
+ }
504
+ pending.verified = true
505
+ state.pendingVerifiedSignups.set(token, pending)
506
+ return redirect(res, `/verify-social/success?token=${encodeURIComponent(token)}`)
507
+ }
508
+
509
+ if (req.method === 'GET' && pathname === '/verify-social/success') {
510
+ const token = String(url.searchParams.get('token') || '').trim()
511
+ const pending = state.pendingVerifiedSignups.get(token)
512
+ if (!pending || !pending.verified) {
513
+ return sendHtml(res, 404, 'Verified Signup Missing', '<div class="card"><h1>Verified account not found</h1></div>')
514
+ }
515
+ return sendHtml(res, 200, 'Verified Chirper Ready', `
516
+ <div class="card">
517
+ <h1>Verified Chirper account ready</h1>
518
+ <p>Handle: <span class="mono" id="verified-handle">${escapeHtml(pending.handle)}</span></p>
519
+ <p>Recovery token: <span class="mono" id="recovery-token">${escapeHtml(pending.recoveryToken)}</span></p>
520
+ </div>
521
+ `)
522
+ }
523
+
524
+ return sendHtml(res, 404, 'Not Found', '<div class="card"><h1>Route not found</h1></div>')
525
+ })
526
+
527
+ return {
528
+ baseUrl,
529
+ state,
530
+ close: async () => closeServer(server),
531
+ }
532
+ }
533
+
534
+ async function startMockSmtpHarness(): Promise<MockSmtpHarness> {
535
+ const messages: MockSmtpMessage[] = []
536
+ const server = net.createServer((socket) => {
537
+ let buffer = ''
538
+ let mailFrom = ''
539
+ let recipients: string[] = []
540
+ let dataMode = false
541
+ let dataBuffer = ''
542
+
543
+ socket.write('220 mock-smtp.local ESMTP ready\r\n')
544
+
545
+ const resetMessage = () => {
546
+ mailFrom = ''
547
+ recipients = []
548
+ dataBuffer = ''
549
+ }
550
+
551
+ const pushIfCompleteData = () => {
552
+ const endMarker = '\r\n.\r\n'
553
+ const endIndex = buffer.indexOf(endMarker)
554
+ if (!dataMode || endIndex === -1) return false
555
+ dataBuffer += buffer.slice(0, endIndex)
556
+ buffer = buffer.slice(endIndex + endMarker.length)
557
+ messages.push({
558
+ mailFrom,
559
+ recipients: [...recipients],
560
+ data: dataBuffer,
561
+ })
562
+ dataMode = false
563
+ dataBuffer = ''
564
+ socket.write('250 Message accepted\r\n')
565
+ resetMessage()
566
+ return true
567
+ }
568
+
569
+ socket.on('data', (chunk) => {
570
+ buffer += chunk.toString('utf8')
571
+ while (buffer.length > 0) {
572
+ if (dataMode) {
573
+ if (!pushIfCompleteData()) break
574
+ continue
575
+ }
576
+ const lineEnd = buffer.indexOf('\r\n')
577
+ if (lineEnd === -1) break
578
+ const line = buffer.slice(0, lineEnd)
579
+ buffer = buffer.slice(lineEnd + 2)
580
+ const upper = line.toUpperCase()
581
+
582
+ if (upper.startsWith('EHLO') || upper.startsWith('HELO')) {
583
+ socket.write('250 mock-smtp.local\r\n')
584
+ continue
585
+ }
586
+ if (upper.startsWith('MAIL FROM:')) {
587
+ mailFrom = line.slice('MAIL FROM:'.length).trim()
588
+ socket.write('250 Sender OK\r\n')
589
+ continue
590
+ }
591
+ if (upper.startsWith('RCPT TO:')) {
592
+ recipients.push(line.slice('RCPT TO:'.length).trim().replace(/^<|>$/g, ''))
593
+ socket.write('250 Recipient OK\r\n')
594
+ continue
595
+ }
596
+ if (upper === 'DATA') {
597
+ dataMode = true
598
+ socket.write('354 End data with <CR><LF>.<CR><LF>\r\n')
599
+ continue
600
+ }
601
+ if (upper === 'QUIT') {
602
+ socket.write('221 Bye\r\n')
603
+ socket.end()
604
+ return
605
+ }
606
+ socket.write('250 OK\r\n')
607
+ }
608
+ })
609
+ })
610
+
611
+ await new Promise<void>((resolve, reject) => {
612
+ server.once('error', reject)
613
+ server.listen(0, '127.0.0.1', () => {
614
+ server.off('error', reject)
615
+ resolve()
616
+ })
617
+ })
618
+ const address = server.address() as AddressInfo | null
619
+ if (!address) throw new Error('Mock SMTP server failed to bind to a port.')
620
+
621
+ return {
622
+ port: address.port,
623
+ messages,
624
+ close: async () => closeServer(server),
625
+ }
626
+ }
627
+
628
+ async function startMockResearchDeployHarness(): Promise<MockResearchDeployHarness> {
629
+ const state = {
630
+ deployments: new Map<string, string>(),
631
+ }
632
+
633
+ const { server, baseUrl } = await startLocalHttpServer(async (req, res) => {
634
+ const url = new URL(req.url || '/', baseUrl)
635
+ const pathname = url.pathname
636
+
637
+ if (req.method === 'GET' && pathname === '/research/brief') {
638
+ return sendHtml(res, 200, 'Northstar Notes Brief', `
639
+ <div class="card">
640
+ <h1>Northstar Notes product brief</h1>
641
+ <p><strong>Product:</strong> Northstar Notes, a weekly AI operator briefing for busy startup founders.</p>
642
+ <p><strong>Audience:</strong> Mid-stage founders who need signal, not noise.</p>
643
+ <p><strong>Required headline:</strong> Northstar Notes for AI Operators</p>
644
+ <p><strong>Required subhead:</strong> One sharp Friday briefing on launches, model updates, and GTM moves that matter.</p>
645
+ <p><strong>Required CTA:</strong> Get the Friday briefing</p>
646
+ <p><strong>Required proof points:</strong> concise market signal, product launch summaries, operator action items.</p>
647
+ <p><strong>Design note:</strong> make it feel decisive and editorial, not generic SaaS boilerplate.</p>
648
+ </div>
649
+ `)
650
+ }
651
+
652
+ if (req.method === 'GET' && pathname === '/docs/deploy-api') {
653
+ return sendHtml(res, 200, 'Deploy API Docs', `
654
+ <div class="card">
655
+ <h1>Mock deploy API</h1>
656
+ <p>Deploy a static HTML page by POSTing JSON to <span class="mono">/deploy</span>.</p>
657
+ <pre class="mono">{
658
+ "slug": "northstar-notes",
659
+ "html": "&lt;!doctype html&gt;..."
660
+ }</pre>
661
+ <p>The response is JSON with a single field: <span class="mono">url</span>.</p>
662
+ <p>After deployment, verify the live page by opening the returned URL and checking that the required headline is visible.</p>
663
+ </div>
664
+ `)
665
+ }
666
+
667
+ if (req.method === 'POST' && pathname === '/deploy') {
668
+ const raw = await readRequestBody(req)
669
+ let payload: Record<string, unknown>
670
+ try {
671
+ payload = JSON.parse(raw) as Record<string, unknown>
672
+ } catch {
673
+ res.statusCode = 400
674
+ res.setHeader('content-type', 'application/json')
675
+ res.end(JSON.stringify({ error: 'invalid json' }))
676
+ return
677
+ }
678
+ const html = typeof payload.html === 'string' ? payload.html : ''
679
+ const slug = typeof payload.slug === 'string' && payload.slug.trim()
680
+ ? payload.slug.trim().toLowerCase().replace(/[^a-z0-9-]/g, '-')
681
+ : `site-${genId(4)}`
682
+ if (!html.trim()) {
683
+ res.statusCode = 400
684
+ res.setHeader('content-type', 'application/json')
685
+ res.end(JSON.stringify({ error: 'html is required' }))
686
+ return
687
+ }
688
+ state.deployments.set(slug, html)
689
+ res.statusCode = 200
690
+ res.setHeader('content-type', 'application/json')
691
+ res.end(JSON.stringify({ url: `${baseUrl}/deployed/${slug}` }))
692
+ return
693
+ }
694
+
695
+ if (req.method === 'GET' && pathname.startsWith('/deployed/')) {
696
+ const slug = pathname.slice('/deployed/'.length)
697
+ const html = state.deployments.get(slug)
698
+ if (!html) {
699
+ res.statusCode = 404
700
+ res.setHeader('content-type', 'text/plain; charset=utf-8')
701
+ res.end('deployment not found')
702
+ return
703
+ }
704
+ res.statusCode = 200
705
+ res.setHeader('content-type', 'text/html; charset=utf-8')
706
+ res.end(html)
707
+ return
708
+ }
709
+
710
+ return sendHtml(res, 404, 'Not Found', '<div class="card"><h1>Route not found</h1></div>')
711
+ })
712
+
713
+ return {
714
+ baseUrl,
715
+ state,
716
+ close: async () => closeServer(server),
717
+ }
718
+ }
719
+
720
+ function truncatePreview(text: string, max = 400): string {
721
+ const normalized = text.replace(/\r\n/g, '\n').trim()
722
+ if (normalized.length <= max) return normalized
723
+ return `${normalized.slice(0, Math.max(0, max - 3))}...`
724
+ }
725
+
726
+ function buildArtifactEvidence(ctx: ScenarioContext, relativePaths: string[]): RegressionArtifactEvidence[] {
727
+ return relativePaths.map((relativePath) => {
728
+ const absolutePath = scenarioFile(ctx, relativePath)
729
+ if (!fs.existsSync(absolutePath)) {
730
+ return {
731
+ relativePath,
732
+ exists: false,
733
+ size: 0,
734
+ }
735
+ }
736
+ const buffer = fs.readFileSync(absolutePath)
737
+ return {
738
+ relativePath,
739
+ exists: true,
740
+ size: buffer.byteLength,
741
+ sha256: createHash('sha256').update(buffer).digest('hex'),
742
+ preview: truncatePreview(buffer.toString('utf8')),
743
+ }
744
+ })
745
+ }
746
+
747
+ function collectWorkspaceFiles(rootDir: string): string[] {
748
+ if (!fs.existsSync(rootDir)) return []
749
+ const files: string[] = []
750
+ const visit = (dir: string) => {
751
+ for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
752
+ const fullPath = path.join(dir, entry.name)
753
+ if (entry.isDirectory()) {
754
+ visit(fullPath)
755
+ continue
756
+ }
757
+ if (!entry.isFile()) continue
758
+ files.push(path.relative(rootDir, fullPath))
759
+ }
760
+ }
761
+ visit(rootDir)
762
+ return files.sort()
763
+ }
764
+
765
+ function writeScenarioEvidenceFiles(ctx: ScenarioContext): AgentRegressionScenarioResult['evidencePaths'] {
766
+ const transcriptPath = scenarioFile(ctx, '.agent-regression-transcript.json')
767
+ const approvalsPath = scenarioFile(ctx, '.agent-regression-approvals.json')
768
+ const workspacePath = scenarioFile(ctx, '.agent-regression-workspace.json')
769
+ const session = loadSessions()[ctx.sessionId]
770
+
771
+ fs.writeFileSync(transcriptPath, JSON.stringify(session?.messages || [], null, 2), 'utf8')
772
+ fs.writeFileSync(approvalsPath, JSON.stringify(listSessionApprovals(ctx.sessionId), null, 2), 'utf8')
773
+ fs.writeFileSync(workspacePath, JSON.stringify(collectWorkspaceFiles(ctx.workspaceDir), null, 2), 'utf8')
774
+
775
+ return {
776
+ transcript: transcriptPath,
777
+ approvals: approvalsPath,
778
+ workspace: workspacePath,
779
+ }
780
+ }
781
+
782
+ export function resolveRegressionApprovalSettings(mode: RegressionApprovalMode): Record<string, unknown> {
783
+ if (mode === 'off') {
784
+ return {
785
+ approvalsEnabled: false,
786
+ approvalAutoApproveCategories: [],
787
+ }
788
+ }
789
+ if (mode === 'auto') {
790
+ return {
791
+ approvalsEnabled: true,
792
+ approvalAutoApproveCategories: ['tool_access'],
793
+ }
794
+ }
795
+ return {
796
+ approvalsEnabled: true,
797
+ approvalAutoApproveCategories: [],
798
+ }
799
+ }
800
+
801
+ export function scoreAssertions(assertions: RegressionAssertion[]): { score: number; maxScore: number; status: 'passed' | 'failed' } {
802
+ let score = 0
803
+ let maxScore = 0
804
+ for (const assertion of assertions) {
805
+ const weight = assertion.weight ?? 1
806
+ maxScore += weight
807
+ if (assertion.passed) score += weight
808
+ }
809
+ return {
810
+ score,
811
+ maxScore,
812
+ status: score === maxScore ? 'passed' : 'failed',
813
+ }
814
+ }
815
+
816
+ function listSessionApprovals(sessionId: string): ApprovalRequest[] {
817
+ return Object.values(loadApprovals() as Record<string, ApprovalRequest>)
818
+ .filter((approval) => approval.sessionId === sessionId)
819
+ .sort((left, right) => left.createdAt - right.createdAt)
820
+ }
821
+
822
+ function buildApprovalEvidence(sessionId: string): RegressionApprovalEvidence[] {
823
+ return listSessionApprovals(sessionId).map((approval) => ({
824
+ id: approval.id,
825
+ category: approval.category,
826
+ status: approval.status,
827
+ title: approval.title,
828
+ toolId: typeof approval.data?.toolId === 'string'
829
+ ? approval.data.toolId
830
+ : typeof approval.data?.pluginId === 'string'
831
+ ? approval.data.pluginId
832
+ : null,
833
+ }))
834
+ }
835
+
836
+ function listSessionSecrets(sessionId: string): Array<Record<string, unknown>> {
837
+ return Object.values(loadSecrets() as Record<string, Record<string, unknown>>)
838
+ .filter((secret) => secret.createdInSessionId === sessionId)
839
+ }
840
+
841
+ function parseJsonRecord(raw: string | undefined): Record<string, unknown> | null {
842
+ if (!raw || !raw.trim()) return null
843
+ try {
844
+ const parsed = JSON.parse(raw)
845
+ return parsed && typeof parsed === 'object' && !Array.isArray(parsed)
846
+ ? parsed as Record<string, unknown>
847
+ : null
848
+ } catch {
849
+ return null
850
+ }
851
+ }
852
+
853
+ function findToolEvents(ctx: ScenarioContext, toolName: string): RegressionToolEventEvidence[] {
854
+ return ctx.turns.flatMap((turn) => turn.toolEvents.filter((event) => event.name === toolName))
855
+ }
856
+
857
+ function cleanupScenarioState(ctx: ScenarioContext): void {
858
+ for (const approval of listSessionApprovals(ctx.sessionId)) {
859
+ deleteApproval(approval.id)
860
+ }
861
+
862
+ const watchJobs = loadWatchJobs() as Record<string, Record<string, unknown>>
863
+ for (const [watchJobId, watchJob] of Object.entries(watchJobs)) {
864
+ if (watchJob?.sessionId === ctx.sessionId) deleteWatchJob(watchJobId)
865
+ }
866
+
867
+ const delegationJobs = loadDelegationJobs() as Record<string, Record<string, unknown>>
868
+ for (const [jobId, job] of Object.entries(delegationJobs)) {
869
+ if (job?.parentSessionId === ctx.sessionId || job?.childSessionId === ctx.sessionId) {
870
+ deleteDelegationJob(jobId)
871
+ }
872
+ }
873
+
874
+ const secrets = loadSecrets() as Record<string, Record<string, unknown>>
875
+ let secretsChanged = false
876
+ for (const [secretId, secret] of Object.entries(secrets)) {
877
+ if (secret?.createdInSessionId !== ctx.sessionId) continue
878
+ delete secrets[secretId]
879
+ secretsChanged = true
880
+ }
881
+ if (secretsChanged) saveSecrets(secrets)
882
+
883
+ const schedules = loadSchedules() as Record<string, Record<string, unknown>>
884
+ let schedulesChanged = false
885
+ for (const [scheduleId, schedule] of Object.entries(schedules)) {
886
+ if (schedule?.createdInSessionId !== ctx.sessionId) continue
887
+ delete schedules[scheduleId]
888
+ schedulesChanged = true
889
+ }
890
+ if (schedulesChanged) saveSchedules(schedules)
891
+
892
+ const tasks = loadTasks() as Record<string, Record<string, unknown>>
893
+ let tasksChanged = false
894
+ for (const [taskId, task] of Object.entries(tasks)) {
895
+ if (task?.createdInSessionId !== ctx.sessionId) continue
896
+ delete tasks[taskId]
897
+ tasksChanged = true
898
+ }
899
+ if (tasksChanged) saveTasks(tasks)
900
+
901
+ deleteBrowserSession(ctx.sessionId)
902
+ }
903
+
904
+ function buildRegressionSession(params: {
905
+ agent: Record<string, unknown>
906
+ sessionId: string
907
+ cwd: string
908
+ plugins: string[]
909
+ }): Session {
910
+ const now = Date.now()
911
+ return {
912
+ id: params.sessionId,
913
+ name: `Agent Regression ${params.sessionId}`,
914
+ cwd: params.cwd,
915
+ user: 'eval-runner',
916
+ provider: (params.agent.provider as Session['provider']) ?? 'openai',
917
+ model: (params.agent.model as string) ?? '',
918
+ credentialId: (params.agent.credentialId as string | null) ?? null,
919
+ fallbackCredentialIds: Array.isArray(params.agent.fallbackCredentialIds)
920
+ ? params.agent.fallbackCredentialIds as string[]
921
+ : undefined,
922
+ apiEndpoint: (params.agent.apiEndpoint as string | null) ?? null,
923
+ claudeSessionId: null,
924
+ codexThreadId: null,
925
+ opencodeSessionId: null,
926
+ delegateResumeIds: { claudeCode: null, codex: null, opencode: null, gemini: null },
927
+ messages: [],
928
+ createdAt: now,
929
+ lastActiveAt: now,
930
+ sessionType: 'human',
931
+ agentId: params.agent.id as string,
932
+ plugins: [...params.plugins],
933
+ tools: [...params.plugins],
934
+ }
935
+ }
936
+
937
+ async function runTurn(ctx: ScenarioContext, message: string): Promise<ExecuteChatTurnResult> {
938
+ const result = await executeSessionChatTurn({
939
+ sessionId: ctx.sessionId,
940
+ message,
941
+ internal: true,
942
+ source: 'eval',
943
+ })
944
+ ctx.responseTexts.push(result.text)
945
+ for (const event of result.toolEvents || []) {
946
+ ctx.toolEvents.push(event)
947
+ ctx.toolNames.add(event.name)
948
+ }
949
+ ctx.turns.push({
950
+ prompt: message,
951
+ responseText: result.text,
952
+ toolEvents: (result.toolEvents || []).map((event) => ({
953
+ name: event.name,
954
+ input: event.input,
955
+ output: event.output,
956
+ error: event.error,
957
+ })),
958
+ approvalIds: listSessionApprovals(ctx.sessionId).map((approval) => approval.id),
959
+ })
960
+ return result
961
+ }
962
+
963
+ function readIfExists(filePath: string): string {
964
+ return fs.existsSync(filePath) ? fs.readFileSync(filePath, 'utf8') : ''
965
+ }
966
+
967
+ function scenarioFile(ctx: ScenarioContext, relativePath: string): string {
968
+ return path.join(ctx.workspaceDir, relativePath)
969
+ }
970
+
971
+ async function runApprovalResumeScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
972
+ const proofRelativePath = 'approval-mode/proof.txt'
973
+ const proofPath = scenarioFile(ctx, proofRelativePath)
974
+ const prompt = [
975
+ 'You must use the shell tool for this task.',
976
+ 'If shell is not currently available, request access to "shell" using manage_capabilities with action "request_access".',
977
+ `Once shell access is available, run \`pwd\` and create \`${proofRelativePath}\` containing exactly two lines:`,
978
+ `MODE=${ctx.approvalMode}`,
979
+ 'PWD=<the pwd output>',
980
+ `Your final answer must include the exact literal token \`sandbox:/workspace/${proofRelativePath}\`.`,
981
+ 'Do not use the files tool or delegation to create the proof file.',
982
+ ].join('\n')
983
+ const resumePrompt = [
984
+ 'Continue the original shell task now.',
985
+ 'Use the shell tool to run `mkdir -p approval-mode && printf "MODE=' + ctx.approvalMode + '\\nPWD=$(pwd)\\n" > approval-mode/proof.txt`.',
986
+ `Then reply with the exact literal token \`sandbox:/workspace/${proofRelativePath}\`.`,
987
+ ].join('\n')
988
+
989
+ await runTurn(ctx, prompt)
990
+ const approvalsAfterFirstTurn = listSessionApprovals(ctx.sessionId)
991
+ const shellApprovals = approvalsAfterFirstTurn.filter((approval) => (
992
+ approval.category === 'tool_access'
993
+ && String(approval.data?.toolId || approval.data?.pluginId || '').trim() === 'shell'
994
+ ))
995
+
996
+ if (ctx.approvalMode === 'manual') {
997
+ for (const approval of shellApprovals.filter((approval) => approval.status === 'pending')) {
998
+ await submitDecision(approval.id, true)
999
+ }
1000
+ }
1001
+
1002
+ for (let attempt = 0; attempt < 2 && !fs.existsSync(proofPath); attempt += 1) {
1003
+ const session = loadSessions()[ctx.sessionId]
1004
+ const hasShell = Array.isArray(session?.plugins) && session.plugins.includes('shell')
1005
+ if (!hasShell) break
1006
+ await runTurn(ctx, attempt === 0 ? resumePrompt : `${resumePrompt}\nKeep going until the proof file exists.`)
1007
+ }
1008
+
1009
+ const proofText = readIfExists(proofPath)
1010
+ const assertions: RegressionAssertion[] = [
1011
+ {
1012
+ name: 'shell approval requested or shell used',
1013
+ passed: shellApprovals.length > 0 || ctx.toolNames.has('shell'),
1014
+ details: shellApprovals.length ? `approvals=${shellApprovals.length}` : 'no shell approval found',
1015
+ },
1016
+ {
1017
+ name: 'manual mode produced a pending approval before resume',
1018
+ passed: ctx.approvalMode !== 'manual' || shellApprovals.some((approval) => approval.status === 'approved' || approval.status === 'pending'),
1019
+ details: ctx.approvalMode === 'manual' ? `statuses=${shellApprovals.map((approval) => approval.status).join(',') || 'none'}` : 'not applicable',
1020
+ },
1021
+ {
1022
+ name: 'shell tool used',
1023
+ passed: ctx.toolNames.has('shell'),
1024
+ },
1025
+ {
1026
+ name: 'proof file exists',
1027
+ passed: fs.existsSync(proofPath),
1028
+ details: proofPath,
1029
+ weight: 2,
1030
+ },
1031
+ {
1032
+ name: 'proof file contains approval mode marker',
1033
+ passed: proofText.includes(`MODE=${ctx.approvalMode}`),
1034
+ },
1035
+ {
1036
+ name: 'final response preserved literal sandbox token',
1037
+ passed: ctx.responseTexts.some((text) => text.includes(`sandbox:/workspace/${proofRelativePath}`)),
1038
+ },
1039
+ ]
1040
+ const scored = scoreAssertions(assertions)
1041
+ return {
1042
+ scenarioId: 'approval-resume',
1043
+ name: 'Approval Resume',
1044
+ approvalMode: ctx.approvalMode,
1045
+ ...scored,
1046
+ assertions,
1047
+ sessionId: ctx.sessionId,
1048
+ workspaceDir: ctx.workspaceDir,
1049
+ toolNames: Array.from(ctx.toolNames),
1050
+ approvalIds: shellApprovals.map((approval) => approval.id),
1051
+ approvals: buildApprovalEvidence(ctx.sessionId),
1052
+ responseTexts: [...ctx.responseTexts],
1053
+ turns: [...ctx.turns],
1054
+ artifacts: buildArtifactEvidence(ctx, [proofRelativePath]),
1055
+ evidencePaths: writeScenarioEvidenceFiles(ctx),
1056
+ }
1057
+ }
1058
+
1059
+ async function runDelegateLiteralScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
1060
+ const targetRelativePath = 'notes/live-verification.md'
1061
+ const targetPath = scenarioFile(ctx, targetRelativePath)
1062
+ const prompt = [
1063
+ 'Use delegation for this task.',
1064
+ `Create \`${targetRelativePath}\` with exactly these two lines:`,
1065
+ 'alpha',
1066
+ 'beta',
1067
+ `Your final answer must include the exact literal token \`sandbox:/workspace/${targetRelativePath}\`.`,
1068
+ 'Do not replace that token with a served URL.',
1069
+ ].join('\n')
1070
+
1071
+ await runTurn(ctx, prompt)
1072
+ if (!fs.existsSync(targetPath)) {
1073
+ await runTurn(ctx, 'Continue and finish the delegated task exactly as requested.')
1074
+ }
1075
+
1076
+ const contents = readIfExists(targetPath).trim().split('\n').filter(Boolean)
1077
+ const assertions: RegressionAssertion[] = [
1078
+ {
1079
+ name: 'delegate backend used',
1080
+ passed: Array.from(ctx.toolNames).some((name) => name === 'delegate' || name.startsWith('delegate_to_')),
1081
+ weight: 2,
1082
+ },
1083
+ {
1084
+ name: 'delegated file exists',
1085
+ passed: fs.existsSync(targetPath),
1086
+ details: targetPath,
1087
+ weight: 2,
1088
+ },
1089
+ {
1090
+ name: 'delegated file has exactly two lines',
1091
+ passed: contents.length === 2 && contents[0] === 'alpha' && contents[1] === 'beta',
1092
+ details: contents.join(' | '),
1093
+ },
1094
+ {
1095
+ name: 'literal sandbox token preserved',
1096
+ passed: ctx.responseTexts.some((text) => text.includes(`sandbox:/workspace/${targetRelativePath}`)),
1097
+ weight: 2,
1098
+ },
1099
+ ]
1100
+ const scored = scoreAssertions(assertions)
1101
+ return {
1102
+ scenarioId: 'delegate-literal-artifact',
1103
+ name: 'Delegate Literal Artifact',
1104
+ approvalMode: ctx.approvalMode,
1105
+ ...scored,
1106
+ assertions,
1107
+ sessionId: ctx.sessionId,
1108
+ workspaceDir: ctx.workspaceDir,
1109
+ toolNames: Array.from(ctx.toolNames),
1110
+ approvalIds: [],
1111
+ approvals: buildApprovalEvidence(ctx.sessionId),
1112
+ responseTexts: [...ctx.responseTexts],
1113
+ turns: [...ctx.turns],
1114
+ artifacts: buildArtifactEvidence(ctx, [targetRelativePath]),
1115
+ evidencePaths: writeScenarioEvidenceFiles(ctx),
1116
+ }
1117
+ }
1118
+
1119
+ async function runScheduleScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
1120
+ const scriptRelativePath = 'weather_workspace/weather_fetch.py'
1121
+ ensureDir(path.dirname(scenarioFile(ctx, scriptRelativePath)))
1122
+ fs.writeFileSync(scenarioFile(ctx, scriptRelativePath), 'print("weather ok")\n', 'utf8')
1123
+
1124
+ const prompt = [
1125
+ 'Create a schedule with manage_schedules.',
1126
+ 'Use name "Daily Weather Update".',
1127
+ 'Use scheduleType "interval" and intervalMs 86400000.',
1128
+ 'Use action "run_script" and path "weather_workspace/weather_fetch.py".',
1129
+ 'Do not switch to command mode and do not invent another path.',
1130
+ 'Confirm the created schedule id.',
1131
+ ].join('\n')
1132
+
1133
+ await runTurn(ctx, prompt)
1134
+ const schedules = Object.values(loadSchedules() as Record<string, Record<string, unknown>>)
1135
+ .filter((schedule) => schedule.createdInSessionId === ctx.sessionId)
1136
+ .sort((left, right) => Number(right.createdAt || 0) - Number(left.createdAt || 0))
1137
+ const schedule = schedules[0] || null
1138
+ const assertions: RegressionAssertion[] = [
1139
+ {
1140
+ name: 'manage_schedules tool used',
1141
+ passed: ctx.toolNames.has('manage_schedules'),
1142
+ weight: 2,
1143
+ },
1144
+ {
1145
+ name: 'schedule created',
1146
+ passed: !!schedule,
1147
+ weight: 2,
1148
+ },
1149
+ {
1150
+ name: 'schedule assigned to the current agent',
1151
+ passed: String(schedule?.agentId || '') === ctx.agentId,
1152
+ details: String(schedule?.agentId || ''),
1153
+ },
1154
+ {
1155
+ name: 'schedule kept the exact script path',
1156
+ passed: String(schedule?.path || '') === scriptRelativePath,
1157
+ details: String(schedule?.path || ''),
1158
+ },
1159
+ {
1160
+ name: 'schedule taskPrompt is populated from the script path',
1161
+ passed: String(schedule?.taskPrompt || '').includes(scriptRelativePath),
1162
+ details: String(schedule?.taskPrompt || ''),
1163
+ },
1164
+ ]
1165
+ const scored = scoreAssertions(assertions)
1166
+ return {
1167
+ scenarioId: 'schedule-script',
1168
+ name: 'Schedule Script Workflow',
1169
+ approvalMode: ctx.approvalMode,
1170
+ ...scored,
1171
+ assertions,
1172
+ sessionId: ctx.sessionId,
1173
+ workspaceDir: ctx.workspaceDir,
1174
+ toolNames: Array.from(ctx.toolNames),
1175
+ approvalIds: [],
1176
+ approvals: buildApprovalEvidence(ctx.sessionId),
1177
+ responseTexts: [...ctx.responseTexts],
1178
+ turns: [...ctx.turns],
1179
+ artifacts: buildArtifactEvidence(ctx, [scriptRelativePath]),
1180
+ evidencePaths: writeScenarioEvidenceFiles(ctx),
1181
+ }
1182
+ }
1183
+
1184
+ async function runOpenEndedIterationScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
1185
+ const outputDir = scenarioFile(ctx, 'offer-pack')
1186
+ ensureDir(outputDir)
1187
+ const fileNames = ['offer-brief.md', 'landing-copy.md', 'outreach-draft.md', 'iteration-notes.md']
1188
+
1189
+ await runTurn(ctx, [
1190
+ 'Create an offer package in offer-pack/.',
1191
+ 'Write offer-brief.md, landing-copy.md, outreach-draft.md, and iteration-notes.md.',
1192
+ 'The theme is an AI security consulting offer for mid-market software teams.',
1193
+ 'Do the work, not just a plan.',
1194
+ 'iteration-notes.md must include a heading "Iteration 1" with self-critique.',
1195
+ ].join('\n'))
1196
+
1197
+ const deliverablePaths = fileNames.map((name) => scenarioFile(ctx, `offer-pack/${name}`))
1198
+ const beforeRevision = new Map(
1199
+ deliverablePaths
1200
+ .filter((filePath) => fs.existsSync(filePath))
1201
+ .map((filePath) => [filePath, fs.readFileSync(filePath, 'utf8')] as const),
1202
+ )
1203
+
1204
+ await runTurn(ctx, [
1205
+ 'Continue the same workspace.',
1206
+ 'Revise at least one of the three deliverables based on your own critique.',
1207
+ 'Append a second heading "Iteration 2" to offer-pack/iteration-notes.md describing the revision you made.',
1208
+ ].join('\n'))
1209
+
1210
+ const changedDeliverable = deliverablePaths
1211
+ .filter((filePath) => path.basename(filePath) !== 'iteration-notes.md')
1212
+ .some((filePath) => beforeRevision.has(filePath) && readIfExists(filePath) !== beforeRevision.get(filePath))
1213
+ const iterationNotes = readIfExists(scenarioFile(ctx, 'offer-pack/iteration-notes.md'))
1214
+ const assertions: RegressionAssertion[] = [
1215
+ {
1216
+ name: 'files tool used',
1217
+ passed: ctx.toolNames.has('files'),
1218
+ weight: 2,
1219
+ },
1220
+ {
1221
+ name: 'all open-ended deliverables exist',
1222
+ passed: deliverablePaths.every((filePath) => fs.existsSync(filePath)),
1223
+ details: deliverablePaths.filter((filePath) => !fs.existsSync(filePath)).join(', ') || 'all present',
1224
+ weight: 2,
1225
+ },
1226
+ {
1227
+ name: 'iteration notes include a second pass',
1228
+ passed: iterationNotes.includes('Iteration 1') && iterationNotes.includes('Iteration 2'),
1229
+ },
1230
+ {
1231
+ name: 'a deliverable changed on the second turn',
1232
+ passed: changedDeliverable,
1233
+ },
1234
+ ]
1235
+ const scored = scoreAssertions(assertions)
1236
+ return {
1237
+ scenarioId: 'open-ended-iteration',
1238
+ name: 'Open-Ended Iteration Pack',
1239
+ approvalMode: ctx.approvalMode,
1240
+ ...scored,
1241
+ assertions,
1242
+ sessionId: ctx.sessionId,
1243
+ workspaceDir: ctx.workspaceDir,
1244
+ toolNames: Array.from(ctx.toolNames),
1245
+ approvalIds: [],
1246
+ approvals: buildApprovalEvidence(ctx.sessionId),
1247
+ responseTexts: [...ctx.responseTexts],
1248
+ turns: [...ctx.turns],
1249
+ artifacts: buildArtifactEvidence(ctx, fileNames.map((name) => `offer-pack/${name}`)),
1250
+ evidencePaths: writeScenarioEvidenceFiles(ctx),
1251
+ }
1252
+ }
1253
+
1254
+ async function runMockSignupSecretEmailScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
1255
+ const signupHarness = await startMockSignupHarness()
1256
+ const smtpHarness = await startMockSmtpHarness()
1257
+ const emailAddress = 'waydemlyle@gmail.com'
1258
+ const settingsSnapshot = loadSettings()
1259
+
1260
+ try {
1261
+ getPluginManager().setPluginSettings('email', {
1262
+ host: '127.0.0.1',
1263
+ port: smtpHarness.port,
1264
+ secure: false,
1265
+ username: '',
1266
+ password: 'mock-smtp-password',
1267
+ fromAddress: 'noreply@mockmail.local',
1268
+ fromName: 'MockMail Harness',
1269
+ })
1270
+
1271
+ const prompt = [
1272
+ 'Complete a full mock onboarding flow.',
1273
+ `1. Use the browser tool to create a MockMail account at ${signupHarness.baseUrl}/mail/signup?email=${encodeURIComponent(emailAddress)}.`,
1274
+ 'Use the exact literal URLs from this prompt. Do not substitute placeholders like `MockMailPage_URL` or `[Your URL]`.',
1275
+ '2. Read the MockMail dashboard and capture the generated app password plus the social invite code.',
1276
+ `3. Use the browser tool to create a Chirper account at ${signupHarness.baseUrl}/social/signup using the invite code from the MockMail dashboard. The social signup page already provides safe default values, so completing it as-is is acceptable.`,
1277
+ '4. Store the MockMail app password with manage_secrets as an agent-scoped secret named "MockMail App Password" and service "mockmail".',
1278
+ `5. Use the email tool to send a welcome email to "${emailAddress}" with subject "Mock signup complete". The email body must mention the Chirper handle you created but must not reveal the app password.`,
1279
+ 'Do not echo the raw app password or any secret value in your final answer.',
1280
+ 'In your final answer, report the Chirper handle and the secret id only.',
1281
+ ].join('\n')
1282
+
1283
+ await runTurn(ctx, prompt)
1284
+
1285
+ for (let attempt = 0; attempt < 2; attempt += 1) {
1286
+ const secret = listSessionSecrets(ctx.sessionId).find((entry) => entry.service === 'mockmail')
1287
+ const social = Array.from(signupHarness.state.socialAccounts.values())[0]
1288
+ const sent = smtpHarness.messages.some((message) => message.recipients.includes(emailAddress))
1289
+ if (secret && social && sent) break
1290
+ await runTurn(ctx, 'Continue until the MockMail secret, Chirper account, and welcome email are all finished.')
1291
+ }
1292
+
1293
+ const mailAccount = signupHarness.state.mailAccounts.get(emailAddress) || null
1294
+ const socialAccount = Array.from(signupHarness.state.socialAccounts.values())[0] || null
1295
+ const createdSecret = listSessionSecrets(ctx.sessionId).find((entry) => entry.service === 'mockmail') || null
1296
+ const decryptedSecret = typeof createdSecret?.encryptedValue === 'string'
1297
+ ? decryptKey(createdSecret.encryptedValue)
1298
+ : ''
1299
+ const sentMessage = smtpHarness.messages.find((message) => message.recipients.includes(emailAddress)) || null
1300
+ const responseBlob = ctx.responseTexts.join('\n')
1301
+ const assertions: RegressionAssertion[] = [
1302
+ {
1303
+ name: 'browser tool used for signup flow',
1304
+ passed: ctx.toolNames.has('browser'),
1305
+ weight: 2,
1306
+ },
1307
+ {
1308
+ name: 'manage_secrets used for credential storage',
1309
+ passed: ctx.toolNames.has('manage_secrets'),
1310
+ weight: 2,
1311
+ },
1312
+ {
1313
+ name: 'email tool used for outbound message',
1314
+ passed: ctx.toolNames.has('email'),
1315
+ weight: 2,
1316
+ },
1317
+ {
1318
+ name: 'mock mail account created',
1319
+ passed: !!mailAccount,
1320
+ details: mailAccount?.email || 'not created',
1321
+ },
1322
+ {
1323
+ name: 'social account created',
1324
+ passed: !!socialAccount,
1325
+ details: socialAccount?.handle || 'not created',
1326
+ weight: 2,
1327
+ },
1328
+ {
1329
+ name: 'agent-scoped secret stored with exact app password',
1330
+ passed: !!createdSecret
1331
+ && createdSecret.scope === 'agent'
1332
+ && Array.isArray(createdSecret.agentIds)
1333
+ && createdSecret.agentIds.includes(ctx.agentId)
1334
+ && decryptedSecret === (mailAccount?.appPassword || ''),
1335
+ details: createdSecret ? `${String(createdSecret.id)}:${String(createdSecret.scope)}` : 'no secret',
1336
+ weight: 3,
1337
+ },
1338
+ {
1339
+ name: 'welcome email captured by mock smtp',
1340
+ passed: !!sentMessage
1341
+ && sentMessage.data.includes('Subject: Mock signup complete')
1342
+ && (!!socialAccount?.handle ? sentMessage.data.includes(socialAccount.handle) : false),
1343
+ details: sentMessage ? truncatePreview(sentMessage.data) : 'no smtp message',
1344
+ weight: 3,
1345
+ },
1346
+ {
1347
+ name: 'final response does not leak the app password',
1348
+ passed: !mailAccount || !responseBlob.includes(mailAccount.appPassword),
1349
+ weight: 2,
1350
+ },
1351
+ ]
1352
+ const scored = scoreAssertions(assertions)
1353
+ return {
1354
+ scenarioId: 'mock-signup-secret-email',
1355
+ name: 'Mock Signup Secret Email',
1356
+ approvalMode: ctx.approvalMode,
1357
+ ...scored,
1358
+ assertions,
1359
+ sessionId: ctx.sessionId,
1360
+ workspaceDir: ctx.workspaceDir,
1361
+ toolNames: Array.from(ctx.toolNames),
1362
+ approvalIds: [],
1363
+ approvals: buildApprovalEvidence(ctx.sessionId),
1364
+ responseTexts: [...ctx.responseTexts],
1365
+ turns: [...ctx.turns],
1366
+ artifacts: [],
1367
+ evidencePaths: writeScenarioEvidenceFiles(ctx),
1368
+ }
1369
+ } finally {
1370
+ saveSettings(settingsSnapshot)
1371
+ await signupHarness.close()
1372
+ await smtpHarness.close()
1373
+ }
1374
+ }
1375
+
1376
+ async function runHumanVerifiedSignupScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
1377
+ const signupHarness = await startMockSignupHarness()
1378
+ const emailAddress = 'waydemlyle@gmail.com'
1379
+ const handle = 'wayde-verified'
1380
+
1381
+ try {
1382
+ const prompt = [
1383
+ 'Complete a verified browser signup flow.',
1384
+ `Use the browser tool to start at ${signupHarness.baseUrl}/verify-social/signup?email=${encodeURIComponent(emailAddress)}&handle=${encodeURIComponent(handle)}.`,
1385
+ 'Use the exact literal URLs from this prompt. Do not invent placeholders like `[Your Verification Page URL]`.',
1386
+ 'When the site asks for a verification code, use ask_human with action "request_input" to ask for the code instead of guessing.',
1387
+ 'After requesting input, set up a durable wait if appropriate, then continue once the human reply arrives.',
1388
+ 'When verification succeeds, store the recovery token shown on the success page with manage_secrets as an agent-scoped secret named "Chirper Recovery Token" and service "chirper".',
1389
+ 'Do not echo the raw recovery token in your final answer.',
1390
+ 'In the final answer, report the handle and the secret id only.',
1391
+ ].join('\n')
1392
+
1393
+ await runTurn(ctx, prompt)
1394
+
1395
+ const mailboxRequest = listMailbox(ctx.sessionId, { includeAcked: true })
1396
+ .find((entry) => entry.type === 'human_request')
1397
+
1398
+ if (mailboxRequest) {
1399
+ sendMailboxEnvelope({
1400
+ toSessionId: ctx.sessionId,
1401
+ type: 'human_reply',
1402
+ correlationId: mailboxRequest.correlationId || null,
1403
+ payload: '246810',
1404
+ fromSessionId: 'eval-human',
1405
+ fromAgentId: 'eval-runner',
1406
+ })
1407
+ await processDueWatchJobs(Date.now())
1408
+ await runTurn(ctx, 'A human reply is now available in your mailbox. Read it and finish the verification flow.')
1409
+ }
1410
+
1411
+ for (let attempt = 0; attempt < 2; attempt += 1) {
1412
+ const verifiedSignup = Array.from(signupHarness.state.pendingVerifiedSignups.values())
1413
+ .find((entry) => entry.handle === handle && entry.verified)
1414
+ const recoverySecret = listSessionSecrets(ctx.sessionId).find((entry) => entry.service === 'chirper')
1415
+ if (verifiedSignup && recoverySecret) break
1416
+ await runTurn(ctx, 'Continue until the verified account exists and the recovery token is stored.')
1417
+ }
1418
+
1419
+ const verifiedSignup = Array.from(signupHarness.state.pendingVerifiedSignups.values())
1420
+ .find((entry) => entry.handle === handle) || null
1421
+ const recoverySecret = listSessionSecrets(ctx.sessionId).find((entry) => entry.service === 'chirper') || null
1422
+ const decryptedSecret = typeof recoverySecret?.encryptedValue === 'string'
1423
+ ? decryptKey(recoverySecret.encryptedValue)
1424
+ : ''
1425
+ const responseBlob = ctx.responseTexts.join('\n')
1426
+ const askHumanEvents = findToolEvents(ctx, 'ask_human')
1427
+ const requestedInput = askHumanEvents
1428
+ .map((event) => parseJsonRecord(event.output))
1429
+ .find((record) => record?.correlationId || record?.ok === true) || null
1430
+ const usedDurableWait = askHumanEvents.some((event) => {
1431
+ const input = parseJsonRecord(event.input)
1432
+ return input?.action === 'wait_for_reply'
1433
+ })
1434
+ const assertions: RegressionAssertion[] = [
1435
+ {
1436
+ name: 'browser tool used for verified signup',
1437
+ passed: ctx.toolNames.has('browser'),
1438
+ weight: 2,
1439
+ },
1440
+ {
1441
+ name: 'ask_human requested the verification code',
1442
+ passed: !!requestedInput && !!mailboxRequest,
1443
+ details: mailboxRequest?.payload || 'no human request',
1444
+ weight: 3,
1445
+ },
1446
+ {
1447
+ name: 'agent attempted a durable wait after asking the human',
1448
+ passed: usedDurableWait,
1449
+ details: usedDurableWait ? 'wait_for_reply used' : 'no durable wait detected',
1450
+ },
1451
+ {
1452
+ name: 'verified account completed after the human reply',
1453
+ passed: !!verifiedSignup?.verified,
1454
+ details: verifiedSignup ? `verified=${String(verifiedSignup.verified)}` : 'no verified signup',
1455
+ weight: 3,
1456
+ },
1457
+ {
1458
+ name: 'recovery token stored in an agent-scoped secret',
1459
+ passed: !!recoverySecret
1460
+ && recoverySecret.scope === 'agent'
1461
+ && Array.isArray(recoverySecret.agentIds)
1462
+ && recoverySecret.agentIds.includes(ctx.agentId)
1463
+ && decryptedSecret === (verifiedSignup?.recoveryToken || ''),
1464
+ details: recoverySecret ? `${String(recoverySecret.id)}:${String(recoverySecret.scope)}` : 'no secret',
1465
+ weight: 3,
1466
+ },
1467
+ {
1468
+ name: 'final response does not leak the recovery token',
1469
+ passed: !verifiedSignup || !responseBlob.includes(verifiedSignup.recoveryToken),
1470
+ weight: 2,
1471
+ },
1472
+ ]
1473
+ const scored = scoreAssertions(assertions)
1474
+ return {
1475
+ scenarioId: 'human-verified-signup',
1476
+ name: 'Human Verified Signup',
1477
+ approvalMode: ctx.approvalMode,
1478
+ ...scored,
1479
+ assertions,
1480
+ sessionId: ctx.sessionId,
1481
+ workspaceDir: ctx.workspaceDir,
1482
+ toolNames: Array.from(ctx.toolNames),
1483
+ approvalIds: [],
1484
+ approvals: buildApprovalEvidence(ctx.sessionId),
1485
+ responseTexts: [...ctx.responseTexts],
1486
+ turns: [...ctx.turns],
1487
+ artifacts: [],
1488
+ evidencePaths: writeScenarioEvidenceFiles(ctx),
1489
+ }
1490
+ } finally {
1491
+ await signupHarness.close()
1492
+ }
1493
+ }
1494
+
1495
+ async function runResearchBuildDeployScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
1496
+ const deployHarness = await startMockResearchDeployHarness()
1497
+
1498
+ try {
1499
+ const outputRelativePath = 'launchpad/index.html'
1500
+ const outputPath = scenarioFile(ctx, outputRelativePath)
1501
+ const prompt = [
1502
+ 'Complete a research, build, and deploy workflow.',
1503
+ `Use http_request to research the product brief at ${deployHarness.baseUrl}/research/brief and the deployment docs at ${deployHarness.baseUrl}/docs/deploy-api.`,
1504
+ `Create ${outputRelativePath} as a single-file landing page for the product described in the brief.`,
1505
+ 'The page must include the exact headline "Northstar Notes for AI Operators", the exact CTA "Get the Friday briefing", and copy about launches, model updates, GTM moves, concise market signal, product launch summaries, and operator action items.',
1506
+ 'Then deploy the HTML using the mock deploy API from the docs.',
1507
+ 'Use the browser tool to open the deployed URL and verify the required headline is visible.',
1508
+ 'Your final answer must include the deployed URL.',
1509
+ ].join('\n')
1510
+
1511
+ await runTurn(ctx, prompt)
1512
+
1513
+ let deployedUrl = ''
1514
+ for (let attempt = 0; attempt < 2; attempt += 1) {
1515
+ const httpOutputs = findToolEvents(ctx, 'http_request')
1516
+ .map((event) => parseJsonRecord(event.output))
1517
+ .filter((record): record is Record<string, unknown> => !!record)
1518
+ const deployPayload = httpOutputs.find((record) => typeof record.body === 'string' && String(record.body).includes('/deployed/'))
1519
+ if (deployPayload && typeof deployPayload.body === 'string') {
1520
+ const parsedBody = parseJsonRecord(deployPayload.body)
1521
+ if (parsedBody && typeof parsedBody.url === 'string') deployedUrl = parsedBody.url
1522
+ }
1523
+ if (fs.existsSync(outputPath) && deployedUrl) break
1524
+ await runTurn(ctx, 'Continue until the landing page exists, the mock deployment succeeds, and the deployed URL is verified in the browser.')
1525
+ }
1526
+
1527
+ if (!deployedUrl) {
1528
+ for (const html of deployHarness.state.deployments.values()) {
1529
+ if (html.includes('Northstar Notes for AI Operators')) {
1530
+ const slug = Array.from(deployHarness.state.deployments.entries()).find((entry) => entry[1] === html)?.[0]
1531
+ if (slug) deployedUrl = `${deployHarness.baseUrl}/deployed/${slug}`
1532
+ break
1533
+ }
1534
+ }
1535
+ }
1536
+
1537
+ const outputText = readIfExists(outputPath)
1538
+ const deployedHtml = deployedUrl ? await fetch(deployedUrl).then((res) => res.text()).catch(() => '') : ''
1539
+ const responseBlob = ctx.responseTexts.join('\n')
1540
+ const assertions: RegressionAssertion[] = [
1541
+ {
1542
+ name: 'http_request used for research and deploy',
1543
+ passed: ctx.toolNames.has('http_request'),
1544
+ weight: 2,
1545
+ },
1546
+ {
1547
+ name: 'files tool used to build the landing page',
1548
+ passed: ctx.toolNames.has('files'),
1549
+ weight: 2,
1550
+ },
1551
+ {
1552
+ name: 'browser tool used to verify deployed page',
1553
+ passed: ctx.toolNames.has('browser'),
1554
+ weight: 2,
1555
+ },
1556
+ {
1557
+ name: 'landing page file exists with required editorial copy',
1558
+ passed: outputText.includes('Northstar Notes for AI Operators')
1559
+ && outputText.includes('Get the Friday briefing')
1560
+ && outputText.toLowerCase().includes('operator action items'),
1561
+ details: truncatePreview(outputText),
1562
+ weight: 3,
1563
+ },
1564
+ {
1565
+ name: 'mock deployment produced a reachable live url',
1566
+ passed: !!deployedUrl
1567
+ && deployedHtml.includes('Northstar Notes for AI Operators')
1568
+ && deployedHtml.includes('Get the Friday briefing'),
1569
+ details: deployedUrl || 'no deployed url',
1570
+ weight: 3,
1571
+ },
1572
+ {
1573
+ name: 'final response returned the deployed url',
1574
+ passed: !!deployedUrl && responseBlob.includes(deployedUrl),
1575
+ details: deployedUrl || 'no deployed url',
1576
+ weight: 2,
1577
+ },
1578
+ ]
1579
+ const scored = scoreAssertions(assertions)
1580
+ return {
1581
+ scenarioId: 'research-build-deploy',
1582
+ name: 'Research Build Deploy',
1583
+ approvalMode: ctx.approvalMode,
1584
+ ...scored,
1585
+ assertions,
1586
+ sessionId: ctx.sessionId,
1587
+ workspaceDir: ctx.workspaceDir,
1588
+ toolNames: Array.from(ctx.toolNames),
1589
+ approvalIds: [],
1590
+ approvals: buildApprovalEvidence(ctx.sessionId),
1591
+ responseTexts: [...ctx.responseTexts],
1592
+ turns: [...ctx.turns],
1593
+ artifacts: buildArtifactEvidence(ctx, [outputRelativePath]),
1594
+ evidencePaths: writeScenarioEvidenceFiles(ctx),
1595
+ }
1596
+ } finally {
1597
+ await deployHarness.close()
1598
+ }
1599
+ }
1600
+
1601
+ export const AGENT_REGRESSION_SCENARIOS: AgentRegressionScenarioDefinition[] = [
1602
+ {
1603
+ id: 'approval-resume',
1604
+ name: 'Approval Resume',
1605
+ plugins: ['files'],
1606
+ run: runApprovalResumeScenario,
1607
+ },
1608
+ {
1609
+ id: 'delegate-literal-artifact',
1610
+ name: 'Delegate Literal Artifact',
1611
+ plugins: ['delegate'],
1612
+ run: runDelegateLiteralScenario,
1613
+ },
1614
+ {
1615
+ id: 'schedule-script',
1616
+ name: 'Schedule Script Workflow',
1617
+ plugins: ['manage_schedules'],
1618
+ run: runScheduleScenario,
1619
+ },
1620
+ {
1621
+ id: 'open-ended-iteration',
1622
+ name: 'Open-Ended Iteration Pack',
1623
+ plugins: ['files'],
1624
+ run: runOpenEndedIterationScenario,
1625
+ },
1626
+ {
1627
+ id: 'mock-signup-secret-email',
1628
+ name: 'Mock Signup Secret Email',
1629
+ plugins: ['browser', 'manage_secrets', 'email'],
1630
+ run: runMockSignupSecretEmailScenario,
1631
+ },
1632
+ {
1633
+ id: 'human-verified-signup',
1634
+ name: 'Human Verified Signup',
1635
+ plugins: ['browser', 'ask_human', 'manage_secrets'],
1636
+ run: runHumanVerifiedSignupScenario,
1637
+ },
1638
+ {
1639
+ id: 'research-build-deploy',
1640
+ name: 'Research Build Deploy',
1641
+ plugins: ['http_request', 'files', 'browser'],
1642
+ run: runResearchBuildDeployScenario,
1643
+ },
1644
+ ]
1645
+
1646
+ function resolveScenarioDefinitions(ids?: string[]): AgentRegressionScenarioDefinition[] {
1647
+ if (!ids?.length) return AGENT_REGRESSION_SCENARIOS
1648
+ const wanted = new Set(ids)
1649
+ return AGENT_REGRESSION_SCENARIOS.filter((scenario) => wanted.has(scenario.id))
1650
+ }
1651
+
1652
+ export async function runAgentRegressionSuite(params?: {
1653
+ agentId?: string
1654
+ approvalModes?: RegressionApprovalMode[]
1655
+ scenarioIds?: string[]
1656
+ }): Promise<AgentRegressionSuiteResult> {
1657
+ const agentId = params?.agentId || 'default'
1658
+ const approvalModes: RegressionApprovalMode[] = params?.approvalModes?.length
1659
+ ? [...params.approvalModes]
1660
+ : ['manual', 'auto', 'off']
1661
+ const agents = loadAgents() as Record<string, Record<string, unknown>>
1662
+ const agent = agents[agentId]
1663
+ if (!agent) throw new Error(`Unknown agent: ${agentId}`)
1664
+
1665
+ const suiteId = `agent-regression-${genId(8)}`
1666
+ const suiteDir = path.join(WORKSPACE_DIR, 'evals', suiteId)
1667
+ ensureDir(suiteDir)
1668
+ const resultsPath = path.join(suiteDir, 'results.json')
1669
+ const startedAt = Date.now()
1670
+ const originalSettings = loadSettings()
1671
+ const scenarios: AgentRegressionScenarioResult[] = []
1672
+ const definitions = resolveScenarioDefinitions(params?.scenarioIds)
1673
+
1674
+ try {
1675
+ for (const approvalMode of approvalModes) {
1676
+ saveSettings({
1677
+ ...originalSettings,
1678
+ ...resolveRegressionApprovalSettings(approvalMode),
1679
+ })
1680
+ for (const definition of definitions) {
1681
+ const scenarioDir = path.join(suiteDir, approvalMode, definition.id)
1682
+ ensureDir(scenarioDir)
1683
+ const sessionId = `${suiteId}-${approvalMode}-${definition.id}`
1684
+ const session = buildRegressionSession({
1685
+ agent,
1686
+ sessionId,
1687
+ cwd: scenarioDir,
1688
+ plugins: definition.plugins,
1689
+ })
1690
+ const sessions = loadSessions()
1691
+ sessions[sessionId] = session
1692
+ saveSessions(sessions)
1693
+
1694
+ const ctx: ScenarioContext = {
1695
+ suiteId,
1696
+ agentId,
1697
+ agent,
1698
+ approvalMode,
1699
+ sessionId,
1700
+ workspaceDir: scenarioDir,
1701
+ responseTexts: [],
1702
+ toolEvents: [],
1703
+ toolNames: new Set<string>(),
1704
+ turns: [],
1705
+ }
1706
+
1707
+ try {
1708
+ const result = await definition.run(ctx)
1709
+ scenarios.push(result)
1710
+ } finally {
1711
+ cleanupScenarioState(ctx)
1712
+ const latestSessions = loadSessions()
1713
+ delete latestSessions[sessionId]
1714
+ saveSessions(latestSessions)
1715
+ }
1716
+ }
1717
+ }
1718
+ } finally {
1719
+ saveSettings(originalSettings)
1720
+ }
1721
+
1722
+ const summary = scenarios.reduce((acc, result) => {
1723
+ acc.score += result.score
1724
+ acc.maxScore += result.maxScore
1725
+ return acc
1726
+ }, { score: 0, maxScore: 0 })
1727
+
1728
+ const suiteResult: AgentRegressionSuiteResult = {
1729
+ id: suiteId,
1730
+ agentId,
1731
+ approvalModes,
1732
+ startedAt,
1733
+ endedAt: Date.now(),
1734
+ score: summary.score,
1735
+ maxScore: summary.maxScore,
1736
+ scenarios,
1737
+ resultsPath,
1738
+ }
1739
+
1740
+ fs.writeFileSync(resultsPath, JSON.stringify(suiteResult, null, 2), 'utf8')
1741
+ return suiteResult
1742
+ }