@swarmclawai/swarmclaw 0.7.3 → 0.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +47 -40
- package/bin/package-manager.js +157 -0
- package/bin/package-manager.test.js +90 -0
- package/bin/server-cmd.js +38 -7
- package/bin/swarmclaw.js +54 -4
- package/bin/update-cmd.js +48 -10
- package/bin/update-cmd.test.js +55 -0
- package/package.json +8 -3
- package/scripts/postinstall.mjs +26 -0
- package/src/app/api/agents/[id]/route.ts +17 -0
- package/src/app/api/agents/[id]/thread/route.ts +3 -1
- package/src/app/api/agents/route.ts +23 -1
- package/src/app/api/auth/route.ts +1 -1
- package/src/app/api/chatrooms/[id]/chat/route.ts +16 -5
- package/src/app/api/chatrooms/[id]/pins/route.ts +2 -1
- package/src/app/api/chatrooms/[id]/reactions/route.ts +2 -1
- package/src/app/api/chatrooms/[id]/route.ts +6 -0
- package/src/app/api/chats/[id]/route.ts +12 -0
- package/src/app/api/chats/heartbeat/route.ts +2 -1
- package/src/app/api/chats/route.ts +7 -1
- package/src/app/api/external-agents/[id]/heartbeat/route.ts +33 -0
- package/src/app/api/external-agents/[id]/route.ts +31 -0
- package/src/app/api/external-agents/register/route.ts +3 -0
- package/src/app/api/external-agents/route.ts +66 -0
- package/src/app/api/gateways/[id]/health/route.ts +28 -0
- package/src/app/api/gateways/[id]/route.ts +79 -0
- package/src/app/api/gateways/route.ts +57 -0
- package/src/app/api/openclaw/gateway/route.ts +10 -7
- package/src/app/api/openclaw/skills/route.ts +1 -1
- package/src/app/api/providers/[id]/discover-models/route.ts +27 -0
- package/src/app/api/schedules/[id]/route.ts +38 -9
- package/src/app/api/schedules/route.ts +51 -28
- package/src/app/api/settings/route.ts +6 -10
- package/src/app/api/setup/doctor/route.ts +6 -4
- package/src/app/api/tasks/[id]/route.ts +2 -1
- package/src/app/api/tasks/bulk/route.ts +2 -2
- package/src/app/page.tsx +126 -15
- package/src/cli/binary.test.js +142 -0
- package/src/cli/index.js +34 -11
- package/src/cli/index.test.js +195 -0
- package/src/cli/index.ts +20 -4
- package/src/cli/server-cmd.test.js +59 -0
- package/src/cli/spec.js +20 -2
- package/src/components/agents/agent-sheet.tsx +249 -7
- package/src/components/agents/inspector-panel.tsx +3 -2
- package/src/components/agents/sandbox-env-panel.tsx +4 -1
- package/src/components/auth/setup-wizard.tsx +970 -275
- package/src/components/chat/chat-area.tsx +41 -14
- package/src/components/chat/chat-card.tsx +2 -1
- package/src/components/chat/chat-header.tsx +8 -13
- package/src/components/chat/chat-list.tsx +58 -20
- package/src/components/chat/message-list.tsx +142 -18
- package/src/components/chatrooms/chatroom-input.tsx +96 -33
- package/src/components/chatrooms/chatroom-list.tsx +141 -72
- package/src/components/chatrooms/chatroom-message.tsx +7 -6
- package/src/components/chatrooms/chatroom-sheet.tsx +13 -1
- package/src/components/chatrooms/chatroom-tool-request-banner.tsx +5 -2
- package/src/components/chatrooms/chatroom-view.tsx +157 -86
- package/src/components/chatrooms/reaction-picker.tsx +38 -33
- package/src/components/gateways/gateway-sheet.tsx +567 -0
- package/src/components/input/chat-input.tsx +135 -86
- package/src/components/layout/app-layout.tsx +2 -0
- package/src/components/memory/memory-browser.tsx +71 -6
- package/src/components/memory/memory-card.tsx +18 -0
- package/src/components/memory/memory-detail.tsx +58 -31
- package/src/components/memory/memory-sheet.tsx +32 -4
- package/src/components/projects/project-detail.tsx +7 -2
- package/src/components/providers/provider-list.tsx +158 -2
- package/src/components/providers/provider-sheet.tsx +81 -70
- package/src/components/shared/bottom-sheet.tsx +31 -15
- package/src/components/shared/confirm-dialog.tsx +45 -30
- package/src/components/shared/model-combobox.tsx +90 -8
- package/src/components/shared/settings/section-heartbeat.tsx +11 -6
- package/src/components/shared/settings/section-orchestrator.tsx +3 -0
- package/src/components/shared/settings/settings-page.tsx +5 -3
- package/src/components/tasks/approvals-panel.tsx +7 -1
- package/src/components/ui/dialog.tsx +2 -2
- package/src/components/wallets/wallet-approval-dialog.tsx +59 -54
- package/src/lib/heartbeat-defaults.ts +48 -0
- package/src/lib/memory-presentation.ts +59 -0
- package/src/lib/provider-model-discovery-client.ts +29 -0
- package/src/lib/providers/index.ts +12 -5
- package/src/lib/runtime-loop.ts +105 -3
- package/src/lib/safe-storage.ts +6 -1
- package/src/lib/server/agent-runtime-config.test.ts +141 -0
- package/src/lib/server/agent-runtime-config.ts +277 -0
- package/src/lib/server/approvals-auto-approve.test.ts +59 -0
- package/src/lib/server/build-llm.test.ts +13 -5
- package/src/lib/server/chat-execution-tool-events.test.ts +87 -2
- package/src/lib/server/chat-execution.ts +159 -71
- package/src/lib/server/chatroom-helpers.test.ts +7 -0
- package/src/lib/server/chatroom-helpers.ts +99 -6
- package/src/lib/server/chatroom-session-persistence.test.ts +87 -0
- package/src/lib/server/connectors/manager.ts +89 -61
- package/src/lib/server/connectors/slack.ts +1 -1
- package/src/lib/server/daemon-state.ts +3 -2
- package/src/lib/server/eval/agent-regression.test.ts +47 -0
- package/src/lib/server/eval/agent-regression.ts +1742 -0
- package/src/lib/server/eval/runner.ts +11 -1
- package/src/lib/server/eval/store.ts +2 -1
- package/src/lib/server/heartbeat-service.ts +10 -4
- package/src/lib/server/main-agent-loop.ts +13 -6
- package/src/lib/server/openclaw-exec-config.ts +4 -2
- package/src/lib/server/openclaw-gateway.ts +123 -36
- package/src/lib/server/orchestrator-lg.ts +1 -2
- package/src/lib/server/orchestrator.ts +3 -2
- package/src/lib/server/plugins.test.ts +9 -1
- package/src/lib/server/plugins.ts +12 -2
- package/src/lib/server/provider-model-discovery.ts +481 -0
- package/src/lib/server/queue.ts +1 -1
- package/src/lib/server/runtime-settings.test.ts +119 -0
- package/src/lib/server/runtime-settings.ts +12 -92
- package/src/lib/server/schedule-normalization.ts +187 -0
- package/src/lib/server/session-tools/autonomy-tools.test.ts +23 -0
- package/src/lib/server/session-tools/crud.ts +27 -3
- package/src/lib/server/session-tools/discovery-approvals.test.ts +170 -0
- package/src/lib/server/session-tools/discovery.ts +18 -8
- package/src/lib/server/session-tools/file-normalize.test.ts +5 -0
- package/src/lib/server/session-tools/file.ts +8 -2
- package/src/lib/server/session-tools/http.ts +9 -3
- package/src/lib/server/session-tools/index.ts +31 -1
- package/src/lib/server/session-tools/manage-schedules.test.ts +137 -0
- package/src/lib/server/session-tools/monitor.ts +14 -7
- package/src/lib/server/session-tools/openclaw-nodes.test.ts +111 -0
- package/src/lib/server/session-tools/openclaw-nodes.ts +86 -20
- package/src/lib/server/session-tools/platform.ts +1 -1
- package/src/lib/server/session-tools/plugin-creator.ts +9 -2
- package/src/lib/server/session-tools/sandbox.ts +51 -92
- package/src/lib/server/session-tools/session-info.ts +22 -1
- package/src/lib/server/session-tools/session-tools-wiring.test.ts +23 -0
- package/src/lib/server/session-tools/shell.ts +2 -2
- package/src/lib/server/session-tools/subagent.ts +3 -1
- package/src/lib/server/session-tools/web.ts +73 -30
- package/src/lib/server/storage.ts +29 -3
- package/src/lib/server/stream-agent-chat.test.ts +61 -0
- package/src/lib/server/stream-agent-chat.ts +139 -4
- package/src/lib/server/structured-extract.ts +1 -1
- package/src/lib/server/task-mention.ts +0 -1
- package/src/lib/server/tool-aliases.ts +37 -6
- package/src/lib/server/tool-capability-policy.ts +1 -1
- package/src/lib/setup-defaults.ts +352 -11
- package/src/lib/tool-definitions.ts +3 -4
- package/src/lib/validation/schemas.ts +55 -1
- package/src/stores/use-app-store.ts +43 -1
- package/src/stores/use-chatroom-store.ts +153 -26
- package/src/types/index.ts +189 -6
- package/src/app/api/chats/[id]/main-loop/route.ts +0 -13
|
@@ -0,0 +1,1742 @@
|
|
|
1
|
+
import fs from 'node:fs'
|
|
2
|
+
import http, { type IncomingMessage, type Server as HttpServer, type ServerResponse } from 'node:http'
|
|
3
|
+
import net, { type AddressInfo } from 'node:net'
|
|
4
|
+
import { createHash } from 'node:crypto'
|
|
5
|
+
import path from 'node:path'
|
|
6
|
+
import { genId } from '@/lib/id'
|
|
7
|
+
import type { ApprovalRequest, MessageToolEvent, Session } from '@/types'
|
|
8
|
+
import { submitDecision } from '../approvals'
|
|
9
|
+
import { executeSessionChatTurn, type ExecuteChatTurnResult } from '../chat-execution'
|
|
10
|
+
import { WORKSPACE_DIR } from '../data-dir'
|
|
11
|
+
import { getPluginManager } from '../plugins'
|
|
12
|
+
import { sendMailboxEnvelope, listMailbox } from '../session-mailbox'
|
|
13
|
+
import { processDueWatchJobs } from '../watch-jobs'
|
|
14
|
+
import {
|
|
15
|
+
deleteApproval,
|
|
16
|
+
deleteBrowserSession,
|
|
17
|
+
deleteDelegationJob,
|
|
18
|
+
deleteWatchJob,
|
|
19
|
+
decryptKey,
|
|
20
|
+
loadAgents,
|
|
21
|
+
loadApprovals,
|
|
22
|
+
loadDelegationJobs,
|
|
23
|
+
loadSchedules,
|
|
24
|
+
loadSecrets,
|
|
25
|
+
loadSessions,
|
|
26
|
+
loadSettings,
|
|
27
|
+
loadTasks,
|
|
28
|
+
loadWatchJobs,
|
|
29
|
+
saveSchedules,
|
|
30
|
+
saveSecrets,
|
|
31
|
+
saveSessions,
|
|
32
|
+
saveSettings,
|
|
33
|
+
saveTasks,
|
|
34
|
+
} from '../storage'
|
|
35
|
+
|
|
36
|
+
export type RegressionApprovalMode = 'manual' | 'auto' | 'off'
|
|
37
|
+
|
|
38
|
+
export interface RegressionAssertion {
|
|
39
|
+
name: string
|
|
40
|
+
passed: boolean
|
|
41
|
+
details?: string
|
|
42
|
+
weight?: number
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export interface AgentRegressionScenarioResult {
|
|
46
|
+
scenarioId: string
|
|
47
|
+
name: string
|
|
48
|
+
approvalMode: RegressionApprovalMode
|
|
49
|
+
status: 'passed' | 'failed'
|
|
50
|
+
score: number
|
|
51
|
+
maxScore: number
|
|
52
|
+
assertions: RegressionAssertion[]
|
|
53
|
+
sessionId: string
|
|
54
|
+
workspaceDir: string
|
|
55
|
+
toolNames: string[]
|
|
56
|
+
approvalIds: string[]
|
|
57
|
+
approvals: RegressionApprovalEvidence[]
|
|
58
|
+
responseTexts: string[]
|
|
59
|
+
turns: RegressionTurnEvidence[]
|
|
60
|
+
artifacts: RegressionArtifactEvidence[]
|
|
61
|
+
evidencePaths: {
|
|
62
|
+
transcript: string
|
|
63
|
+
approvals: string
|
|
64
|
+
workspace: string
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export interface AgentRegressionSuiteResult {
|
|
69
|
+
id: string
|
|
70
|
+
agentId: string
|
|
71
|
+
approvalModes: RegressionApprovalMode[]
|
|
72
|
+
startedAt: number
|
|
73
|
+
endedAt: number
|
|
74
|
+
score: number
|
|
75
|
+
maxScore: number
|
|
76
|
+
scenarios: AgentRegressionScenarioResult[]
|
|
77
|
+
resultsPath: string
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
interface ScenarioContext {
|
|
81
|
+
suiteId: string
|
|
82
|
+
agentId: string
|
|
83
|
+
agent: Record<string, unknown>
|
|
84
|
+
approvalMode: RegressionApprovalMode
|
|
85
|
+
sessionId: string
|
|
86
|
+
workspaceDir: string
|
|
87
|
+
responseTexts: string[]
|
|
88
|
+
toolEvents: MessageToolEvent[]
|
|
89
|
+
toolNames: Set<string>
|
|
90
|
+
turns: RegressionTurnEvidence[]
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
interface AgentRegressionScenarioDefinition {
|
|
94
|
+
id: string
|
|
95
|
+
name: string
|
|
96
|
+
plugins: string[]
|
|
97
|
+
run: (ctx: ScenarioContext) => Promise<AgentRegressionScenarioResult>
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
interface MockMailAccount {
|
|
101
|
+
email: string
|
|
102
|
+
chosenPassword: string
|
|
103
|
+
appPassword: string
|
|
104
|
+
inviteCode: string
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
interface MockSocialAccount {
|
|
108
|
+
email: string
|
|
109
|
+
handle: string
|
|
110
|
+
password: string
|
|
111
|
+
inviteCode: string
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
interface MockVerifiedSignup {
|
|
115
|
+
token: string
|
|
116
|
+
email: string
|
|
117
|
+
handle: string
|
|
118
|
+
password: string
|
|
119
|
+
verificationCode: string
|
|
120
|
+
recoveryToken: string
|
|
121
|
+
verified: boolean
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
interface MockSignupHarness {
|
|
125
|
+
baseUrl: string
|
|
126
|
+
close: () => Promise<void>
|
|
127
|
+
state: {
|
|
128
|
+
mailAccounts: Map<string, MockMailAccount>
|
|
129
|
+
socialAccounts: Map<string, MockSocialAccount>
|
|
130
|
+
pendingVerifiedSignups: Map<string, MockVerifiedSignup>
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
interface MockSmtpMessage {
|
|
135
|
+
mailFrom: string
|
|
136
|
+
recipients: string[]
|
|
137
|
+
data: string
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
interface MockSmtpHarness {
|
|
141
|
+
port: number
|
|
142
|
+
messages: MockSmtpMessage[]
|
|
143
|
+
close: () => Promise<void>
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
interface MockResearchDeployHarness {
|
|
147
|
+
baseUrl: string
|
|
148
|
+
close: () => Promise<void>
|
|
149
|
+
state: {
|
|
150
|
+
deployments: Map<string, string>
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
export interface RegressionToolEventEvidence {
|
|
155
|
+
name: string
|
|
156
|
+
input?: string
|
|
157
|
+
output?: string
|
|
158
|
+
error?: boolean | string
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
export interface RegressionTurnEvidence {
|
|
162
|
+
prompt: string
|
|
163
|
+
responseText: string
|
|
164
|
+
toolEvents: RegressionToolEventEvidence[]
|
|
165
|
+
approvalIds: string[]
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
export interface RegressionArtifactEvidence {
|
|
169
|
+
relativePath: string
|
|
170
|
+
exists: boolean
|
|
171
|
+
size: number
|
|
172
|
+
sha256?: string
|
|
173
|
+
preview?: string
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export interface RegressionApprovalEvidence {
|
|
177
|
+
id: string
|
|
178
|
+
category: string
|
|
179
|
+
status: string
|
|
180
|
+
title: string
|
|
181
|
+
toolId: string | null
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
function ensureDir(dir: string): void {
|
|
185
|
+
fs.mkdirSync(dir, { recursive: true })
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
function escapeHtml(value: string): string {
|
|
189
|
+
return value
|
|
190
|
+
.replaceAll('&', '&')
|
|
191
|
+
.replaceAll('<', '<')
|
|
192
|
+
.replaceAll('>', '>')
|
|
193
|
+
.replaceAll('"', '"')
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
function signupSeed(input: string): string {
|
|
197
|
+
return createHash('sha1').update(input).digest('hex').slice(0, 8)
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
function htmlDocument(title: string, body: string): string {
|
|
201
|
+
return [
|
|
202
|
+
'<!doctype html>',
|
|
203
|
+
'<html lang="en">',
|
|
204
|
+
'<head>',
|
|
205
|
+
' <meta charset="utf-8">',
|
|
206
|
+
` <title>${escapeHtml(title)}</title>`,
|
|
207
|
+
' <meta name="viewport" content="width=device-width, initial-scale=1">',
|
|
208
|
+
' <style>',
|
|
209
|
+
' body { font-family: ui-sans-serif, system-ui, sans-serif; margin: 40px; line-height: 1.5; }',
|
|
210
|
+
' form { display: grid; gap: 12px; max-width: 420px; }',
|
|
211
|
+
' label { display: grid; gap: 6px; font-weight: 600; }',
|
|
212
|
+
' input { padding: 10px 12px; border: 1px solid #cbd5e1; border-radius: 8px; }',
|
|
213
|
+
' button, a.button { display: inline-flex; align-items: center; justify-content: center; padding: 10px 14px; border-radius: 8px; background: #0f172a; color: white; text-decoration: none; border: none; cursor: pointer; }',
|
|
214
|
+
' .card { max-width: 720px; padding: 24px; border: 1px solid #cbd5e1; border-radius: 16px; background: #fff; }',
|
|
215
|
+
' .mono { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; }',
|
|
216
|
+
' .muted { color: #475569; }',
|
|
217
|
+
' </style>',
|
|
218
|
+
'</head>',
|
|
219
|
+
'<body>',
|
|
220
|
+
body,
|
|
221
|
+
'</body>',
|
|
222
|
+
'</html>',
|
|
223
|
+
].join('\n')
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
function sendHtml(res: ServerResponse, statusCode: number, title: string, body: string): void {
|
|
227
|
+
res.statusCode = statusCode
|
|
228
|
+
res.setHeader('content-type', 'text/html; charset=utf-8')
|
|
229
|
+
res.end(htmlDocument(title, body))
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
function redirect(res: ServerResponse, location: string): void {
|
|
233
|
+
res.statusCode = 302
|
|
234
|
+
res.setHeader('location', location)
|
|
235
|
+
res.end()
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
async function readRequestBody(req: IncomingMessage): Promise<string> {
|
|
239
|
+
const chunks: Buffer[] = []
|
|
240
|
+
for await (const chunk of req) {
|
|
241
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk))
|
|
242
|
+
}
|
|
243
|
+
return Buffer.concat(chunks).toString('utf8')
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
async function startLocalHttpServer(
|
|
247
|
+
handler: (req: IncomingMessage, res: ServerResponse) => Promise<void>,
|
|
248
|
+
): Promise<{ server: HttpServer; baseUrl: string }> {
|
|
249
|
+
const server = http.createServer((req, res) => {
|
|
250
|
+
void handler(req, res).catch((error: unknown) => {
|
|
251
|
+
const message = error instanceof Error ? error.stack || error.message : String(error)
|
|
252
|
+
res.statusCode = 500
|
|
253
|
+
res.setHeader('content-type', 'text/plain; charset=utf-8')
|
|
254
|
+
res.end(message)
|
|
255
|
+
})
|
|
256
|
+
})
|
|
257
|
+
await new Promise<void>((resolve, reject) => {
|
|
258
|
+
server.once('error', reject)
|
|
259
|
+
server.listen(0, '127.0.0.1', () => {
|
|
260
|
+
server.off('error', reject)
|
|
261
|
+
resolve()
|
|
262
|
+
})
|
|
263
|
+
})
|
|
264
|
+
const address = server.address()
|
|
265
|
+
if (!address || typeof address === 'string') throw new Error('Mock HTTP server failed to bind to a TCP port.')
|
|
266
|
+
return {
|
|
267
|
+
server,
|
|
268
|
+
baseUrl: `http://127.0.0.1:${address.port}`,
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
async function closeServer(server: HttpServer | net.Server): Promise<void> {
|
|
273
|
+
await new Promise<void>((resolve) => {
|
|
274
|
+
server.close(() => resolve())
|
|
275
|
+
})
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
async function startMockSignupHarness(): Promise<MockSignupHarness> {
|
|
279
|
+
const state = {
|
|
280
|
+
mailAccounts: new Map<string, MockMailAccount>(),
|
|
281
|
+
socialAccounts: new Map<string, MockSocialAccount>(),
|
|
282
|
+
pendingVerifiedSignups: new Map<string, MockVerifiedSignup>(),
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
const { server, baseUrl } = await startLocalHttpServer(async (req, res) => {
|
|
286
|
+
const url = new URL(req.url || '/', baseUrl)
|
|
287
|
+
const pathname = url.pathname
|
|
288
|
+
|
|
289
|
+
if (req.method === 'GET' && pathname === '/') {
|
|
290
|
+
return sendHtml(res, 200, 'Mock Services', `
|
|
291
|
+
<div class="card">
|
|
292
|
+
<h1>Mock External Services</h1>
|
|
293
|
+
<p class="muted">Use these pages to test browser signup, secrets, and verification flows.</p>
|
|
294
|
+
<p><a class="button" href="/mail/signup">Open MockMail signup</a></p>
|
|
295
|
+
<p><a class="button" href="/verify-social/signup">Open Chirper verification signup</a></p>
|
|
296
|
+
</div>
|
|
297
|
+
`)
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
if (req.method === 'GET' && pathname === '/mail/signup') {
|
|
301
|
+
const prefilledEmail = String(url.searchParams.get('email') || '').trim()
|
|
302
|
+
return sendHtml(res, 200, 'MockMail Signup', `
|
|
303
|
+
<div class="card">
|
|
304
|
+
<h1>Create a MockMail account</h1>
|
|
305
|
+
<p class="muted">This mock provider generates an app password and a social invite code after signup.</p>
|
|
306
|
+
<form method="post" action="/mail/signup">
|
|
307
|
+
<label>Email address
|
|
308
|
+
<input id="email" name="email" type="email" autocomplete="email" value="${escapeHtml(prefilledEmail)}" required />
|
|
309
|
+
</label>
|
|
310
|
+
<label>Password
|
|
311
|
+
<input id="password" name="password" type="password" autocomplete="new-password" value="TempMockMailPass!23" required />
|
|
312
|
+
</label>
|
|
313
|
+
<button id="submit" type="submit">Create MockMail account</button>
|
|
314
|
+
</form>
|
|
315
|
+
</div>
|
|
316
|
+
`)
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
if (req.method === 'POST' && pathname === '/mail/signup') {
|
|
320
|
+
const body = await readRequestBody(req)
|
|
321
|
+
const form = new URLSearchParams(body)
|
|
322
|
+
const email = String(form.get('email') || '').trim().toLowerCase()
|
|
323
|
+
const chosenPassword = String(form.get('password') || '').trim()
|
|
324
|
+
if (!email || !chosenPassword) {
|
|
325
|
+
return sendHtml(res, 400, 'MockMail Signup Error', '<div class="card"><h1>Missing email or password</h1></div>')
|
|
326
|
+
}
|
|
327
|
+
const seed = signupSeed(email)
|
|
328
|
+
state.mailAccounts.set(email, {
|
|
329
|
+
email,
|
|
330
|
+
chosenPassword,
|
|
331
|
+
appPassword: `mockmail-app-${seed}`,
|
|
332
|
+
inviteCode: `INV-${seed.slice(0, 6).toUpperCase()}`,
|
|
333
|
+
})
|
|
334
|
+
return redirect(res, `/mail/dashboard?email=${encodeURIComponent(email)}`)
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
if (req.method === 'GET' && pathname === '/mail/dashboard') {
|
|
338
|
+
const email = String(url.searchParams.get('email') || '').trim().toLowerCase()
|
|
339
|
+
const account = state.mailAccounts.get(email)
|
|
340
|
+
if (!account) {
|
|
341
|
+
return sendHtml(res, 404, 'MockMail Account Missing', '<div class="card"><h1>Account not found</h1></div>')
|
|
342
|
+
}
|
|
343
|
+
return sendHtml(res, 200, 'MockMail Dashboard', `
|
|
344
|
+
<div class="card">
|
|
345
|
+
<h1>MockMail account ready</h1>
|
|
346
|
+
<p>Email: <span class="mono" id="mail-email">${escapeHtml(account.email)}</span></p>
|
|
347
|
+
<p>App password: <span class="mono" id="app-password">${escapeHtml(account.appPassword)}</span></p>
|
|
348
|
+
<p>Social invite code: <span class="mono" id="invite-code">${escapeHtml(account.inviteCode)}</span></p>
|
|
349
|
+
<p class="muted">Use the invite code to create a Chirper account.</p>
|
|
350
|
+
<p><a class="button" id="social-link" href="/social/signup?email=${encodeURIComponent(account.email)}&inviteCode=${encodeURIComponent(account.inviteCode)}">Create Chirper account</a></p>
|
|
351
|
+
</div>
|
|
352
|
+
`)
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
if (req.method === 'GET' && pathname === '/social/signup') {
|
|
356
|
+
const email = String(url.searchParams.get('email') || '').trim()
|
|
357
|
+
const inviteCode = String(url.searchParams.get('inviteCode') || '').trim()
|
|
358
|
+
const handle = String(url.searchParams.get('handle') || 'northstar-operator').trim()
|
|
359
|
+
return sendHtml(res, 200, 'Chirper Signup', `
|
|
360
|
+
<div class="card">
|
|
361
|
+
<h1>Create a Chirper account</h1>
|
|
362
|
+
<form method="post" action="/social/signup">
|
|
363
|
+
<label>Email address
|
|
364
|
+
<input id="email" name="email" type="email" value="${escapeHtml(email)}" required />
|
|
365
|
+
</label>
|
|
366
|
+
<label>Handle
|
|
367
|
+
<input id="handle" name="handle" type="text" value="${escapeHtml(handle)}" required />
|
|
368
|
+
</label>
|
|
369
|
+
<label>Password
|
|
370
|
+
<input id="password" name="password" type="password" value="TempChirperPass!23" required />
|
|
371
|
+
</label>
|
|
372
|
+
<label>Invite code
|
|
373
|
+
<input id="inviteCode" name="inviteCode" type="text" value="${escapeHtml(inviteCode)}" required />
|
|
374
|
+
</label>
|
|
375
|
+
<button id="submit" type="submit">Create Chirper account</button>
|
|
376
|
+
</form>
|
|
377
|
+
</div>
|
|
378
|
+
`)
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
if (req.method === 'POST' && pathname === '/social/signup') {
|
|
382
|
+
const body = await readRequestBody(req)
|
|
383
|
+
const form = new URLSearchParams(body)
|
|
384
|
+
const email = String(form.get('email') || '').trim().toLowerCase()
|
|
385
|
+
const handle = String(form.get('handle') || '').trim()
|
|
386
|
+
const password = String(form.get('password') || '').trim()
|
|
387
|
+
const inviteCode = String(form.get('inviteCode') || '').trim()
|
|
388
|
+
const mailAccount = state.mailAccounts.get(email)
|
|
389
|
+
if (!mailAccount || inviteCode !== mailAccount.inviteCode || !handle || !password) {
|
|
390
|
+
return sendHtml(res, 400, 'Chirper Signup Error', `
|
|
391
|
+
<div class="card">
|
|
392
|
+
<h1>Signup failed</h1>
|
|
393
|
+
<p class="muted">A valid invite code from the MockMail dashboard is required.</p>
|
|
394
|
+
</div>
|
|
395
|
+
`)
|
|
396
|
+
}
|
|
397
|
+
state.socialAccounts.set(handle, {
|
|
398
|
+
email,
|
|
399
|
+
handle,
|
|
400
|
+
password,
|
|
401
|
+
inviteCode,
|
|
402
|
+
})
|
|
403
|
+
return redirect(res, `/social/success?handle=${encodeURIComponent(handle)}`)
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
if (req.method === 'GET' && pathname === '/social/success') {
|
|
407
|
+
const handle = String(url.searchParams.get('handle') || '').trim()
|
|
408
|
+
const account = state.socialAccounts.get(handle)
|
|
409
|
+
if (!account) {
|
|
410
|
+
return sendHtml(res, 404, 'Chirper Account Missing', '<div class="card"><h1>Account not found</h1></div>')
|
|
411
|
+
}
|
|
412
|
+
return sendHtml(res, 200, 'Chirper Ready', `
|
|
413
|
+
<div class="card">
|
|
414
|
+
<h1>Chirper account ready</h1>
|
|
415
|
+
<p>Handle: <span class="mono" id="chirper-handle">${escapeHtml(account.handle)}</span></p>
|
|
416
|
+
<p>Email: <span class="mono">${escapeHtml(account.email)}</span></p>
|
|
417
|
+
</div>
|
|
418
|
+
`)
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
if (req.method === 'GET' && pathname === '/verify-social/signup') {
|
|
422
|
+
const prefilledEmail = String(url.searchParams.get('email') || '').trim()
|
|
423
|
+
const prefilledHandle = String(url.searchParams.get('handle') || 'verified-operator').trim()
|
|
424
|
+
return sendHtml(res, 200, 'Chirper Verification Signup', `
|
|
425
|
+
<div class="card">
|
|
426
|
+
<h1>Create a Chirper account with verification</h1>
|
|
427
|
+
<p class="muted">This flow requires a human verification code after the first step.</p>
|
|
428
|
+
<form method="post" action="/verify-social/signup">
|
|
429
|
+
<label>Email address
|
|
430
|
+
<input id="email" name="email" type="email" autocomplete="email" value="${escapeHtml(prefilledEmail)}" required />
|
|
431
|
+
</label>
|
|
432
|
+
<label>Handle
|
|
433
|
+
<input id="handle" name="handle" type="text" value="${escapeHtml(prefilledHandle)}" required />
|
|
434
|
+
</label>
|
|
435
|
+
<label>Password
|
|
436
|
+
<input id="password" name="password" type="password" value="TempVerifiedPass!23" required />
|
|
437
|
+
</label>
|
|
438
|
+
<button id="submit" type="submit">Start verified signup</button>
|
|
439
|
+
</form>
|
|
440
|
+
</div>
|
|
441
|
+
`)
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
if (req.method === 'POST' && pathname === '/verify-social/signup') {
|
|
445
|
+
const body = await readRequestBody(req)
|
|
446
|
+
const form = new URLSearchParams(body)
|
|
447
|
+
const email = String(form.get('email') || '').trim().toLowerCase()
|
|
448
|
+
const handle = String(form.get('handle') || '').trim()
|
|
449
|
+
const password = String(form.get('password') || '').trim()
|
|
450
|
+
if (!email || !handle || !password) {
|
|
451
|
+
return sendHtml(res, 400, 'Verified Signup Error', '<div class="card"><h1>Missing email, handle, or password</h1></div>')
|
|
452
|
+
}
|
|
453
|
+
const token = `verify-${signupSeed(`${email}:${handle}`)}`
|
|
454
|
+
state.pendingVerifiedSignups.set(token, {
|
|
455
|
+
token,
|
|
456
|
+
email,
|
|
457
|
+
handle,
|
|
458
|
+
password,
|
|
459
|
+
verificationCode: '246810',
|
|
460
|
+
recoveryToken: `recover-${signupSeed(`${handle}:${email}:recovery`)}`,
|
|
461
|
+
verified: false,
|
|
462
|
+
})
|
|
463
|
+
return redirect(res, `/verify-social/verify?token=${encodeURIComponent(token)}`)
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
if (req.method === 'GET' && pathname === '/verify-social/verify') {
|
|
467
|
+
const token = String(url.searchParams.get('token') || '').trim()
|
|
468
|
+
const pending = state.pendingVerifiedSignups.get(token)
|
|
469
|
+
if (!pending) {
|
|
470
|
+
return sendHtml(res, 404, 'Verification Missing', '<div class="card"><h1>Verification session not found</h1></div>')
|
|
471
|
+
}
|
|
472
|
+
return sendHtml(res, 200, 'Enter Verification Code', `
|
|
473
|
+
<div class="card">
|
|
474
|
+
<h1>Verification code required</h1>
|
|
475
|
+
<p id="verification-copy">A human verification code was sent out-of-band. Ask the human for the code. Do not guess.</p>
|
|
476
|
+
<form method="post" action="/verify-social/verify">
|
|
477
|
+
<input type="hidden" name="token" value="${escapeHtml(token)}" />
|
|
478
|
+
<label>Verification code
|
|
479
|
+
<input id="code" name="code" type="text" required />
|
|
480
|
+
</label>
|
|
481
|
+
<button id="submit" type="submit">Complete verified signup</button>
|
|
482
|
+
</form>
|
|
483
|
+
</div>
|
|
484
|
+
`)
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
if (req.method === 'POST' && pathname === '/verify-social/verify') {
|
|
488
|
+
const body = await readRequestBody(req)
|
|
489
|
+
const form = new URLSearchParams(body)
|
|
490
|
+
const token = String(form.get('token') || '').trim()
|
|
491
|
+
const code = String(form.get('code') || '').trim()
|
|
492
|
+
const pending = state.pendingVerifiedSignups.get(token)
|
|
493
|
+
if (!pending) {
|
|
494
|
+
return sendHtml(res, 404, 'Verification Missing', '<div class="card"><h1>Verification session not found</h1></div>')
|
|
495
|
+
}
|
|
496
|
+
if (code !== pending.verificationCode) {
|
|
497
|
+
return sendHtml(res, 400, 'Verification Failed', `
|
|
498
|
+
<div class="card">
|
|
499
|
+
<h1>Incorrect code</h1>
|
|
500
|
+
<p class="muted">The verification code must come from a human. Do not guess.</p>
|
|
501
|
+
</div>
|
|
502
|
+
`)
|
|
503
|
+
}
|
|
504
|
+
pending.verified = true
|
|
505
|
+
state.pendingVerifiedSignups.set(token, pending)
|
|
506
|
+
return redirect(res, `/verify-social/success?token=${encodeURIComponent(token)}`)
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
if (req.method === 'GET' && pathname === '/verify-social/success') {
|
|
510
|
+
const token = String(url.searchParams.get('token') || '').trim()
|
|
511
|
+
const pending = state.pendingVerifiedSignups.get(token)
|
|
512
|
+
if (!pending || !pending.verified) {
|
|
513
|
+
return sendHtml(res, 404, 'Verified Signup Missing', '<div class="card"><h1>Verified account not found</h1></div>')
|
|
514
|
+
}
|
|
515
|
+
return sendHtml(res, 200, 'Verified Chirper Ready', `
|
|
516
|
+
<div class="card">
|
|
517
|
+
<h1>Verified Chirper account ready</h1>
|
|
518
|
+
<p>Handle: <span class="mono" id="verified-handle">${escapeHtml(pending.handle)}</span></p>
|
|
519
|
+
<p>Recovery token: <span class="mono" id="recovery-token">${escapeHtml(pending.recoveryToken)}</span></p>
|
|
520
|
+
</div>
|
|
521
|
+
`)
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
return sendHtml(res, 404, 'Not Found', '<div class="card"><h1>Route not found</h1></div>')
|
|
525
|
+
})
|
|
526
|
+
|
|
527
|
+
return {
|
|
528
|
+
baseUrl,
|
|
529
|
+
state,
|
|
530
|
+
close: async () => closeServer(server),
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
async function startMockSmtpHarness(): Promise<MockSmtpHarness> {
|
|
535
|
+
const messages: MockSmtpMessage[] = []
|
|
536
|
+
const server = net.createServer((socket) => {
|
|
537
|
+
let buffer = ''
|
|
538
|
+
let mailFrom = ''
|
|
539
|
+
let recipients: string[] = []
|
|
540
|
+
let dataMode = false
|
|
541
|
+
let dataBuffer = ''
|
|
542
|
+
|
|
543
|
+
socket.write('220 mock-smtp.local ESMTP ready\r\n')
|
|
544
|
+
|
|
545
|
+
const resetMessage = () => {
|
|
546
|
+
mailFrom = ''
|
|
547
|
+
recipients = []
|
|
548
|
+
dataBuffer = ''
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
const pushIfCompleteData = () => {
|
|
552
|
+
const endMarker = '\r\n.\r\n'
|
|
553
|
+
const endIndex = buffer.indexOf(endMarker)
|
|
554
|
+
if (!dataMode || endIndex === -1) return false
|
|
555
|
+
dataBuffer += buffer.slice(0, endIndex)
|
|
556
|
+
buffer = buffer.slice(endIndex + endMarker.length)
|
|
557
|
+
messages.push({
|
|
558
|
+
mailFrom,
|
|
559
|
+
recipients: [...recipients],
|
|
560
|
+
data: dataBuffer,
|
|
561
|
+
})
|
|
562
|
+
dataMode = false
|
|
563
|
+
dataBuffer = ''
|
|
564
|
+
socket.write('250 Message accepted\r\n')
|
|
565
|
+
resetMessage()
|
|
566
|
+
return true
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
socket.on('data', (chunk) => {
|
|
570
|
+
buffer += chunk.toString('utf8')
|
|
571
|
+
while (buffer.length > 0) {
|
|
572
|
+
if (dataMode) {
|
|
573
|
+
if (!pushIfCompleteData()) break
|
|
574
|
+
continue
|
|
575
|
+
}
|
|
576
|
+
const lineEnd = buffer.indexOf('\r\n')
|
|
577
|
+
if (lineEnd === -1) break
|
|
578
|
+
const line = buffer.slice(0, lineEnd)
|
|
579
|
+
buffer = buffer.slice(lineEnd + 2)
|
|
580
|
+
const upper = line.toUpperCase()
|
|
581
|
+
|
|
582
|
+
if (upper.startsWith('EHLO') || upper.startsWith('HELO')) {
|
|
583
|
+
socket.write('250 mock-smtp.local\r\n')
|
|
584
|
+
continue
|
|
585
|
+
}
|
|
586
|
+
if (upper.startsWith('MAIL FROM:')) {
|
|
587
|
+
mailFrom = line.slice('MAIL FROM:'.length).trim()
|
|
588
|
+
socket.write('250 Sender OK\r\n')
|
|
589
|
+
continue
|
|
590
|
+
}
|
|
591
|
+
if (upper.startsWith('RCPT TO:')) {
|
|
592
|
+
recipients.push(line.slice('RCPT TO:'.length).trim().replace(/^<|>$/g, ''))
|
|
593
|
+
socket.write('250 Recipient OK\r\n')
|
|
594
|
+
continue
|
|
595
|
+
}
|
|
596
|
+
if (upper === 'DATA') {
|
|
597
|
+
dataMode = true
|
|
598
|
+
socket.write('354 End data with <CR><LF>.<CR><LF>\r\n')
|
|
599
|
+
continue
|
|
600
|
+
}
|
|
601
|
+
if (upper === 'QUIT') {
|
|
602
|
+
socket.write('221 Bye\r\n')
|
|
603
|
+
socket.end()
|
|
604
|
+
return
|
|
605
|
+
}
|
|
606
|
+
socket.write('250 OK\r\n')
|
|
607
|
+
}
|
|
608
|
+
})
|
|
609
|
+
})
|
|
610
|
+
|
|
611
|
+
await new Promise<void>((resolve, reject) => {
|
|
612
|
+
server.once('error', reject)
|
|
613
|
+
server.listen(0, '127.0.0.1', () => {
|
|
614
|
+
server.off('error', reject)
|
|
615
|
+
resolve()
|
|
616
|
+
})
|
|
617
|
+
})
|
|
618
|
+
const address = server.address() as AddressInfo | null
|
|
619
|
+
if (!address) throw new Error('Mock SMTP server failed to bind to a port.')
|
|
620
|
+
|
|
621
|
+
return {
|
|
622
|
+
port: address.port,
|
|
623
|
+
messages,
|
|
624
|
+
close: async () => closeServer(server),
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
async function startMockResearchDeployHarness(): Promise<MockResearchDeployHarness> {
|
|
629
|
+
const state = {
|
|
630
|
+
deployments: new Map<string, string>(),
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
const { server, baseUrl } = await startLocalHttpServer(async (req, res) => {
|
|
634
|
+
const url = new URL(req.url || '/', baseUrl)
|
|
635
|
+
const pathname = url.pathname
|
|
636
|
+
|
|
637
|
+
if (req.method === 'GET' && pathname === '/research/brief') {
|
|
638
|
+
return sendHtml(res, 200, 'Northstar Notes Brief', `
|
|
639
|
+
<div class="card">
|
|
640
|
+
<h1>Northstar Notes product brief</h1>
|
|
641
|
+
<p><strong>Product:</strong> Northstar Notes, a weekly AI operator briefing for busy startup founders.</p>
|
|
642
|
+
<p><strong>Audience:</strong> Mid-stage founders who need signal, not noise.</p>
|
|
643
|
+
<p><strong>Required headline:</strong> Northstar Notes for AI Operators</p>
|
|
644
|
+
<p><strong>Required subhead:</strong> One sharp Friday briefing on launches, model updates, and GTM moves that matter.</p>
|
|
645
|
+
<p><strong>Required CTA:</strong> Get the Friday briefing</p>
|
|
646
|
+
<p><strong>Required proof points:</strong> concise market signal, product launch summaries, operator action items.</p>
|
|
647
|
+
<p><strong>Design note:</strong> make it feel decisive and editorial, not generic SaaS boilerplate.</p>
|
|
648
|
+
</div>
|
|
649
|
+
`)
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
if (req.method === 'GET' && pathname === '/docs/deploy-api') {
|
|
653
|
+
return sendHtml(res, 200, 'Deploy API Docs', `
|
|
654
|
+
<div class="card">
|
|
655
|
+
<h1>Mock deploy API</h1>
|
|
656
|
+
<p>Deploy a static HTML page by POSTing JSON to <span class="mono">/deploy</span>.</p>
|
|
657
|
+
<pre class="mono">{
|
|
658
|
+
"slug": "northstar-notes",
|
|
659
|
+
"html": "<!doctype html>..."
|
|
660
|
+
}</pre>
|
|
661
|
+
<p>The response is JSON with a single field: <span class="mono">url</span>.</p>
|
|
662
|
+
<p>After deployment, verify the live page by opening the returned URL and checking that the required headline is visible.</p>
|
|
663
|
+
</div>
|
|
664
|
+
`)
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
if (req.method === 'POST' && pathname === '/deploy') {
|
|
668
|
+
const raw = await readRequestBody(req)
|
|
669
|
+
let payload: Record<string, unknown>
|
|
670
|
+
try {
|
|
671
|
+
payload = JSON.parse(raw) as Record<string, unknown>
|
|
672
|
+
} catch {
|
|
673
|
+
res.statusCode = 400
|
|
674
|
+
res.setHeader('content-type', 'application/json')
|
|
675
|
+
res.end(JSON.stringify({ error: 'invalid json' }))
|
|
676
|
+
return
|
|
677
|
+
}
|
|
678
|
+
const html = typeof payload.html === 'string' ? payload.html : ''
|
|
679
|
+
const slug = typeof payload.slug === 'string' && payload.slug.trim()
|
|
680
|
+
? payload.slug.trim().toLowerCase().replace(/[^a-z0-9-]/g, '-')
|
|
681
|
+
: `site-${genId(4)}`
|
|
682
|
+
if (!html.trim()) {
|
|
683
|
+
res.statusCode = 400
|
|
684
|
+
res.setHeader('content-type', 'application/json')
|
|
685
|
+
res.end(JSON.stringify({ error: 'html is required' }))
|
|
686
|
+
return
|
|
687
|
+
}
|
|
688
|
+
state.deployments.set(slug, html)
|
|
689
|
+
res.statusCode = 200
|
|
690
|
+
res.setHeader('content-type', 'application/json')
|
|
691
|
+
res.end(JSON.stringify({ url: `${baseUrl}/deployed/${slug}` }))
|
|
692
|
+
return
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
if (req.method === 'GET' && pathname.startsWith('/deployed/')) {
|
|
696
|
+
const slug = pathname.slice('/deployed/'.length)
|
|
697
|
+
const html = state.deployments.get(slug)
|
|
698
|
+
if (!html) {
|
|
699
|
+
res.statusCode = 404
|
|
700
|
+
res.setHeader('content-type', 'text/plain; charset=utf-8')
|
|
701
|
+
res.end('deployment not found')
|
|
702
|
+
return
|
|
703
|
+
}
|
|
704
|
+
res.statusCode = 200
|
|
705
|
+
res.setHeader('content-type', 'text/html; charset=utf-8')
|
|
706
|
+
res.end(html)
|
|
707
|
+
return
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
return sendHtml(res, 404, 'Not Found', '<div class="card"><h1>Route not found</h1></div>')
|
|
711
|
+
})
|
|
712
|
+
|
|
713
|
+
return {
|
|
714
|
+
baseUrl,
|
|
715
|
+
state,
|
|
716
|
+
close: async () => closeServer(server),
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
function truncatePreview(text: string, max = 400): string {
|
|
721
|
+
const normalized = text.replace(/\r\n/g, '\n').trim()
|
|
722
|
+
if (normalized.length <= max) return normalized
|
|
723
|
+
return `${normalized.slice(0, Math.max(0, max - 3))}...`
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
function buildArtifactEvidence(ctx: ScenarioContext, relativePaths: string[]): RegressionArtifactEvidence[] {
|
|
727
|
+
return relativePaths.map((relativePath) => {
|
|
728
|
+
const absolutePath = scenarioFile(ctx, relativePath)
|
|
729
|
+
if (!fs.existsSync(absolutePath)) {
|
|
730
|
+
return {
|
|
731
|
+
relativePath,
|
|
732
|
+
exists: false,
|
|
733
|
+
size: 0,
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
const buffer = fs.readFileSync(absolutePath)
|
|
737
|
+
return {
|
|
738
|
+
relativePath,
|
|
739
|
+
exists: true,
|
|
740
|
+
size: buffer.byteLength,
|
|
741
|
+
sha256: createHash('sha256').update(buffer).digest('hex'),
|
|
742
|
+
preview: truncatePreview(buffer.toString('utf8')),
|
|
743
|
+
}
|
|
744
|
+
})
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
function collectWorkspaceFiles(rootDir: string): string[] {
|
|
748
|
+
if (!fs.existsSync(rootDir)) return []
|
|
749
|
+
const files: string[] = []
|
|
750
|
+
const visit = (dir: string) => {
|
|
751
|
+
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
|
|
752
|
+
const fullPath = path.join(dir, entry.name)
|
|
753
|
+
if (entry.isDirectory()) {
|
|
754
|
+
visit(fullPath)
|
|
755
|
+
continue
|
|
756
|
+
}
|
|
757
|
+
if (!entry.isFile()) continue
|
|
758
|
+
files.push(path.relative(rootDir, fullPath))
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
visit(rootDir)
|
|
762
|
+
return files.sort()
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
function writeScenarioEvidenceFiles(ctx: ScenarioContext): AgentRegressionScenarioResult['evidencePaths'] {
|
|
766
|
+
const transcriptPath = scenarioFile(ctx, '.agent-regression-transcript.json')
|
|
767
|
+
const approvalsPath = scenarioFile(ctx, '.agent-regression-approvals.json')
|
|
768
|
+
const workspacePath = scenarioFile(ctx, '.agent-regression-workspace.json')
|
|
769
|
+
const session = loadSessions()[ctx.sessionId]
|
|
770
|
+
|
|
771
|
+
fs.writeFileSync(transcriptPath, JSON.stringify(session?.messages || [], null, 2), 'utf8')
|
|
772
|
+
fs.writeFileSync(approvalsPath, JSON.stringify(listSessionApprovals(ctx.sessionId), null, 2), 'utf8')
|
|
773
|
+
fs.writeFileSync(workspacePath, JSON.stringify(collectWorkspaceFiles(ctx.workspaceDir), null, 2), 'utf8')
|
|
774
|
+
|
|
775
|
+
return {
|
|
776
|
+
transcript: transcriptPath,
|
|
777
|
+
approvals: approvalsPath,
|
|
778
|
+
workspace: workspacePath,
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
export function resolveRegressionApprovalSettings(mode: RegressionApprovalMode): Record<string, unknown> {
|
|
783
|
+
if (mode === 'off') {
|
|
784
|
+
return {
|
|
785
|
+
approvalsEnabled: false,
|
|
786
|
+
approvalAutoApproveCategories: [],
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
if (mode === 'auto') {
|
|
790
|
+
return {
|
|
791
|
+
approvalsEnabled: true,
|
|
792
|
+
approvalAutoApproveCategories: ['tool_access'],
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
return {
|
|
796
|
+
approvalsEnabled: true,
|
|
797
|
+
approvalAutoApproveCategories: [],
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
export function scoreAssertions(assertions: RegressionAssertion[]): { score: number; maxScore: number; status: 'passed' | 'failed' } {
|
|
802
|
+
let score = 0
|
|
803
|
+
let maxScore = 0
|
|
804
|
+
for (const assertion of assertions) {
|
|
805
|
+
const weight = assertion.weight ?? 1
|
|
806
|
+
maxScore += weight
|
|
807
|
+
if (assertion.passed) score += weight
|
|
808
|
+
}
|
|
809
|
+
return {
|
|
810
|
+
score,
|
|
811
|
+
maxScore,
|
|
812
|
+
status: score === maxScore ? 'passed' : 'failed',
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
function listSessionApprovals(sessionId: string): ApprovalRequest[] {
|
|
817
|
+
return Object.values(loadApprovals() as Record<string, ApprovalRequest>)
|
|
818
|
+
.filter((approval) => approval.sessionId === sessionId)
|
|
819
|
+
.sort((left, right) => left.createdAt - right.createdAt)
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
function buildApprovalEvidence(sessionId: string): RegressionApprovalEvidence[] {
|
|
823
|
+
return listSessionApprovals(sessionId).map((approval) => ({
|
|
824
|
+
id: approval.id,
|
|
825
|
+
category: approval.category,
|
|
826
|
+
status: approval.status,
|
|
827
|
+
title: approval.title,
|
|
828
|
+
toolId: typeof approval.data?.toolId === 'string'
|
|
829
|
+
? approval.data.toolId
|
|
830
|
+
: typeof approval.data?.pluginId === 'string'
|
|
831
|
+
? approval.data.pluginId
|
|
832
|
+
: null,
|
|
833
|
+
}))
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
function listSessionSecrets(sessionId: string): Array<Record<string, unknown>> {
|
|
837
|
+
return Object.values(loadSecrets() as Record<string, Record<string, unknown>>)
|
|
838
|
+
.filter((secret) => secret.createdInSessionId === sessionId)
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
function parseJsonRecord(raw: string | undefined): Record<string, unknown> | null {
|
|
842
|
+
if (!raw || !raw.trim()) return null
|
|
843
|
+
try {
|
|
844
|
+
const parsed = JSON.parse(raw)
|
|
845
|
+
return parsed && typeof parsed === 'object' && !Array.isArray(parsed)
|
|
846
|
+
? parsed as Record<string, unknown>
|
|
847
|
+
: null
|
|
848
|
+
} catch {
|
|
849
|
+
return null
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
function findToolEvents(ctx: ScenarioContext, toolName: string): RegressionToolEventEvidence[] {
|
|
854
|
+
return ctx.turns.flatMap((turn) => turn.toolEvents.filter((event) => event.name === toolName))
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
function cleanupScenarioState(ctx: ScenarioContext): void {
|
|
858
|
+
for (const approval of listSessionApprovals(ctx.sessionId)) {
|
|
859
|
+
deleteApproval(approval.id)
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
const watchJobs = loadWatchJobs() as Record<string, Record<string, unknown>>
|
|
863
|
+
for (const [watchJobId, watchJob] of Object.entries(watchJobs)) {
|
|
864
|
+
if (watchJob?.sessionId === ctx.sessionId) deleteWatchJob(watchJobId)
|
|
865
|
+
}
|
|
866
|
+
|
|
867
|
+
const delegationJobs = loadDelegationJobs() as Record<string, Record<string, unknown>>
|
|
868
|
+
for (const [jobId, job] of Object.entries(delegationJobs)) {
|
|
869
|
+
if (job?.parentSessionId === ctx.sessionId || job?.childSessionId === ctx.sessionId) {
|
|
870
|
+
deleteDelegationJob(jobId)
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
const secrets = loadSecrets() as Record<string, Record<string, unknown>>
|
|
875
|
+
let secretsChanged = false
|
|
876
|
+
for (const [secretId, secret] of Object.entries(secrets)) {
|
|
877
|
+
if (secret?.createdInSessionId !== ctx.sessionId) continue
|
|
878
|
+
delete secrets[secretId]
|
|
879
|
+
secretsChanged = true
|
|
880
|
+
}
|
|
881
|
+
if (secretsChanged) saveSecrets(secrets)
|
|
882
|
+
|
|
883
|
+
const schedules = loadSchedules() as Record<string, Record<string, unknown>>
|
|
884
|
+
let schedulesChanged = false
|
|
885
|
+
for (const [scheduleId, schedule] of Object.entries(schedules)) {
|
|
886
|
+
if (schedule?.createdInSessionId !== ctx.sessionId) continue
|
|
887
|
+
delete schedules[scheduleId]
|
|
888
|
+
schedulesChanged = true
|
|
889
|
+
}
|
|
890
|
+
if (schedulesChanged) saveSchedules(schedules)
|
|
891
|
+
|
|
892
|
+
const tasks = loadTasks() as Record<string, Record<string, unknown>>
|
|
893
|
+
let tasksChanged = false
|
|
894
|
+
for (const [taskId, task] of Object.entries(tasks)) {
|
|
895
|
+
if (task?.createdInSessionId !== ctx.sessionId) continue
|
|
896
|
+
delete tasks[taskId]
|
|
897
|
+
tasksChanged = true
|
|
898
|
+
}
|
|
899
|
+
if (tasksChanged) saveTasks(tasks)
|
|
900
|
+
|
|
901
|
+
deleteBrowserSession(ctx.sessionId)
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
function buildRegressionSession(params: {
|
|
905
|
+
agent: Record<string, unknown>
|
|
906
|
+
sessionId: string
|
|
907
|
+
cwd: string
|
|
908
|
+
plugins: string[]
|
|
909
|
+
}): Session {
|
|
910
|
+
const now = Date.now()
|
|
911
|
+
return {
|
|
912
|
+
id: params.sessionId,
|
|
913
|
+
name: `Agent Regression ${params.sessionId}`,
|
|
914
|
+
cwd: params.cwd,
|
|
915
|
+
user: 'eval-runner',
|
|
916
|
+
provider: (params.agent.provider as Session['provider']) ?? 'openai',
|
|
917
|
+
model: (params.agent.model as string) ?? '',
|
|
918
|
+
credentialId: (params.agent.credentialId as string | null) ?? null,
|
|
919
|
+
fallbackCredentialIds: Array.isArray(params.agent.fallbackCredentialIds)
|
|
920
|
+
? params.agent.fallbackCredentialIds as string[]
|
|
921
|
+
: undefined,
|
|
922
|
+
apiEndpoint: (params.agent.apiEndpoint as string | null) ?? null,
|
|
923
|
+
claudeSessionId: null,
|
|
924
|
+
codexThreadId: null,
|
|
925
|
+
opencodeSessionId: null,
|
|
926
|
+
delegateResumeIds: { claudeCode: null, codex: null, opencode: null, gemini: null },
|
|
927
|
+
messages: [],
|
|
928
|
+
createdAt: now,
|
|
929
|
+
lastActiveAt: now,
|
|
930
|
+
sessionType: 'human',
|
|
931
|
+
agentId: params.agent.id as string,
|
|
932
|
+
plugins: [...params.plugins],
|
|
933
|
+
tools: [...params.plugins],
|
|
934
|
+
}
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
async function runTurn(ctx: ScenarioContext, message: string): Promise<ExecuteChatTurnResult> {
|
|
938
|
+
const result = await executeSessionChatTurn({
|
|
939
|
+
sessionId: ctx.sessionId,
|
|
940
|
+
message,
|
|
941
|
+
internal: true,
|
|
942
|
+
source: 'eval',
|
|
943
|
+
})
|
|
944
|
+
ctx.responseTexts.push(result.text)
|
|
945
|
+
for (const event of result.toolEvents || []) {
|
|
946
|
+
ctx.toolEvents.push(event)
|
|
947
|
+
ctx.toolNames.add(event.name)
|
|
948
|
+
}
|
|
949
|
+
ctx.turns.push({
|
|
950
|
+
prompt: message,
|
|
951
|
+
responseText: result.text,
|
|
952
|
+
toolEvents: (result.toolEvents || []).map((event) => ({
|
|
953
|
+
name: event.name,
|
|
954
|
+
input: event.input,
|
|
955
|
+
output: event.output,
|
|
956
|
+
error: event.error,
|
|
957
|
+
})),
|
|
958
|
+
approvalIds: listSessionApprovals(ctx.sessionId).map((approval) => approval.id),
|
|
959
|
+
})
|
|
960
|
+
return result
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
function readIfExists(filePath: string): string {
|
|
964
|
+
return fs.existsSync(filePath) ? fs.readFileSync(filePath, 'utf8') : ''
|
|
965
|
+
}
|
|
966
|
+
|
|
967
|
+
function scenarioFile(ctx: ScenarioContext, relativePath: string): string {
|
|
968
|
+
return path.join(ctx.workspaceDir, relativePath)
|
|
969
|
+
}
|
|
970
|
+
|
|
971
|
+
async function runApprovalResumeScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
|
|
972
|
+
const proofRelativePath = 'approval-mode/proof.txt'
|
|
973
|
+
const proofPath = scenarioFile(ctx, proofRelativePath)
|
|
974
|
+
const prompt = [
|
|
975
|
+
'You must use the shell tool for this task.',
|
|
976
|
+
'If shell is not currently available, request access to "shell" using manage_capabilities with action "request_access".',
|
|
977
|
+
`Once shell access is available, run \`pwd\` and create \`${proofRelativePath}\` containing exactly two lines:`,
|
|
978
|
+
`MODE=${ctx.approvalMode}`,
|
|
979
|
+
'PWD=<the pwd output>',
|
|
980
|
+
`Your final answer must include the exact literal token \`sandbox:/workspace/${proofRelativePath}\`.`,
|
|
981
|
+
'Do not use the files tool or delegation to create the proof file.',
|
|
982
|
+
].join('\n')
|
|
983
|
+
const resumePrompt = [
|
|
984
|
+
'Continue the original shell task now.',
|
|
985
|
+
'Use the shell tool to run `mkdir -p approval-mode && printf "MODE=' + ctx.approvalMode + '\\nPWD=$(pwd)\\n" > approval-mode/proof.txt`.',
|
|
986
|
+
`Then reply with the exact literal token \`sandbox:/workspace/${proofRelativePath}\`.`,
|
|
987
|
+
].join('\n')
|
|
988
|
+
|
|
989
|
+
await runTurn(ctx, prompt)
|
|
990
|
+
const approvalsAfterFirstTurn = listSessionApprovals(ctx.sessionId)
|
|
991
|
+
const shellApprovals = approvalsAfterFirstTurn.filter((approval) => (
|
|
992
|
+
approval.category === 'tool_access'
|
|
993
|
+
&& String(approval.data?.toolId || approval.data?.pluginId || '').trim() === 'shell'
|
|
994
|
+
))
|
|
995
|
+
|
|
996
|
+
if (ctx.approvalMode === 'manual') {
|
|
997
|
+
for (const approval of shellApprovals.filter((approval) => approval.status === 'pending')) {
|
|
998
|
+
await submitDecision(approval.id, true)
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
for (let attempt = 0; attempt < 2 && !fs.existsSync(proofPath); attempt += 1) {
|
|
1003
|
+
const session = loadSessions()[ctx.sessionId]
|
|
1004
|
+
const hasShell = Array.isArray(session?.plugins) && session.plugins.includes('shell')
|
|
1005
|
+
if (!hasShell) break
|
|
1006
|
+
await runTurn(ctx, attempt === 0 ? resumePrompt : `${resumePrompt}\nKeep going until the proof file exists.`)
|
|
1007
|
+
}
|
|
1008
|
+
|
|
1009
|
+
const proofText = readIfExists(proofPath)
|
|
1010
|
+
const assertions: RegressionAssertion[] = [
|
|
1011
|
+
{
|
|
1012
|
+
name: 'shell approval requested or shell used',
|
|
1013
|
+
passed: shellApprovals.length > 0 || ctx.toolNames.has('shell'),
|
|
1014
|
+
details: shellApprovals.length ? `approvals=${shellApprovals.length}` : 'no shell approval found',
|
|
1015
|
+
},
|
|
1016
|
+
{
|
|
1017
|
+
name: 'manual mode produced a pending approval before resume',
|
|
1018
|
+
passed: ctx.approvalMode !== 'manual' || shellApprovals.some((approval) => approval.status === 'approved' || approval.status === 'pending'),
|
|
1019
|
+
details: ctx.approvalMode === 'manual' ? `statuses=${shellApprovals.map((approval) => approval.status).join(',') || 'none'}` : 'not applicable',
|
|
1020
|
+
},
|
|
1021
|
+
{
|
|
1022
|
+
name: 'shell tool used',
|
|
1023
|
+
passed: ctx.toolNames.has('shell'),
|
|
1024
|
+
},
|
|
1025
|
+
{
|
|
1026
|
+
name: 'proof file exists',
|
|
1027
|
+
passed: fs.existsSync(proofPath),
|
|
1028
|
+
details: proofPath,
|
|
1029
|
+
weight: 2,
|
|
1030
|
+
},
|
|
1031
|
+
{
|
|
1032
|
+
name: 'proof file contains approval mode marker',
|
|
1033
|
+
passed: proofText.includes(`MODE=${ctx.approvalMode}`),
|
|
1034
|
+
},
|
|
1035
|
+
{
|
|
1036
|
+
name: 'final response preserved literal sandbox token',
|
|
1037
|
+
passed: ctx.responseTexts.some((text) => text.includes(`sandbox:/workspace/${proofRelativePath}`)),
|
|
1038
|
+
},
|
|
1039
|
+
]
|
|
1040
|
+
const scored = scoreAssertions(assertions)
|
|
1041
|
+
return {
|
|
1042
|
+
scenarioId: 'approval-resume',
|
|
1043
|
+
name: 'Approval Resume',
|
|
1044
|
+
approvalMode: ctx.approvalMode,
|
|
1045
|
+
...scored,
|
|
1046
|
+
assertions,
|
|
1047
|
+
sessionId: ctx.sessionId,
|
|
1048
|
+
workspaceDir: ctx.workspaceDir,
|
|
1049
|
+
toolNames: Array.from(ctx.toolNames),
|
|
1050
|
+
approvalIds: shellApprovals.map((approval) => approval.id),
|
|
1051
|
+
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
1052
|
+
responseTexts: [...ctx.responseTexts],
|
|
1053
|
+
turns: [...ctx.turns],
|
|
1054
|
+
artifacts: buildArtifactEvidence(ctx, [proofRelativePath]),
|
|
1055
|
+
evidencePaths: writeScenarioEvidenceFiles(ctx),
|
|
1056
|
+
}
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
async function runDelegateLiteralScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
|
|
1060
|
+
const targetRelativePath = 'notes/live-verification.md'
|
|
1061
|
+
const targetPath = scenarioFile(ctx, targetRelativePath)
|
|
1062
|
+
const prompt = [
|
|
1063
|
+
'Use delegation for this task.',
|
|
1064
|
+
`Create \`${targetRelativePath}\` with exactly these two lines:`,
|
|
1065
|
+
'alpha',
|
|
1066
|
+
'beta',
|
|
1067
|
+
`Your final answer must include the exact literal token \`sandbox:/workspace/${targetRelativePath}\`.`,
|
|
1068
|
+
'Do not replace that token with a served URL.',
|
|
1069
|
+
].join('\n')
|
|
1070
|
+
|
|
1071
|
+
await runTurn(ctx, prompt)
|
|
1072
|
+
if (!fs.existsSync(targetPath)) {
|
|
1073
|
+
await runTurn(ctx, 'Continue and finish the delegated task exactly as requested.')
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
const contents = readIfExists(targetPath).trim().split('\n').filter(Boolean)
|
|
1077
|
+
const assertions: RegressionAssertion[] = [
|
|
1078
|
+
{
|
|
1079
|
+
name: 'delegate backend used',
|
|
1080
|
+
passed: Array.from(ctx.toolNames).some((name) => name === 'delegate' || name.startsWith('delegate_to_')),
|
|
1081
|
+
weight: 2,
|
|
1082
|
+
},
|
|
1083
|
+
{
|
|
1084
|
+
name: 'delegated file exists',
|
|
1085
|
+
passed: fs.existsSync(targetPath),
|
|
1086
|
+
details: targetPath,
|
|
1087
|
+
weight: 2,
|
|
1088
|
+
},
|
|
1089
|
+
{
|
|
1090
|
+
name: 'delegated file has exactly two lines',
|
|
1091
|
+
passed: contents.length === 2 && contents[0] === 'alpha' && contents[1] === 'beta',
|
|
1092
|
+
details: contents.join(' | '),
|
|
1093
|
+
},
|
|
1094
|
+
{
|
|
1095
|
+
name: 'literal sandbox token preserved',
|
|
1096
|
+
passed: ctx.responseTexts.some((text) => text.includes(`sandbox:/workspace/${targetRelativePath}`)),
|
|
1097
|
+
weight: 2,
|
|
1098
|
+
},
|
|
1099
|
+
]
|
|
1100
|
+
const scored = scoreAssertions(assertions)
|
|
1101
|
+
return {
|
|
1102
|
+
scenarioId: 'delegate-literal-artifact',
|
|
1103
|
+
name: 'Delegate Literal Artifact',
|
|
1104
|
+
approvalMode: ctx.approvalMode,
|
|
1105
|
+
...scored,
|
|
1106
|
+
assertions,
|
|
1107
|
+
sessionId: ctx.sessionId,
|
|
1108
|
+
workspaceDir: ctx.workspaceDir,
|
|
1109
|
+
toolNames: Array.from(ctx.toolNames),
|
|
1110
|
+
approvalIds: [],
|
|
1111
|
+
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
1112
|
+
responseTexts: [...ctx.responseTexts],
|
|
1113
|
+
turns: [...ctx.turns],
|
|
1114
|
+
artifacts: buildArtifactEvidence(ctx, [targetRelativePath]),
|
|
1115
|
+
evidencePaths: writeScenarioEvidenceFiles(ctx),
|
|
1116
|
+
}
|
|
1117
|
+
}
|
|
1118
|
+
|
|
1119
|
+
async function runScheduleScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
|
|
1120
|
+
const scriptRelativePath = 'weather_workspace/weather_fetch.py'
|
|
1121
|
+
ensureDir(path.dirname(scenarioFile(ctx, scriptRelativePath)))
|
|
1122
|
+
fs.writeFileSync(scenarioFile(ctx, scriptRelativePath), 'print("weather ok")\n', 'utf8')
|
|
1123
|
+
|
|
1124
|
+
const prompt = [
|
|
1125
|
+
'Create a schedule with manage_schedules.',
|
|
1126
|
+
'Use name "Daily Weather Update".',
|
|
1127
|
+
'Use scheduleType "interval" and intervalMs 86400000.',
|
|
1128
|
+
'Use action "run_script" and path "weather_workspace/weather_fetch.py".',
|
|
1129
|
+
'Do not switch to command mode and do not invent another path.',
|
|
1130
|
+
'Confirm the created schedule id.',
|
|
1131
|
+
].join('\n')
|
|
1132
|
+
|
|
1133
|
+
await runTurn(ctx, prompt)
|
|
1134
|
+
const schedules = Object.values(loadSchedules() as Record<string, Record<string, unknown>>)
|
|
1135
|
+
.filter((schedule) => schedule.createdInSessionId === ctx.sessionId)
|
|
1136
|
+
.sort((left, right) => Number(right.createdAt || 0) - Number(left.createdAt || 0))
|
|
1137
|
+
const schedule = schedules[0] || null
|
|
1138
|
+
const assertions: RegressionAssertion[] = [
|
|
1139
|
+
{
|
|
1140
|
+
name: 'manage_schedules tool used',
|
|
1141
|
+
passed: ctx.toolNames.has('manage_schedules'),
|
|
1142
|
+
weight: 2,
|
|
1143
|
+
},
|
|
1144
|
+
{
|
|
1145
|
+
name: 'schedule created',
|
|
1146
|
+
passed: !!schedule,
|
|
1147
|
+
weight: 2,
|
|
1148
|
+
},
|
|
1149
|
+
{
|
|
1150
|
+
name: 'schedule assigned to the current agent',
|
|
1151
|
+
passed: String(schedule?.agentId || '') === ctx.agentId,
|
|
1152
|
+
details: String(schedule?.agentId || ''),
|
|
1153
|
+
},
|
|
1154
|
+
{
|
|
1155
|
+
name: 'schedule kept the exact script path',
|
|
1156
|
+
passed: String(schedule?.path || '') === scriptRelativePath,
|
|
1157
|
+
details: String(schedule?.path || ''),
|
|
1158
|
+
},
|
|
1159
|
+
{
|
|
1160
|
+
name: 'schedule taskPrompt is populated from the script path',
|
|
1161
|
+
passed: String(schedule?.taskPrompt || '').includes(scriptRelativePath),
|
|
1162
|
+
details: String(schedule?.taskPrompt || ''),
|
|
1163
|
+
},
|
|
1164
|
+
]
|
|
1165
|
+
const scored = scoreAssertions(assertions)
|
|
1166
|
+
return {
|
|
1167
|
+
scenarioId: 'schedule-script',
|
|
1168
|
+
name: 'Schedule Script Workflow',
|
|
1169
|
+
approvalMode: ctx.approvalMode,
|
|
1170
|
+
...scored,
|
|
1171
|
+
assertions,
|
|
1172
|
+
sessionId: ctx.sessionId,
|
|
1173
|
+
workspaceDir: ctx.workspaceDir,
|
|
1174
|
+
toolNames: Array.from(ctx.toolNames),
|
|
1175
|
+
approvalIds: [],
|
|
1176
|
+
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
1177
|
+
responseTexts: [...ctx.responseTexts],
|
|
1178
|
+
turns: [...ctx.turns],
|
|
1179
|
+
artifacts: buildArtifactEvidence(ctx, [scriptRelativePath]),
|
|
1180
|
+
evidencePaths: writeScenarioEvidenceFiles(ctx),
|
|
1181
|
+
}
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
async function runOpenEndedIterationScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
|
|
1185
|
+
const outputDir = scenarioFile(ctx, 'offer-pack')
|
|
1186
|
+
ensureDir(outputDir)
|
|
1187
|
+
const fileNames = ['offer-brief.md', 'landing-copy.md', 'outreach-draft.md', 'iteration-notes.md']
|
|
1188
|
+
|
|
1189
|
+
await runTurn(ctx, [
|
|
1190
|
+
'Create an offer package in offer-pack/.',
|
|
1191
|
+
'Write offer-brief.md, landing-copy.md, outreach-draft.md, and iteration-notes.md.',
|
|
1192
|
+
'The theme is an AI security consulting offer for mid-market software teams.',
|
|
1193
|
+
'Do the work, not just a plan.',
|
|
1194
|
+
'iteration-notes.md must include a heading "Iteration 1" with self-critique.',
|
|
1195
|
+
].join('\n'))
|
|
1196
|
+
|
|
1197
|
+
const deliverablePaths = fileNames.map((name) => scenarioFile(ctx, `offer-pack/${name}`))
|
|
1198
|
+
const beforeRevision = new Map(
|
|
1199
|
+
deliverablePaths
|
|
1200
|
+
.filter((filePath) => fs.existsSync(filePath))
|
|
1201
|
+
.map((filePath) => [filePath, fs.readFileSync(filePath, 'utf8')] as const),
|
|
1202
|
+
)
|
|
1203
|
+
|
|
1204
|
+
await runTurn(ctx, [
|
|
1205
|
+
'Continue the same workspace.',
|
|
1206
|
+
'Revise at least one of the three deliverables based on your own critique.',
|
|
1207
|
+
'Append a second heading "Iteration 2" to offer-pack/iteration-notes.md describing the revision you made.',
|
|
1208
|
+
].join('\n'))
|
|
1209
|
+
|
|
1210
|
+
const changedDeliverable = deliverablePaths
|
|
1211
|
+
.filter((filePath) => path.basename(filePath) !== 'iteration-notes.md')
|
|
1212
|
+
.some((filePath) => beforeRevision.has(filePath) && readIfExists(filePath) !== beforeRevision.get(filePath))
|
|
1213
|
+
const iterationNotes = readIfExists(scenarioFile(ctx, 'offer-pack/iteration-notes.md'))
|
|
1214
|
+
const assertions: RegressionAssertion[] = [
|
|
1215
|
+
{
|
|
1216
|
+
name: 'files tool used',
|
|
1217
|
+
passed: ctx.toolNames.has('files'),
|
|
1218
|
+
weight: 2,
|
|
1219
|
+
},
|
|
1220
|
+
{
|
|
1221
|
+
name: 'all open-ended deliverables exist',
|
|
1222
|
+
passed: deliverablePaths.every((filePath) => fs.existsSync(filePath)),
|
|
1223
|
+
details: deliverablePaths.filter((filePath) => !fs.existsSync(filePath)).join(', ') || 'all present',
|
|
1224
|
+
weight: 2,
|
|
1225
|
+
},
|
|
1226
|
+
{
|
|
1227
|
+
name: 'iteration notes include a second pass',
|
|
1228
|
+
passed: iterationNotes.includes('Iteration 1') && iterationNotes.includes('Iteration 2'),
|
|
1229
|
+
},
|
|
1230
|
+
{
|
|
1231
|
+
name: 'a deliverable changed on the second turn',
|
|
1232
|
+
passed: changedDeliverable,
|
|
1233
|
+
},
|
|
1234
|
+
]
|
|
1235
|
+
const scored = scoreAssertions(assertions)
|
|
1236
|
+
return {
|
|
1237
|
+
scenarioId: 'open-ended-iteration',
|
|
1238
|
+
name: 'Open-Ended Iteration Pack',
|
|
1239
|
+
approvalMode: ctx.approvalMode,
|
|
1240
|
+
...scored,
|
|
1241
|
+
assertions,
|
|
1242
|
+
sessionId: ctx.sessionId,
|
|
1243
|
+
workspaceDir: ctx.workspaceDir,
|
|
1244
|
+
toolNames: Array.from(ctx.toolNames),
|
|
1245
|
+
approvalIds: [],
|
|
1246
|
+
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
1247
|
+
responseTexts: [...ctx.responseTexts],
|
|
1248
|
+
turns: [...ctx.turns],
|
|
1249
|
+
artifacts: buildArtifactEvidence(ctx, fileNames.map((name) => `offer-pack/${name}`)),
|
|
1250
|
+
evidencePaths: writeScenarioEvidenceFiles(ctx),
|
|
1251
|
+
}
|
|
1252
|
+
}
|
|
1253
|
+
|
|
1254
|
+
async function runMockSignupSecretEmailScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
|
|
1255
|
+
const signupHarness = await startMockSignupHarness()
|
|
1256
|
+
const smtpHarness = await startMockSmtpHarness()
|
|
1257
|
+
const emailAddress = 'waydemlyle@gmail.com'
|
|
1258
|
+
const settingsSnapshot = loadSettings()
|
|
1259
|
+
|
|
1260
|
+
try {
|
|
1261
|
+
getPluginManager().setPluginSettings('email', {
|
|
1262
|
+
host: '127.0.0.1',
|
|
1263
|
+
port: smtpHarness.port,
|
|
1264
|
+
secure: false,
|
|
1265
|
+
username: '',
|
|
1266
|
+
password: 'mock-smtp-password',
|
|
1267
|
+
fromAddress: 'noreply@mockmail.local',
|
|
1268
|
+
fromName: 'MockMail Harness',
|
|
1269
|
+
})
|
|
1270
|
+
|
|
1271
|
+
const prompt = [
|
|
1272
|
+
'Complete a full mock onboarding flow.',
|
|
1273
|
+
`1. Use the browser tool to create a MockMail account at ${signupHarness.baseUrl}/mail/signup?email=${encodeURIComponent(emailAddress)}.`,
|
|
1274
|
+
'Use the exact literal URLs from this prompt. Do not substitute placeholders like `MockMailPage_URL` or `[Your URL]`.',
|
|
1275
|
+
'2. Read the MockMail dashboard and capture the generated app password plus the social invite code.',
|
|
1276
|
+
`3. Use the browser tool to create a Chirper account at ${signupHarness.baseUrl}/social/signup using the invite code from the MockMail dashboard. The social signup page already provides safe default values, so completing it as-is is acceptable.`,
|
|
1277
|
+
'4. Store the MockMail app password with manage_secrets as an agent-scoped secret named "MockMail App Password" and service "mockmail".',
|
|
1278
|
+
`5. Use the email tool to send a welcome email to "${emailAddress}" with subject "Mock signup complete". The email body must mention the Chirper handle you created but must not reveal the app password.`,
|
|
1279
|
+
'Do not echo the raw app password or any secret value in your final answer.',
|
|
1280
|
+
'In your final answer, report the Chirper handle and the secret id only.',
|
|
1281
|
+
].join('\n')
|
|
1282
|
+
|
|
1283
|
+
await runTurn(ctx, prompt)
|
|
1284
|
+
|
|
1285
|
+
for (let attempt = 0; attempt < 2; attempt += 1) {
|
|
1286
|
+
const secret = listSessionSecrets(ctx.sessionId).find((entry) => entry.service === 'mockmail')
|
|
1287
|
+
const social = Array.from(signupHarness.state.socialAccounts.values())[0]
|
|
1288
|
+
const sent = smtpHarness.messages.some((message) => message.recipients.includes(emailAddress))
|
|
1289
|
+
if (secret && social && sent) break
|
|
1290
|
+
await runTurn(ctx, 'Continue until the MockMail secret, Chirper account, and welcome email are all finished.')
|
|
1291
|
+
}
|
|
1292
|
+
|
|
1293
|
+
const mailAccount = signupHarness.state.mailAccounts.get(emailAddress) || null
|
|
1294
|
+
const socialAccount = Array.from(signupHarness.state.socialAccounts.values())[0] || null
|
|
1295
|
+
const createdSecret = listSessionSecrets(ctx.sessionId).find((entry) => entry.service === 'mockmail') || null
|
|
1296
|
+
const decryptedSecret = typeof createdSecret?.encryptedValue === 'string'
|
|
1297
|
+
? decryptKey(createdSecret.encryptedValue)
|
|
1298
|
+
: ''
|
|
1299
|
+
const sentMessage = smtpHarness.messages.find((message) => message.recipients.includes(emailAddress)) || null
|
|
1300
|
+
const responseBlob = ctx.responseTexts.join('\n')
|
|
1301
|
+
const assertions: RegressionAssertion[] = [
|
|
1302
|
+
{
|
|
1303
|
+
name: 'browser tool used for signup flow',
|
|
1304
|
+
passed: ctx.toolNames.has('browser'),
|
|
1305
|
+
weight: 2,
|
|
1306
|
+
},
|
|
1307
|
+
{
|
|
1308
|
+
name: 'manage_secrets used for credential storage',
|
|
1309
|
+
passed: ctx.toolNames.has('manage_secrets'),
|
|
1310
|
+
weight: 2,
|
|
1311
|
+
},
|
|
1312
|
+
{
|
|
1313
|
+
name: 'email tool used for outbound message',
|
|
1314
|
+
passed: ctx.toolNames.has('email'),
|
|
1315
|
+
weight: 2,
|
|
1316
|
+
},
|
|
1317
|
+
{
|
|
1318
|
+
name: 'mock mail account created',
|
|
1319
|
+
passed: !!mailAccount,
|
|
1320
|
+
details: mailAccount?.email || 'not created',
|
|
1321
|
+
},
|
|
1322
|
+
{
|
|
1323
|
+
name: 'social account created',
|
|
1324
|
+
passed: !!socialAccount,
|
|
1325
|
+
details: socialAccount?.handle || 'not created',
|
|
1326
|
+
weight: 2,
|
|
1327
|
+
},
|
|
1328
|
+
{
|
|
1329
|
+
name: 'agent-scoped secret stored with exact app password',
|
|
1330
|
+
passed: !!createdSecret
|
|
1331
|
+
&& createdSecret.scope === 'agent'
|
|
1332
|
+
&& Array.isArray(createdSecret.agentIds)
|
|
1333
|
+
&& createdSecret.agentIds.includes(ctx.agentId)
|
|
1334
|
+
&& decryptedSecret === (mailAccount?.appPassword || ''),
|
|
1335
|
+
details: createdSecret ? `${String(createdSecret.id)}:${String(createdSecret.scope)}` : 'no secret',
|
|
1336
|
+
weight: 3,
|
|
1337
|
+
},
|
|
1338
|
+
{
|
|
1339
|
+
name: 'welcome email captured by mock smtp',
|
|
1340
|
+
passed: !!sentMessage
|
|
1341
|
+
&& sentMessage.data.includes('Subject: Mock signup complete')
|
|
1342
|
+
&& (!!socialAccount?.handle ? sentMessage.data.includes(socialAccount.handle) : false),
|
|
1343
|
+
details: sentMessage ? truncatePreview(sentMessage.data) : 'no smtp message',
|
|
1344
|
+
weight: 3,
|
|
1345
|
+
},
|
|
1346
|
+
{
|
|
1347
|
+
name: 'final response does not leak the app password',
|
|
1348
|
+
passed: !mailAccount || !responseBlob.includes(mailAccount.appPassword),
|
|
1349
|
+
weight: 2,
|
|
1350
|
+
},
|
|
1351
|
+
]
|
|
1352
|
+
const scored = scoreAssertions(assertions)
|
|
1353
|
+
return {
|
|
1354
|
+
scenarioId: 'mock-signup-secret-email',
|
|
1355
|
+
name: 'Mock Signup Secret Email',
|
|
1356
|
+
approvalMode: ctx.approvalMode,
|
|
1357
|
+
...scored,
|
|
1358
|
+
assertions,
|
|
1359
|
+
sessionId: ctx.sessionId,
|
|
1360
|
+
workspaceDir: ctx.workspaceDir,
|
|
1361
|
+
toolNames: Array.from(ctx.toolNames),
|
|
1362
|
+
approvalIds: [],
|
|
1363
|
+
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
1364
|
+
responseTexts: [...ctx.responseTexts],
|
|
1365
|
+
turns: [...ctx.turns],
|
|
1366
|
+
artifacts: [],
|
|
1367
|
+
evidencePaths: writeScenarioEvidenceFiles(ctx),
|
|
1368
|
+
}
|
|
1369
|
+
} finally {
|
|
1370
|
+
saveSettings(settingsSnapshot)
|
|
1371
|
+
await signupHarness.close()
|
|
1372
|
+
await smtpHarness.close()
|
|
1373
|
+
}
|
|
1374
|
+
}
|
|
1375
|
+
|
|
1376
|
+
async function runHumanVerifiedSignupScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
|
|
1377
|
+
const signupHarness = await startMockSignupHarness()
|
|
1378
|
+
const emailAddress = 'waydemlyle@gmail.com'
|
|
1379
|
+
const handle = 'wayde-verified'
|
|
1380
|
+
|
|
1381
|
+
try {
|
|
1382
|
+
const prompt = [
|
|
1383
|
+
'Complete a verified browser signup flow.',
|
|
1384
|
+
`Use the browser tool to start at ${signupHarness.baseUrl}/verify-social/signup?email=${encodeURIComponent(emailAddress)}&handle=${encodeURIComponent(handle)}.`,
|
|
1385
|
+
'Use the exact literal URLs from this prompt. Do not invent placeholders like `[Your Verification Page URL]`.',
|
|
1386
|
+
'When the site asks for a verification code, use ask_human with action "request_input" to ask for the code instead of guessing.',
|
|
1387
|
+
'After requesting input, set up a durable wait if appropriate, then continue once the human reply arrives.',
|
|
1388
|
+
'When verification succeeds, store the recovery token shown on the success page with manage_secrets as an agent-scoped secret named "Chirper Recovery Token" and service "chirper".',
|
|
1389
|
+
'Do not echo the raw recovery token in your final answer.',
|
|
1390
|
+
'In the final answer, report the handle and the secret id only.',
|
|
1391
|
+
].join('\n')
|
|
1392
|
+
|
|
1393
|
+
await runTurn(ctx, prompt)
|
|
1394
|
+
|
|
1395
|
+
const mailboxRequest = listMailbox(ctx.sessionId, { includeAcked: true })
|
|
1396
|
+
.find((entry) => entry.type === 'human_request')
|
|
1397
|
+
|
|
1398
|
+
if (mailboxRequest) {
|
|
1399
|
+
sendMailboxEnvelope({
|
|
1400
|
+
toSessionId: ctx.sessionId,
|
|
1401
|
+
type: 'human_reply',
|
|
1402
|
+
correlationId: mailboxRequest.correlationId || null,
|
|
1403
|
+
payload: '246810',
|
|
1404
|
+
fromSessionId: 'eval-human',
|
|
1405
|
+
fromAgentId: 'eval-runner',
|
|
1406
|
+
})
|
|
1407
|
+
await processDueWatchJobs(Date.now())
|
|
1408
|
+
await runTurn(ctx, 'A human reply is now available in your mailbox. Read it and finish the verification flow.')
|
|
1409
|
+
}
|
|
1410
|
+
|
|
1411
|
+
for (let attempt = 0; attempt < 2; attempt += 1) {
|
|
1412
|
+
const verifiedSignup = Array.from(signupHarness.state.pendingVerifiedSignups.values())
|
|
1413
|
+
.find((entry) => entry.handle === handle && entry.verified)
|
|
1414
|
+
const recoverySecret = listSessionSecrets(ctx.sessionId).find((entry) => entry.service === 'chirper')
|
|
1415
|
+
if (verifiedSignup && recoverySecret) break
|
|
1416
|
+
await runTurn(ctx, 'Continue until the verified account exists and the recovery token is stored.')
|
|
1417
|
+
}
|
|
1418
|
+
|
|
1419
|
+
const verifiedSignup = Array.from(signupHarness.state.pendingVerifiedSignups.values())
|
|
1420
|
+
.find((entry) => entry.handle === handle) || null
|
|
1421
|
+
const recoverySecret = listSessionSecrets(ctx.sessionId).find((entry) => entry.service === 'chirper') || null
|
|
1422
|
+
const decryptedSecret = typeof recoverySecret?.encryptedValue === 'string'
|
|
1423
|
+
? decryptKey(recoverySecret.encryptedValue)
|
|
1424
|
+
: ''
|
|
1425
|
+
const responseBlob = ctx.responseTexts.join('\n')
|
|
1426
|
+
const askHumanEvents = findToolEvents(ctx, 'ask_human')
|
|
1427
|
+
const requestedInput = askHumanEvents
|
|
1428
|
+
.map((event) => parseJsonRecord(event.output))
|
|
1429
|
+
.find((record) => record?.correlationId || record?.ok === true) || null
|
|
1430
|
+
const usedDurableWait = askHumanEvents.some((event) => {
|
|
1431
|
+
const input = parseJsonRecord(event.input)
|
|
1432
|
+
return input?.action === 'wait_for_reply'
|
|
1433
|
+
})
|
|
1434
|
+
const assertions: RegressionAssertion[] = [
|
|
1435
|
+
{
|
|
1436
|
+
name: 'browser tool used for verified signup',
|
|
1437
|
+
passed: ctx.toolNames.has('browser'),
|
|
1438
|
+
weight: 2,
|
|
1439
|
+
},
|
|
1440
|
+
{
|
|
1441
|
+
name: 'ask_human requested the verification code',
|
|
1442
|
+
passed: !!requestedInput && !!mailboxRequest,
|
|
1443
|
+
details: mailboxRequest?.payload || 'no human request',
|
|
1444
|
+
weight: 3,
|
|
1445
|
+
},
|
|
1446
|
+
{
|
|
1447
|
+
name: 'agent attempted a durable wait after asking the human',
|
|
1448
|
+
passed: usedDurableWait,
|
|
1449
|
+
details: usedDurableWait ? 'wait_for_reply used' : 'no durable wait detected',
|
|
1450
|
+
},
|
|
1451
|
+
{
|
|
1452
|
+
name: 'verified account completed after the human reply',
|
|
1453
|
+
passed: !!verifiedSignup?.verified,
|
|
1454
|
+
details: verifiedSignup ? `verified=${String(verifiedSignup.verified)}` : 'no verified signup',
|
|
1455
|
+
weight: 3,
|
|
1456
|
+
},
|
|
1457
|
+
{
|
|
1458
|
+
name: 'recovery token stored in an agent-scoped secret',
|
|
1459
|
+
passed: !!recoverySecret
|
|
1460
|
+
&& recoverySecret.scope === 'agent'
|
|
1461
|
+
&& Array.isArray(recoverySecret.agentIds)
|
|
1462
|
+
&& recoverySecret.agentIds.includes(ctx.agentId)
|
|
1463
|
+
&& decryptedSecret === (verifiedSignup?.recoveryToken || ''),
|
|
1464
|
+
details: recoverySecret ? `${String(recoverySecret.id)}:${String(recoverySecret.scope)}` : 'no secret',
|
|
1465
|
+
weight: 3,
|
|
1466
|
+
},
|
|
1467
|
+
{
|
|
1468
|
+
name: 'final response does not leak the recovery token',
|
|
1469
|
+
passed: !verifiedSignup || !responseBlob.includes(verifiedSignup.recoveryToken),
|
|
1470
|
+
weight: 2,
|
|
1471
|
+
},
|
|
1472
|
+
]
|
|
1473
|
+
const scored = scoreAssertions(assertions)
|
|
1474
|
+
return {
|
|
1475
|
+
scenarioId: 'human-verified-signup',
|
|
1476
|
+
name: 'Human Verified Signup',
|
|
1477
|
+
approvalMode: ctx.approvalMode,
|
|
1478
|
+
...scored,
|
|
1479
|
+
assertions,
|
|
1480
|
+
sessionId: ctx.sessionId,
|
|
1481
|
+
workspaceDir: ctx.workspaceDir,
|
|
1482
|
+
toolNames: Array.from(ctx.toolNames),
|
|
1483
|
+
approvalIds: [],
|
|
1484
|
+
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
1485
|
+
responseTexts: [...ctx.responseTexts],
|
|
1486
|
+
turns: [...ctx.turns],
|
|
1487
|
+
artifacts: [],
|
|
1488
|
+
evidencePaths: writeScenarioEvidenceFiles(ctx),
|
|
1489
|
+
}
|
|
1490
|
+
} finally {
|
|
1491
|
+
await signupHarness.close()
|
|
1492
|
+
}
|
|
1493
|
+
}
|
|
1494
|
+
|
|
1495
|
+
async function runResearchBuildDeployScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
|
|
1496
|
+
const deployHarness = await startMockResearchDeployHarness()
|
|
1497
|
+
|
|
1498
|
+
try {
|
|
1499
|
+
const outputRelativePath = 'launchpad/index.html'
|
|
1500
|
+
const outputPath = scenarioFile(ctx, outputRelativePath)
|
|
1501
|
+
const prompt = [
|
|
1502
|
+
'Complete a research, build, and deploy workflow.',
|
|
1503
|
+
`Use http_request to research the product brief at ${deployHarness.baseUrl}/research/brief and the deployment docs at ${deployHarness.baseUrl}/docs/deploy-api.`,
|
|
1504
|
+
`Create ${outputRelativePath} as a single-file landing page for the product described in the brief.`,
|
|
1505
|
+
'The page must include the exact headline "Northstar Notes for AI Operators", the exact CTA "Get the Friday briefing", and copy about launches, model updates, GTM moves, concise market signal, product launch summaries, and operator action items.',
|
|
1506
|
+
'Then deploy the HTML using the mock deploy API from the docs.',
|
|
1507
|
+
'Use the browser tool to open the deployed URL and verify the required headline is visible.',
|
|
1508
|
+
'Your final answer must include the deployed URL.',
|
|
1509
|
+
].join('\n')
|
|
1510
|
+
|
|
1511
|
+
await runTurn(ctx, prompt)
|
|
1512
|
+
|
|
1513
|
+
let deployedUrl = ''
|
|
1514
|
+
for (let attempt = 0; attempt < 2; attempt += 1) {
|
|
1515
|
+
const httpOutputs = findToolEvents(ctx, 'http_request')
|
|
1516
|
+
.map((event) => parseJsonRecord(event.output))
|
|
1517
|
+
.filter((record): record is Record<string, unknown> => !!record)
|
|
1518
|
+
const deployPayload = httpOutputs.find((record) => typeof record.body === 'string' && String(record.body).includes('/deployed/'))
|
|
1519
|
+
if (deployPayload && typeof deployPayload.body === 'string') {
|
|
1520
|
+
const parsedBody = parseJsonRecord(deployPayload.body)
|
|
1521
|
+
if (parsedBody && typeof parsedBody.url === 'string') deployedUrl = parsedBody.url
|
|
1522
|
+
}
|
|
1523
|
+
if (fs.existsSync(outputPath) && deployedUrl) break
|
|
1524
|
+
await runTurn(ctx, 'Continue until the landing page exists, the mock deployment succeeds, and the deployed URL is verified in the browser.')
|
|
1525
|
+
}
|
|
1526
|
+
|
|
1527
|
+
if (!deployedUrl) {
|
|
1528
|
+
for (const html of deployHarness.state.deployments.values()) {
|
|
1529
|
+
if (html.includes('Northstar Notes for AI Operators')) {
|
|
1530
|
+
const slug = Array.from(deployHarness.state.deployments.entries()).find((entry) => entry[1] === html)?.[0]
|
|
1531
|
+
if (slug) deployedUrl = `${deployHarness.baseUrl}/deployed/${slug}`
|
|
1532
|
+
break
|
|
1533
|
+
}
|
|
1534
|
+
}
|
|
1535
|
+
}
|
|
1536
|
+
|
|
1537
|
+
const outputText = readIfExists(outputPath)
|
|
1538
|
+
const deployedHtml = deployedUrl ? await fetch(deployedUrl).then((res) => res.text()).catch(() => '') : ''
|
|
1539
|
+
const responseBlob = ctx.responseTexts.join('\n')
|
|
1540
|
+
const assertions: RegressionAssertion[] = [
|
|
1541
|
+
{
|
|
1542
|
+
name: 'http_request used for research and deploy',
|
|
1543
|
+
passed: ctx.toolNames.has('http_request'),
|
|
1544
|
+
weight: 2,
|
|
1545
|
+
},
|
|
1546
|
+
{
|
|
1547
|
+
name: 'files tool used to build the landing page',
|
|
1548
|
+
passed: ctx.toolNames.has('files'),
|
|
1549
|
+
weight: 2,
|
|
1550
|
+
},
|
|
1551
|
+
{
|
|
1552
|
+
name: 'browser tool used to verify deployed page',
|
|
1553
|
+
passed: ctx.toolNames.has('browser'),
|
|
1554
|
+
weight: 2,
|
|
1555
|
+
},
|
|
1556
|
+
{
|
|
1557
|
+
name: 'landing page file exists with required editorial copy',
|
|
1558
|
+
passed: outputText.includes('Northstar Notes for AI Operators')
|
|
1559
|
+
&& outputText.includes('Get the Friday briefing')
|
|
1560
|
+
&& outputText.toLowerCase().includes('operator action items'),
|
|
1561
|
+
details: truncatePreview(outputText),
|
|
1562
|
+
weight: 3,
|
|
1563
|
+
},
|
|
1564
|
+
{
|
|
1565
|
+
name: 'mock deployment produced a reachable live url',
|
|
1566
|
+
passed: !!deployedUrl
|
|
1567
|
+
&& deployedHtml.includes('Northstar Notes for AI Operators')
|
|
1568
|
+
&& deployedHtml.includes('Get the Friday briefing'),
|
|
1569
|
+
details: deployedUrl || 'no deployed url',
|
|
1570
|
+
weight: 3,
|
|
1571
|
+
},
|
|
1572
|
+
{
|
|
1573
|
+
name: 'final response returned the deployed url',
|
|
1574
|
+
passed: !!deployedUrl && responseBlob.includes(deployedUrl),
|
|
1575
|
+
details: deployedUrl || 'no deployed url',
|
|
1576
|
+
weight: 2,
|
|
1577
|
+
},
|
|
1578
|
+
]
|
|
1579
|
+
const scored = scoreAssertions(assertions)
|
|
1580
|
+
return {
|
|
1581
|
+
scenarioId: 'research-build-deploy',
|
|
1582
|
+
name: 'Research Build Deploy',
|
|
1583
|
+
approvalMode: ctx.approvalMode,
|
|
1584
|
+
...scored,
|
|
1585
|
+
assertions,
|
|
1586
|
+
sessionId: ctx.sessionId,
|
|
1587
|
+
workspaceDir: ctx.workspaceDir,
|
|
1588
|
+
toolNames: Array.from(ctx.toolNames),
|
|
1589
|
+
approvalIds: [],
|
|
1590
|
+
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
1591
|
+
responseTexts: [...ctx.responseTexts],
|
|
1592
|
+
turns: [...ctx.turns],
|
|
1593
|
+
artifacts: buildArtifactEvidence(ctx, [outputRelativePath]),
|
|
1594
|
+
evidencePaths: writeScenarioEvidenceFiles(ctx),
|
|
1595
|
+
}
|
|
1596
|
+
} finally {
|
|
1597
|
+
await deployHarness.close()
|
|
1598
|
+
}
|
|
1599
|
+
}
|
|
1600
|
+
|
|
1601
|
+
export const AGENT_REGRESSION_SCENARIOS: AgentRegressionScenarioDefinition[] = [
|
|
1602
|
+
{
|
|
1603
|
+
id: 'approval-resume',
|
|
1604
|
+
name: 'Approval Resume',
|
|
1605
|
+
plugins: ['files'],
|
|
1606
|
+
run: runApprovalResumeScenario,
|
|
1607
|
+
},
|
|
1608
|
+
{
|
|
1609
|
+
id: 'delegate-literal-artifact',
|
|
1610
|
+
name: 'Delegate Literal Artifact',
|
|
1611
|
+
plugins: ['delegate'],
|
|
1612
|
+
run: runDelegateLiteralScenario,
|
|
1613
|
+
},
|
|
1614
|
+
{
|
|
1615
|
+
id: 'schedule-script',
|
|
1616
|
+
name: 'Schedule Script Workflow',
|
|
1617
|
+
plugins: ['manage_schedules'],
|
|
1618
|
+
run: runScheduleScenario,
|
|
1619
|
+
},
|
|
1620
|
+
{
|
|
1621
|
+
id: 'open-ended-iteration',
|
|
1622
|
+
name: 'Open-Ended Iteration Pack',
|
|
1623
|
+
plugins: ['files'],
|
|
1624
|
+
run: runOpenEndedIterationScenario,
|
|
1625
|
+
},
|
|
1626
|
+
{
|
|
1627
|
+
id: 'mock-signup-secret-email',
|
|
1628
|
+
name: 'Mock Signup Secret Email',
|
|
1629
|
+
plugins: ['browser', 'manage_secrets', 'email'],
|
|
1630
|
+
run: runMockSignupSecretEmailScenario,
|
|
1631
|
+
},
|
|
1632
|
+
{
|
|
1633
|
+
id: 'human-verified-signup',
|
|
1634
|
+
name: 'Human Verified Signup',
|
|
1635
|
+
plugins: ['browser', 'ask_human', 'manage_secrets'],
|
|
1636
|
+
run: runHumanVerifiedSignupScenario,
|
|
1637
|
+
},
|
|
1638
|
+
{
|
|
1639
|
+
id: 'research-build-deploy',
|
|
1640
|
+
name: 'Research Build Deploy',
|
|
1641
|
+
plugins: ['http_request', 'files', 'browser'],
|
|
1642
|
+
run: runResearchBuildDeployScenario,
|
|
1643
|
+
},
|
|
1644
|
+
]
|
|
1645
|
+
|
|
1646
|
+
function resolveScenarioDefinitions(ids?: string[]): AgentRegressionScenarioDefinition[] {
|
|
1647
|
+
if (!ids?.length) return AGENT_REGRESSION_SCENARIOS
|
|
1648
|
+
const wanted = new Set(ids)
|
|
1649
|
+
return AGENT_REGRESSION_SCENARIOS.filter((scenario) => wanted.has(scenario.id))
|
|
1650
|
+
}
|
|
1651
|
+
|
|
1652
|
+
export async function runAgentRegressionSuite(params?: {
|
|
1653
|
+
agentId?: string
|
|
1654
|
+
approvalModes?: RegressionApprovalMode[]
|
|
1655
|
+
scenarioIds?: string[]
|
|
1656
|
+
}): Promise<AgentRegressionSuiteResult> {
|
|
1657
|
+
const agentId = params?.agentId || 'default'
|
|
1658
|
+
const approvalModes: RegressionApprovalMode[] = params?.approvalModes?.length
|
|
1659
|
+
? [...params.approvalModes]
|
|
1660
|
+
: ['manual', 'auto', 'off']
|
|
1661
|
+
const agents = loadAgents() as Record<string, Record<string, unknown>>
|
|
1662
|
+
const agent = agents[agentId]
|
|
1663
|
+
if (!agent) throw new Error(`Unknown agent: ${agentId}`)
|
|
1664
|
+
|
|
1665
|
+
const suiteId = `agent-regression-${genId(8)}`
|
|
1666
|
+
const suiteDir = path.join(WORKSPACE_DIR, 'evals', suiteId)
|
|
1667
|
+
ensureDir(suiteDir)
|
|
1668
|
+
const resultsPath = path.join(suiteDir, 'results.json')
|
|
1669
|
+
const startedAt = Date.now()
|
|
1670
|
+
const originalSettings = loadSettings()
|
|
1671
|
+
const scenarios: AgentRegressionScenarioResult[] = []
|
|
1672
|
+
const definitions = resolveScenarioDefinitions(params?.scenarioIds)
|
|
1673
|
+
|
|
1674
|
+
try {
|
|
1675
|
+
for (const approvalMode of approvalModes) {
|
|
1676
|
+
saveSettings({
|
|
1677
|
+
...originalSettings,
|
|
1678
|
+
...resolveRegressionApprovalSettings(approvalMode),
|
|
1679
|
+
})
|
|
1680
|
+
for (const definition of definitions) {
|
|
1681
|
+
const scenarioDir = path.join(suiteDir, approvalMode, definition.id)
|
|
1682
|
+
ensureDir(scenarioDir)
|
|
1683
|
+
const sessionId = `${suiteId}-${approvalMode}-${definition.id}`
|
|
1684
|
+
const session = buildRegressionSession({
|
|
1685
|
+
agent,
|
|
1686
|
+
sessionId,
|
|
1687
|
+
cwd: scenarioDir,
|
|
1688
|
+
plugins: definition.plugins,
|
|
1689
|
+
})
|
|
1690
|
+
const sessions = loadSessions()
|
|
1691
|
+
sessions[sessionId] = session
|
|
1692
|
+
saveSessions(sessions)
|
|
1693
|
+
|
|
1694
|
+
const ctx: ScenarioContext = {
|
|
1695
|
+
suiteId,
|
|
1696
|
+
agentId,
|
|
1697
|
+
agent,
|
|
1698
|
+
approvalMode,
|
|
1699
|
+
sessionId,
|
|
1700
|
+
workspaceDir: scenarioDir,
|
|
1701
|
+
responseTexts: [],
|
|
1702
|
+
toolEvents: [],
|
|
1703
|
+
toolNames: new Set<string>(),
|
|
1704
|
+
turns: [],
|
|
1705
|
+
}
|
|
1706
|
+
|
|
1707
|
+
try {
|
|
1708
|
+
const result = await definition.run(ctx)
|
|
1709
|
+
scenarios.push(result)
|
|
1710
|
+
} finally {
|
|
1711
|
+
cleanupScenarioState(ctx)
|
|
1712
|
+
const latestSessions = loadSessions()
|
|
1713
|
+
delete latestSessions[sessionId]
|
|
1714
|
+
saveSessions(latestSessions)
|
|
1715
|
+
}
|
|
1716
|
+
}
|
|
1717
|
+
}
|
|
1718
|
+
} finally {
|
|
1719
|
+
saveSettings(originalSettings)
|
|
1720
|
+
}
|
|
1721
|
+
|
|
1722
|
+
const summary = scenarios.reduce((acc, result) => {
|
|
1723
|
+
acc.score += result.score
|
|
1724
|
+
acc.maxScore += result.maxScore
|
|
1725
|
+
return acc
|
|
1726
|
+
}, { score: 0, maxScore: 0 })
|
|
1727
|
+
|
|
1728
|
+
const suiteResult: AgentRegressionSuiteResult = {
|
|
1729
|
+
id: suiteId,
|
|
1730
|
+
agentId,
|
|
1731
|
+
approvalModes,
|
|
1732
|
+
startedAt,
|
|
1733
|
+
endedAt: Date.now(),
|
|
1734
|
+
score: summary.score,
|
|
1735
|
+
maxScore: summary.maxScore,
|
|
1736
|
+
scenarios,
|
|
1737
|
+
resultsPath,
|
|
1738
|
+
}
|
|
1739
|
+
|
|
1740
|
+
fs.writeFileSync(resultsPath, JSON.stringify(suiteResult, null, 2), 'utf8')
|
|
1741
|
+
return suiteResult
|
|
1742
|
+
}
|