autopreso 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/server.js ADDED
@@ -0,0 +1,996 @@
1
+ import { createHash } from "node:crypto";
2
+ import { appendFileSync } from "node:fs";
3
+ import { createServer as createHttpServer } from "node:http";
4
+ import path from "node:path";
5
+ import { fileURLToPath } from "node:url";
6
+
7
+ import { generateText, stepCountIs, streamText, tool } from "ai";
8
+ import express from "express";
9
+ import { WebSocket, WebSocketServer } from "ws";
10
+ import { z } from "zod";
11
+
12
+ import {
13
+ createWhiteboardAgentModel,
14
+ defaultWhiteboardAgentProvider,
15
+ resolveAgentProviderFromSettings,
16
+ } from "./agent-provider.js";
17
+ import { createMoonshineTranscription as createDefaultMoonshineTranscription } from "./moonshine-transcription.js";
18
+ import { createOpenAITranscription as createDefaultOpenAITranscription } from "./openai-transcription.js";
19
+ import { broadcast, createWhiteboardSession } from "./whiteboard-session.js";
20
+ import { detectMalformedLayoutWarnings, normalizeWhiteboardElements } from "./whiteboard-elements.js";
21
+ import { applyWhiteboardEditOperations, formatLineNumberedWhiteboard } from "./whiteboard-tools.js";
22
+
23
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
24
+ const PUBLIC_DIR = path.join(__dirname, "..", "public");
25
+ export const DEFAULT_AGENT_TIMEOUT_MS = 90_000;
26
+
27
+ export async function startServer(options) {
28
+ const app = express();
29
+ app.use(express.json({ limit: "1mb" }));
30
+ app.use(express.static(PUBLIC_DIR));
31
+
32
+ const httpServer = createHttpServer(app);
33
+ const wss = new WebSocketServer({ server: httpServer, path: "/ws" });
34
+ const state = createWhiteboardSession({
35
+ options,
36
+ wss,
37
+ runAgent: ({ transcript, state, wss, options }) =>
38
+ runWhiteboardAgent({
39
+ transcript,
40
+ state,
41
+ wss,
42
+ options,
43
+ generateTextFn: options.generateTextFn ?? generateText,
44
+ streamTextFn: options.streamTextFn ?? streamText,
45
+ }),
46
+ });
47
+
48
+ const transcription = await createTranscriptionManager({
49
+ options,
50
+ wss,
51
+ queueTranscript: (transcript) => state.queueTranscript(transcript),
52
+ });
53
+
54
+ app.get("/api/config", async (_req, res) => {
55
+ const sanitized = options.settingsStore ? await options.settingsStore.getSanitized() : null;
56
+ res.json({
57
+ transcriptionEngine: transcription.getLabel(),
58
+ settings: sanitized,
59
+ });
60
+ });
61
+
62
+ app.get("/api/settings", async (_req, res) => {
63
+ if (!options.settingsStore) return res.status(404).json({ error: "Settings store not available." });
64
+ res.json(await options.settingsStore.getSanitized());
65
+ });
66
+
67
+ app.post("/api/session/reset", (_req, res) => {
68
+ state.reset();
69
+ broadcast(wss, { type: "whiteboard:update", elements: state.elements });
70
+ res.json({ ok: true });
71
+ });
72
+
73
+ app.post("/api/preso/start", (req, res) => {
74
+ const { stagingElements, stagingScreenshot } = req.body ?? {};
75
+ if (!Array.isArray(stagingElements)) {
76
+ return res.status(400).json({ error: "stagingElements (array) is required." });
77
+ }
78
+ const primerMessage = buildStagingPrimerMessage({ stagingElements, stagingScreenshot });
79
+ state.startPreso({ primerMessage });
80
+ state.startWarmupLoop({
81
+ runOnce: ({ attempt }) =>
82
+ runWhiteboardWarmupOnce({
83
+ state,
84
+ options,
85
+ attempt,
86
+ generateTextFn: options.generateTextFn ?? generateText,
87
+ streamTextFn: options.streamTextFn ?? streamText,
88
+ }).catch((error) => {
89
+ console.error(`Whiteboard warmup attempt ${attempt} failed:`, error);
90
+ options.onAgentEvent?.({ type: "warmup:error", attempt, error: error.message, timestamp: new Date().toISOString() });
91
+ return { usage: { input: 0, cached: 0, output: 0, reasoning: 0 } };
92
+ }),
93
+ delays: options.warmupDelays,
94
+ maxAttempts: options.warmupMaxAttempts,
95
+ // After the loop ends, append [warmup_user_msg, assistant("UNDERSTOOD")]
96
+ // to agentHistory so every subsequent turn's request prefix starts with
97
+ // exactly the bytes warmup wrote to cache.
98
+ primingMessages: WARMUP_PRIMING_MESSAGES,
99
+ });
100
+ broadcast(wss, { type: "mode", mode: state.mode });
101
+ broadcast(wss, { type: "whiteboard:update", elements: state.elements });
102
+ res.json({ ok: true });
103
+ });
104
+
105
+ app.post("/api/preso/warmup/cancel", (_req, res) => {
106
+ state.cancelWarmup();
107
+ res.json({ ok: true });
108
+ });
109
+
110
+ app.post("/api/preso/back-to-staging", (_req, res) => {
111
+ state.backToStaging();
112
+ broadcast(wss, { type: "mode", mode: state.mode });
113
+ res.json({ ok: true });
114
+ });
115
+
116
+ app.put("/api/settings", async (req, res) => {
117
+ if (!options.settingsStore) return res.status(404).json({ error: "Settings store not available." });
118
+ try {
119
+ await options.settingsStore.save(req.body ?? {});
120
+ await transcription.applyCurrent();
121
+ const sanitized = await options.settingsStore.getSanitized();
122
+ res.json({ settings: sanitized, transcriptionEngine: transcription.getLabel() });
123
+ broadcast(wss, { type: "settings", settings: sanitized });
124
+ broadcast(wss, { type: "config", transcriptionEngine: transcription.getLabel() });
125
+ } catch (error) {
126
+ res.status(400).json({ error: error.message });
127
+ }
128
+ });
129
+
130
+ httpServer.on("close", () => transcription.close());
131
+
132
+ wss.on("connection", async (client) => {
133
+ client.send(JSON.stringify({ type: "config", transcriptionEngine: transcription.getLabel() }));
134
+ if (options.settingsStore) {
135
+ const sanitized = await options.settingsStore.getSanitized();
136
+ client.send(JSON.stringify({ type: "settings", settings: sanitized }));
137
+ }
138
+ client.send(JSON.stringify({ type: "agent:status", status: state.agentStatus }));
139
+ client.send(JSON.stringify({ type: "mode", mode: state.mode }));
140
+ client.send(JSON.stringify({ type: "warmup", ...state.warmupState }));
141
+ if (state.mode === "live") {
142
+ client.send(JSON.stringify({ type: "whiteboard:update", elements: state.elements }));
143
+ }
144
+
145
+ client.on("message", async (raw) => {
146
+ let message;
147
+ try {
148
+ message = JSON.parse(raw.toString());
149
+ } catch {
150
+ return;
151
+ }
152
+
153
+ if (message.type === "audio") {
154
+ if (state.mode === "live") transcription.sendAudio(message.audio);
155
+ }
156
+
157
+ if (message.type === "stop") {
158
+ transcription.stop();
159
+ }
160
+
161
+ if (message.type === "whiteboard:screenshot" && typeof message.image === "string") {
162
+ if (state.mode === "live") state.updateLatestScreenshot(message.image);
163
+ }
164
+
165
+ if (message.type === "warmup:cancel") {
166
+ state.cancelWarmup();
167
+ }
168
+
169
+ if (message.type === "whiteboard:user-elements" && Array.isArray(message.elements)) {
170
+ // The user can draw on the live canvas before clicking Start listening
171
+ // (and during it). Frontend pushes the current scene here so the next
172
+ // transcript turn has fresh elements available to the agent.
173
+ if (state.mode === "live") {
174
+ state.elements = message.elements;
175
+ }
176
+ }
177
+
178
+ if (message.type === "settings:update" && options.settingsStore) {
179
+ try {
180
+ await options.settingsStore.save(message.patch ?? {});
181
+ await transcription.applyCurrent();
182
+ const sanitized = await options.settingsStore.getSanitized();
183
+ broadcast(wss, { type: "settings", settings: sanitized });
184
+ broadcast(wss, { type: "config", transcriptionEngine: transcription.getLabel() });
185
+ } catch (error) {
186
+ client.send(JSON.stringify({ type: "error", message: `Failed to apply settings: ${error.message}` }));
187
+ }
188
+ }
189
+ });
190
+ });
191
+
192
+ await new Promise((resolve) => httpServer.listen(options.port, options.host, resolve));
193
+ const address = httpServer.address();
194
+ const port = typeof address === "object" && address ? address.port : options.port;
195
+ return {
196
+ app,
197
+ httpServer,
198
+ state,
199
+ url: `http://${options.host}:${port}`,
200
+ };
201
+ }
202
+
203
+ async function createTranscriptionManager({ options, wss, queueTranscript }) {
204
+ let current = null;
205
+ let label = "";
206
+
207
+ const sendTranscript = (message) => broadcast(wss, message);
208
+
209
+ function buildOptionsForFactory(settings) {
210
+ if (!settings) return options;
211
+ return {
212
+ ...options,
213
+ moonshineModel: settings.transcription.moonshine.model,
214
+ openaiTranscriptionModel: settings.transcription.openai.model,
215
+ env: { ...(options.env ?? process.env), OPENAI_API_KEY: settings.apiKeys?.openai || (options.env ?? process.env).OPENAI_API_KEY },
216
+ };
217
+ }
218
+
219
+ function pickFactory(settings) {
220
+ if (options.createTranscription) return options.createTranscription;
221
+ const provider = settings ? settings.transcription.provider : options.transcriptionProvider;
222
+ if (provider === "openai") return createDefaultOpenAITranscription;
223
+ return createDefaultMoonshineTranscription;
224
+ }
225
+
226
+ function describeLabel(settings) {
227
+ if (settings) {
228
+ if (settings.transcription.provider === "openai") return `OpenAI ${settings.transcription.openai.model}`;
229
+ return `Moonshine ${settings.transcription.moonshine.model}`;
230
+ }
231
+ if (options.transcriptionProvider === "openai") return `OpenAI ${options.openaiTranscriptionModel}`;
232
+ return `Moonshine ${options.moonshineModel}`;
233
+ }
234
+
235
+ async function applyCurrent() {
236
+ const settings = options.settingsStore ? await options.settingsStore.load() : null;
237
+ const newLabel = describeLabel(settings);
238
+
239
+ if (current && newLabel === label) return;
240
+
241
+ if (current) current.close();
242
+
243
+ const factoryOptions = buildOptionsForFactory(settings);
244
+ const factory = pickFactory(settings);
245
+ label = newLabel;
246
+ options.onStatus?.(`Loading ${label} transcription model...`);
247
+ current = factory({
248
+ sendTranscript,
249
+ queueTranscript,
250
+ options: factoryOptions,
251
+ env: factoryOptions.env,
252
+ });
253
+ await current.ready();
254
+ options.onStatus?.(`${label} transcription model is ready.`);
255
+ }
256
+
257
+ await applyCurrent();
258
+
259
+ return {
260
+ sendAudio: (audio) => current?.sendAudio(audio),
261
+ stop: () => current?.stop(),
262
+ close: () => current?.close(),
263
+ getLabel: () => label,
264
+ applyCurrent,
265
+ };
266
+ }
267
+
268
+ export async function runWhiteboardAgent({ transcript, state, wss, options, generateTextFn = generateText, streamTextFn = streamText }) {
269
+ // Only attach the live screenshot when the canvas has been edited since the
270
+ // last attach. On DONE-only turns nothing changed, so the screenshot adds
271
+ // ~7-10k tokens of noise without giving the agent new visual info.
272
+ const screenshotForAgent = state.canvasDirtyForAgent ? state.latestScreenshot : undefined;
273
+ state.canvasDirtyForAgent = false;
274
+ const rawMessages = buildWhiteboardAgentMessages({
275
+ elements: state.elements,
276
+ agentHistory: state.agentHistory,
277
+ latestScreenshot: screenshotForAgent,
278
+ transcript,
279
+ });
280
+ const whiteboardElementSchema = z.record(z.string(), z.any());
281
+ const editOperationSchema = z.discriminatedUnion("type", [
282
+ z.object({
283
+ type: z.literal("replace"),
284
+ line: z.number().int().positive().describe("Current 1-based line number to replace."),
285
+ element: whiteboardElementSchema.describe("Replacement drawing object for this line."),
286
+ }),
287
+ z.object({
288
+ type: z.literal("insert_after"),
289
+ line: z.number().int().min(0).describe("Current line number to insert after. Use 0 to insert at the start."),
290
+ element: whiteboardElementSchema.describe("Drawing object to insert after this line."),
291
+ }),
292
+ z.object({
293
+ type: z.literal("delete"),
294
+ line: z.number().int().positive().describe("Current 1-based line number to delete."),
295
+ }),
296
+ ]);
297
+
298
+ const baseSystem = whiteboardSystemPrompt();
299
+
300
+ const agentProvider = options.agentProvider
301
+ ?? (options.settingsStore
302
+ ? resolveAgentProviderFromSettings({ settings: await options.settingsStore.load(), env: options.env ?? process.env })
303
+ : defaultWhiteboardAgentProvider(options));
304
+ // Fold the primer text into the system prompt for both openai and codex
305
+ // providers. The primer image (if any) stays in messages[0] - system prompts
306
+ // are text-only across these APIs. This keeps the staging context as a
307
+ // first-class system instruction rather than a stale early user message.
308
+ const primerText = extractPrimerText(state.agentHistory?.[0]);
309
+ const effectiveSystem = buildEffectiveSystemPrompt(baseSystem, primerText);
310
+ const messages = primerText ? reshapeMessagesForCodex(rawMessages) : rawMessages;
311
+ options.onAgentEvent?.({ type: "model:start", transcript, system: effectiveSystem, messages, timestamp: new Date().toISOString() });
312
+ const codexInstructions = agentProvider.provider === "codex" ? effectiveSystem : null;
313
+ dumpAgentRequest("turn", { system: effectiveSystem, messages, instructions: codexInstructions, primerText });
314
+ const agentCallOptions = {
315
+ model: createWhiteboardAgentModel(agentProvider),
316
+ providerOptions: createWhiteboardAgentProviderOptions(agentProvider, effectiveSystem),
317
+ stopWhen: stepCountIs(4),
318
+ system: effectiveSystem,
319
+ messages,
320
+ tools: {
321
+ whiteboard_overwrite: tool({
322
+ description: "Replace the entire whiteboard with a complete drawing object array. Use only for clearing, resetting, or starting fresh.",
323
+ inputSchema: z.object({
324
+ elements: z.array(whiteboardElementSchema).describe("Complete replacement drawing object array."),
325
+ }),
326
+ execute: async ({ elements }) => {
327
+ options.onAgentEvent?.({ type: "tool:start", tool: "whiteboard_overwrite", input: { elements }, timestamp: new Date().toISOString() });
328
+ const normalizedElements = normalizeWhiteboardElements(elements);
329
+ state.elements = normalizedElements;
330
+ state.canvasDirtyForAgent = true;
331
+ broadcast(wss, { type: "whiteboard:update", elements: normalizedElements });
332
+ const result = appendLayoutWarnings(formatLineNumberedWhiteboard(normalizedElements), normalizedElements);
333
+ dumpToolCall("whiteboard_overwrite", { elementCount: elements.length, ids: elements.map((el) => el.id) }, normalizedElements.map((el) => el.id), result);
334
+ options.onAgentEvent?.({ type: "tool:end", tool: "whiteboard_overwrite", result, elements: normalizedElements, timestamp: new Date().toISOString() });
335
+ return result;
336
+ },
337
+ }),
338
+ whiteboard_apply: tool({
339
+ description: "Apply edits and/or move the viewport in a SINGLE call. Combine everything you want to do this turn into one whiteboard_apply call - do not split into back-to-back calls. Either operations, viewport, or both must be provided. operations applies edits in line-number order; viewport scrolls/zooms after edits land. For scroll_to_content, ALWAYS pass focus_ids.",
340
+ inputSchema: z.object({
341
+ operations: z.array(editOperationSchema).optional().describe("Edit operations applied in order. Omit (or pass empty) when you only want to move the viewport."),
342
+ viewport: z.object({
343
+ action: z.enum(["scroll_to_content", "set_zoom", "zoom_in", "zoom_out", "reset_zoom"]),
344
+ zoom: z.number().min(0.1).max(3).optional().describe("Zoom value for set_zoom. 1 is 100%."),
345
+ focus_ids: z.array(z.string()).optional().describe("For scroll_to_content: stable element IDs the audience should look at right now (typically the elements you just edited or the cluster the speaker is currently discussing). Pass 1-5 IDs - the active talking point, not the whole diagram."),
346
+ }).optional().describe("Optional viewport command applied AFTER any edits. Omit when no viewport change is needed."),
347
+ }),
348
+ execute: async ({ operations, viewport }) => {
349
+ const hasOps = Array.isArray(operations) && operations.length > 0;
350
+ const hasViewport = viewport && typeof viewport === "object";
351
+ if (!hasOps && !hasViewport) {
352
+ const msg = "whiteboard_apply: Provide at least one of operations or viewport. Empty calls are not allowed - if there's nothing to do, don't call this tool.";
353
+ dumpToolCall("whiteboard_apply", { operations, viewport }, state.elements.map((el) => el.id), msg);
354
+ return msg;
355
+ }
356
+ options.onAgentEvent?.({ type: "tool:start", tool: "whiteboard_apply", input: { operations, viewport }, timestamp: new Date().toISOString() });
357
+
358
+ let canvasResult = "";
359
+ if (hasOps) {
360
+ const nextElements = normalizeWhiteboardElements(applyWhiteboardEditOperations(state.elements, operations));
361
+ state.elements = nextElements;
362
+ state.canvasDirtyForAgent = true;
363
+ broadcast(wss, { type: "whiteboard:update", elements: nextElements });
364
+ canvasResult = appendLayoutWarnings(formatLineNumberedWhiteboard(nextElements), nextElements);
365
+ }
366
+
367
+ let viewportResult = "";
368
+ if (hasViewport) {
369
+ const { action, zoom, focus_ids } = viewport;
370
+ const broadcastPayload = {
371
+ action,
372
+ ...(zoom === undefined ? {} : { zoom }),
373
+ ...(Array.isArray(focus_ids) && focus_ids.length > 0 ? { focus_ids } : {}),
374
+ };
375
+ broadcast(wss, { type: "whiteboard:viewport", ...broadcastPayload });
376
+ if (action === "scroll_to_content") {
377
+ if (!focus_ids || focus_ids.length === 0) {
378
+ viewportResult = "Viewport scrolled to fit ALL content. Next time, pass focus_ids so the audience sees the active talking point, not the whole canvas.";
379
+ } else {
380
+ const sceneIds = new Set(state.elements.map((el) => el.id));
381
+ const known = focus_ids.filter((id) => sceneIds.has(id));
382
+ const unknown = focus_ids.filter((id) => !sceneIds.has(id));
383
+ if (known.length === 0) {
384
+ viewportResult = `Viewport WARNING: none of focus_ids ${JSON.stringify(focus_ids)} match any element in the current scene (scene has ids: ${JSON.stringify([...sceneIds].slice(0, 12))}${sceneIds.size > 12 ? ", ..." : ""}). The frontend fell back to fitting the entire canvas. Use IDs from the line-numbered whiteboard content above.`;
385
+ } else if (unknown.length > 0) {
386
+ viewportResult = `Viewport command sent. NOTE: ${unknown.length} of your focus_ids did not match any scene element and were ignored: ${JSON.stringify(unknown)}. The viewport scrolled to: ${JSON.stringify(known)}.`;
387
+ } else {
388
+ viewportResult = `Viewport scrolled to ${known.length} element${known.length === 1 ? "" : "s"}: ${JSON.stringify(known)}.`;
389
+ }
390
+ }
391
+ } else {
392
+ viewportResult = "Viewport command sent.";
393
+ }
394
+ }
395
+
396
+ const result = [canvasResult, viewportResult].filter(Boolean).join("\n\n");
397
+ dumpToolCall("whiteboard_apply", { operations, viewport }, state.elements.map((el) => el.id), result);
398
+ options.onAgentEvent?.({ type: "tool:end", tool: "whiteboard_apply", result, elements: state.elements, timestamp: new Date().toISOString() });
399
+ return result;
400
+ },
401
+ }),
402
+ },
403
+ };
404
+
405
+ const result = await withTimeout(
406
+ runWhiteboardAgentGeneration(agentProvider, agentCallOptions, { generateTextFn, streamTextFn }),
407
+ options.agentTimeoutMs ?? DEFAULT_AGENT_TIMEOUT_MS,
408
+ "Whiteboard agent timed out",
409
+ );
410
+ logAgentUsage("turn", result, {
411
+ transcript: transcript?.slice(0, 80),
412
+ fingerprints: {
413
+ system: fingerprint(effectiveSystem),
414
+ primer: fingerprint(state.agentHistory[0]),
415
+ tools: fingerprint(toolDefinitionFingerprintInput(agentCallOptions.tools)),
416
+ },
417
+ });
418
+ options.onAgentEvent?.({ type: "model:end", transcript, result: summarizeAgentResult(result), timestamp: new Date().toISOString() });
419
+
420
+ state.agentHistory = appendWhiteboardAgentHistory(state.agentHistory, {
421
+ transcript,
422
+ });
423
+ return result;
424
+ }
425
+
426
+ function appendLayoutWarnings(formattedBoard, elements) {
427
+ const warnings = detectMalformedLayoutWarnings(elements);
428
+ if (warnings.length === 0) return formattedBoard;
429
+ return `${formattedBoard}\n\n${warnings.map((w, i) => `WARNING ${i + 1}: ${w}`).join("\n")}\n\nFix the warnings above on your next edit so the rendered scene actually looks right.`;
430
+ }
431
+
432
+ async function runWhiteboardAgentGeneration(agentProvider, agentCallOptions, { generateTextFn, streamTextFn }) {
433
+ if (agentProvider.provider !== "codex") return generateTextFn(agentCallOptions);
434
+ const stream = streamTextFn(agentCallOptions);
435
+ await stream.consumeStream();
436
+ // streamText exposes the final values as promise-properties on the result.
437
+ // After consumeStream resolves they resolve too. Read them defensively so
438
+ // older SDK versions or test mocks without these fields don't throw.
439
+ const safeGet = async (key) => {
440
+ try {
441
+ const value = stream?.[key];
442
+ if (value && typeof value.then === "function") return await value;
443
+ return value;
444
+ } catch {
445
+ return undefined;
446
+ }
447
+ };
448
+ return {
449
+ text: await safeGet("text"),
450
+ finishReason: await safeGet("finishReason"),
451
+ usage: await safeGet("usage"),
452
+ toolCalls: await safeGet("toolCalls"),
453
+ toolResults: await safeGet("toolResults"),
454
+ steps: await safeGet("steps"),
455
+ };
456
+ }
457
+
458
+ // Identical warmup message across attempts AND identical to the priming pair
459
+ // appended to agentHistory after warmup. Once warmup writes a cache entry for
460
+ // [primer, WARMUP_USER_MESSAGE], every subsequent turn whose prefix starts with
461
+ // [primer, WARMUP_USER_MESSAGE, assistant("UNDERSTOOD"), ...] hits that cache.
462
+ export const WARMUP_USER_MESSAGE = {
463
+ role: "user",
464
+ content: "Speaker turn:\n(cache warmup - no spoken content yet, confirm readiness by responding UNDERSTOOD without calling tools)",
465
+ };
466
+ export const WARMUP_ASSISTANT_REPLY = { role: "assistant", content: "UNDERSTOOD" };
467
+ export const WARMUP_PRIMING_MESSAGES = [WARMUP_USER_MESSAGE, WARMUP_ASSISTANT_REPLY];
468
+
469
+ export async function runWhiteboardWarmupOnce({ state, options, attempt = 1, generateTextFn = generateText, streamTextFn = streamText }) {
470
+ if (!Array.isArray(state.agentHistory) || state.agentHistory.length === 0) return undefined;
471
+
472
+ const baseSystem = whiteboardSystemPrompt();
473
+ const agentProvider = options.agentProvider
474
+ ?? (options.settingsStore
475
+ ? resolveAgentProviderFromSettings({ settings: await options.settingsStore.load(), env: options.env ?? process.env })
476
+ : defaultWhiteboardAgentProvider(options));
477
+ const primerText = extractPrimerText(state.agentHistory[0]);
478
+ const effectiveSystem = buildEffectiveSystemPrompt(baseSystem, primerText);
479
+
480
+ // Each warmup attempt sends the IDENTICAL prefix [primer, WARMUP_USER_MESSAGE]
481
+ // so attempt N hits the cache that attempt N-1 wrote. We must NOT mutate
482
+ // state.agentHistory until the loop ends - otherwise attempt 2's prefix
483
+ // would differ from attempt 1's and cache wouldn't share.
484
+ const all = [...state.agentHistory, WARMUP_USER_MESSAGE];
485
+ const messages = primerText ? reshapeMessagesForCodex(all) : all;
486
+
487
+ options.onAgentEvent?.({ type: "warmup:start", attempt, system: effectiveSystem, timestamp: new Date().toISOString() });
488
+
489
+ // Same tool definitions as the live agent so the request prefix matches and
490
+ // automatic prompt cache fires on subsequent transcript turns.
491
+ const whiteboardElementSchema = z.record(z.string(), z.any());
492
+ const editOperationSchema = z.discriminatedUnion("type", [
493
+ z.object({
494
+ type: z.literal("replace"),
495
+ line: z.number().int().positive().describe("Current 1-based line number to replace."),
496
+ element: whiteboardElementSchema.describe("Replacement drawing object for this line."),
497
+ }),
498
+ z.object({
499
+ type: z.literal("insert_after"),
500
+ line: z.number().int().min(0).describe("Current line number to insert after. Use 0 to insert at the start."),
501
+ element: whiteboardElementSchema.describe("Drawing object to insert after this line."),
502
+ }),
503
+ z.object({
504
+ type: z.literal("delete"),
505
+ line: z.number().int().positive().describe("Current 1-based line number to delete."),
506
+ }),
507
+ ]);
508
+ const noop = async () => "warmup-noop";
509
+
510
+ const callOptions = {
511
+ model: createWhiteboardAgentModel(agentProvider),
512
+ providerOptions: createWhiteboardAgentProviderOptions(agentProvider, effectiveSystem),
513
+ stopWhen: stepCountIs(1),
514
+ system: effectiveSystem,
515
+ messages,
516
+ tools: {
517
+ whiteboard_overwrite: tool({
518
+ description: "Replace the entire whiteboard with a complete drawing object array. Use only for clearing, resetting, or starting fresh.",
519
+ inputSchema: z.object({
520
+ elements: z.array(whiteboardElementSchema).describe("Complete replacement drawing object array."),
521
+ }),
522
+ execute: noop,
523
+ }),
524
+ whiteboard_apply: tool({
525
+ description: "Apply edits and/or move the viewport in a SINGLE call. Combine everything you want to do this turn into one whiteboard_apply call - do not split into back-to-back calls. Either operations, viewport, or both must be provided. operations applies edits in line-number order; viewport scrolls/zooms after edits land. For scroll_to_content, ALWAYS pass focus_ids.",
526
+ inputSchema: z.object({
527
+ operations: z.array(editOperationSchema).optional().describe("Edit operations applied in order. Omit (or pass empty) when you only want to move the viewport."),
528
+ viewport: z.object({
529
+ action: z.enum(["scroll_to_content", "set_zoom", "zoom_in", "zoom_out", "reset_zoom"]),
530
+ zoom: z.number().min(0.1).max(3).optional().describe("Zoom value for set_zoom. 1 is 100%."),
531
+ focus_ids: z.array(z.string()).optional().describe("For scroll_to_content: stable element IDs the audience should look at right now (typically the elements you just edited or the cluster the speaker is currently discussing). Pass 1-5 IDs - the active talking point, not the whole diagram."),
532
+ }).optional().describe("Optional viewport command applied AFTER any edits. Omit when no viewport change is needed."),
533
+ }),
534
+ execute: noop,
535
+ }),
536
+ },
537
+ };
538
+
539
+ const fingerprints = {
540
+ system: fingerprint(effectiveSystem),
541
+ primer: fingerprint(state.agentHistory[0]),
542
+ tools: fingerprint(toolDefinitionFingerprintInput(callOptions.tools)),
543
+ };
544
+
545
+ const codexInstructionsForWarmup = agentProvider.provider === "codex" ? effectiveSystem : null;
546
+ const label = `warmup#${attempt}`;
547
+ dumpAgentRequest(label, { system: effectiveSystem, messages, instructions: codexInstructionsForWarmup, primerText });
548
+ const result = await withTimeout(
549
+ runWhiteboardAgentGeneration(agentProvider, callOptions, { generateTextFn, streamTextFn }),
550
+ options.warmupTimeoutMs ?? options.agentTimeoutMs ?? DEFAULT_AGENT_TIMEOUT_MS,
551
+ "Whiteboard warmup timed out",
552
+ );
553
+ logAgentUsage(label, result, { fingerprints });
554
+
555
+ options.onAgentEvent?.({ type: "warmup:end", attempt, result: summarizeAgentResult(result), timestamp: new Date().toISOString() });
556
+ return { usage: extractAgentUsage(result), result };
557
+ }
558
+
559
+ function summarizeAgentResult(result) {
560
+ if (!result || typeof result !== "object") return result;
561
+
562
+ return Object.fromEntries(
563
+ ["text", "finishReason", "usage", "toolCalls", "toolResults", "steps"]
564
+ .filter((key) => result[key] !== undefined)
565
+ .map((key) => [key, result[key]]),
566
+ );
567
+ }
568
+
569
+ const CACHE_USAGE_LOG_PATH = process.env.AUTOPRESO_CACHE_LOG ?? path.join(process.cwd(), "autopreso-cache.log");
570
+ const DEBUG_LOG_PATH = process.env.AUTOPRESO_DEBUG_LOG ?? path.join(process.cwd(), "autopreso-debug.log");
571
+
572
+ function summarizeMessageForDump(message) {
573
+ if (typeof message?.content === "string") {
574
+ return { role: message.role, contentType: "text", text: message.content };
575
+ }
576
+ if (Array.isArray(message?.content)) {
577
+ return {
578
+ role: message.role,
579
+ contentType: "multimodal",
580
+ parts: message.content.map((part) => {
581
+ if (part?.type === "text") return { type: "text", text: part.text ?? "" };
582
+ if (part?.type === "image") {
583
+ const image = typeof part.image === "string" ? part.image : "";
584
+ return {
585
+ type: "image",
586
+ note: image.startsWith("data:") ? `data URL, ${image.length} chars` : "image",
587
+ };
588
+ }
589
+ return { type: part?.type ?? "unknown" };
590
+ }),
591
+ };
592
+ }
593
+ return { role: message?.role, content: message?.content };
594
+ }
595
+
596
+ export function dumpAgentRequest(label, { system, messages, instructions, primerText } = {}) {
597
+ try {
598
+ const record = {
599
+ ts: new Date().toISOString(),
600
+ label,
601
+ systemFingerprint: fingerprint(system),
602
+ systemLength: typeof system === "string" ? system.length : 0,
603
+ instructionsFingerprint: fingerprint(instructions ?? null),
604
+ instructionsLength: typeof instructions === "string" ? instructions.length : 0,
605
+ // Primer text now lives in the system prompt for both providers, plus
606
+ // codex's `instructions` field which mirrors system. Dumping it directly
607
+ // lets you verify the user's staging content reached the agent without
608
+ // having to parse the (huge) full system prompt.
609
+ primerText: typeof primerText === "string" ? primerText : null,
610
+ messageCount: Array.isArray(messages) ? messages.length : 0,
611
+ messages: Array.isArray(messages) ? messages.map(summarizeMessageForDump) : null,
612
+ };
613
+ appendFileSync(DEBUG_LOG_PATH, "\n" + "=".repeat(80) + "\n" + JSON.stringify(record, null, 2) + "\n");
614
+ } catch (error) {
615
+ console.warn("[debug] failed to append to debug log:", error.message);
616
+ }
617
+ }
618
+
619
+ export function dumpToolCall(toolName, input, sceneIds, result) {
620
+ try {
621
+ const record = {
622
+ ts: new Date().toISOString(),
623
+ tool: toolName,
624
+ input,
625
+ sceneIds: Array.isArray(sceneIds) ? sceneIds : null,
626
+ resultPreview: typeof result === "string" ? result.slice(0, 600) : result,
627
+ };
628
+ appendFileSync(DEBUG_LOG_PATH, "\n" + "-".repeat(80) + "\nTOOL CALL: " + JSON.stringify(record, null, 2) + "\n");
629
+ } catch (error) {
630
+ console.warn("[debug] failed to append tool call to debug log:", error.message);
631
+ }
632
+ }
633
+
634
+ export function extractAgentUsage(result) {
635
+ const usage = result?.usage ?? {};
636
+ const input = usage.inputTokens ?? usage.promptTokens ?? usage.input_tokens ?? usage.prompt_tokens ?? 0;
637
+ const output = usage.outputTokens ?? usage.completionTokens ?? usage.output_tokens ?? usage.completion_tokens ?? 0;
638
+ const cached = usage.cachedInputTokens
639
+ ?? usage.cached_input_tokens
640
+ ?? usage.promptTokensDetails?.cachedTokens
641
+ ?? usage.prompt_tokens_details?.cached_tokens
642
+ ?? usage.input_tokens_details?.cached_tokens
643
+ ?? 0;
644
+ const reasoning = usage.reasoningTokens ?? usage.reasoning_tokens ?? 0;
645
+ return { input, cached, output, reasoning };
646
+ }
647
+
648
+ function fingerprint(value) {
649
+ try {
650
+ return createHash("sha1").update(JSON.stringify(value ?? null)).digest("hex").slice(0, 10);
651
+ } catch {
652
+ return "n/a";
653
+ }
654
+ }
655
+
656
+ function toolDefinitionFingerprintInput(tools) {
657
+ // The execute callbacks are closures and can't be JSON-stringified. For cache
658
+ // parity we only care about the parts the model sees: name, description, and
659
+ // input schema. Zod schemas don't serialize cleanly so we read shape via _def
660
+ // when present; this is a best-effort fingerprint, not a JSON-Schema dump.
661
+ if (!tools || typeof tools !== "object") return null;
662
+ const out = {};
663
+ for (const [name, def] of Object.entries(tools)) {
664
+ let keys = [];
665
+ try {
666
+ const shape = def?.inputSchema?._def?.shape;
667
+ const resolved = typeof shape === "function" ? shape() : (shape ?? def?.inputSchema?.shape ?? {});
668
+ keys = Object.keys(resolved).sort();
669
+ } catch {
670
+ keys = [];
671
+ }
672
+ out[name] = {
673
+ description: def?.description ?? null,
674
+ schemaShape: def?.inputSchema?._def?.typeName ?? typeof def?.inputSchema,
675
+ schemaKeys: keys,
676
+ };
677
+ }
678
+ return out;
679
+ }
680
+
681
+ export function logAgentUsage(label, result, extras = {}) {
682
+ const { input, cached, output, reasoning } = extractAgentUsage(result);
683
+ const cachePct = input > 0 ? Math.round((cached / input) * 100) : 0;
684
+ const fingerprintsSuffix = extras.fingerprints
685
+ ? ` system=${extras.fingerprints.system} primer=${extras.fingerprints.primer} tools=${extras.fingerprints.tools}`
686
+ : "";
687
+ const line = `[cache] ${label.padEnd(7)} input=${input} cached=${cached} (${cachePct}%) output=${output}${reasoning ? ` reasoning=${reasoning}` : ""}${fingerprintsSuffix}`;
688
+ console.log(line);
689
+ try {
690
+ const record = {
691
+ ts: new Date().toISOString(),
692
+ label,
693
+ input,
694
+ cached,
695
+ cachePct,
696
+ output,
697
+ reasoning,
698
+ ...extras,
699
+ };
700
+ appendFileSync(CACHE_USAGE_LOG_PATH, JSON.stringify(record) + "\n");
701
+ } catch (error) {
702
+ // Don't let logging break the agent flow.
703
+ console.warn("[cache] failed to append to log file:", error.message);
704
+ }
705
+ }
706
+
707
+ function createWhiteboardAgentProviderOptions(agentProvider, effectiveSystem) {
708
+ if (!["openai", "codex"].includes(agentProvider.provider)) return undefined;
709
+ return {
710
+ openai: {
711
+ reasoningEffort: agentProvider.reasoningEffort,
712
+ ...(agentProvider.serviceTier ? { serviceTier: agentProvider.serviceTier } : {}),
713
+ // Codex's Responses API uses `instructions` instead of a system message.
714
+ // We pass the same effective system (base + primer text) here so codex
715
+ // gets the primer too. `store: false` disables server-side conversation
716
+ // storage; we send full history each turn.
717
+ ...(agentProvider.provider === "codex" ? { store: false, instructions: effectiveSystem } : {}),
718
+ },
719
+ };
720
+ }
721
+
722
+ export function buildEffectiveSystemPrompt(systemPrompt, primerText) {
723
+ if (!primerText) return systemPrompt;
724
+ return `${systemPrompt}\n\n${primerText}`;
725
+ }
726
+
727
+ export function extractPrimerText(primerMessage) {
728
+ if (!primerMessage) return "";
729
+ if (typeof primerMessage.content === "string") return primerMessage.content;
730
+ if (Array.isArray(primerMessage.content)) {
731
+ return primerMessage.content
732
+ .filter((part) => part?.type === "text" && typeof part.text === "string")
733
+ .map((part) => part.text)
734
+ .join("\n\n");
735
+ }
736
+ return "";
737
+ }
738
+
739
+ export function reshapeMessagesForCodex(messages) {
740
+ // The primer text now lives entirely in codex's `instructions` field for
741
+ // cache reasons, so drop the primer message from the messages array. If a
742
+ // primer happens to carry non-text parts (legacy or future image use), keep
743
+ // those parts as a stripped-down user message.
744
+ if (!Array.isArray(messages) || messages.length === 0) return messages;
745
+ const first = messages[0];
746
+ if (first?.role !== "user") return messages;
747
+ if (typeof first.content === "string") return messages.slice(1);
748
+ if (Array.isArray(first.content)) {
749
+ const nonTextParts = first.content.filter((part) => part?.type !== "text");
750
+ if (nonTextParts.length === 0) return messages.slice(1);
751
+ return [{ role: "user", content: nonTextParts }, ...messages.slice(1)];
752
+ }
753
+ return messages;
754
+ }
755
+
756
+ function withTimeout(promise, timeoutMs, message) {
757
+ let timeout;
758
+ const timeoutPromise = new Promise((_, reject) => {
759
+ timeout = setTimeout(() => reject(new Error(`${message} after ${timeoutMs}ms.`)), timeoutMs);
760
+ });
761
+
762
+ return Promise.race([promise, timeoutPromise]).finally(() => clearTimeout(timeout));
763
+ }
764
+
765
+ export function buildWhiteboardAgentMessages({ agentHistory, elements, latestScreenshot, transcript }) {
766
+ return [
767
+ ...agentHistory,
768
+ { role: "user", content: formatSpeakerTurn(transcript) },
769
+ { role: "user", content: formatCurrentCanvasTask(elements, latestScreenshot) },
770
+ ];
771
+ }
772
+
773
+ export function appendWhiteboardAgentHistory(agentHistory, { transcript }) {
774
+ const nextHistory = [...agentHistory];
775
+ const transcriptText = transcript.trim();
776
+
777
+ if (transcriptText) {
778
+ nextHistory.push({ role: "user", content: formatSpeakerTurn(transcriptText) });
779
+ }
780
+
781
+ return nextHistory;
782
+ }
783
+
784
+ function formatSpeakerTurn(transcript) {
785
+ return `Speaker turn:\n${transcript.trim()}`;
786
+ }
787
+
788
+ export function buildStagingPrimerMessage({ stagingElements, stagingScreenshot }) {
789
+ const elementsText = formatLineNumberedWhiteboard(stagingElements);
790
+ const text = `Reference context for this presentation:
791
+
792
+ The user prepared this staging area before starting. Use it as a strong reference for two things:
793
+
794
+ 1. Content / vocabulary: names, terms, facts, numbers, and relationships the speaker is likely to refer to. Prefer the staging's wording over your own paraphrases.
795
+ 2. Structure / layout: if the staging contains a diagram (positioned shapes, arrows, columns, groupings, or any visible spatial relationships), treat it as the user's chosen visualization for that topic. When the speaker reaches the related topic, roughly follow that structure on the live canvas - same overall arrangement, similar relative positions, same connections and groupings - rather than inventing a different layout. You can swap shape types if a different one fits better (rectangle vs ellipse vs diamond, etc.); the structure matters more than the specific shapes. Reuse the staging's color encoding if it has one.
796
+
797
+ You may simplify, relabel, drop, or rearrange pieces that don't apply to what the speaker is currently saying, and you may add new content the staging didn't anticipate. But when the speaker is talking about something the staging clearly diagrams, lean into that diagram instead of starting from scratch.
798
+
799
+ Don't dump the entire staging onto the live canvas before the speaker brings a topic up. The live canvas should still grow with the talk - the staging just biases what it grows into.
800
+
801
+ Staging elements:
802
+ ${elementsText}
803
+
804
+ ${stagingScreenshot ? "An image of the full staging area is attached so you can see the layout visually as well." : ""}
805
+
806
+ This message arrives before any spoken content. Respond with the single word UNDERSTOOD and take no further action - do not call any tools - until an actual speaker transcript turn arrives. When transcript turns do arrive in subsequent messages, behave normally per your system instructions.`;
807
+ if (typeof stagingScreenshot === "string" && stagingScreenshot) {
808
+ return {
809
+ role: "user",
810
+ content: [
811
+ { type: "text", text },
812
+ { type: "image", image: stagingScreenshot },
813
+ ],
814
+ };
815
+ }
816
+ return { role: "user", content: text };
817
+ }
818
+
819
+ function formatCurrentCanvasTask(elements, latestScreenshot) {
820
+ const text = `Current line-numbered whiteboard content:\n${formatLineNumberedWhiteboard(elements)}\n\nTask:\nUse the latest speaker turn and prior context to decide whether the canvas should change.\n\nBEFORE choosing a layout, check the "Reference context for this presentation" section in your system instructions: it contains the staging area the user prepared, including any diagrams. If the speaker has just reached a topic that the staging diagrams already cover, REUSE that staging structure on the live canvas - same shapes, same labels, same arrangement, same colors - rather than inventing a different layout. The staging is the user's pre-approved visualization for those topics; only invent something new when staging doesn't cover the topic at all.\n\nIf updating, use whiteboard_apply for targeted changes (operations + viewport in ONE call). Use whiteboard_overwrite only when you need to clear, reset, or start fresh. Keep the canvas organized around the core concepts, not the transcript sequence. In the same whiteboard_apply call, also include viewport with action "scroll_to_content" AND focus_ids naming the elements the speaker is currently talking about, so the viewport centers exactly on the active talking point - never call scroll_to_content without focus_ids. Make ONE whiteboard_apply call per turn whenever possible; do not split edits and viewport into back-to-back calls. The attached screenshot (when present) shows the audience's current viewport - use it to verify your edits actually look good and that the right region is visible.`;
821
+ if (typeof latestScreenshot === "string" && latestScreenshot) {
822
+ return [
823
+ { type: "text", text },
824
+ { type: "image", image: latestScreenshot },
825
+ ];
826
+ }
827
+ return text;
828
+ }
829
+
830
+ export function whiteboardSystemPrompt() {
831
+ return `You are AutoPreso, a real-time visual note-taking agent.
832
+
833
+ You listen to transcript chunks and maintain a visual presentation that complements the speaker.
834
+ The transcript may contain slight inaccuracies, especially for names, product terms, and short phrases.
835
+ Use surrounding context and prior turns to take your best guess at what the speaker really means instead of copying suspicious wording literally.
836
+ There are two kinds of useful input.
837
+
838
+ 1. Visual notes: durable talking points, relationships, decisions, contrasts, and flows.
839
+ For visual notes, update the canvas only when there is concrete content worth preserving.
840
+ Ignore filler, self-corrections, and incomplete thoughts.
841
+ Do not mirror the transcript, create subtitles, or list the speaker's sentences as separate text blocks.
842
+ Use short labels, diagrams, groupings, and relationships that add structure beyond the voiceover.
843
+ Extract the core concepts and choose the best visual form: concept map, process diagram, system architecture, comparison, hierarchy, timeline, or chart.
844
+ Reorganize the whole canvas as your understanding improves.
845
+ Move, rewrite, group, or replace existing objects instead of appending one note per transcript chunk.
846
+ If the current canvas is turning into a transcript list, replace it with a clearer conceptual diagram.
847
+
848
+ 2. Direct canvas commands: the user may give a direct command to perform an action on the canvas.
849
+ Examples include "clear the canvas", "add a rectangle", "draw an arrow from A to B", and "draw a line chart".
850
+ When intent is a direct canvas command, execute the requested canvas action instead of visualizing the command as a talking point.
851
+
852
+ Reference context (staging area):
853
+ Sessions often begin with a "reference context" message describing material the user prepared in advance: notes, key terms, and frequently a partial or full diagram for one or more upcoming topics.
854
+ Treat that reference context as the user's preferred answer for those topics. When the speaker reaches a topic that the reference context already diagrams, REUSE it - same overall structure, same labels, same groupings, same connections, same color encoding. Don't invent a slightly different layout when a workable one is already there. You may swap shape types if a different one fits better and you may simplify or omit pieces that don't apply to the current moment, but the structural skeleton should be recognizable from the staging.
855
+ Only build something new from scratch when the speaker's topic isn't covered by the reference context at all.
856
+ Use the reference context's vocabulary verbatim where you can - the user has already chosen the wording they want their audience to see.
857
+ Never dump the entire reference context onto the canvas at the start. Surface relevant pieces only when the speaker brings them up; the canvas should still grow with the talk.
858
+
859
+ When updating the canvas:
860
+ - Use whiteboard_apply for normal incremental changes.
861
+ - Use whiteboard_overwrite only when you need to clear, reset, or start fresh.
862
+ - whiteboard_apply takes optional operations (edit ops) and an optional viewport command, and runs them together: edits land first, then the viewport moves.
863
+ - whiteboard_overwrite accepts a complete replacement array of simple drawing objects.
864
+ - Both tools return the latest full whiteboard as line-numbered content (and whiteboard_apply also returns the viewport result).
865
+ - Line numbers are references for editing and are not part of the drawing objects.
866
+ - After a tool returns, use the returned line-numbered content as the authoritative latest whiteboard state.
867
+
868
+ CRITICAL: one tool call per turn.
869
+ - Combine all edits and the viewport move into a single whiteboard_apply call per turn. Plan all the operations you want, plus the viewport you want to land on, and emit them together.
870
+ - Do NOT make multiple back-to-back whiteboard_apply calls in the same turn. Each tool call is a separate model roundtrip and adds noticeable latency for the audience. Think through the full edit upfront, then send it once.
871
+ - The only situation where a second call is acceptable is if the FIRST tool call returns a layout warning that you must fix; otherwise stick to one call.
872
+ - If you only need to move the viewport (no edits), pass just viewport. If you only need to edit (no viewport change), pass just operations. If you need both, pass both.
873
+
874
+ You receive a screenshot of the audience's CURRENT VIEWPORT (not the entire infinite canvas) on each turn. Use it to verify your edits actually rendered well: look for clipped labels, overlapping shapes, arrows that miss their targets, and check that the right region is visible. The line-numbered text content is authoritative for positions; the screenshot is for visual sanity checking.
875
+ The audience's viewport is whatever you last set it to. They cannot see anything outside it. So:
876
+ - After every meaningful canvas update, pass viewport with action "scroll_to_content" AND a focus_ids list naming the 1-5 elements that represent the active talking point. The viewport will center on exactly those IDs. Pass the IDs of what the speaker is talking about RIGHT NOW, not the whole diagram.
877
+ - When the speaker shifts topic to a different region of the canvas, send a new whiteboard_apply with viewport scroll_to_content and the new region's focus_ids.
878
+ - Calling scroll_to_content WITHOUT focus_ids fits the entire scene and is almost always the wrong move - the audience ends up looking at a tiny zoomed-out overview instead of the active subject. Use it only on the rare occasion you genuinely want a full-canvas summary view.
879
+ - If the relevant region won't be readable even when centered (too dense, or labels are tiny), use set_zoom (or zoom_in/zoom_out) instead of, or together with, scroll_to_content.
880
+ - Treat moving the viewport to follow the speaker as a first-class part of your job, not an afterthought.
881
+ The app will convert these simple drawing objects into Excalidraw elements after your tool call.
882
+ Your coordinates and sizes are used directly.
883
+ The app does not automatically fix spacing, resize shapes, wrap labels, or reroute arrows.
884
+
885
+ whiteboard_apply operations:
886
+ - replace: replace one existing line with one drawing object.
887
+ - insert_after: insert one drawing object after a line. Use line 0 to insert at the start.
888
+ - delete: delete one existing line.
889
+ - Operations are applied in order to the current line numbers after previous operations in the same call.
890
+
891
+ Available viewport actions: scroll_to_content, set_zoom, zoom_in, zoom_out, reset_zoom.
892
+
893
+ Supported drawing objects:
894
+ - type: "rectangle", "ellipse", "diamond", "arrow", or "text"
895
+ - id: stable unique string
896
+ - x, y: top-left canvas coordinates
897
+ - width, height: size for shapes and arrows
898
+ - text: required for text objects
899
+ - label: optional for shapes and arrows, as { "text": "...", "fontSize": 18 }
900
+ - backgroundColor: optional fill color such as "#a5d8ff"
901
+ - fillStyle: optional, usually "solid"
902
+ - roundness: optional for rectangles, usually { "type": 3 }
903
+
904
+ For color and visual hierarchy:
905
+ - Use a tight palette of at most 2 to 3 background colors across the entire canvas. Do not give every shape a unique color.
906
+ - Color must encode meaning: same color = same role or category (for example, all problems pink, all solutions blue, all metrics yellow). If you cannot articulate what a color means, do not use it.
907
+ - A safe default is one neutral color (such as #e7f5ff or #f8f9fa) for most shapes and one accent color for the single most important node. When in doubt, use one color for everything.
908
+ - Never assign a different color to each shape just to differentiate them. Position, label, and shape type already differentiate them.
909
+ - The center or origin node of a hub-and-spoke, the conclusion of a flow, or the "headline" concept should get the accent color. Supporting nodes share the neutral color.
910
+
911
+ For text and labels:
912
+ - ALWAYS use a shape's "label" field for any text that belongs INSIDE a shape (node names, card titles, button labels, anything inside a rectangle/ellipse/diamond). NEVER place a standalone "text" element on top of or overlapping a shape - Excalidraw renders standalone text by literal coordinates with no auto-centering or wrapping, so it will bleed outside the shape and look broken. Use the shape's label and Excalidraw will center and wrap correctly.
913
+ - Standalone text elements are reserved for: the canvas title, top-level section headers placed CLEARLY OUTSIDE any shape, axis labels on charts, and arrow labels (use the arrow's label field, not a free-floating text element).
914
+ - If you find yourself wanting a text element near or over a shape, stop - that text should be the shape's label instead.
915
+ - Do not create paragraph-style text blocks of details, sub-bullets, examples, or explanatory notes hanging beside a shape. If the detail does not fit inside the shape label in 3-7 words, drop the detail or replace the shape with a tighter concept.
916
+ - Do not pair a labeled shape with a detail text block describing the same concept. One concept is one element, not two.
917
+ - Count standalone text blocks toward the 8-10 element budget. A board with 8 boxes and 6 caption blocks is 14 elements, which is too many.
918
+ - Keep labels short enough to fit inside their shape, or make the shape wider and taller.
919
+ - Treat shape labels as centered inside their shape.
920
+ - Make each labeled shape large enough for its label text plus padding.
921
+ - Keep at least 24 px of internal padding between label text and the shape border.
922
+ - Do not place text over arrows, shape borders, or another object's label.
923
+
924
+ For multiline text:
925
+ - You may use newlines in text and label strings.
926
+ - In tool arguments, represent a newline with a single JSON newline escape: "\\n".
927
+ - Do not double-escape newlines as "\\\\n"; that renders as the literal characters backslash and n on the canvas.
928
+ - Correct: {"label":{"text":"Moonshine\\nTranscription"}}
929
+ - Incorrect: {"label":{"text":"Moonshine\\\\nTranscription"}}
930
+
931
+ For arrows:
932
+ - Use type: "arrow"
933
+ - Use points: [[0, 0], [width, height]]
934
+ - Use endArrowhead: "arrow" when direction matters
935
+ - Prefer unlabeled arrows when the meaning is obvious from nearby node labels.
936
+ - Only label an arrow when the relationship needs a short verb or phrase.
937
+ - Keep arrow labels to 1-2 words.
938
+ - Only label an arrow when the arrow segment is long enough to leave clear space around the label.
939
+ - Never place an arrow label inside a shape or touching a shape border.
940
+ - An arrow must connect two visually adjacent shapes only. The straight segment between its endpoints must not pass through, clip, or overlap the body of any other shape, label, or text element on the canvas.
941
+ - Before adding an arrow, mentally draw the line from start to end and check whether it crosses any rectangle, ellipse, or diamond bounds. If it does, do not add that arrow. Either move one of the shapes so the two are adjacent, drop the arrow entirely, or replace the relationship with proximity and shared color instead.
942
+ - Prefer purely horizontal or purely vertical arrows aligned to the connected shapes' centers. Avoid diagonal arrows that span more than one row or column of nodes.
943
+ - Each arrow's endpoints should sit just outside the source and target shape borders (a small gap of 5-15 px). Do not start or end an arrow inside a shape.
944
+ - If two related concepts cannot be made adjacent without a long or crossing arrow, restructure the layout (reflow the rows/columns) before resorting to a long arrow.
945
+
946
+ For charts:
947
+ - Build simple charts from basic objects.
948
+ - Use text for the title and labels.
949
+ - Use arrows or lines for axes.
950
+ - Use rectangles, arrows, or connected line segments for data marks.
951
+
952
+ Layout rules:
953
+ - Prefer labeled rectangles, diamonds, ellipses, arrows, and text.
954
+ - Use stable ids when an object keeps the same conceptual role, but change positions and labels when a better overall layout is available.
955
+ - Keep the layout readable with generous spacing and font sizes >= 16.
956
+ - Leave at least 60 px of empty space between adjacent shape bounds, and at least 80 px between columns of nodes. 32 px is the absolute minimum and only acceptable for tightly grouped elements.
957
+ - Aim for at most 8 to 10 primary nodes on the final canvas. If you find yourself creating an 11th node, first consolidate or remove a less essential one.
958
+ - Prefer a small clear diagram over a crowded canvas.
959
+ - Favor short labels of 3-7 words per node. Keep node text to at most 2 lines. If a node needs more detail, drop the detail or split into a separate clearly grouped sub-region.
960
+ - Build one dominant flow or structure (left-to-right, top-to-bottom, or hub-and-spoke) rather than a grid of loosely connected boxes. The viewer should be able to trace the main story in one path.
961
+ - The chosen structure must be visible through explicit connectors, not just positioning. If you use a hub-and-spoke layout, draw a short arrow or line from the hub to each spoke. If you use a left-to-right or top-to-bottom flow, draw arrows between consecutive nodes. A reader should be able to see the relationship at a glance without inferring it from layout alone.
962
+ - The canvas must hold ONE structure, not two stacked ones. If the talk suggests two independent structural lenses (for example, a decomposition into parts AND a timeline of phases, or pillars AND a roadmap), pick the single lens that best summarizes the talk and drop the other, or compress it into one inline annotation, a single small row of labels, or one summary shape. Do not place a hub-and-spoke above a vertical timeline (or any analogous pairing) connected by one bridging arrow; that pattern reads as two diagrams glued together rather than one coherent picture. If you catch yourself starting a second diagram below or beside the first, delete one of the two.
963
+ - Common patterns to draw from when the talk fits one:
964
+ - · Parallel peers (independent items at the same level: features, risks, themes, OKRs, perspectives, competitors, principles): same-size grid of cards (single row of 3-4, 2x2 for 4, 3x2 for 5-6). NO arrows between peer cards - arrows imply ordering. Cap at 3-5 cards; fold extras into a single "watch list" card.
965
+ - · Schema dimensions: when each card has the same fixed structure (e.g. risk = prob + indicator + owner + mitigation), render each dimension as its own labeled line ("Real:", "Ask:", "Owner:", "Move:") inside the card. Don't collapse to paragraph text - it hides the comparison.
966
+ - · Severity / status / tier: encode as fill color, NOT as a written word. high/red = #ffc9c9, medium/orange = #ffd8a8, low/yellow = #fff3bf, on-track/green = #d3f9d8, neutral = #f8f9fa. Don't write "Red" or "Yellow" in the label - the color IS the tier.
967
+ - · Card label hierarchy: 1-3 word headline (largest) + 4-8 word subtitle + at most one or two further short lines. Never write 5+ line paragraph labels.
968
+ - · Chronology (4+ dated events): single horizontal row of compact shapes connected by short rightward arrows. Each label leads with the date/time on its own line + 2-4 word event below.
969
+ - · Hero content: the headline result/metric/outcome of the talk gets ~2x area and the strongest accent color, reserved for that one element so the audience sees it unmistakably.
970
+ - · Meta content (open questions, takeaways, action items, gotchas, limitations, asks): separate bottom row in a distinct color, one item per card with 1-3 word handle + short clarifying line. Don't fold into the main grid; don't collapse into one banner.
971
+ - · Setup / context: single short banner under the title - one line, comma-separated facts. Don't chain context facts with arrows.
972
+ - · No meta-explanation hub between title and content. The title alone provides framing. Don't insert a hub card that fans arrows down to peer cards.
973
+ - · Scoreboard: when there are aggregate counts (e.g. "12 KRs · 3 green · 7 yellow · 2 red"), render as a one-line strip under the title.
974
+ - · Comparison / before-after: header above two equal-width side-by-side columns; verdict centered below both.
975
+ - · Benchmark / scorecard with 3+ entities × 2+ metrics: render as a TABLE (entities as rows, metrics as columns); highlight the winner per metric. Overrides parallel peers.
976
+ - · Hiring rubric / process-with-criteria: column-per-stage matrix (header / signals row / anti-signals row / pass-bar row). Color-encode rows by content type.
977
+ - · Long ordered list (6+ steps): never one shape per step (causes serpentine that overflows). Either group into 3-4 phase shapes with sub-steps in multi-line labels, OR keep only 4-5 highest-leverage items as shapes.
978
+ - After placing the shapes for a layout, before finishing, audit the canvas: do the connectors actually convey the structure you intended? If a peripheral node has no connector to anything, either add one or remove the node.
979
+ - Avoid long-distance arrows that cross the canvas. Keep arrows under ~250 px and connect adjacent nodes. If two nodes need a connection that requires a long arrow, restructure the layout so they end up adjacent instead.
980
+ - Avoid arrow labels longer than two words; if you cannot make the relationship obvious without a long phrase, restructure the diagram instead.
981
+ - For summary-style talks, prefer a single-screen composition over a sprawling board.
982
+ - Keep important content inside an approximate 1000 px wide by 780 px tall frame so it can be read in one viewport.
983
+ - If the diagram grows beyond that frame, consolidate or replace details instead of extending farther right or down.
984
+ - Use both axes of the frame, not just one. A diagram that runs as a single horizontal row across the full 1000 px width while using only ~100 px of vertical space (or the analogous tall-thin column) is underdeveloped: it wastes half the canvas, tends to overshoot 1000 px wide because shapes get compressed, and turns rich content into overly abstract labels. When a primary flow has 4 or more nodes, either (a) fold it into a two-row top-bottom serpentine so each shape can be larger and the diagram fills both axes, or (b) keep only 3 nodes on the main axis and expand the most concept-rich node perpendicular to the flow into 2-3 concrete sub-points (the specific examples, sub-effects, or breakdown the speaker named). The goal is a 2D composition that uses the full frame, not a one-dimensional chain.
985
+ - Use set_zoom or zoom_out when needed so the audience can see the complete diagram in one viewport, and scroll_to_content to recenter on the speaker's current focus.
986
+ - Before editing the whiteboard, mentally check the rendered scene for clipped labels, overlapping labels, arrow labels touching shapes, cramped spacing, and arrows that cross over other shapes or labels. The attached viewport screenshot is the most reliable signal that something looks wrong - if it does, fix it on the next edit.
987
+ - When the canvas already conveys the speaker's main points, prefer NOT updating over adding another node. Each new node should earn its place by carrying a distinct concept.
988
+ - If no update is useful, do not call a tool.
989
+ - After all useful whiteboard updates are complete, respond with exactly DONE.
990
+ - Do not summarize what changed or say anything else after the updates.
991
+
992
+ Examples:
993
+ {"type":"rectangle","id":"node-1","x":100,"y":100,"width":220,"height":80,"backgroundColor":"#a5d8ff","fillStyle":"solid","roundness":{"type":3},"label":{"text":"Main idea","fontSize":18}}
994
+ {"type":"arrow","id":"edge-1","x":320,"y":140,"width":160,"height":0,"points":[[0,0],[160,0]],"endArrowhead":"arrow","label":{"text":"leads to","fontSize":14}}
995
+ {"type":"text","id":"title","x":100,"y":40,"text":"Live Talking Points","fontSize":24}`;
996
+ }