@runuai/host 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +91 -0
  3. package/bin/uai-host.mjs +14 -0
  4. package/db/migrations/0000_host_tasks.sql +12 -0
  5. package/db/migrations/0001_host_ui.sql +11 -0
  6. package/db/migrations/0002_host_github_tokens.sql +8 -0
  7. package/db/migrations/0003_host_ssh_keys.sql +8 -0
  8. package/db/migrations/0004_host_owner_name.sql +1 -0
  9. package/db/migrations/meta/_journal.json +41 -0
  10. package/db/schema.ts +82 -0
  11. package/images/standard/Dockerfile +232 -0
  12. package/images/standard/README.md +122 -0
  13. package/images/standard/container/code-server-settings.json +36 -0
  14. package/images/standard/container/uai-init +215 -0
  15. package/images/standard/tool-versions +2 -0
  16. package/lib/agent.ts +292 -0
  17. package/lib/agents/claude.ts +343 -0
  18. package/lib/agents/codex.ts +522 -0
  19. package/lib/agents/factory.ts +34 -0
  20. package/lib/agents/mock.ts +133 -0
  21. package/lib/agents/proc.ts +172 -0
  22. package/lib/agents/registry.ts +109 -0
  23. package/lib/agents/types.ts +133 -0
  24. package/lib/attachments.ts +46 -0
  25. package/lib/cloud-state.ts +56 -0
  26. package/lib/command-db.ts +278 -0
  27. package/lib/db.ts +68 -0
  28. package/lib/env.ts +140 -0
  29. package/lib/git-diff.ts +370 -0
  30. package/lib/git-identity.ts +65 -0
  31. package/lib/github-tokens.ts +321 -0
  32. package/lib/orchestrator.ts +975 -0
  33. package/lib/preview-ports.ts +85 -0
  34. package/lib/repo-clone.ts +127 -0
  35. package/lib/runtime-state.ts +120 -0
  36. package/lib/secrets.ts +71 -0
  37. package/lib/ssh.ts +186 -0
  38. package/lib/standard-image.ts +152 -0
  39. package/lib/task-diff.ts +113 -0
  40. package/lib/task-status.ts +46 -0
  41. package/lib/transcript.ts +30 -0
  42. package/lib/ulid.ts +7 -0
  43. package/package.json +85 -0
  44. package/scripts/agent/_common.sh +248 -0
  45. package/scripts/agent/task-down.sh +113 -0
  46. package/scripts/agent/task-status.sh +54 -0
  47. package/scripts/agent/task-up.sh +457 -0
  48. package/scripts/install/darwin.ts +167 -0
  49. package/scripts/install/linux.ts +115 -0
  50. package/scripts/install/types.ts +35 -0
  51. package/scripts/install/util.ts +39 -0
  52. package/scripts/install/win.ts +130 -0
  53. package/src/cli.ts +445 -0
  54. package/src/index.ts +375 -0
  55. package/src/load-env.ts +52 -0
  56. package/src/main.ts +1156 -0
  57. package/src/paths.ts +64 -0
  58. package/src/protocol.ts +413 -0
  59. package/src/ui/server.ts +343 -0
  60. package/src/ui/types.ts +78 -0
  61. package/ui/app.js +264 -0
  62. package/ui/index.html +55 -0
  63. package/ui/style.css +359 -0
  64. package/ui/uai-logo-black.svg +9 -0
@@ -0,0 +1,975 @@
1
+ /**
2
+ * Orchestrator — the chat channel runtime.
3
+ *
4
+ * One **channel** per task. A channel holds an `AgentSession` per roster
5
+ * agent and emits protocol events for the cloud side to persist, stream,
6
+ * and route. It is the in-process stand-in for what becomes the host
7
+ * agent process in the hosted product (docs/hosted-architecture.md).
8
+ *
9
+ * Responsibilities:
10
+ * - Lazily build a channel: load the project roster, spawn sessions.
11
+ * - Deliver cloud-routed messages into live agent sessions.
12
+ * - Emit typed HostEvents for agent output, tools, permissions, exits.
13
+ *
14
+ * Module singleton, stashed on globalThis so Next.js HMR reuses it
15
+ * instead of leaking a second orchestrator.
16
+ */
17
+
18
+ import { spawnSync } from "node:child_process";
19
+ import { existsSync, readFileSync } from "node:fs";
20
+ import { homedir } from "node:os";
21
+ import { join } from "node:path";
22
+
23
+ import { inArray } from "drizzle-orm";
24
+
25
+ import { getDb, schema } from "./db";
26
+ import { mockAgentFactory } from "./agents/mock";
27
+ import { realAgentFactory } from "./agents/factory";
28
+ import {
29
+ type AgentEvent,
30
+ type AgentSession,
31
+ type AgentSessionFactory,
32
+ type Roster,
33
+ type RosterAgent,
34
+ } from "./agents/types";
35
+ import { ACTIVE_STATUSES } from "./task-status";
36
+ import { getHostTask, upsertHostTask } from "./runtime-state";
37
+ import { setupTaskGithub } from "./github-tokens";
38
+ import { setupTaskGitIdentity } from "./git-identity";
39
+ import type { ChannelEnsureInput, HostEvent } from "../src/protocol";
40
+
41
+ export type HostEventSubscriber = (event: HostEvent) => void;
42
+
43
+ // ---------------------------------------------------------------------------
44
+ // Channel — one task's live conversation.
45
+ // ---------------------------------------------------------------------------
46
+
47
+ interface Channel {
48
+ taskId: string;
49
+ roster: Roster;
50
+ sessions: Map<string, AgentSession>;
51
+ /** Session-spawn inputs, kept so sessions can be started lazily. */
52
+ containerName: string;
53
+ /** Per-agent system preamble (channel briefing + assembled persona). */
54
+ preambles: Map<string, string>;
55
+ /** Per-agent first-turn message — only agents whose `initialPrompt` is
56
+ * non-empty have an entry. Delivered once at container-ready. */
57
+ firstTurns: Map<string, string>;
58
+ /** True once agent sessions have been spawned (lazily, on first send). */
59
+ sessionsStarted: boolean;
60
+ /** Per-agent respawn counter — bounded so a broken agent can't
61
+ * loop forever rewriting its config. */
62
+ respawns: Map<string, number>;
63
+ }
64
+
65
+ /** Hard cap on automatic respawns per agent per channel lifetime. */
66
+ const MAX_RESPAWNS_PER_AGENT = 5;
67
+
68
+ /** Substrings in an agent's error output that mean "config was
69
+ * unlinked between runs" — repair-and-respawn covers the common
70
+ * Docker-Desktop-macOS race where Claude's atomic writes briefly
71
+ * leave .claude.json missing. */
72
+ const CLAUDE_CONFIG_MISSING_PATTERNS = [
73
+ /Claude configuration file not found/i,
74
+ /\/\.claude\.json/i,
75
+ ];
76
+
77
+ class Orchestrator {
78
+ private readonly channels = new Map<string, Channel>();
79
+ private readonly channelSpecs = new Map<string, ChannelEnsureInput>();
80
+ private readonly hostSubscribers = new Set<HostEventSubscriber>();
81
+
82
+ constructor(private readonly factory: AgentSessionFactory) {}
83
+
84
+ // -- subscriptions --------------------------------------------------------
85
+
86
+ subscribeHostEvents(fn: HostEventSubscriber): () => void {
87
+ this.hostSubscribers.add(fn);
88
+ return () => this.hostSubscribers.delete(fn);
89
+ }
90
+
91
+ /** Emit a host-originated system note into a task's channel (ADR-027). */
92
+ emitSystemNote(taskId: string, text: string): void {
93
+ this.emitHost({ kind: "system.note", taskId, text });
94
+ }
95
+
96
+ private emitHost(event: HostEvent): void {
97
+ for (const fn of this.hostSubscribers) {
98
+ try {
99
+ fn(event);
100
+ } catch {
101
+ // A broken cloud subscriber must not take the channel down.
102
+ }
103
+ }
104
+ }
105
+
106
+ // -- channel lifecycle ----------------------------------------------------
107
+
108
+ registerChannelSpec(spec: ChannelEnsureInput): void {
109
+ this.channelSpecs.set(spec.taskId, spec);
110
+ }
111
+
112
+ private async getOrCreateChannel(taskId: string): Promise<Channel | null> {
113
+ const existing = this.channels.get(taskId);
114
+ if (existing) return existing;
115
+
116
+ const spec = this.channelSpecs.get(taskId);
117
+ if (!spec) return null;
118
+
119
+ const roster = spec.agents;
120
+ const preambles = new Map<string, string>();
121
+ const firstTurns = new Map<string, string>();
122
+ for (const agent of roster) {
123
+ preambles.set(
124
+ agent.id,
125
+ buildSystemPreamble(
126
+ roster,
127
+ agent,
128
+ spec.projects,
129
+ spec.globalContext,
130
+ spec.workspacePath,
131
+ spec.branch,
132
+ ),
133
+ );
134
+ // ADR-022: any agent with a non-empty initialPrompt opens a first
135
+ // turn at container-ready. The rest stay silent until addressed.
136
+ const firstTurn = assembleFirstTurnPrompt(
137
+ spec.projects,
138
+ spec.globalContext,
139
+ agent,
140
+ );
141
+ if (firstTurn !== null) firstTurns.set(agent.id, firstTurn);
142
+ }
143
+
144
+ const channel: Channel = {
145
+ taskId,
146
+ roster,
147
+ sessions: new Map(),
148
+ containerName: `task-${taskId.toLowerCase()}-app-1`,
149
+ preambles,
150
+ firstTurns,
151
+ sessionsStarted: false,
152
+ respawns: new Map(),
153
+ };
154
+ this.channels.set(taskId, channel);
155
+ return channel;
156
+ }
157
+
158
+ /**
159
+ * Spawn the channel's agent sessions, lazily and exactly once.
160
+ *
161
+ * The real adapters `docker exec` into the task container, which only
162
+ * exists after `task-up` has run `docker compose up`. So sessions are
163
+ * gated on the task being `running` — calling this earlier is a no-op
164
+ * that returns `false`. Once the task is running, the first call
165
+ * spawns every roster agent's session.
166
+ *
167
+ * Returns whether sessions are ready.
168
+ */
169
+ private async ensureSessions(channel: Channel): Promise<boolean> {
170
+ if (channel.sessionsStarted) return true;
171
+
172
+ const task = getHostTask(channel.taskId);
173
+ if (!task || task.statusMirror !== "running") return false;
174
+
175
+ // Flip the flag before the await so a concurrent send can't
176
+ // double-spawn the sessions.
177
+ channel.sessionsStarted = true;
178
+
179
+ // Set the task creator's git author identity in the container (ADR-029).
180
+ // The SSH key itself is installed earlier by task-up.sh (host clone +
181
+ // container), using the creator's per-user key. Best-effort.
182
+ setupTaskGitIdentity(channel.taskId, task.ownerName, task.ownerEmail);
183
+
184
+ for (const agent of channel.roster) {
185
+ const session = await this.factory.create({
186
+ taskId: channel.taskId,
187
+ agent,
188
+ containerName: channel.containerName,
189
+ systemPreamble: channel.preambles.get(agent.id) ?? "",
190
+ });
191
+ channel.sessions.set(agent.id, session);
192
+ session.onEvent((event) => {
193
+ void this.handleAgentEvent(channel, agent.id, event);
194
+ });
195
+ }
196
+
197
+ // Container is ready: deliver each agent's first-turn prompt (ADR-022).
198
+ // Only agents whose `initialPrompt` was non-empty have an entry.
199
+ for (const agent of channel.roster) {
200
+ const firstTurn = channel.firstTurns.get(agent.id);
201
+ if (firstTurn === undefined) continue;
202
+ const session = channel.sessions.get(agent.id);
203
+ if (session) void session.send(firstTurn);
204
+ }
205
+ return true;
206
+ }
207
+
208
+ /**
209
+ * Ensure the channel exists and, if the task is `running`, its agent
210
+ * sessions are spawned. Idempotent — the task detail page's poll
211
+ * calls this so agents auto-start the moment the task is running,
212
+ * without the human having to send a message first.
213
+ */
214
+ async ensureStarted(taskId: string): Promise<void> {
215
+ const channel = await this.getOrCreateChannel(taskId);
216
+ if (channel) await this.ensureSessions(channel);
217
+ }
218
+
219
+ // -- inbound: a human (or routed peer) message ----------------------------
220
+
221
+ /**
222
+ * Deliver a cloud-routed message into one live agent session.
223
+ */
224
+ async deliver(
225
+ taskId: string,
226
+ agentId: string,
227
+ text: string,
228
+ ): Promise<{ ok: true } | { ok: false; error: string }> {
229
+ // Slash command: re-run GitHub auth setup for this task (ADR-027). Handled
230
+ // before sessions so it works even when an agent isn't ready.
231
+ if (text.trim() === "/retry-gh") {
232
+ void this.handleRetryGh(taskId);
233
+ return { ok: true };
234
+ }
235
+
236
+ const channel = await this.getOrCreateChannel(taskId);
237
+ if (!channel) return { ok: false, error: "task or project not found" };
238
+
239
+ const ready = await this.ensureSessions(channel);
240
+ if (!ready) {
241
+ return { ok: false, error: "task is not running" };
242
+ }
243
+
244
+ const session = channel.sessions.get(agentId);
245
+ if (!session) return { ok: false, error: `no such agent: ${agentId}` };
246
+
247
+ void session.send(rewriteAttachmentRefs(text));
248
+ return { ok: true };
249
+ }
250
+
251
+ /** Interrupt an agent's current turn (ESC). No-op when the task/agent has no
252
+ * live session (nothing to stop). */
253
+ async interrupt(
254
+ taskId: string,
255
+ agentId: string,
256
+ ): Promise<{ ok: true } | { ok: false; error: string }> {
257
+ const session = this.channels.get(taskId)?.sessions.get(agentId);
258
+ if (!session) return { ok: false, error: "no active session" };
259
+ void session.interrupt();
260
+ this.emitSystemNote(taskId, `Stopped @${agentId}.`);
261
+ return { ok: true };
262
+ }
263
+
264
+ /** `/retry-gh` handler: re-mint + inject the task owner's GitHub token. */
265
+ private async handleRetryGh(taskId: string): Promise<void> {
266
+ const owner = getHostTask(taskId)?.ownerUserId;
267
+ if (!owner) {
268
+ this.emitSystemNote(taskId, "gh: no owner recorded for this task.");
269
+ return;
270
+ }
271
+ this.emitSystemNote(taskId, "gh: retrying authentication…");
272
+ const ok = await setupTaskGithub(taskId, owner);
273
+ this.emitSystemNote(
274
+ taskId,
275
+ ok
276
+ ? "gh: authentication restored."
277
+ : "gh: still not connected — reconnect GitHub on Account, then try again.",
278
+ );
279
+ }
280
+
281
+ // -- outbound: events from an agent session -------------------------------
282
+
283
+ private async handleAgentEvent(
284
+ channel: Channel,
285
+ agentId: string,
286
+ event: AgentEvent,
287
+ ): Promise<void> {
288
+ switch (event.type) {
289
+ case "message_delta": {
290
+ this.emitHost({
291
+ kind: "agent.message_delta",
292
+ taskId: channel.taskId,
293
+ agentId,
294
+ chunk: event.text,
295
+ });
296
+ break;
297
+ }
298
+ case "message_complete": {
299
+ this.emitHost({
300
+ kind: "agent.message_complete",
301
+ taskId: channel.taskId,
302
+ agentId,
303
+ fullText: event.text,
304
+ mentions: parseMentions(event.text, channel.roster),
305
+ });
306
+ break;
307
+ }
308
+ case "tool_call": {
309
+ this.emitHost({
310
+ kind: "agent.tool_call",
311
+ taskId: channel.taskId,
312
+ agentId,
313
+ tool: event.title,
314
+ meta: { detail: event.detail, toolId: event.id },
315
+ });
316
+ break;
317
+ }
318
+ case "permission_request": {
319
+ this.emitHost({
320
+ kind: "agent.permission_request",
321
+ taskId: channel.taskId,
322
+ agentId,
323
+ requestId: event.id,
324
+ meta: {
325
+ title: event.title,
326
+ detail: event.detail,
327
+ requestId: event.id,
328
+ },
329
+ });
330
+ break;
331
+ }
332
+ case "peer_message": {
333
+ const text = `@${event.toAgentId} ${event.text}`;
334
+ this.emitHost({
335
+ kind: "agent.message_complete",
336
+ taskId: channel.taskId,
337
+ agentId,
338
+ fullText: text,
339
+ mentions: [event.toAgentId],
340
+ });
341
+ break;
342
+ }
343
+ case "error": {
344
+ // Claude under load (especially Docker Desktop macOS) occasionally
345
+ // unlinks ~/.claude.json mid-write during atomic config rewrites,
346
+ // and a concurrent claude spawn lands during the gap and exits 0
347
+ // with "Claude configuration file not found". When we recognise
348
+ // that pattern, repair the in-container config + respawn the
349
+ // session so the user keeps working instead of staring at a dead
350
+ // chat.
351
+ const agent = channel.roster.find((a) => a.id === agentId);
352
+ const isClaude = agent?.kind === "claude";
353
+ const configMissing = CLAUDE_CONFIG_MISSING_PATTERNS.some((re) =>
354
+ re.test(event.message),
355
+ );
356
+ if (isClaude && configMissing) {
357
+ await this.recoverClaudeAgent(channel, agentId);
358
+ break;
359
+ }
360
+ this.emitHost({
361
+ kind: "agent.exit",
362
+ taskId: channel.taskId,
363
+ agentId,
364
+ reason: event.message,
365
+ });
366
+ break;
367
+ }
368
+ case "turn_complete":
369
+ case "exit":
370
+ break;
371
+ }
372
+ }
373
+
374
+ // -- session recovery -----------------------------------------------------
375
+
376
+ /**
377
+ * Re-copy the host's `~/.claude.json` into the task container and
378
+ * respawn the agent's session. Bounded by MAX_RESPAWNS_PER_AGENT so
379
+ * a permanently-broken setup doesn't loop forever.
380
+ */
381
+ private async recoverClaudeAgent(
382
+ channel: Channel,
383
+ agentId: string,
384
+ ): Promise<void> {
385
+ const tries = (channel.respawns.get(agentId) ?? 0) + 1;
386
+ channel.respawns.set(agentId, tries);
387
+
388
+ if (tries > MAX_RESPAWNS_PER_AGENT) {
389
+ this.emitHost({
390
+ kind: "agent.exit",
391
+ taskId: channel.taskId,
392
+ agentId,
393
+ reason:
394
+ `Claude exited with "configuration file not found" ${tries - 1} ` +
395
+ "times in a row. Giving up automatic recovery — recreate the " +
396
+ "task to start fresh.",
397
+ });
398
+ return;
399
+ }
400
+
401
+ const restored = repairClaudeConfigInContainer(channel.containerName);
402
+
403
+ // Tear the dead session down and build a fresh one in its place.
404
+ const old = channel.sessions.get(agentId);
405
+ if (old) {
406
+ try {
407
+ await old.close();
408
+ } catch {
409
+ // Already exited — close is idempotent for our adapters.
410
+ }
411
+ }
412
+
413
+ const agent = channel.roster.find((a) => a.id === agentId);
414
+ if (!agent) return;
415
+ const session = await this.factory.create({
416
+ taskId: channel.taskId,
417
+ agent,
418
+ containerName: channel.containerName,
419
+ systemPreamble: channel.preambles.get(agentId) ?? "",
420
+ });
421
+ channel.sessions.set(agentId, session);
422
+ session.onEvent((event) => {
423
+ void this.handleAgentEvent(channel, agentId, event);
424
+ });
425
+
426
+ const message = restored
427
+ ? `${agentId} restarted — config file was missing in the container, restored from the host.`
428
+ : `${agentId} restarted — config file was missing in the container (host copy not found; spawned anyway).`;
429
+ this.emitHost({
430
+ kind: "agent.exit",
431
+ taskId: channel.taskId,
432
+ agentId,
433
+ reason: message,
434
+ });
435
+ }
436
+
437
+ // -- permission resolution ------------------------------------------------
438
+
439
+ async resolvePermission(
440
+ taskId: string,
441
+ agentId: string,
442
+ requestId: string,
443
+ decision: "accept" | "decline",
444
+ ): Promise<boolean> {
445
+ const channel = this.channels.get(taskId);
446
+ const session = channel?.sessions.get(agentId);
447
+ if (!channel || !session) return false;
448
+ await session.resolvePermission(requestId, decision);
449
+ return true;
450
+ }
451
+
452
+ /** Tear a channel down (task killed). */
453
+ async closeChannel(taskId: string): Promise<void> {
454
+ const ch = this.channels.get(taskId);
455
+ if (!ch) return;
456
+ for (const session of ch.sessions.values()) await session.close();
457
+ this.channels.delete(taskId);
458
+ }
459
+ }
460
+
461
+ // ---------------------------------------------------------------------------
462
+ // `@mention` addressing.
463
+ // ---------------------------------------------------------------------------
464
+
465
+ /**
466
+ * Extract the roster agents explicitly `@mentioned` in `text`, in order
467
+ * of first appearance, de-duplicated. Returns [] when none are
468
+ * mentioned. A single space after `@` is tolerated (`@ codex`) since
469
+ * agents sometimes emit that.
470
+ */
471
+ export function parseMentions(text: string, roster: Roster): string[] {
472
+ const ids = new Set(roster.map((a) => a.id));
473
+ const mentioned: string[] = [];
474
+ const re = /@ ?([A-Za-z0-9_-]+)/g;
475
+ let m: RegExpExecArray | null;
476
+ while ((m = re.exec(text)) !== null) {
477
+ const id = m[1];
478
+ if (id && ids.has(id) && !mentioned.includes(id)) mentioned.push(id);
479
+ }
480
+ return mentioned;
481
+ }
482
+
483
+ /**
484
+ * Resolve who an inbound human message is addressed to: the explicit
485
+ * `@mentions`, else `fallbackAgentId`, else the first roster agent.
486
+ */
487
+ /**
488
+ * Rewrite cloud attachment URLs (`…/api/tasks/<id>/uploads/<file>`) to the
489
+ * in-container workspace path so the agent reads the file directly with its own
490
+ * tools (ADR-015 attachments live in the workspace, mounted at /workspace). The
491
+ * cloud URL would be Clerk-gated and unreachable from inside the container.
492
+ */
493
+ export function rewriteAttachmentRefs(text: string): string {
494
+ return text.replace(
495
+ /(?:https?:\/\/[^/\s)\]]+)?\/api\/tasks\/[^/\s)\]]+\/uploads\/([A-Za-z0-9._-]+)/g,
496
+ (_m, filename: string) => `/workspace/.uai/attachments/${filename}`,
497
+ );
498
+ }
499
+
500
+ export function resolveAddressing(
501
+ text: string,
502
+ roster: Roster,
503
+ fallbackAgentId?: string,
504
+ ): string[] {
505
+ const mentioned = parseMentions(text, roster);
506
+ if (mentioned.length > 0) return mentioned;
507
+ if (fallbackAgentId && roster.some((a) => a.id === fallbackAgentId)) {
508
+ return [fallbackAgentId];
509
+ }
510
+ const first = roster[0];
511
+ return first ? [first.id] : [];
512
+ }
513
+
514
+ /**
515
+ * Re-copy the host's `~/.claude.json` into the task container at
516
+ * `/home/node/.claude.json` and chown it back to `node`. Used when an
517
+ * agent exited with "configuration file not found" — Claude under
518
+ * Docker Desktop macOS sometimes unlinks its own config mid-write,
519
+ * and a concurrent spawn lands in the gap. Restoring from the host is
520
+ * the same recovery `task-up.sh` does at first launch.
521
+ *
522
+ * Returns true when the file actually made it into the container.
523
+ */
524
+ function repairClaudeConfigInContainer(containerName: string): boolean {
525
+ // Operator can override the source path via UAI_OWNER_HOME so the
526
+ // server doesn't have to guess. Useful when Next.js's dev process
527
+ // env diverges from `os.homedir()` for any reason.
528
+ const home = process.env.UAI_OWNER_HOME || homedir();
529
+ const hostConfig = join(home, ".claude.json");
530
+ if (!existsSync(hostConfig)) {
531
+ console.error(
532
+ `[orchestrator] repair: host .claude.json not found at ${hostConfig} ` +
533
+ `(homedir=${homedir()}, UAI_OWNER_HOME=${
534
+ process.env.UAI_OWNER_HOME ?? "<unset>"
535
+ })`,
536
+ );
537
+ return false;
538
+ }
539
+
540
+ // Why not `docker cp`: it does unlink + create on the destination,
541
+ // and when a stuck claude process still has the in-container file
542
+ // open we get `Error response from daemon: unlinkat …: device or
543
+ // resource busy` and the new bytes never land.
544
+ //
545
+ // Instead, pipe the host file's bytes into a `cat >` inside the
546
+ // container. That truncates the existing inode in place — no
547
+ // unlink, no rename — so the file's bytes are replaced and any
548
+ // open fd just sees the new content on its next read. Permissions
549
+ // and ownership of the inode are preserved (it was already owned
550
+ // by `node`, which is what claude needs).
551
+ let content: Buffer;
552
+ try {
553
+ content = readFileSync(hostConfig);
554
+ } catch (err) {
555
+ console.error(
556
+ `[orchestrator] repair: read host .claude.json failed: ${
557
+ err instanceof Error ? err.message : String(err)
558
+ }`,
559
+ );
560
+ return false;
561
+ }
562
+ // Clear the existing dentry before writing. Docker Desktop macOS
563
+ // occasionally ends up with a stale dentry where `ls -la` shows
564
+ // `-????????? ?` and `open(O_CREAT)` fails with "Directory
565
+ // nonexistent" — the file appears to be there but every syscall
566
+ // bounces. `rm -f` punches through it (or no-ops on a genuine
567
+ // missing file). Then `cat >` creates the fresh inode. Both steps
568
+ // run in one `sh -c` so a partial failure leaves a clean state.
569
+ const write = spawnSync(
570
+ "docker",
571
+ [
572
+ "exec",
573
+ "-i",
574
+ containerName,
575
+ "sh",
576
+ "-c",
577
+ "rm -f /home/node/.claude.json && cat > /home/node/.claude.json",
578
+ ],
579
+ { input: content, encoding: "buffer" },
580
+ );
581
+ if (write.status !== 0) {
582
+ const stderr =
583
+ write.stderr instanceof Buffer
584
+ ? write.stderr.toString("utf8")
585
+ : String(write.stderr ?? "");
586
+ console.error(
587
+ `[orchestrator] repair: in-place write to .claude.json failed: ${stderr.trim()}`,
588
+ );
589
+ return false;
590
+ }
591
+ return true;
592
+ }
593
+
594
+ /** A project as carried in a channel spec — one repo + its prompt. */
595
+ type ChannelProject = { slug: string; defaultPrompt: string };
596
+
597
+ /**
598
+ * Concatenate each selected project's `defaultPrompt` in position order
599
+ * (the spec already orders them), dropping empties, joined by a blank
600
+ * line. The leading layer of the ADR-022 prompt-assembly chain.
601
+ */
602
+ function concatProjectPrompts(projects: ChannelProject[]): string {
603
+ return projects
604
+ .map((p) => p.defaultPrompt.trim())
605
+ .filter((p) => p.length > 0)
606
+ .join("\n\n");
607
+ }
608
+
609
+ /**
610
+ * Assemble one agent's first-turn message (ADR-022 prompt-assembly
611
+ * chain):
612
+ *
613
+ * concat(project.defaultPrompt in position order)
614
+ * + "\n\n" + globalContext (task-level mission, if present)
615
+ * + "\n\n" + agent.defaultPrompt (the agent's persona, if present)
616
+ * + "\n\n" + agent.initialPrompt (this-turn instructions)
617
+ *
618
+ * Returns `null` when the agent has no non-empty `initialPrompt` — those
619
+ * agents stay silent until addressed. There is no "first speaker" /
620
+ * coder rule; any agent with an `initialPrompt` opens a first turn.
621
+ */
622
+ export function assembleFirstTurnPrompt(
623
+ projects: ChannelProject[],
624
+ globalContext: string | undefined,
625
+ agent: RosterAgent,
626
+ ): string | null {
627
+ const initial = (agent.initialPrompt ?? "").trim();
628
+ if (initial.length === 0) return null;
629
+
630
+ const parts = [
631
+ concatProjectPrompts(projects),
632
+ (globalContext ?? "").trim(),
633
+ (agent.defaultPrompt ?? "").trim(),
634
+ initial,
635
+ ].filter((p) => p.length > 0);
636
+ return parts.join("\n\n");
637
+ }
638
+
639
+ /**
640
+ * Build the system preamble an agent gets on session start. It opens
641
+ * with how uai's channel works — agents must know to hand off via
642
+ * `@mention`, since there is no `peer` command / shared tmux any more
643
+ * (ADR-008) — then appends the project context and the agent's persona.
644
+ *
645
+ * The persona / mission layers (project defaultPrompts, globalContext,
646
+ * agent.defaultPrompt) live in the always-on system prompt so they apply
647
+ * to every turn; only `initialPrompt` is delivered as the first user
648
+ * turn (see `assembleFirstTurnPrompt`).
649
+ */
650
+ export function buildSystemPreamble(
651
+ roster: Roster,
652
+ agent: RosterAgent,
653
+ projects: ChannelProject[],
654
+ globalContext: string | undefined,
655
+ workspacePath: string,
656
+ taskBranch: string,
657
+ ): string {
658
+ const others = roster.map((a) => `@${a.id} (${a.label})`).join(", ");
659
+ const projectLines =
660
+ projects.length === 0
661
+ ? ["(none mounted)"]
662
+ : projects.map(
663
+ (p) =>
664
+ `- \`${workspacePath}/${p.slug}\` — git worktree on \`${taskBranch}\``,
665
+ );
666
+ const comms = [
667
+ "## uai task channel",
668
+ "",
669
+ "You are one agent in a uai task chat channel, shared with the human",
670
+ "and the other agents. To hand work to or ask another agent, mention",
671
+ "it by id at the start of a line — e.g. `@codex please review the",
672
+ "diff`. uai routes that message into that agent's input.",
673
+ "",
674
+ `Agents in this channel: ${others}.`,
675
+ "",
676
+ "An agent only receives a message when it is explicitly @-mentioned",
677
+ "(or addressed by the human) — so always @-mention the agent you mean.",
678
+ "There is NO `peer` command and no shared tmux session; hand-offs are",
679
+ "just @-mentions in your replies.",
680
+ "",
681
+ "Because your input is only what you're addressed, you may be missing",
682
+ "context from messages between the human and the other agents. The full",
683
+ "channel transcript — every message + who wrote it (no tool calls) — is",
684
+ "logged at `/workspace/.uai/chat.md`. Read it whenever you need that",
685
+ "context (e.g. the human shared a file or instruction with another",
686
+ "agent); it's appended live, so re-read it for the latest.",
687
+ "",
688
+ "Hand off when you finish your part of the work. When you've made",
689
+ "and committed your changes, or completed a review, end your reply by",
690
+ "@-mentioning the agent who should act next and telling them what you",
691
+ "did and what you need (e.g. `@codex changes committed on <branch> —",
692
+ "please review`, or `@claude review done, N issues to fix`). Never",
693
+ "stop silently — if genuinely no agent needs to act, say so to the",
694
+ "human.",
695
+ "",
696
+ "## Workspace layout",
697
+ "",
698
+ `Your shell starts in \`${workspacePath}\` (the task workspace).`,
699
+ "That directory is **not** itself a git repo — it holds one git",
700
+ "worktree per project this task spans. Every project below is on the",
701
+ "same task branch. To run git commands, **`cd` into one of the",
702
+ "project directories first**:",
703
+ "",
704
+ ...projectLines,
705
+ "",
706
+ `The task branch is \`${taskBranch}\`. Push with \`git push -u origin`,
707
+ `${taskBranch}\` from inside the project, then open a PR with \`gh pr`,
708
+ "create` (the container has gh authenticated). For multi-project",
709
+ "tasks, each project's PR is independent — open one per project whose",
710
+ "worktree you actually changed.",
711
+ "",
712
+ "The `.uai/` directory under each task is uai's own scaffolding",
713
+ "(rendered Dockerfile, compose file, container scripts) — it is NOT",
714
+ "part of the project. Never review, edit, stage, commit, or flag it;",
715
+ "treat it as ignored, even though git may show it as untracked.",
716
+ "",
717
+ "## Commit policy",
718
+ "",
719
+ "Commits are SSH-signed automatically (git is configured for it) — do",
720
+ "not disable or override signing. Do NOT add any `Co-Authored-By:`",
721
+ "trailers to commit messages, and do NOT add 'Generated with …' or any",
722
+ "tool/agent attribution footer to commit messages or PR/issue bodies.",
723
+ "Write commit messages and PR descriptions plainly, as the author, with",
724
+ "no agent attribution.",
725
+ ].join("\n");
726
+
727
+ // Persona / mission layers, always-on so they apply to every turn:
728
+ // project context + task globalContext + this agent's persona. The
729
+ // per-turn instructions (initialPrompt) are delivered separately.
730
+ const context = [
731
+ concatProjectPrompts(projects),
732
+ (globalContext ?? "").trim(),
733
+ (agent.defaultPrompt ?? "").trim(),
734
+ ]
735
+ .filter((p) => p.length > 0)
736
+ .join("\n\n");
737
+
738
+ return context.length > 0 ? `${comms}\n\n---\n\n${context}` : comms;
739
+ }
740
+
741
+ // ---------------------------------------------------------------------------
742
+ // Singleton — survives Next.js HMR by living on globalThis.
743
+ // ---------------------------------------------------------------------------
744
+
745
+ const globalForOrchestrator = globalThis as unknown as {
746
+ __uaiOrchestrator?: Orchestrator;
747
+ __uaiRecoverRan?: boolean;
748
+ };
749
+
750
+ export function getOrchestrator(): Orchestrator {
751
+ if (!globalForOrchestrator.__uaiOrchestrator) {
752
+ // `UAI_AGENTS=real` drives the real Claude/Codex CLIs inside the
753
+ // task containers; anything else uses the mock (no Docker / no CLI
754
+ // needed — the default until the adapters are verified on a host).
755
+ const factory =
756
+ process.env.UAI_AGENTS === "real" ? realAgentFactory : mockAgentFactory;
757
+ globalForOrchestrator.__uaiOrchestrator = new Orchestrator(factory);
758
+ }
759
+ if (!globalForOrchestrator.__uaiRecoverRan) {
760
+ globalForOrchestrator.__uaiRecoverRan = true;
761
+ // Fire-and-forget — the orchestrator is usable while recovery runs.
762
+ void recoverRunningTasks();
763
+ }
764
+ return globalForOrchestrator.__uaiOrchestrator;
765
+ }
766
+
767
+ // ---------------------------------------------------------------------------
768
+ // Boot-time recovery
769
+ //
770
+ // When uai starts after a machine reboot (or a hard `pnpm dev` restart)
771
+ // task containers are gone but the DB still claims status='running'.
772
+ // Walk the active rows and reconcile each against actual Docker state:
773
+ //
774
+ // container up → re-discover the host port, keep
775
+ // the row at `running`.
776
+ // container exited (preserved) → `docker start` it, re-run uai-init,
777
+ // discover the fresh port, keep
778
+ // `running`. AI sessions respawn on
779
+ // the next ensureSessions() call.
780
+ // container gone, worktree on → mark `stopped`. User hits Resume,
781
+ // disk which runs task-up against the same
782
+ // id (task-up is idempotent).
783
+ // container gone, worktree gone → mark `error`. Data loss.
784
+ //
785
+ // Idempotent per process — guarded by `__uaiRecoverRan` on globalThis
786
+ // so a Next.js HMR rebuild doesn't trigger it twice.
787
+ // ---------------------------------------------------------------------------
788
+
789
+ /**
790
+ * docker ps --all output entry — Names is unique per task because we
791
+ * scope by compose-project label (set via `-p task-<id>` at task-up).
792
+ */
793
+ interface DockerPs {
794
+ Names: string;
795
+ State: string; // "running" | "exited" | "created" | "paused"
796
+ }
797
+
798
+ function dockerListContainersByLabel(label: string): DockerPs[] {
799
+ const res = spawnSync(
800
+ "docker",
801
+ ["ps", "--all", "--filter", `label=${label}`, "--format", "{{json .}}"],
802
+ { encoding: "utf8" },
803
+ );
804
+ if (res.status !== 0) return [];
805
+ return res.stdout
806
+ .split("\n")
807
+ .map((l) => l.trim())
808
+ .filter(Boolean)
809
+ .map((l) => {
810
+ try {
811
+ return JSON.parse(l) as DockerPs;
812
+ } catch {
813
+ return null;
814
+ }
815
+ })
816
+ .filter((x): x is DockerPs => x !== null);
817
+ }
818
+
819
+ function dockerStart(containerName: string): boolean {
820
+ const res = spawnSync("docker", ["start", containerName], {
821
+ encoding: "utf8",
822
+ });
823
+ if (res.status !== 0) {
824
+ console.error(
825
+ `[orchestrator] docker start ${containerName} failed: ${res.stderr.trim()}`,
826
+ );
827
+ return false;
828
+ }
829
+ return true;
830
+ }
831
+
832
+ function dockerPort(
833
+ containerName: string,
834
+ containerPort: number,
835
+ ): number | null {
836
+ const res = spawnSync(
837
+ "docker",
838
+ ["port", containerName, String(containerPort)],
839
+ { encoding: "utf8" },
840
+ );
841
+ if (res.status !== 0) return null;
842
+ // Output: "127.0.0.1:32785\n"
843
+ const firstLine = res.stdout.split("\n")[0]?.trim() ?? "";
844
+ const colonIdx = firstLine.lastIndexOf(":");
845
+ if (colonIdx === -1) return null;
846
+ const port = Number(firstLine.slice(colonIdx + 1));
847
+ return Number.isFinite(port) ? port : null;
848
+ }
849
+
850
+ function dockerExec(containerName: string, cmd: string[]): boolean {
851
+ const res = spawnSync("docker", ["exec", containerName, ...cmd], {
852
+ encoding: "utf8",
853
+ });
854
+ return res.status === 0;
855
+ }
856
+
857
+ async function recoverRunningTasks(): Promise<void> {
858
+ try {
859
+ const db = getDb();
860
+ const rows = db
861
+ .select()
862
+ .from(schema.hostTasks)
863
+ .where(inArray(schema.hostTasks.statusMirror, [...ACTIVE_STATUSES]))
864
+ .all();
865
+ if (rows.length === 0) return;
866
+ console.log(
867
+ `[orchestrator] recovery: scanning ${rows.length} active task row(s)`,
868
+ );
869
+ for (const task of rows) {
870
+ try {
871
+ await recoverOneTask(task);
872
+ } catch (err) {
873
+ console.error(
874
+ `[orchestrator] recovery: ${task.taskId} failed:`,
875
+ err instanceof Error ? err.message : err,
876
+ );
877
+ }
878
+ }
879
+ } catch (err) {
880
+ console.error(
881
+ "[orchestrator] recovery: top-level failure",
882
+ err instanceof Error ? err.message : err,
883
+ );
884
+ }
885
+ }
886
+
887
+ async function recoverOneTask(
888
+ task: typeof schema.hostTasks.$inferSelect,
889
+ ): Promise<void> {
890
+ const composeProject = task.composeProject;
891
+ if (!composeProject) {
892
+ // We never wrote a compose project name for this row — must be
893
+ // a row stuck in `queued`/`starting` from before task-up got past
894
+ // step 2. Leave it; the user can recreate.
895
+ return;
896
+ }
897
+ const containerName = `${composeProject}-app-1`;
898
+ const containers = dockerListContainersByLabel(
899
+ `com.docker.compose.project=${composeProject}`,
900
+ );
901
+
902
+ if (containers.length === 0) {
903
+ // Container gone. Worktree state determines whether resume is
904
+ // viable.
905
+ const workspaceOnDisk = task.worktreePath && existsSync(task.worktreePath);
906
+ const next = workspaceOnDisk ? "stopped" : "error";
907
+ db_setStatus(task.taskId, next, {
908
+ codeServerPort: null,
909
+ previewPorts: "[]",
910
+ composeProject: next === "error" ? null : composeProject,
911
+ });
912
+ console.log(
913
+ `[orchestrator] recovery: ${task.taskId} -> ${next} (container gone, ` +
914
+ `worktree ${workspaceOnDisk ? "present" : "gone"})`,
915
+ );
916
+ return;
917
+ }
918
+
919
+ const running = containers.some((c) => c.State === "running");
920
+ if (running) {
921
+ // Container is up. Re-discover the host port in case Docker
922
+ // remapped it across restarts (it usually does for ephemeral
923
+ // bindings).
924
+ const port = dockerPort(containerName, 8080);
925
+ if (port && port !== task.codeServerPort) {
926
+ db_setRuntime(task.taskId, {
927
+ codeServerPort: port,
928
+ });
929
+ console.log(`[orchestrator] recovery: ${task.taskId} ports refreshed`);
930
+ }
931
+ return;
932
+ }
933
+
934
+ // Container exists but exited. Bring it back up + re-launch the
935
+ // in-container init (deps + code-server).
936
+ console.log(
937
+ `[orchestrator] recovery: ${task.taskId} starting exited container ${containerName}`,
938
+ );
939
+ if (!dockerStart(containerName)) {
940
+ db_setStatus(task.taskId, "stopped", {
941
+ codeServerPort: null,
942
+ previewPorts: "[]",
943
+ });
944
+ return;
945
+ }
946
+ if (!dockerExec(containerName, ["/usr/local/bin/uai-init"])) {
947
+ console.warn(
948
+ `[orchestrator] recovery: ${task.taskId} uai-init failed; container is up but Editor may be down`,
949
+ );
950
+ }
951
+ const port = dockerPort(containerName, 8080) ?? null;
952
+ db_setRuntime(task.taskId, {
953
+ codeServerPort: port,
954
+ });
955
+ console.log(
956
+ `[orchestrator] recovery: ${task.taskId} resumed (port ${port ?? "?"})`,
957
+ );
958
+ }
959
+
960
+ function db_setStatus(
961
+ taskId: string,
962
+ status: string,
963
+ extras: Partial<typeof schema.hostTasks.$inferInsert>,
964
+ ): void {
965
+ upsertHostTask(taskId, { ...extras, statusMirror: status });
966
+ }
967
+
968
+ function db_setRuntime(
969
+ taskId: string,
970
+ extras: Partial<typeof schema.hostTasks.$inferInsert>,
971
+ ): void {
972
+ upsertHostTask(taskId, extras);
973
+ }
974
+
975
+ export type { Orchestrator };