pi-crew 0.9.7 → 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,208 @@
1
+ /**
2
+ * Crash Classification Taxonomy — pure function for categorizing worker exits.
3
+ *
4
+ * Distilled from gajae-code's `debug/crash-diagnostics.ts` (P0 item #1).
5
+ * Unlike the original, this module is pure: it does NOT write crash reports or
6
+ * touch the filesystem. The file-I/O layer is intentionally omitted; callers
7
+ * that want durable crash logs can layer them on top of {@link classifyProcessCrash}.
8
+ *
9
+ * The classification precedence (most-significant first) mirrors the
10
+ * reference implementation:
11
+ *
12
+ * 1. timeout — process was terminated by the response-timeout guard
13
+ * 2. cancelled — cooperative cancellation (AbortSignal) triggered exit
14
+ * 3. spawn_error — child_process emitted an `error` event before `exit`
15
+ * 4. native_panic — stderr indicates a native crash (SIGSEGV / abort / panic)
16
+ * 5. signal_exit — the process was terminated by an OS signal
17
+ * 6. clean_exit — exit code 0
18
+ * 7. non_zero_exit — exit code != 0 (and != null)
19
+ * 8. protocol_exit — exit code is null with no signal (protocol/stream
20
+ * ended before a normal exit was observed)
21
+ * 9. unknown — defensive fallback (should not occur in practice)
22
+ *
23
+ * NOTE on timeout-vs-cancel precedence: when BOTH `timedOut` and `cancelled`
24
+ * are true, `timeout` wins (the timeout terminated the process). This matches
25
+ * gajae-code and the existing child-pi.ts response-timeout guard, which fires
26
+ * the hard kill and is the proximate cause.
27
+ */
28
+
29
+ /**
30
+ * Categorical classification of why a worker process ended.
31
+ *
32
+ * @see classifyProcessCrash
33
+ */
34
+ export type CrashClass =
35
+ | "clean_exit"
36
+ | "non_zero_exit"
37
+ | "signal_exit"
38
+ | "timeout"
39
+ | "cancelled"
40
+ | "spawn_error"
41
+ | "protocol_exit"
42
+ | "native_panic"
43
+ | "unknown";
44
+
45
+ /**
46
+ * Inputs to {@link classifyProcessCrash}. All fields are optional/safe-defaulting
47
+ * so callers can pass a partial view (e.g. just `{ exitCode: 0 }`).
48
+ *
49
+ * Field semantics:
50
+ * - `exitCode` — the OS exit code, or `null` when no code was observed.
51
+ * - `signal` — the terminating signal name (e.g. `"SIGTERM"`) or `null`.
52
+ * - `cancelled` — true when cooperative cancellation (AbortSignal) was requested.
53
+ * - `timedOut` — true when the response-timeout guard fired (and likely killed).
54
+ * - `killed` — true when the parent explicitly killed the child (best-effort).
55
+ * - `spawnError` — truthy when the child emitted a spawn/process `error` event.
56
+ * - `stderrSnippet` — tail of captured stderr, used to detect native panics.
57
+ */
58
+ export interface CrashClassificationInput {
59
+ exitCode?: number | null;
60
+ signal?: string | null;
61
+ cancelled?: boolean;
62
+ timedOut?: boolean;
63
+ killed?: boolean;
64
+ spawnError?: unknown;
65
+ stderrSnippet?: string;
66
+ }
67
+
68
+ /**
69
+ * Result of classifying an exit. `crashClass` is machine-readable;
70
+ * `reason` is a human-friendly one-liner suitable for logs/diagnostics.
71
+ */
72
+ export interface CrashClassification {
73
+ crashClass: CrashClass;
74
+ reason: string;
75
+ }
76
+
77
+ // ── native-panic detection ──────────────────────────────────────────────────
78
+ //
79
+ // We look for a small, well-known set of native-crash signatures in the stderr
80
+ // tail. This is deliberately conservative: false positives would mislabel
81
+ // ordinary non-zero exits as native panics. The patterns are anchored on
82
+ // substrings that do not appear in normal application output.
83
+
84
+ interface NativePanicSignature {
85
+ /** Substring to search for (case-insensitive). */
86
+ pattern: string;
87
+ /** Human-readable class-specific reason suffix. */
88
+ label: string;
89
+ }
90
+
91
+ const NATIVE_PANIC_SIGNATURES: readonly NativePanicSignature[] = [
92
+ { pattern: "sigsegv", label: "segmentation fault" },
93
+ { pattern: "segfault", label: "segmentation fault" },
94
+ { pattern: "segmentation fault", label: "segmentation fault" },
95
+ { pattern: "sigabrt", label: "abort signal" },
96
+ { pattern: "abort(", label: "abort" },
97
+ { pattern: "fatal error", label: "V8/node fatal error" },
98
+ { pattern: "panic:", label: "rust/go panic" },
99
+ { pattern: "thread '", label: "rust panic (thread context)" },
100
+ { pattern: "illegal instruction", label: "illegal instruction" },
101
+ { pattern: "double free", label: "heap corruption (double free)" },
102
+ ];
103
+
104
+ /**
105
+ * If the stderr tail contains a recognizable native-crash signature, return the
106
+ * matching label; otherwise `null`. Case-insensitive.
107
+ */
108
+ function detectNativePanic(stderrSnippet: string | undefined): string | null {
109
+ if (!stderrSnippet) return null;
110
+ const lower = stderrSnippet.toLowerCase();
111
+ for (const sig of NATIVE_PANIC_SIGNATURES) {
112
+ if (lower.includes(sig.pattern)) return sig.label;
113
+ }
114
+ return null;
115
+ }
116
+
117
+ /** Normalize an optional/signal-ish value to `string | null`. */
118
+ function normalizeSignal(signal: string | null | undefined): string | null {
119
+ return signal ?? null;
120
+ }
121
+
122
+ /**
123
+ * Classify a worker exit into a {@link CrashClass}.
124
+ *
125
+ * Pure: no I/O, no globals, no side effects. Deterministic given the same input.
126
+ * Safe to call from any context (including signal handlers).
127
+ *
128
+ * @example
129
+ * classifyProcessCrash({ exitCode: 0 }) // → clean_exit
130
+ * classifyProcessCrash({ exitCode: 1 }) // → non_zero_exit
131
+ * classifyProcessCrash({ signal: "SIGTERM" }) // → signal_exit
132
+ * classifyProcessCrash({ timedOut: true, exitCode: null }) // → timeout
133
+ * classifyProcessCrash({ cancelled: true, exitCode: null }) // → cancelled
134
+ * classifyProcessCrash({ spawnError: new Error("ENOENT") }) // → spawn_error
135
+ * classifyProcessCrash({ exitCode: null }) // → protocol_exit
136
+ * classifyProcessCrash({ exitCode: 139, signal: "SIGSEGV" }) // → signal_exit
137
+ * classifyProcessCrash({ exitCode: 134, stderrSnippet: "abort()" }) // → native_panic
138
+ */
139
+ export function classifyProcessCrash(input: CrashClassificationInput): CrashClassification {
140
+ const exitCode = input.exitCode ?? null;
141
+ const signal = normalizeSignal(input.signal);
142
+
143
+ // 1. Timeout takes precedence: the response-timeout guard is the proximate
144
+ // cause of death even if cancellation was also requested.
145
+ if (input.timedOut) {
146
+ return { crashClass: "timeout", reason: "process timed out (response timeout guard fired)" };
147
+ }
148
+
149
+ // 2. Cooperative cancellation.
150
+ if (input.cancelled) {
151
+ return { crashClass: "cancelled", reason: "process was cancelled (abort requested)" };
152
+ }
153
+
154
+ // 3. Spawn error: the child never started or emitted a process error.
155
+ if (input.spawnError !== undefined && input.spawnError !== null) {
156
+ return {
157
+ crashClass: "spawn_error",
158
+ reason: `spawn error: ${stringifyError(input.spawnError)}`,
159
+ };
160
+ }
161
+
162
+ // 4. Native panic from stderr (only when we have a signal/abnormal exit —
163
+ // never reclassify a clean exit as a panic based on stderr noise).
164
+ const abnormalExit = signal !== null || (exitCode !== null && exitCode !== 0);
165
+ if (abnormalExit) {
166
+ const panic = detectNativePanic(input.stderrSnippet);
167
+ if (panic !== null) {
168
+ return { crashClass: "native_panic", reason: `native panic detected: ${panic}` };
169
+ }
170
+ }
171
+
172
+ // 5. Signal exit.
173
+ if (signal !== null) {
174
+ return { crashClass: "signal_exit", reason: `process exited after signal ${signal}` };
175
+ }
176
+
177
+ // 6. Clean exit.
178
+ if (exitCode === 0) {
179
+ return { crashClass: "clean_exit", reason: "process exited cleanly" };
180
+ }
181
+
182
+ // 7. Non-zero exit.
183
+ if (exitCode !== null) {
184
+ return { crashClass: "non_zero_exit", reason: `process exited with code ${exitCode}` };
185
+ }
186
+
187
+ // 8. Protocol exit: exitCode is null with no signal — the process stream
188
+ // ended before a normal exit was observed (e.g. stdio closed unexpectedly).
189
+ // If `killed` is true but no signal was recorded, treat as protocol_exit
190
+ // (the kill may not have delivered a signal we could capture).
191
+ if (input.killed) {
192
+ return { crashClass: "protocol_exit", reason: "process was killed but no signal/exit code was captured" };
193
+ }
194
+
195
+ // 8b. Truly null exitCode with no other context — protocol/stream ended early.
196
+ return { crashClass: "protocol_exit", reason: "process exited before protocol completion (exit code unknown)" };
197
+ }
198
+
199
+ /** Render an unknown error value to a short message string. */
200
+ function stringifyError(error: unknown): string {
201
+ if (error instanceof Error) return error.message || error.name;
202
+ if (typeof error === "string") return error;
203
+ try {
204
+ return String(error);
205
+ } catch {
206
+ return "(unstringifiable error)";
207
+ }
208
+ }
@@ -14,7 +14,7 @@
14
14
 
15
15
  import { defineTool, type ToolDefinition } from "@earendil-works/pi-coding-agent";
16
16
  import { Type, type Static } from "@sinclair/typebox";
17
- import { listLiveAgents, sendIrcMessage, broadcastIrcMessage } from "../live-agent-manager.ts";
17
+ import { listLiveAgents, sendIrcMessage, broadcastIrcMessage, respondAsBackground } from "../live-agent-manager.ts";
18
18
  import type { IrcMessage } from "../live-irc.ts";
19
19
 
20
20
  const IrcParams = Type.Object({
@@ -37,7 +37,7 @@ const IrcParams = Type.Object({
37
37
  ),
38
38
  awaitReply: Type.Optional(
39
39
  Type.Boolean({
40
- description: "Wait for a reply (default: true for DM, false for broadcast). Not yet supported messages are fire-and-forget.",
40
+ description: "Wait for a prose reply (default: true for DM, false for broadcast). For DMs the recipient receives the message as a non-blocking background turn and its reply is returned to the caller. Broadcast always ignores this flag.",
41
41
  }),
42
42
  ),
43
43
  });
@@ -64,6 +64,8 @@ interface IrcDetails {
64
64
  delivered?: string[];
65
65
  notFound?: string[];
66
66
  peers?: Array<{ id: string; status: string }>;
67
+ /** Replies received from recipients (awaitReply DM path). */
68
+ replies?: Array<{ from: string; text: string }>;
67
69
  error?: string;
68
70
  }
69
71
 
@@ -130,10 +132,10 @@ function executeList(selfId: string): { content: Array<{ type: "text"; text: str
130
132
  };
131
133
  }
132
134
 
133
- function executeSend(
135
+ async function executeSend(
134
136
  selfId: string,
135
137
  params: IrcParams,
136
- ): { content: Array<{ type: "text"; text: string }>; details: IrcDetails } {
138
+ ): Promise<{ content: Array<{ type: "text"; text: string }>; details: IrcDetails }> {
137
139
  const to = params.to?.trim();
138
140
  const message = params.message?.trim();
139
141
 
@@ -156,23 +158,52 @@ function executeSend(
156
158
  };
157
159
  }
158
160
 
161
+ // awaitReply defaults to true for DMs, false for broadcast. Broadcast
162
+ // always ignores the flag (fire-and-forget) — there is no single sender
163
+ // to receive a correlated reply from.
164
+ const isBroadcast = to === "all";
165
+ const wantsReply = !isBroadcast && (params.awaitReply ?? true);
166
+
159
167
  const ircMessage: IrcMessage = {
160
168
  from: selfId,
161
169
  to,
162
170
  content: message,
163
171
  timestamp: new Date().toISOString(),
164
- awaitReply: params.awaitReply,
172
+ awaitReply: wantsReply,
165
173
  };
166
174
 
167
175
  const notFound: string[] = [];
168
176
  const delivered: string[] = [];
177
+ const replies: Array<{ from: string; text: string }> = [];
169
178
 
170
179
  try {
171
- if (to === "all") {
180
+ if (isBroadcast) {
181
+ // Broadcast: always fire-and-forget regardless of awaitReply.
172
182
  const recipients = broadcastIrcMessage(selfId, ircMessage);
173
183
  delivered.push(...recipients);
184
+ } else if (wantsReply) {
185
+ // DM with reply: use the non-blocking side-channel.
186
+ const agents = listLiveAgents();
187
+ const target = agents.find((a) => a.agentId === to);
188
+ if (!target || (target.status !== "running" && target.status !== "queued")) {
189
+ notFound.push(to);
190
+ } else {
191
+ const result = await respondAsBackground(to, selfId, message, { awaitReply: true });
192
+ if (result.ok) {
193
+ delivered.push(to);
194
+ if (result.replyContent) replies.push({ from: to, text: result.replyContent });
195
+ } else if (result.timedOut) {
196
+ // Message was delivered (non-blocking), but no reply in time.
197
+ delivered.push(to);
198
+ replies.push({ from: to, text: `(no reply — timed out)` });
199
+ } else {
200
+ // Delivery channel unavailable or cancelled.
201
+ if (result.error === "cancelled") delivered.push(to);
202
+ else notFound.push(to);
203
+ }
204
+ }
174
205
  } else {
175
- // DM to specific agent
206
+ // DM fire-and-forget (awaitReply explicitly false).
176
207
  const agents = listLiveAgents();
177
208
  const target = agents.find((a) => a.agentId === to);
178
209
  if (!target || (target.status !== "running" && target.status !== "queued")) {
@@ -197,6 +228,14 @@ function executeSend(
197
228
  } else {
198
229
  lines.push("No recipients received the message.");
199
230
  }
231
+ if (replies.length > 0) {
232
+ lines.push("");
233
+ lines.push("## Replies");
234
+ for (const reply of replies) {
235
+ lines.push(`### ${reply.from}`);
236
+ lines.push(reply.text);
237
+ }
238
+ }
200
239
  if (notFound.length > 0) {
201
240
  lines.push(`Unknown / unavailable peers: ${notFound.join(", ")}`);
202
241
  }
@@ -209,6 +248,7 @@ function executeSend(
209
248
  to,
210
249
  delivered: delivered.length > 0 ? delivered : undefined,
211
250
  notFound: notFound.length > 0 ? notFound : undefined,
251
+ replies: replies.length > 0 ? replies : undefined,
212
252
  },
213
253
  };
214
254
  }
@@ -416,3 +416,188 @@ function drainIrcMessages(agentIdOrTaskId: string): IrcMessage[] {
416
416
  handle.pendingMessages.length = 0;
417
417
  return messages;
418
418
  }
419
+
420
+ /* ── IRC reply support (side-channel Q&A) ─────────────────────────── */
421
+
422
+ /** Default timeout for awaiting a side-channel reply (60s). */
423
+ const DEFAULT_REPLY_TIMEOUT_MS = 60_000;
424
+
425
+ /** Result of a background reply attempt. */
426
+ export interface BackgroundReplyResult {
427
+ ok: boolean;
428
+ /** Correlation id for the pending reply (present once registered). */
429
+ corrId?: string;
430
+ /** Reply prose content (present on success when awaitReply was set). */
431
+ replyContent?: string;
432
+ /** Human-readable error description. */
433
+ error?: string;
434
+ /** True when the reply did not arrive before the timeout. */
435
+ timedOut?: boolean;
436
+ }
437
+
438
+ interface PendingReply {
439
+ corrId: string;
440
+ targetAgentId: string;
441
+ fromId: string;
442
+ deadline: number;
443
+ resolve: (result: BackgroundReplyResult) => void;
444
+ timer?: ReturnType<typeof setTimeout>;
445
+ }
446
+
447
+ /** In-process pending replies keyed by correlation id. */
448
+ const pendingReplies = new Map<string, PendingReply>();
449
+ /** Reverse index: targetAgentId → set of corrIds awaiting a reply from it. */
450
+ const pendingRepliesByTarget = new Map<string, Set<string>>();
451
+
452
+ function makeCorrelationId(): string {
453
+ return `irc_reply_${Date.now().toString(36)}_${Math.random().toString(36).slice(2, 10)}`;
454
+ }
455
+
456
+ /**
457
+ * Deliver a message to a live agent's session as a *background* turn —
458
+ * without blocking the recipient's main agent loop — and (optionally)
459
+ * await a prose reply via a side-channel.
460
+ *
461
+ * Non-blocking invariant (mirrors gajae-code's `respondAsBackground`):
462
+ * the message is injected via `sendCustomMessage` (triggerTurn:false) or a
463
+ * fire-and-forget `session.prompt`; we NEVER await the recipient's full
464
+ * main-loop turn. When `awaitReply` is set we instead await an event-driven
465
+ * reply resolution (see {@link resolveIrcReply}) bounded by a timeout.
466
+ *
467
+ * Note on mailbox.ts reply fields: those file-based fields
468
+ * (`replyTo`/`replyContent`/`replyDeadline`/`updateMailboxMessageReply`)
469
+ * serve cross-process workers that communicate via on-disk mailbox files.
470
+ * Live-session agents share a single process, so an in-memory event-driven
471
+ * registry is used here — it is lower-latency and trivially non-blocking.
472
+ * Both mechanisms coexist; file-based workers keep using mailbox.ts.
473
+ */
474
+ export async function respondAsBackground(
475
+ targetAgentId: string,
476
+ fromId: string,
477
+ message: string,
478
+ opts?: { awaitReply?: boolean; timeoutMs?: number; signal?: AbortSignal },
479
+ ): Promise<BackgroundReplyResult> {
480
+ const handle = getLiveAgent(targetAgentId);
481
+ if (!handle) return { ok: false, error: `Live agent '${targetAgentId}' not found.` };
482
+
483
+ const awaitReply = opts?.awaitReply ?? false;
484
+ const timeoutMs = opts?.timeoutMs ?? DEFAULT_REPLY_TIMEOUT_MS;
485
+ const corrId = makeCorrelationId();
486
+
487
+ // --- Non-blocking delivery -------------------------------------------
488
+ const session = handle.session as Record<string, unknown>;
489
+ const deliveredTag = `[DM from ${fromId}] ${message}`;
490
+ let delivered = false;
491
+ if (typeof session.sendCustomMessage === "function") {
492
+ try {
493
+ (session.sendCustomMessage as (msg: unknown, o?: unknown) => void)(
494
+ { customType: "irc", content: deliveredTag, display: "collapsed", corrId },
495
+ { deliverAs: "followUp", triggerTurn: false },
496
+ );
497
+ delivered = true;
498
+ } catch {
499
+ // fall through to prompt-based delivery
500
+ }
501
+ }
502
+ if (!delivered && typeof handle.session.prompt === "function") {
503
+ const promptText = `${deliveredTag}${awaitReply ? ` (reply correlation: ${corrId})` : ""}`;
504
+ void handle.session.prompt(promptText, { source: "api", expandPromptTemplates: false }).catch((error) => logInternalError("live-agent-manager.respondAsBackground", error, `agentId=${handle.agentId}`));
505
+ delivered = true;
506
+ }
507
+ if (!delivered) return { ok: false, error: `Target '${targetAgentId}' has no message channel.` };
508
+ handle.updatedAt = new Date().toISOString();
509
+
510
+ if (!awaitReply) return { ok: true, corrId };
511
+
512
+ // --- Await reply (event-driven, bounded by timeout) ------------------
513
+ return awaitPendingReply(corrId, targetAgentId, fromId, timeoutMs, opts?.signal);
514
+ }
515
+
516
+ /**
517
+ * Register a pending reply and resolve it when the reply arrives, the
518
+ * timeout elapses, or the caller's abort signal fires.
519
+ *
520
+ * @internal exported for testing
521
+ */
522
+ export function awaitPendingReply(
523
+ corrId: string,
524
+ targetAgentId: string,
525
+ fromId: string,
526
+ timeoutMs: number,
527
+ signal?: AbortSignal,
528
+ ): Promise<BackgroundReplyResult> {
529
+ return new Promise((resolve) => {
530
+ const deadline = Date.now() + timeoutMs;
531
+ let settled = false;
532
+ let timer: ReturnType<typeof setTimeout> | undefined;
533
+ let signalListener: (() => void) | undefined;
534
+
535
+ const finish = (result: BackgroundReplyResult) => {
536
+ if (settled) return;
537
+ settled = true;
538
+ if (timer) clearTimeout(timer);
539
+ if (signalListener && signal) signal.removeEventListener("abort", signalListener);
540
+ pendingReplies.delete(corrId);
541
+ const set = pendingRepliesByTarget.get(targetAgentId);
542
+ set?.delete(corrId);
543
+ if (set && set.size === 0) pendingRepliesByTarget.delete(targetAgentId);
544
+ resolve(result);
545
+ };
546
+
547
+ timer = setTimeout(() => finish({ ok: false, corrId, timedOut: true }), timeoutMs);
548
+
549
+ if (signal) {
550
+ if (signal.aborted) {
551
+ finish({ ok: false, corrId, error: "cancelled" });
552
+ return;
553
+ }
554
+ signalListener = () => finish({ ok: false, corrId, error: "cancelled" });
555
+ signal.addEventListener("abort", signalListener, { once: true });
556
+ }
557
+
558
+ pendingReplies.set(corrId, { corrId, targetAgentId, fromId, deadline, resolve: finish, timer });
559
+ const set = pendingRepliesByTarget.get(targetAgentId) ?? new Set<string>();
560
+ set.add(corrId);
561
+ pendingRepliesByTarget.set(targetAgentId, set);
562
+ });
563
+ }
564
+
565
+ /**
566
+ * Resolve a pending side-channel reply. Called by the reply-routing layer
567
+ * (e.g. irc-tool when the recipient sends a message back referencing the
568
+ * correlation id, or by tests simulating a recipient response).
569
+ *
570
+ * Returns true if a pending reply was resolved, false if none matched
571
+ * (already timed out / cancelled / unknown correlation id).
572
+ */
573
+ export function resolveIrcReply(corrId: string, replyContent: string): boolean {
574
+ const pending = pendingReplies.get(corrId);
575
+ if (!pending) return false;
576
+ pending.resolve({ ok: true, corrId, replyContent });
577
+ return true;
578
+ }
579
+
580
+ /**
581
+ * Cancel a pending side-channel reply (e.g. sender gave up).
582
+ * Returns true if a pending reply was cancelled, false if none matched.
583
+ */
584
+ export function cancelIrcReply(corrId: string, reason = "cancelled"): boolean {
585
+ const pending = pendingReplies.get(corrId);
586
+ if (!pending) return false;
587
+ pending.resolve({ ok: false, corrId, error: reason });
588
+ return true;
589
+ }
590
+
591
+ /** Correlation ids currently awaiting a reply from the given target agent. */
592
+ export function pendingReplyCorrIdsForTarget(targetAgentId: string): string[] {
593
+ return [...(pendingRepliesByTarget.get(targetAgentId) ?? [])];
594
+ }
595
+
596
+ /** Clear all pending replies (test helper). */
597
+ export function clearPendingRepliesForTest(): void {
598
+ for (const pending of pendingReplies.values()) {
599
+ if (pending.timer) clearTimeout(pending.timer);
600
+ }
601
+ pendingReplies.clear();
602
+ pendingRepliesByTarget.clear();
603
+ }