@cordfuse/crosstalk 5.0.0-alpha.5 → 5.0.0-alpha.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cordfuse/crosstalk",
3
- "version": "5.0.0-alpha.5",
3
+ "version": "5.0.0-alpha.7",
4
4
  "description": "Crosstalk runtime — async messaging between agents over git. The crosstalk CLI plus dispatch, send, attach, chat, and supporting tools.",
5
5
  "type": "module",
6
6
  "license": "MIT",
package/src/dispatch.ts CHANGED
@@ -124,6 +124,158 @@ function recipients(toField: unknown): string[] {
124
124
  return [];
125
125
  }
126
126
 
127
+ // A `to:` recipient is either a bare actor name (`junior-developer`) or
128
+ // an actor@host pair (`junior-developer@cachy`). Bare names broadcast to
129
+ // every host that declares the actor; @host narrows to one host.
130
+ //
131
+ // Documented in concierge.md "Host-aware routing"; honored by the runtime
132
+ // as of alpha.7 step 1. Prior to this, the recipient string was matched
133
+ // verbatim against the actor name, so `junior-developer@cachy` never
134
+ // matched the cachy dispatcher's `junior-developer` actor declaration —
135
+ // the harness's first cross-host bug.
136
+ function extractActor(recipient: string): string {
137
+ const at = recipient.indexOf('@');
138
+ return at === -1 ? recipient : recipient.slice(0, at);
139
+ }
140
+
141
+ function targetHost(recipient: string): string | null {
142
+ const at = recipient.indexOf('@');
143
+ return at === -1 ? null : recipient.slice(at + 1);
144
+ }
145
+
146
+ // Does `recipientList` address `actorName` on `thisHost`? Returns the match
147
+ // outcome plus a flag for "actor was named but every instance targeted a
148
+ // different host" — useful as a diagnostic so silent wrong-host routes are
149
+ // logged rather than dropped without trace.
150
+ function matchHostRouting(
151
+ recipientList: string[],
152
+ actorName: string,
153
+ thisHost: string,
154
+ ): { addressed: boolean; wrongHost: boolean } {
155
+ let addressed = false;
156
+ let actorNamedAtAll = false;
157
+ for (const r of recipientList) {
158
+ if (extractActor(r) !== actorName) continue;
159
+ actorNamedAtAll = true;
160
+ const host = targetHost(r);
161
+ if (host === null || host === thisHost) {
162
+ addressed = true;
163
+ break;
164
+ }
165
+ }
166
+ return { addressed, wrongHost: !addressed && actorNamedAtAll };
167
+ }
168
+
169
+ // Host-agnostic actor name check, used by causality scans (isCausalReply,
170
+ // hasPriorWork) where the question is "does this recipient list name actor
171
+ // X at all?" — host doesn't matter because the `from` field of replies
172
+ // doesn't carry a host suffix either.
173
+ function namesActor(recipientList: string[], actorName: string): boolean {
174
+ for (const r of recipientList) {
175
+ if (extractActor(r) === actorName) return true;
176
+ }
177
+ return false;
178
+ }
179
+
180
+ // Declared lifecycle kind for a message. `work` (default for legacy messages
181
+ // without the field) is the as-tagged intent. The runtime does NOT trust this
182
+ // value directly for the activation decision — see effectiveKind() below.
183
+ // Kept for use as the seed of the effective-kind computation.
184
+ function messageKind(msg: ChannelMessage): 'work' | 'result' {
185
+ const raw = msg.data['kind'];
186
+ return raw === 'result' ? 'result' : 'work';
187
+ }
188
+
189
+ // Is `msg` causally a reply to a prior ask? True iff some message strictly
190
+ // before `msg` was sent FROM one of `msg`'s recipients TO `msg`'s sender with
191
+ // declared kind `work`. If so, `msg` is that recipient's answer coming back —
192
+ // regardless of how its sender (a fallible LLM actor, or `crosstalk send`'s
193
+ // `work` default) labelled it.
194
+ //
195
+ // Conservative on multi-recipient `to:` lists: if ANY recipient previously
196
+ // tasked the sender, the message is treated as causally a reply for all
197
+ // recipients. The per-addressee asymmetry in hasPriorWork (below) compensates
198
+ // — only the recipient that actually asked wakes on it. Known v1 limitation:
199
+ // genuine multi-recipient fan-out where one recipient happens to have prior
200
+ // unrelated work to the sender will be demoted to result and suppress wakes
201
+ // for the other recipients. Not observed in Monte Carlo; revisit if it
202
+ // surfaces.
203
+ function isCausalReply(channelMessages: ChannelMessage[], msg: ChannelMessage): boolean {
204
+ const sender = typeof msg.data['from'] === 'string' ? msg.data['from'] : '';
205
+ if (!sender) return false;
206
+ const toList = recipients(msg.data['to']);
207
+ for (const m of channelMessages) {
208
+ if (m.relPath >= msg.relPath) break;
209
+ // Read receipts are bookkeeping, never causal evidence. The activation
210
+ // scan already filters them out before considering a message for
211
+ // dispatch — this filter is the same guard at the causality-helper
212
+ // level, so a receipt from one of msg's recipients to msg's sender
213
+ // can't forge a false causal-reply edge (which would then demote a
214
+ // legitimate `work` to `result` and silently skip it). This was the
215
+ // alpha.7 step 2 finding from the cross-host harness — receipts
216
+ // pre-existing in the channel from cachy's first dispatch burst
217
+ // misclassified mac's subsequent fan-out msgs as replies.
218
+ if (m.data['type'] === 'read') continue;
219
+ const mFrom = typeof m.data['from'] === 'string' ? m.data['from'] : '';
220
+ // Host-agnostic actor name match: `from` fields are bare actor names,
221
+ // but `to` fields may include `@host` suffixes that don't change
222
+ // causal semantics.
223
+ if (!namesActor(toList, mFrom)) continue;
224
+ if ((m.data['kind'] ?? 'work') === 'result') continue;
225
+ if (namesActor(recipients(m.data['to']), sender)) return true;
226
+ }
227
+ return false;
228
+ }
229
+
230
+ // Effective lifecycle kind. The runtime INFERS kind from the causality graph
231
+ // rather than trusting the declared field: a message that is causally a reply
232
+ // is a `result` even if it was labelled `work` (actors routinely report
233
+ // results via `crosstalk send`, which defaults to `work`, and that mislabel
234
+ // forges false reply-causality edges → wake-up loops). Genuine unsolicited
235
+ // tasks (kickoffs, fresh dispatches) have no prior opposite-direction work
236
+ // and keep their `work` kind. See PROTOCOL.md "Message kinds".
237
+ //
238
+ // This is the load-bearing principle the rest of the activation rule rides
239
+ // on: the dispatcher derives semantics from the interaction graph; it never
240
+ // trusts an actor's declaration.
241
+ function effectiveKind(channelMessages: ChannelMessage[], msg: ChannelMessage): 'work' | 'result' {
242
+ if (messageKind(msg) === 'result') return 'result';
243
+ return isCausalReply(channelMessages, msg) ? 'result' : 'work';
244
+ }
245
+
246
+ // Reply causality — does `addressee` have a prior `kind: work` outbound to
247
+ // `sender` somewhere in the channel's history strictly before `before`? If
248
+ // yes, an inbound `kind: result` from `sender` to `addressee` is the answer
249
+ // to that ask, and the addressee should wake on it. If no, the result is
250
+ // unsolicited from addressee's POV and is informational only.
251
+ //
252
+ // Uses effectiveKind (not messageKind) when checking prior messages — a
253
+ // mislabeled "work" reply from a prior peer would otherwise forge a false
254
+ // causality edge here, which was the ping-pong root.
255
+ //
256
+ // The channel is already sorted by relPath ascending in
257
+ // listChannelMessages(), so the scan walks chronologically.
258
+ function hasPriorWork(
259
+ channelMessages: ChannelMessage[],
260
+ addressee: string,
261
+ sender: string,
262
+ before: string,
263
+ ): boolean {
264
+ for (const m of channelMessages) {
265
+ if (m.relPath >= before) break;
266
+ // Same receipt filter as isCausalReply — a receipt from `addressee`
267
+ // to `sender` would otherwise look like a prior work outbound and
268
+ // forge a false causal edge here too. Defense against the same
269
+ // bug class at every causality-walking helper.
270
+ if (m.data['type'] === 'read') continue;
271
+ if (typeof m.data['from'] !== 'string' || m.data['from'] !== addressee) continue;
272
+ if (effectiveKind(channelMessages, m) !== 'work') continue;
273
+ const toList = recipients(m.data['to']);
274
+ if (namesActor(toList, sender)) return true;
275
+ }
276
+ return false;
277
+ }
278
+
127
279
  function composeSystemPrompt(actorPrompt: string): string {
128
280
  return [protocolPrompt, actorPrompt]
129
281
  .filter((p) => p.length > 0)
@@ -145,7 +297,12 @@ interface CliResult {
145
297
  stderr: string;
146
298
  }
147
299
 
148
- function invokeCli(cli: string, systemPrompt: string, userMessage: string): Promise<CliResult> {
300
+ function invokeCli(
301
+ cli: string,
302
+ systemPrompt: string,
303
+ userMessage: string,
304
+ actorName: string,
305
+ ): Promise<CliResult> {
149
306
  return new Promise((res) => {
150
307
  const fullPrompt = `${systemPrompt}\n\n---\n\n${userMessage}`;
151
308
  const parts = tokenizeCli(cli);
@@ -153,14 +310,34 @@ function invokeCli(cli: string, systemPrompt: string, userMessage: string): Prom
153
310
  res({ status: 1, stdout: '', stderr: 'tokenized cli is empty' });
154
311
  return;
155
312
  }
156
- const child = spawn(parts[0], parts.slice(1), { stdio: ['pipe', 'pipe', 'pipe'] });
313
+ // detached: true creates a new process group so we can SIGKILL the
314
+ // group (not just the parent) on timeout — orphan children writing
315
+ // to the transport after parent SIGKILL was an observed alpha.5 hazard.
316
+ // Env: CROSSTALK_DISPATCH_ACTOR tells send.ts what to use as --from when
317
+ // the dispatched actor invokes `crosstalk send` without explicit --from.
318
+ const child = spawn(parts[0], parts.slice(1), {
319
+ stdio: ['pipe', 'pipe', 'pipe'],
320
+ detached: true,
321
+ env: { ...process.env, CROSSTALK_DISPATCH_ACTOR: actorName },
322
+ });
157
323
  let stdout = '';
158
324
  let stderr = '';
159
325
  let resolved = false;
160
326
  const timeout = setTimeout(() => {
161
327
  if (resolved) return;
162
328
  resolved = true;
163
- child.kill('SIGKILL');
329
+ // SIGKILL the process group (negative pid) so any children the actor
330
+ // spawned (e.g. crosstalk send subprocesses) die with the parent.
331
+ // Fallback to single-pid kill if the group signal fails (some envs).
332
+ try {
333
+ if (typeof child.pid === 'number') {
334
+ process.kill(-child.pid, 'SIGKILL');
335
+ } else {
336
+ child.kill('SIGKILL');
337
+ }
338
+ } catch {
339
+ try { child.kill('SIGKILL'); } catch { /* already dead */ }
340
+ }
164
341
  res({ status: 124, stdout, stderr: stderr + '\n[timeout]' });
165
342
  }, 5 * 60_000);
166
343
  child.stdout.on('data', (d) => { stdout += d.toString(); });
@@ -193,14 +370,19 @@ function invokeCli(cli: string, systemPrompt: string, userMessage: string): Prom
193
370
  function writeReply(
194
371
  channelUuid: string,
195
372
  fromActor: string,
196
- toActor: string,
373
+ toActor: string | string[],
197
374
  body: string,
198
375
  ): void {
199
376
  const ts = now();
200
377
  const dir = join(transportRoot, 'data', 'channels', channelUuid, ts.pathDate);
201
378
  mkdirSync(dir, { recursive: true });
379
+ // Auto-replies emitted via stdout are `kind: result` by default — the actor
380
+ // is answering, not initiating new work. Recipients only wake on a result if
381
+ // they previously asked the sender for work in this channel (reply
382
+ // causality, see activation rule below). Actors that want to dispatch new
383
+ // work do so explicitly via `crosstalk send --kind work`.
202
384
  const content = serializeFrontmatter(
203
- { from: fromActor, to: toActor, type: 'text', timestamp: ts.iso },
385
+ { from: fromActor, to: toActor, type: 'text', kind: 'result', timestamp: ts.iso },
204
386
  body,
205
387
  );
206
388
  writeFileSync(join(dir, messageFilename(ts)), content);
@@ -225,16 +407,76 @@ function writeReadReceipt(
225
407
  interface PendingDispatch {
226
408
  actorName: string;
227
409
  channelUuid: string;
228
- msg: ChannelMessage;
229
- from: string;
410
+ msgs: ChannelMessage[]; // all unread messages addressed to this actor in this channel
230
411
  tiers: HostActorTiers;
231
412
  }
232
413
 
414
+ function messageSender(msg: ChannelMessage): string {
415
+ return typeof msg.data['from'] === 'string' ? msg.data['from'] : 'unknown';
416
+ }
417
+
418
+ function formatBatchedUserMessage(msgs: ChannelMessage[]): string {
419
+ if (msgs.length === 1) return msgs[0].body;
420
+ const header = `You have ${msgs.length} new messages in this channel. Process them collectively and reply once.`;
421
+ const parts: string[] = [header];
422
+ for (let i = 0; i < msgs.length; i++) {
423
+ const m = msgs[i];
424
+ const from = messageSender(m);
425
+ const ts = typeof m.data['timestamp'] === 'string' ? (m.data['timestamp'] as string) : '';
426
+ parts.push(`--- Message ${i + 1} of ${msgs.length} (from: ${from}, ref: ${m.relPath}${ts ? `, ts: ${ts}` : ''}) ---`);
427
+ parts.push(m.body);
428
+ }
429
+ return parts.join('\n\n');
430
+ }
431
+
432
+ // Split a channel's pending messages (already sorted by relPath) into
433
+ // contiguous batches sized for the actor's concurrency. Contiguous (not
434
+ // round-robin) so each batch's highest relPath is monotone across batches —
435
+ // the cursor advances safely after the dispatch loop's per-batch writes
436
+ // without leaving a gap that would re-dispatch on the next tick.
437
+ //
438
+ // When pending fits within concurrency, every batch is a single message
439
+ // (preserves parallel fan-out — junior-developer with count: 10 and 10
440
+ // pending fan-out messages dispatches 10 parallel CLI invocations of 1
441
+ // message each). When pending exceeds concurrency, batches collapse pending
442
+ // into ~concurrency parallel invocations, each handling ceil(N/concurrency)
443
+ // messages (preserves the fan-in collapse — concierge with count: 1 and 10
444
+ // pending replies dispatches 1 invocation of 10 messages).
445
+ function splitForConcurrency(
446
+ msgs: ChannelMessage[],
447
+ concurrency: number,
448
+ ): ChannelMessage[][] {
449
+ if (concurrency <= 1 || msgs.length <= 1) return [msgs];
450
+ const chunkSize = Math.max(1, Math.ceil(msgs.length / concurrency));
451
+ const out: ChannelMessage[][] = [];
452
+ for (let i = 0; i < msgs.length; i += chunkSize) {
453
+ out.push(msgs.slice(i, i + chunkSize));
454
+ }
455
+ return out;
456
+ }
457
+
458
+ function distinctSenders(msgs: ChannelMessage[]): string[] {
459
+ const seen = new Set<string>();
460
+ const out: string[] = [];
461
+ for (const m of msgs) {
462
+ const s = messageSender(m);
463
+ if (s !== 'unknown' && !seen.has(s)) {
464
+ seen.add(s);
465
+ out.push(s);
466
+ }
467
+ }
468
+ return out;
469
+ }
470
+
233
471
  async function dispatchOne(p: PendingDispatch): Promise<boolean> {
234
- // Resolve CLI per-message message frontmatter may request a specific
235
- // tier via `tier: <name>`. Falls back to first declared tier.
236
- const preferredTier = typeof p.msg.data['tier'] === 'string'
237
- ? (p.msg.data['tier'] as string)
472
+ // Tier resolution uses the first message's `tier:` hint (if any). Batched
473
+ // dispatches assume homogeneous tier preference within an (actor, channel)
474
+ // pairing true for fan-in (all peer replies omit tier) and for explicit
475
+ // single-message dispatches alike.
476
+ const firstMsg = p.msgs[0];
477
+ const lastMsg = p.msgs[p.msgs.length - 1];
478
+ const preferredTier = typeof firstMsg.data['tier'] === 'string'
479
+ ? (firstMsg.data['tier'] as string)
238
480
  : undefined;
239
481
  let resolved;
240
482
  try {
@@ -257,11 +499,17 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
257
499
  return false;
258
500
  }
259
501
  const cli = resolved.cli;
260
- if (isQuarantined(transportRoot, 'dispatch', p.actorName, p.channelUuid, p.msg.relPath)) {
502
+
503
+ // Quarantine check uses the LAST message's relPath as the batch's identity.
504
+ // Per-message quarantine semantics are preserved because batch boundaries
505
+ // align with cursor checkpoints; if a single message in a batch keeps
506
+ // failing, the cursor never advances past it and it surfaces as a singleton
507
+ // batch on the next tick.
508
+ if (isQuarantined(transportRoot, 'dispatch', p.actorName, p.channelUuid, lastMsg.relPath)) {
261
509
  log('dispatch_skipped_quarantined', {
262
510
  actor: p.actorName,
263
511
  channel: p.channelUuid.slice(0, 8),
264
- msg: p.msg.relPath,
512
+ msg: lastMsg.relPath,
265
513
  });
266
514
  return false;
267
515
  }
@@ -269,10 +517,17 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
269
517
  log('dispatch', {
270
518
  actor: p.actorName,
271
519
  channel: p.channelUuid.slice(0, 8),
272
- msg: p.msg.relPath,
520
+ batch_size: p.msgs.length,
521
+ first_msg: firstMsg.relPath,
522
+ last_msg: lastMsg.relPath,
273
523
  });
274
524
 
275
- writeReadReceipt(p.channelUuid, p.actorName, p.from, p.msg.relPath);
525
+ // Read receipt per message — preserves the audit trail (each original
526
+ // message gets exactly one receipt) and keeps the stale-receipt sweep
527
+ // correct.
528
+ for (const m of p.msgs) {
529
+ writeReadReceipt(p.channelUuid, p.actorName, messageSender(m), m.relPath);
530
+ }
276
531
 
277
532
  let profile;
278
533
  try {
@@ -296,7 +551,8 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
296
551
  }
297
552
 
298
553
  const systemPrompt = composeSystemPrompt(profile.systemPrompt);
299
- const result = await invokeCli(cli, systemPrompt, p.msg.body);
554
+ const userMessage = formatBatchedUserMessage(p.msgs);
555
+ const result = await invokeCli(cli, systemPrompt, userMessage, p.actorName);
300
556
 
301
557
  if (result.status !== 0) {
302
558
  const r = writeDlqEntry(
@@ -304,12 +560,13 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
304
560
  'dispatch',
305
561
  p.actorName,
306
562
  p.channelUuid,
307
- p.msg.relPath,
563
+ lastMsg.relPath,
308
564
  `cli exit=${result.status}\n${result.stderr.slice(0, 1000)}`,
309
565
  );
310
566
  log('dispatch_failed', {
311
567
  actor: p.actorName,
312
568
  channel: p.channelUuid.slice(0, 8),
569
+ batch_size: p.msgs.length,
313
570
  dlq_id: r.id,
314
571
  attempts: r.attempts,
315
572
  quarantined: r.quarantined,
@@ -320,12 +577,25 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
320
577
 
321
578
  const reply = result.stdout.trim();
322
579
  if (reply.length === 0) {
580
+ // Empty stdout on a multi-message batch is treated as success — the
581
+ // actor likely routed via `crosstalk send` and has nothing to add as
582
+ // an auto-reply. For a single-message batch we keep the prior DLQ
583
+ // semantics: a single dispatched message that produces no reply is a
584
+ // protocol violation.
585
+ if (p.msgs.length > 1) {
586
+ log('dispatch_batch_silent_ok', {
587
+ actor: p.actorName,
588
+ channel: p.channelUuid.slice(0, 8),
589
+ batch_size: p.msgs.length,
590
+ });
591
+ return true;
592
+ }
323
593
  const r = writeDlqEntry(
324
594
  transportRoot,
325
595
  'dispatch',
326
596
  p.actorName,
327
597
  p.channelUuid,
328
- p.msg.relPath,
598
+ lastMsg.relPath,
329
599
  'cli returned empty reply',
330
600
  );
331
601
  log('dispatch_empty_reply', {
@@ -338,7 +608,14 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
338
608
  return false;
339
609
  }
340
610
 
341
- writeReply(p.channelUuid, p.actorName, p.from, reply);
611
+ // Auto-reply addressing: single-sender batches reply to that sender
612
+ // (preserves prior behavior). Multi-sender batches address all distinct
613
+ // senders so each peer sees the response.
614
+ const senders = distinctSenders(p.msgs);
615
+ const replyTo: string | string[] = senders.length <= 1
616
+ ? (senders[0] ?? messageSender(firstMsg))
617
+ : senders;
618
+ writeReply(p.channelUuid, p.actorName, replyTo, reply);
342
619
  return true;
343
620
  }
344
621
 
@@ -397,6 +674,13 @@ async function dispatchTick(): Promise<TickResult> {
397
674
  const tiers = host.actors[actorName];
398
675
  const concurrency = actorConcurrency(tiers);
399
676
 
677
+ // Mailbox batch-drain: for each channel, collect ALL unread messages
678
+ // addressed to this actor into a single PendingDispatch. This collapses
679
+ // fan-in O(N) into O(1) CLI invocations and prevents one actor's deep
680
+ // backlog from starving its peers in the (actor, channel) scan order.
681
+ // Read receipts and self-sent messages are filtered here — receipts
682
+ // are bookkeeping the actor already produced, and self-messages would
683
+ // create a wake-up loop.
400
684
  const pending: PendingDispatch[] = [];
401
685
  const channels = discoverChannels(transportRoot);
402
686
  for (const channelUuid of channels) {
@@ -412,27 +696,70 @@ async function dispatchTick(): Promise<TickResult> {
412
696
  post_cursor_msgs: post.length,
413
697
  });
414
698
 
699
+ const channelBatch: ChannelMessage[] = [];
415
700
  for (const msg of post) {
416
701
  const to = recipients(msg.data['to']);
417
702
  const from = typeof msg.data['from'] === 'string' ? msg.data['from'] : 'unknown';
418
- if (!to.includes(actorName) || from === actorName) {
703
+ const msgType = typeof msg.data['type'] === 'string' ? msg.data['type'] : 'text';
704
+ // Host-aware routing match. A recipient may target this actor
705
+ // either by bare name (`junior-developer` — broadcast to every
706
+ // host that declares the actor) or by `actor@host` (narrowed to
707
+ // a specific host). Bare-name match always succeeds when the
708
+ // actor name matches; @host match succeeds only when the host
709
+ // alias also matches this dispatcher's host. A recipient that
710
+ // names this actor but targets a different host is flagged as
711
+ // `host_routing_mismatch` so silent wrong-host routes are
712
+ // surfaced rather than dropped without trace. See concierge.md
713
+ // "Host-aware routing" + PROTOCOL.md.
714
+ const routing = matchHostRouting(to, actorName, host.alias);
715
+ if (!routing.addressed || from === actorName || msgType === 'read') {
716
+ if (routing.wrongHost) {
717
+ log('host_routing_mismatch', {
718
+ actor: actorName,
719
+ this_host: host.alias,
720
+ channel: channelUuid.slice(0, 8),
721
+ msg: msg.relPath,
722
+ to,
723
+ });
724
+ }
419
725
  writeCursor(transportRoot, actorName, channelUuid, msg.relPath);
420
726
  continue;
421
727
  }
422
- pending.push({ actorName, channelUuid, msg, from, tiers });
728
+ // Lifecycle activation rule. `work` always wakes. `result` wakes
729
+ // only if reply-causal — actor previously sent the sender a `work`
730
+ // in this channel. The kind used here is the runtime's INFERRED
731
+ // effective kind, not the actor's declared kind: a message that's
732
+ // causally a reply is treated as `result` even when an actor (or
733
+ // `crosstalk send`'s default) labelled it `work`, so a fan-in peer
734
+ // mislabeling its reply can't forge a wake-up loop. See PROTOCOL.md
735
+ // "Message kinds".
736
+ const kind = effectiveKind(messages, msg);
737
+ if (kind === 'result' && !hasPriorWork(messages, actorName, from, msg.relPath)) {
738
+ writeCursor(transportRoot, actorName, channelUuid, msg.relPath);
739
+ continue;
740
+ }
741
+ channelBatch.push(msg);
742
+ }
743
+ if (channelBatch.length > 0) {
744
+ const groups = splitForConcurrency(channelBatch, concurrency);
745
+ for (const g of groups) {
746
+ pending.push({ actorName, channelUuid, msgs: g, tiers });
747
+ }
423
748
  }
424
749
  }
425
750
 
751
+ // Concurrency now applies across (channel) batches, not individual
752
+ // messages. Each batch is one CLI invocation regardless of how many
753
+ // messages it carries. Cursor advances to the last message in the batch
754
+ // on success or skip — failure (DLQ) leaves the cursor behind so the
755
+ // tail of the batch retries.
426
756
  for (let i = 0; i < pending.length; i += concurrency) {
427
757
  const batch = pending.slice(i, i + concurrency);
428
758
  const results = await Promise.all(batch.map((p) => dispatchOne(p)));
429
759
  for (let j = 0; j < batch.length; j++) {
430
- writeCursor(
431
- transportRoot,
432
- batch[j].actorName,
433
- batch[j].channelUuid,
434
- batch[j].msg.relPath,
435
- );
760
+ const p = batch[j];
761
+ const lastRelPath = p.msgs[p.msgs.length - 1].relPath;
762
+ writeCursor(transportRoot, p.actorName, p.channelUuid, lastRelPath);
436
763
  if (results[j]) didWork = true;
437
764
  }
438
765
  }
package/src/send.ts CHANGED
@@ -17,17 +17,40 @@ function flag(name: string): string | undefined {
17
17
  async function main(): Promise<void> {
18
18
  const channelUuid = flag('--channel');
19
19
  const to = flag('--to');
20
- const from = flag('--from') ?? 'steve';
20
+ // Sender identity precedence:
21
+ // 1. --from on the command line (explicit operator/actor choice)
22
+ // 2. CROSSTALK_DISPATCH_ACTOR env var (set by dispatch.ts when it spawns
23
+ // an actor's CLI — so the actor's outbound messages route as itself,
24
+ // not as the operator). Fixes the alpha.5 finding where concierge's
25
+ // fan-out messages went out as `from=steve` because send.ts fell
26
+ // through to USER instead.
27
+ // 3. $USER (interactive operator default)
28
+ // 4. literal 'steve' as last resort
29
+ const from = flag('--from')
30
+ ?? process.env['CROSSTALK_DISPATCH_ACTOR']
31
+ ?? process.env['USER']
32
+ ?? 'steve';
21
33
  const tier = flag('--tier');
34
+ // Lifecycle kind. `work` (default) — recipient is being asked to act, will
35
+ // wake on receipt. `result` — informational reply, wakes the recipient only
36
+ // if it previously asked the sender for work (reply causality). See
37
+ // PROTOCOL.md "Message kinds". Proactive sends default to `work`; the
38
+ // runtime's auto-reply path defaults to `result`.
39
+ const kind = flag('--kind') ?? 'work';
22
40
  const body = argv[argv.length - 1];
23
41
 
24
42
  if (!channelUuid || !to || !body || body.startsWith('--')) {
25
43
  console.error(
26
- 'Usage: npx tsx runtime/src/send.ts --channel <uuid> --to <actor> [--from <actor>] [--tier <name>] "<message body>"',
44
+ 'Usage: crosstalk send --channel <uuid> --to <actor> [--from <actor>] [--tier <name>] [--kind work|result] "<message body>"',
27
45
  );
28
46
  process.exit(1);
29
47
  }
30
48
 
49
+ if (kind !== 'work' && kind !== 'result') {
50
+ console.error(`Invalid --kind '${kind}'. Must be 'work' or 'result'.`);
51
+ process.exit(1);
52
+ }
53
+
31
54
  await withLock('dispatch', async () => {
32
55
  const ts = now();
33
56
  const dir = join(transportRoot, 'data', 'channels', channelUuid, ts.pathDate);
@@ -37,6 +60,7 @@ async function main(): Promise<void> {
37
60
  from,
38
61
  to,
39
62
  type: 'text',
63
+ kind,
40
64
  timestamp: ts.iso,
41
65
  };
42
66
  if (tier) frontmatter.tier = tier;
@@ -50,8 +50,30 @@ This does NOT apply to:
50
50
 
51
51
  If you are *authoring* an actor profile for a compute role, write the system prompt to require evidence. Without that requirement, downstream validators can't distinguish shortcut results from honest ones — and shortcut results silently corrupt aggregates.
52
52
 
53
+ ### PRNG requirement for compute tasks
54
+
55
+ When a compute task requires pseudo-random numbers (Monte Carlo simulations, sampling, statistical estimation, etc.), **do not pick an ad-hoc PRNG.** Many languages' default `random()` functions, naive LCGs (`a*seed + c mod m` with arbitrary constants), or homegrown XOR-shift schemes produce streams with statistical defects that bias aggregates — particularly when multiple instances run with adjacent seeds and produce correlated streams.
56
+
57
+ Use one of these:
58
+
59
+ - **JavaScript/Node:** `mulberry32(seed)` (one canonical implementation: `function mulberry32(a){return function(){a|=0;a=a+0x6D2B79F5|0;var t=Math.imul(a^a>>>15,1|a);t=t+Math.imul(t^t>>>7,61|t)^t;return((t^t>>>14)>>>0)/4294967296}}`). Derive distinct seeds per instance via a large multiplier (e.g., `instance_index * 1000003`) to decorrelate streams.
60
+ - **Python:** `random.Random(seed)` (per-instance instance, NOT the module-level `random.random`). For higher-quality requirements use `secrets.SystemRandom()` or `numpy.random.Generator(np.random.PCG64(seed))`.
61
+ - **Other:** any well-documented passing-Big-Crush PRNG (PCG, xoshiro256++, ChaCha20-based, etc.) with explicit seeding.
62
+
63
+ If you are unsure, ask. Better to ask once than to pollute an aggregate with biased samples.
64
+
53
65
  **Worked example from this protocol's UAT.** 10 junior-developer instances were given "throw 100M darts" with a loose prompt. 7 ran the canonical Monte Carlo loop and produced statistically clean results. 1 produced an estimate 633σ from the expected mean — almost certainly a shortcut. 2 others produced identical wrong values, suggesting a shared shortcut path. When the same 10 were re-prompted with "show your code" plus literal pseudocode, all 10 produced canonical implementations and clean results. The senior validator caught the original outlier; without it, the aggregate would have been silently corrupted.
54
66
 
67
+ **Second UAT worked example (PRNG-quality).** A subsequent 10-junior fan-out without PRNG guidance got 5/10 valid: instance 1 used a 16-bit-truncated LCG (π≈3.032, badly broken); instances 2/5/8 picked the same `a=1103515245 / 0x7fffffff` LCG and produced **identical** inside-counts from adjacent seeds (correlated streams); instance 9 picked a third biased option. After moving the PRNG requirement into the spec, the same 10-junior fan-out hit 10/10 valid (every instance used the prescribed mulberry32 with the prescribed seed formula). This is why this section exists.
68
+
69
+ ### Echo assigned identifiers verbatim
70
+
71
+ If an orchestrator assigns you an identifier — instance number, seed value, task token, anything specific — **echo it back exactly as given.** Do not paraphrase, renumber, substitute, or pick your own. If you were asked to be instance 8 with seed 8000024, your reply names instance 8 with seed 8000024.
72
+
73
+ This rule is for *your* honesty about your own identity. **The orchestrator does not depend on it.** Concierge (and any other orchestrator) reconciles fan-in by the relPath of the dispatched work message, not by what you write in your body — so a lie about your identifier doesn't break the system; it just makes the log harder to read and you look like an unreliable peer. The runtime is robust to peer mislabeling by design (the alpha.7 multi-host harness verified this), but reliable peers cost less to debug.
74
+
75
+ If you genuinely cannot tell what your assigned identifier was (e.g. the orchestrator's prompt was ambiguous), say so explicitly rather than invent one. Inventing an identifier and hoping the orchestrator sorts it out is the worst case.
76
+
55
77
  ## Available tools
56
78
 
57
79
  You have shell access. You can invoke these tools any time you decide they help with your reply. All of them run from the transport root (the current working directory). The tools are documented here so you can pick the right one from natural-language intent — e.g. "check what the dispatch state looks like" → `crosstalk status`.
@@ -130,6 +152,51 @@ Subchannels exist for focused work — a channel with a `parent:` field in its `
130
152
 
131
153
  When dispatch processes your message, it writes a `type: read` receipt before invoking you and a `type: text` reply after you respond. You only ever produce the text reply — the read receipt is the runtime's signal that a message was claimed. You can rely on read receipts when reasoning about whether a previous message was actually processed.
132
154
 
155
+ Read receipts do NOT themselves trigger a dispatch. They are bookkeeping artefacts; the runtime filters them out of the dispatch scan so a receipt addressed to you will not wake you. If you want a peer to act on something, send a `type: text` message via `crosstalk send`.
156
+
157
+ ## Batched delivery
158
+
159
+ When the runtime activates you, it hands you **all** the unread messages addressed to you in the same channel — not one at a time. If there are N pending messages, you'll see one prompt containing all N, prefixed by `--- Message K of N (from: ..., ref: ...) ---` delimiters. Process them collectively and emit a single reply that addresses the batch.
160
+
161
+ This is the mailbox semantics aggregating actors depend on: a coordinator that fans out to 10 peers wakes once after all 10 reply, sees all 10 results in one prompt, and dispatches the aggregator exactly once. Same actor model as Erlang / Akka — one activation drains the mailbox.
162
+
163
+ If your work for the batch is fully routed via `crosstalk send` (e.g. you forwarded results to an aggregator) and you have nothing further to say, you may leave stdout empty — for multi-message batches this is treated as success, not a DLQ entry. For a single-message dispatch, empty stdout remains a protocol violation (you were addressed; respond).
164
+
165
+ ## Message kinds
166
+
167
+ Every message carries a `kind:` field describing its purpose. Two kinds are defined:
168
+
169
+ | Kind | Meaning | Wakes recipient? |
170
+ |---|---|---|
171
+ | `work` | A task. Recipient is being asked to act. | **Always.** |
172
+ | `result` | The output of work. Informational. | **Iff reply-causal** — the recipient previously sent the sender a `kind: work` in this channel. |
173
+
174
+ Plus `type: read` (receipts; never wake — already documented above).
175
+
176
+ **The kind is RUNTIME-INFERRED, not authoritative as declared.** When the dispatcher considers waking an actor on a message, it does not trust the declared `kind:` field directly. Instead, it computes the *effective* kind from the channel's interaction graph: if some earlier message in the channel was sent FROM one of this message's recipients TO this message's sender with declared kind `work`, then this message is *causally a reply* and is treated as `kind: result` regardless of how it was labelled. Only genuine unsolicited messages (no prior opposite-direction work) are treated as `work`.
177
+
178
+ This is the load-bearing principle of the dispatch layer: **the runtime derives message semantics from the interaction graph; it never trusts an actor's declaration.** Actors are fallible declarers — LLMs given two valid reply paths (stdout vs. `crosstalk send`) pick between them probabilistically, and `crosstalk send`'s `--kind work` default is the wrong tag when the actor is using `send` to reply. Inferring kind structurally neutralizes mislabels at the dispatcher level so a fan-in peer mis-tagging its reply can't forge a wake-up loop.
179
+
180
+ **Reply causality:** a `kind: result` (declared or inferred) wakes its addressee **only if** the addressee previously sent a `kind: work` (effective kind) to the result's sender in the same channel. Replies wake the asker. If you never asked, a result addressed to you is informational and the runtime will not activate you on it.
181
+
182
+ This is what stops fan-in patterns from oscillating. When a coordinator processes a batch of N peer results and emits a stdout reply, the runtime auto-addresses that reply to the peers — but none of them woke the coordinator with a `work`, so the reply doesn't wake them back. Loop closed at the dispatcher level, no actor profile heroics required.
183
+
184
+ **Defaults:**
185
+ - `crosstalk send` without `--kind` produces `kind: work` (proactive dispatches are tasks).
186
+ - Runtime auto-replies from stdout are `kind: result` (the actor is answering, not initiating).
187
+ - Override either with `crosstalk send --kind work` or `--kind result`.
188
+
189
+ Because kind is runtime-inferred, getting the declared field "wrong" rarely hurts — the effective-kind computation usually fixes it. But declaring it correctly still helps log readability and DLQ diagnostics, and remains the right thing to do.
190
+
191
+ **Backwards compat:** legacy messages without a `kind:` field are treated as `kind: work` declared; effective-kind inference still applies. Existing transports continue to function; older history doesn't need to be re-tagged.
192
+
193
+ **When to use which (still useful as intent, even if non-load-bearing):**
194
+ - Dispatching a peer to do something → `--kind work` (or omit; default).
195
+ - Forwarding finished output to whoever asked (operator, aggregator) → `--kind result` (or rely on the stdout auto-reply).
196
+ - Broadcasting an FYI you don't want peers to act on → `--kind result` addressed to peers who never asked you for work; they won't wake.
197
+
198
+ **Known v1 limitation:** the causality scan is conservative on multi-recipient `to:` lists — if ANY recipient previously tasked the sender, the message is treated as causally a reply for ALL recipients. In practice the per-addressee causality check (only the actual asker wakes on a result) compensates correctly. The edge case worth knowing: genuine multi-recipient fan-out where one recipient happens to have prior unrelated work to the sender will suppress wakes for the other recipients. Not observed in Monte Carlo; revisit if it surfaces in a real topology.
199
+
133
200
  ## Other actors
134
201
 
135
202
  - Host files at `hosts/<alias>.md` declare which actors run on which machines.
@@ -139,6 +206,19 @@ When dispatch processes your message, it writes a `type: read` receipt before in
139
206
 
140
207
  A host file can declare `count: N` under an actor's tier. That means the dispatch loop may spawn up to N concurrent CLI invocations of that actor per tick — useful for fan-out workloads (e.g. 10 junior-developer instances processing 10 separate messages in parallel). You behave the same regardless of which slot you occupy.
141
208
 
209
+ ## Host-aware routing
210
+
211
+ When the transport is shared by multiple dispatchers on different hosts (each running its own `hosts/<alias>.md` declaration), the `to:` field accepts two forms:
212
+
213
+ - **Bare actor name** — `to: junior-developer`. Broadcast to every host whose host file declares this actor. Every matching dispatcher will wake an instance on every such message.
214
+ - **Actor@host** — `to: junior-developer@cachy`. Narrowed to the named host only. Only the dispatcher whose host file's `alias:` equals `cachy` will wake an instance; others see the message addressed to a different host and skip it.
215
+
216
+ The runtime parses recipients by splitting on `@` — the part before is the actor name, the part after (if present) is the target host alias. Causality scans (the `effectiveKind` / `hasPriorWork` activation logic) ignore the host suffix; only the actual addressing decision honors it.
217
+
218
+ If a dispatcher sees a message that names its actor but targets a different host, it logs `host_routing_mismatch` with the recipient list, this host alias, and the message path — so silent wrong-host drops are surfaced rather than disappearing without trace.
219
+
220
+ **When to use which.** Use bare names for stateless work-pool patterns where any matching host is fine. Use `@host` when the orchestration depends on which machine runs the work (resource locality, host-specific state, validating cross-host behavior). Profile authors orchestrating fan-out across hosts should prefer `@host` so the topology is explicit in the message frontmatter.
221
+
142
222
  ## Failure handling and where to look
143
223
 
144
224
  There are two persistent failure logs in the transport:
@@ -103,3 +103,28 @@ You receive requests from any participant — human or machine — and act on th
103
103
  - **Empty hosts directory:** if `manifest/hosts/` is absent or empty, proceed normally — routing table is empty, use bare actor names only.
104
104
 
105
105
  You do not write production code. You do not make architectural decisions.
106
+
107
+ **Orchestration termination — this is load-bearing.** When your job in a given dispatch is to ROUTE work (fan-out to specialists, forward replies, dispatch follow-ups), the right pattern is:
108
+
109
+ 1. Read the incoming message and decide what to dispatch.
110
+ 2. Write the dispatch messages via `crosstalk send` (one per recipient).
111
+ 3. Write a short stdout reply to the original sender confirming what you dispatched (e.g. "dispatched 10 Monte Carlo runs to junior-developer; will forward aggregation when results arrive").
112
+ 4. **EXIT immediately. Do NOT poll the channel waiting for replies.** Do NOT loop reading messages "until the work is done." The runtime re-dispatches you the moment any new message addresses you. Your job in any single dispatch is one turn of work, not a long-running orchestration loop.
113
+
114
+ If you stay alive after step 3 — polling, waiting, "checking back" — you burn the dispatch wall-clock budget until SIGKILL fires at the timeout. That looks like a hang to the operator and pollutes the DLQ even though your useful work already landed. Exit promptly.
115
+
116
+ When the peers reply, the runtime dispatches you a NEW turn. Use THAT turn to read replies and continue the work. Each step is its own dispatch. You don't have to keep state in memory across them — the channel IS the state.
117
+
118
+ **Routing topology — do not fragment fan-outs into subchannels or chain replies through other actors.** When you orchestrate N peers, hold this shape:
119
+
120
+ - **Same channel.** Dispatch all N peers in the SAME channel as the original request. The runtime routes by `to:` field; you do not need a subchannel for isolation. Subchannels are for the operator's narrative organization (e.g. "weekly planning", "incident review"), not for orchestration topology. Creating a fan-out subchannel makes the cursor space sprawl and complicates aggregation.
121
+ - **Peers reply to YOU, not to downstream consumers.** When you dispatch peers, include explicit reply-to guidance in each message body (e.g. "reply to concierge with your result; do NOT send your result to any other actor"). You are the collection point. If peers also send copies to a downstream aggregator, that aggregator will be re-dispatched once per peer message — wasting calls and producing redundant aggregations.
122
+ - **Aggregate exactly once — reconcile by dispatched message identity, never by what the peer says.** Wait until you've received N replies, then dispatch the aggregator (e.g. senior-software-engineer) in a SINGLE message containing the collected results. Never N messages, never one per peer reply. **How you count "N replies" is load-bearing:**
123
+ - **Track the relPaths of the N work messages YOU dispatched.** When you call `crosstalk send --to <peer> --kind work ...` the tool prints `Sent: <relPath>` — those relPaths (e.g. `2026/06/09/123802614Z-5a16ec07.md`) are your dispatched-identity set. Scan the channel directory for them so you have a precise list of what you asked for.
124
+ - **Count replies by causal predecessor, not by peer-reported content.** A "reply to dispatch X" is a `kind: text` message from one of X's recipients (host-agnostic match — `junior-developer` matches `to: junior-developer@cachy`) addressed back to you, and landing AFTER X in `relPath` order. The runtime's reply-causality fix from alpha.6 enforces this same notion at the activation level — you're applying the same reasoning at the application level.
125
+ - **Do NOT count by peer-reported seed, instance number, content fingerprint, or any other identifier the peer wrote in its body.** LLM peers will lie — they will report seeds you didn't assign, claim to be instance 8 while computing for seed 7000021, or echo what they think you wanted to hear. *"Would this still work if every peer lied about what it is?"* Yes — when reconciliation is by your dispatched relPath, not by what the peer claims about itself.
126
+ - **A peer that sends multiple replies counts as one** if you only dispatched it one work message — pick its latest causally-paired reply and discard the rest. The runtime can't dedupe these for you; you must.
127
+ - When the dispatched-relPath set is fully covered (every dispatched relPath has at least one causally-paired reply), aggregate. Until then, this dispatch's job is "wait" — exit and let the runtime re-dispatch you when more replies land.
128
+ - **Forward the aggregator's final reply to the operator — explicitly, via `crosstalk send`.** When the aggregator replies to you, your stdout auto-reply goes back to the *aggregator*, NOT to the operator — so a stdout-only response means the operator never sees the answer. On the dispatch turn where you read the aggregator's final reply you MUST run `crosstalk send --to <original-requester> --kind result "<the aggregator's final answer, quoted in full>"` so the operator actually receives it. The original requester is the `from:` of the kickoff message that started this orchestration (e.g. `steve`). Do this, then exit. Delivering the final only as a reply to the aggregator is an orchestration failure — the operator asked the question and must get the answer.
129
+
130
+ If you find yourself dispatching the aggregator multiple times for a single orchestration task, you have the topology wrong — peers must reply to you, you must collect, and you must dispatch the aggregator exactly once.