@cordfuse/crosstalk 5.0.0-alpha.3 → 5.0.0-alpha.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/chat.ts +6 -4
- package/src/dispatch.ts +340 -35
- package/src/send.ts +39 -12
- package/src/transport.ts +29 -2
- package/template/upstream/PROTOCOL.md +73 -14
- package/template/upstream/actors/concierge.md +20 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@cordfuse/crosstalk",
|
|
3
|
-
"version": "5.0.0-alpha.
|
|
3
|
+
"version": "5.0.0-alpha.6",
|
|
4
4
|
"description": "Crosstalk runtime — async messaging between agents over git. The crosstalk CLI plus dispatch, send, attach, chat, and supporting tools.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"license": "MIT",
|
package/src/chat.ts
CHANGED
|
@@ -25,7 +25,7 @@ import { createInterface } from 'readline/promises';
|
|
|
25
25
|
import { spawnSync } from 'child_process';
|
|
26
26
|
import { now, messageFilename } from './filenames.js';
|
|
27
27
|
import { serializeFrontmatter, parseFrontmatter } from './frontmatter.js';
|
|
28
|
-
import { gitCommitAndPush
|
|
28
|
+
import { gitCommitAndPush } from './transport.js';
|
|
29
29
|
import { withLock } from './turnq.js';
|
|
30
30
|
|
|
31
31
|
const transportRoot = resolve(process.cwd());
|
|
@@ -132,9 +132,11 @@ async function sendMessage(body: string): Promise<void> {
|
|
|
132
132
|
`chat: ${fromName} -> ${toActor} in ${channelUuid!.slice(0, 8)}`,
|
|
133
133
|
);
|
|
134
134
|
if (!r.ok && r.error) {
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
135
|
+
// Same anti-pattern as send.ts: writing to errors/ from an operator-
|
|
136
|
+
// side command dirties the working tree and breaks subsequent
|
|
137
|
+
// git pull --rebase. Stay on stderr only.
|
|
138
|
+
const kind = r.committed ? 'push' : 'commit';
|
|
139
|
+
console.error(`(${kind} failed: ${r.error.slice(0, 200)} — message is local-only)`);
|
|
138
140
|
}
|
|
139
141
|
});
|
|
140
142
|
}
|
package/src/dispatch.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { resolve, join } from 'path';
|
|
1
|
+
import { resolve, join, dirname } from 'path';
|
|
2
2
|
import { spawn } from 'child_process';
|
|
3
3
|
import {
|
|
4
4
|
mkdirSync,
|
|
@@ -10,6 +10,21 @@ import {
|
|
|
10
10
|
closeSync,
|
|
11
11
|
} from 'fs';
|
|
12
12
|
import { watch } from 'fs/promises';
|
|
13
|
+
import { fileURLToPath } from 'url';
|
|
14
|
+
|
|
15
|
+
// Read runtime version from the installed package's package.json at startup
|
|
16
|
+
// so dispatch_start logs and heartbeat content always match the actual
|
|
17
|
+
// installed @cordfuse/crosstalk version. Avoids hand-editing on every release.
|
|
18
|
+
const RUNTIME_VERSION: string = (() => {
|
|
19
|
+
try {
|
|
20
|
+
const thisFileDir = dirname(fileURLToPath(import.meta.url));
|
|
21
|
+
const pkgPath = join(thisFileDir, '..', 'package.json');
|
|
22
|
+
const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8')) as { version?: string };
|
|
23
|
+
return pkg.version ?? 'unknown';
|
|
24
|
+
} catch {
|
|
25
|
+
return 'unknown';
|
|
26
|
+
}
|
|
27
|
+
})();
|
|
13
28
|
import {
|
|
14
29
|
findHostFile,
|
|
15
30
|
loadActorProfile,
|
|
@@ -53,6 +68,16 @@ const logFile = flag('--log-file');
|
|
|
53
68
|
const MAX_BACKOFF_MULTIPLIER = 10; // cap: pollSeconds * 10
|
|
54
69
|
const BACKOFF_GRACE = 2; // first N failures don't trigger backoff
|
|
55
70
|
|
|
71
|
+
// Per-tick heal: when N consecutive infra failures pile up, the dispatch
|
|
72
|
+
// loop is stuck in a deadlock that entrypoint's boot-time auto-recovery
|
|
73
|
+
// can't break (because dispatch is already running). At HEAL_THRESHOLD
|
|
74
|
+
// consecutive failures, attempt a `git fetch && reset --hard origin/<branch>
|
|
75
|
+
// && clean -fd` from inside the tick loop. Mirrors the entrypoint logic.
|
|
76
|
+
// Throttled — won't reattempt until fully BACKOFF_GRACE+HEAL_THRESHOLD more
|
|
77
|
+
// failures pile up after a heal, to avoid heal-loop-storms.
|
|
78
|
+
const HEAL_THRESHOLD = 5;
|
|
79
|
+
let lastHealAtFailureCount = 0;
|
|
80
|
+
|
|
56
81
|
// Stale-read-receipt sweep config — runs at most every SWEEP_INTERVAL_MS
|
|
57
82
|
// of wall-clock to surface read receipts that never produced a reply
|
|
58
83
|
// (indicates dispatch crashed mid-tick or CLI hung silently).
|
|
@@ -80,7 +105,7 @@ function writeHeartbeat(): void {
|
|
|
80
105
|
try {
|
|
81
106
|
const dir = join(transportRoot, '.turnq');
|
|
82
107
|
mkdirSync(dir, { recursive: true });
|
|
83
|
-
const data = { ts: new Date().toISOString(), pid: process.pid, version:
|
|
108
|
+
const data = { ts: new Date().toISOString(), pid: process.pid, version: RUNTIME_VERSION };
|
|
84
109
|
writeFileSync(join(dir, 'heartbeat'), JSON.stringify(data) + '\n');
|
|
85
110
|
} catch { /* best-effort */ }
|
|
86
111
|
}
|
|
@@ -99,6 +124,87 @@ function recipients(toField: unknown): string[] {
|
|
|
99
124
|
return [];
|
|
100
125
|
}
|
|
101
126
|
|
|
127
|
+
// Declared lifecycle kind for a message. `work` (default for legacy messages
|
|
128
|
+
// without the field) is the as-tagged intent. The runtime does NOT trust this
|
|
129
|
+
// value directly for the activation decision — see effectiveKind() below.
|
|
130
|
+
// Kept for use as the seed of the effective-kind computation.
|
|
131
|
+
function messageKind(msg: ChannelMessage): 'work' | 'result' {
|
|
132
|
+
const raw = msg.data['kind'];
|
|
133
|
+
return raw === 'result' ? 'result' : 'work';
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Is `msg` causally a reply to a prior ask? True iff some message strictly
|
|
137
|
+
// before `msg` was sent FROM one of `msg`'s recipients TO `msg`'s sender with
|
|
138
|
+
// declared kind `work`. If so, `msg` is that recipient's answer coming back —
|
|
139
|
+
// regardless of how its sender (a fallible LLM actor, or `crosstalk send`'s
|
|
140
|
+
// `work` default) labelled it.
|
|
141
|
+
//
|
|
142
|
+
// Conservative on multi-recipient `to:` lists: if ANY recipient previously
|
|
143
|
+
// tasked the sender, the message is treated as causally a reply for all
|
|
144
|
+
// recipients. The per-addressee asymmetry in hasPriorWork (below) compensates
|
|
145
|
+
// — only the recipient that actually asked wakes on it. Known v1 limitation:
|
|
146
|
+
// genuine multi-recipient fan-out where one recipient happens to have prior
|
|
147
|
+
// unrelated work to the sender will be demoted to result and suppress wakes
|
|
148
|
+
// for the other recipients. Not observed in Monte Carlo; revisit if it
|
|
149
|
+
// surfaces.
|
|
150
|
+
function isCausalReply(channelMessages: ChannelMessage[], msg: ChannelMessage): boolean {
|
|
151
|
+
const sender = typeof msg.data['from'] === 'string' ? msg.data['from'] : '';
|
|
152
|
+
if (!sender) return false;
|
|
153
|
+
const toList = recipients(msg.data['to']);
|
|
154
|
+
for (const m of channelMessages) {
|
|
155
|
+
if (m.relPath >= msg.relPath) break;
|
|
156
|
+
const mFrom = typeof m.data['from'] === 'string' ? m.data['from'] : '';
|
|
157
|
+
if (!toList.includes(mFrom)) continue;
|
|
158
|
+
if ((m.data['kind'] ?? 'work') === 'result') continue;
|
|
159
|
+
if (recipients(m.data['to']).includes(sender)) return true;
|
|
160
|
+
}
|
|
161
|
+
return false;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Effective lifecycle kind. The runtime INFERS kind from the causality graph
|
|
165
|
+
// rather than trusting the declared field: a message that is causally a reply
|
|
166
|
+
// is a `result` even if it was labelled `work` (actors routinely report
|
|
167
|
+
// results via `crosstalk send`, which defaults to `work`, and that mislabel
|
|
168
|
+
// forges false reply-causality edges → wake-up loops). Genuine unsolicited
|
|
169
|
+
// tasks (kickoffs, fresh dispatches) have no prior opposite-direction work
|
|
170
|
+
// and keep their `work` kind. See PROTOCOL.md "Message kinds".
|
|
171
|
+
//
|
|
172
|
+
// This is the load-bearing principle the rest of the activation rule rides
|
|
173
|
+
// on: the dispatcher derives semantics from the interaction graph; it never
|
|
174
|
+
// trusts an actor's declaration.
|
|
175
|
+
function effectiveKind(channelMessages: ChannelMessage[], msg: ChannelMessage): 'work' | 'result' {
|
|
176
|
+
if (messageKind(msg) === 'result') return 'result';
|
|
177
|
+
return isCausalReply(channelMessages, msg) ? 'result' : 'work';
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Reply causality — does `addressee` have a prior `kind: work` outbound to
|
|
181
|
+
// `sender` somewhere in the channel's history strictly before `before`? If
|
|
182
|
+
// yes, an inbound `kind: result` from `sender` to `addressee` is the answer
|
|
183
|
+
// to that ask, and the addressee should wake on it. If no, the result is
|
|
184
|
+
// unsolicited from addressee's POV and is informational only.
|
|
185
|
+
//
|
|
186
|
+
// Uses effectiveKind (not messageKind) when checking prior messages — a
|
|
187
|
+
// mislabeled "work" reply from a prior peer would otherwise forge a false
|
|
188
|
+
// causality edge here, which was the ping-pong root.
|
|
189
|
+
//
|
|
190
|
+
// The channel is already sorted by relPath ascending in
|
|
191
|
+
// listChannelMessages(), so the scan walks chronologically.
|
|
192
|
+
function hasPriorWork(
|
|
193
|
+
channelMessages: ChannelMessage[],
|
|
194
|
+
addressee: string,
|
|
195
|
+
sender: string,
|
|
196
|
+
before: string,
|
|
197
|
+
): boolean {
|
|
198
|
+
for (const m of channelMessages) {
|
|
199
|
+
if (m.relPath >= before) break;
|
|
200
|
+
if (typeof m.data['from'] !== 'string' || m.data['from'] !== addressee) continue;
|
|
201
|
+
if (effectiveKind(channelMessages, m) !== 'work') continue;
|
|
202
|
+
const toList = recipients(m.data['to']);
|
|
203
|
+
if (toList.includes(sender)) return true;
|
|
204
|
+
}
|
|
205
|
+
return false;
|
|
206
|
+
}
|
|
207
|
+
|
|
102
208
|
function composeSystemPrompt(actorPrompt: string): string {
|
|
103
209
|
return [protocolPrompt, actorPrompt]
|
|
104
210
|
.filter((p) => p.length > 0)
|
|
@@ -120,7 +226,12 @@ interface CliResult {
|
|
|
120
226
|
stderr: string;
|
|
121
227
|
}
|
|
122
228
|
|
|
123
|
-
function invokeCli(
|
|
229
|
+
function invokeCli(
|
|
230
|
+
cli: string,
|
|
231
|
+
systemPrompt: string,
|
|
232
|
+
userMessage: string,
|
|
233
|
+
actorName: string,
|
|
234
|
+
): Promise<CliResult> {
|
|
124
235
|
return new Promise((res) => {
|
|
125
236
|
const fullPrompt = `${systemPrompt}\n\n---\n\n${userMessage}`;
|
|
126
237
|
const parts = tokenizeCli(cli);
|
|
@@ -128,14 +239,34 @@ function invokeCli(cli: string, systemPrompt: string, userMessage: string): Prom
|
|
|
128
239
|
res({ status: 1, stdout: '', stderr: 'tokenized cli is empty' });
|
|
129
240
|
return;
|
|
130
241
|
}
|
|
131
|
-
|
|
242
|
+
// detached: true creates a new process group so we can SIGKILL the
|
|
243
|
+
// group (not just the parent) on timeout — orphan children writing
|
|
244
|
+
// to the transport after parent SIGKILL was an observed alpha.5 hazard.
|
|
245
|
+
// Env: CROSSTALK_DISPATCH_ACTOR tells send.ts what to use as --from when
|
|
246
|
+
// the dispatched actor invokes `crosstalk send` without explicit --from.
|
|
247
|
+
const child = spawn(parts[0], parts.slice(1), {
|
|
248
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
249
|
+
detached: true,
|
|
250
|
+
env: { ...process.env, CROSSTALK_DISPATCH_ACTOR: actorName },
|
|
251
|
+
});
|
|
132
252
|
let stdout = '';
|
|
133
253
|
let stderr = '';
|
|
134
254
|
let resolved = false;
|
|
135
255
|
const timeout = setTimeout(() => {
|
|
136
256
|
if (resolved) return;
|
|
137
257
|
resolved = true;
|
|
138
|
-
|
|
258
|
+
// SIGKILL the process group (negative pid) so any children the actor
|
|
259
|
+
// spawned (e.g. crosstalk send subprocesses) die with the parent.
|
|
260
|
+
// Fallback to single-pid kill if the group signal fails (some envs).
|
|
261
|
+
try {
|
|
262
|
+
if (typeof child.pid === 'number') {
|
|
263
|
+
process.kill(-child.pid, 'SIGKILL');
|
|
264
|
+
} else {
|
|
265
|
+
child.kill('SIGKILL');
|
|
266
|
+
}
|
|
267
|
+
} catch {
|
|
268
|
+
try { child.kill('SIGKILL'); } catch { /* already dead */ }
|
|
269
|
+
}
|
|
139
270
|
res({ status: 124, stdout, stderr: stderr + '\n[timeout]' });
|
|
140
271
|
}, 5 * 60_000);
|
|
141
272
|
child.stdout.on('data', (d) => { stdout += d.toString(); });
|
|
@@ -168,14 +299,19 @@ function invokeCli(cli: string, systemPrompt: string, userMessage: string): Prom
|
|
|
168
299
|
function writeReply(
|
|
169
300
|
channelUuid: string,
|
|
170
301
|
fromActor: string,
|
|
171
|
-
toActor: string,
|
|
302
|
+
toActor: string | string[],
|
|
172
303
|
body: string,
|
|
173
304
|
): void {
|
|
174
305
|
const ts = now();
|
|
175
306
|
const dir = join(transportRoot, 'data', 'channels', channelUuid, ts.pathDate);
|
|
176
307
|
mkdirSync(dir, { recursive: true });
|
|
308
|
+
// Auto-replies emitted via stdout are `kind: result` by default — the actor
|
|
309
|
+
// is answering, not initiating new work. Recipients only wake on a result if
|
|
310
|
+
// they previously asked the sender for work in this channel (reply
|
|
311
|
+
// causality, see activation rule below). Actors that want to dispatch new
|
|
312
|
+
// work do so explicitly via `crosstalk send --kind work`.
|
|
177
313
|
const content = serializeFrontmatter(
|
|
178
|
-
{ from: fromActor, to: toActor, type: 'text', timestamp: ts.iso },
|
|
314
|
+
{ from: fromActor, to: toActor, type: 'text', kind: 'result', timestamp: ts.iso },
|
|
179
315
|
body,
|
|
180
316
|
);
|
|
181
317
|
writeFileSync(join(dir, messageFilename(ts)), content);
|
|
@@ -200,16 +336,76 @@ function writeReadReceipt(
|
|
|
200
336
|
interface PendingDispatch {
|
|
201
337
|
actorName: string;
|
|
202
338
|
channelUuid: string;
|
|
203
|
-
|
|
204
|
-
from: string;
|
|
339
|
+
msgs: ChannelMessage[]; // all unread messages addressed to this actor in this channel
|
|
205
340
|
tiers: HostActorTiers;
|
|
206
341
|
}
|
|
207
342
|
|
|
343
|
+
function messageSender(msg: ChannelMessage): string {
|
|
344
|
+
return typeof msg.data['from'] === 'string' ? msg.data['from'] : 'unknown';
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
function formatBatchedUserMessage(msgs: ChannelMessage[]): string {
|
|
348
|
+
if (msgs.length === 1) return msgs[0].body;
|
|
349
|
+
const header = `You have ${msgs.length} new messages in this channel. Process them collectively and reply once.`;
|
|
350
|
+
const parts: string[] = [header];
|
|
351
|
+
for (let i = 0; i < msgs.length; i++) {
|
|
352
|
+
const m = msgs[i];
|
|
353
|
+
const from = messageSender(m);
|
|
354
|
+
const ts = typeof m.data['timestamp'] === 'string' ? (m.data['timestamp'] as string) : '';
|
|
355
|
+
parts.push(`--- Message ${i + 1} of ${msgs.length} (from: ${from}, ref: ${m.relPath}${ts ? `, ts: ${ts}` : ''}) ---`);
|
|
356
|
+
parts.push(m.body);
|
|
357
|
+
}
|
|
358
|
+
return parts.join('\n\n');
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// Split a channel's pending messages (already sorted by relPath) into
|
|
362
|
+
// contiguous batches sized for the actor's concurrency. Contiguous (not
|
|
363
|
+
// round-robin) so each batch's highest relPath is monotone across batches —
|
|
364
|
+
// the cursor advances safely after the dispatch loop's per-batch writes
|
|
365
|
+
// without leaving a gap that would re-dispatch on the next tick.
|
|
366
|
+
//
|
|
367
|
+
// When pending fits within concurrency, every batch is a single message
|
|
368
|
+
// (preserves parallel fan-out — junior-developer with count: 10 and 10
|
|
369
|
+
// pending fan-out messages dispatches 10 parallel CLI invocations of 1
|
|
370
|
+
// message each). When pending exceeds concurrency, batches collapse pending
|
|
371
|
+
// into ~concurrency parallel invocations, each handling ceil(N/concurrency)
|
|
372
|
+
// messages (preserves the fan-in collapse — concierge with count: 1 and 10
|
|
373
|
+
// pending replies dispatches 1 invocation of 10 messages).
|
|
374
|
+
function splitForConcurrency(
|
|
375
|
+
msgs: ChannelMessage[],
|
|
376
|
+
concurrency: number,
|
|
377
|
+
): ChannelMessage[][] {
|
|
378
|
+
if (concurrency <= 1 || msgs.length <= 1) return [msgs];
|
|
379
|
+
const chunkSize = Math.max(1, Math.ceil(msgs.length / concurrency));
|
|
380
|
+
const out: ChannelMessage[][] = [];
|
|
381
|
+
for (let i = 0; i < msgs.length; i += chunkSize) {
|
|
382
|
+
out.push(msgs.slice(i, i + chunkSize));
|
|
383
|
+
}
|
|
384
|
+
return out;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
function distinctSenders(msgs: ChannelMessage[]): string[] {
|
|
388
|
+
const seen = new Set<string>();
|
|
389
|
+
const out: string[] = [];
|
|
390
|
+
for (const m of msgs) {
|
|
391
|
+
const s = messageSender(m);
|
|
392
|
+
if (s !== 'unknown' && !seen.has(s)) {
|
|
393
|
+
seen.add(s);
|
|
394
|
+
out.push(s);
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
return out;
|
|
398
|
+
}
|
|
399
|
+
|
|
208
400
|
async function dispatchOne(p: PendingDispatch): Promise<boolean> {
|
|
209
|
-
//
|
|
210
|
-
//
|
|
211
|
-
|
|
212
|
-
|
|
401
|
+
// Tier resolution uses the first message's `tier:` hint (if any). Batched
|
|
402
|
+
// dispatches assume homogeneous tier preference within an (actor, channel)
|
|
403
|
+
// pairing — true for fan-in (all peer replies omit tier) and for explicit
|
|
404
|
+
// single-message dispatches alike.
|
|
405
|
+
const firstMsg = p.msgs[0];
|
|
406
|
+
const lastMsg = p.msgs[p.msgs.length - 1];
|
|
407
|
+
const preferredTier = typeof firstMsg.data['tier'] === 'string'
|
|
408
|
+
? (firstMsg.data['tier'] as string)
|
|
213
409
|
: undefined;
|
|
214
410
|
let resolved;
|
|
215
411
|
try {
|
|
@@ -232,11 +428,17 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
|
|
|
232
428
|
return false;
|
|
233
429
|
}
|
|
234
430
|
const cli = resolved.cli;
|
|
235
|
-
|
|
431
|
+
|
|
432
|
+
// Quarantine check uses the LAST message's relPath as the batch's identity.
|
|
433
|
+
// Per-message quarantine semantics are preserved because batch boundaries
|
|
434
|
+
// align with cursor checkpoints; if a single message in a batch keeps
|
|
435
|
+
// failing, the cursor never advances past it and it surfaces as a singleton
|
|
436
|
+
// batch on the next tick.
|
|
437
|
+
if (isQuarantined(transportRoot, 'dispatch', p.actorName, p.channelUuid, lastMsg.relPath)) {
|
|
236
438
|
log('dispatch_skipped_quarantined', {
|
|
237
439
|
actor: p.actorName,
|
|
238
440
|
channel: p.channelUuid.slice(0, 8),
|
|
239
|
-
msg:
|
|
441
|
+
msg: lastMsg.relPath,
|
|
240
442
|
});
|
|
241
443
|
return false;
|
|
242
444
|
}
|
|
@@ -244,10 +446,17 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
|
|
|
244
446
|
log('dispatch', {
|
|
245
447
|
actor: p.actorName,
|
|
246
448
|
channel: p.channelUuid.slice(0, 8),
|
|
247
|
-
|
|
449
|
+
batch_size: p.msgs.length,
|
|
450
|
+
first_msg: firstMsg.relPath,
|
|
451
|
+
last_msg: lastMsg.relPath,
|
|
248
452
|
});
|
|
249
453
|
|
|
250
|
-
|
|
454
|
+
// Read receipt per message — preserves the audit trail (each original
|
|
455
|
+
// message gets exactly one receipt) and keeps the stale-receipt sweep
|
|
456
|
+
// correct.
|
|
457
|
+
for (const m of p.msgs) {
|
|
458
|
+
writeReadReceipt(p.channelUuid, p.actorName, messageSender(m), m.relPath);
|
|
459
|
+
}
|
|
251
460
|
|
|
252
461
|
let profile;
|
|
253
462
|
try {
|
|
@@ -271,7 +480,8 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
|
|
|
271
480
|
}
|
|
272
481
|
|
|
273
482
|
const systemPrompt = composeSystemPrompt(profile.systemPrompt);
|
|
274
|
-
const
|
|
483
|
+
const userMessage = formatBatchedUserMessage(p.msgs);
|
|
484
|
+
const result = await invokeCli(cli, systemPrompt, userMessage, p.actorName);
|
|
275
485
|
|
|
276
486
|
if (result.status !== 0) {
|
|
277
487
|
const r = writeDlqEntry(
|
|
@@ -279,12 +489,13 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
|
|
|
279
489
|
'dispatch',
|
|
280
490
|
p.actorName,
|
|
281
491
|
p.channelUuid,
|
|
282
|
-
|
|
492
|
+
lastMsg.relPath,
|
|
283
493
|
`cli exit=${result.status}\n${result.stderr.slice(0, 1000)}`,
|
|
284
494
|
);
|
|
285
495
|
log('dispatch_failed', {
|
|
286
496
|
actor: p.actorName,
|
|
287
497
|
channel: p.channelUuid.slice(0, 8),
|
|
498
|
+
batch_size: p.msgs.length,
|
|
288
499
|
dlq_id: r.id,
|
|
289
500
|
attempts: r.attempts,
|
|
290
501
|
quarantined: r.quarantined,
|
|
@@ -295,12 +506,25 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
|
|
|
295
506
|
|
|
296
507
|
const reply = result.stdout.trim();
|
|
297
508
|
if (reply.length === 0) {
|
|
509
|
+
// Empty stdout on a multi-message batch is treated as success — the
|
|
510
|
+
// actor likely routed via `crosstalk send` and has nothing to add as
|
|
511
|
+
// an auto-reply. For a single-message batch we keep the prior DLQ
|
|
512
|
+
// semantics: a single dispatched message that produces no reply is a
|
|
513
|
+
// protocol violation.
|
|
514
|
+
if (p.msgs.length > 1) {
|
|
515
|
+
log('dispatch_batch_silent_ok', {
|
|
516
|
+
actor: p.actorName,
|
|
517
|
+
channel: p.channelUuid.slice(0, 8),
|
|
518
|
+
batch_size: p.msgs.length,
|
|
519
|
+
});
|
|
520
|
+
return true;
|
|
521
|
+
}
|
|
298
522
|
const r = writeDlqEntry(
|
|
299
523
|
transportRoot,
|
|
300
524
|
'dispatch',
|
|
301
525
|
p.actorName,
|
|
302
526
|
p.channelUuid,
|
|
303
|
-
|
|
527
|
+
lastMsg.relPath,
|
|
304
528
|
'cli returned empty reply',
|
|
305
529
|
);
|
|
306
530
|
log('dispatch_empty_reply', {
|
|
@@ -313,7 +537,14 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
|
|
|
313
537
|
return false;
|
|
314
538
|
}
|
|
315
539
|
|
|
316
|
-
|
|
540
|
+
// Auto-reply addressing: single-sender batches reply to that sender
|
|
541
|
+
// (preserves prior behavior). Multi-sender batches address all distinct
|
|
542
|
+
// senders so each peer sees the response.
|
|
543
|
+
const senders = distinctSenders(p.msgs);
|
|
544
|
+
const replyTo: string | string[] = senders.length <= 1
|
|
545
|
+
? (senders[0] ?? messageSender(firstMsg))
|
|
546
|
+
: senders;
|
|
547
|
+
writeReply(p.channelUuid, p.actorName, replyTo, reply);
|
|
317
548
|
return true;
|
|
318
549
|
}
|
|
319
550
|
|
|
@@ -330,8 +561,13 @@ async function dispatchTick(): Promise<TickResult> {
|
|
|
330
561
|
|
|
331
562
|
const pullResult = gitPull(transportRoot);
|
|
332
563
|
if (!pullResult.ok && pullResult.error) {
|
|
333
|
-
|
|
334
|
-
|
|
564
|
+
// Note: deliberately NOT calling writeErrorLog here. Repeated pull
|
|
565
|
+
// failures (deadlock loop) would otherwise write a new errors/*.md
|
|
566
|
+
// every tick, which dispatch then has to commit, which the next
|
|
567
|
+
// pull then chokes on — a positive feedback that contributed to
|
|
568
|
+
// the alpha.3/alpha.4 Mac UAT wedge. The structured log line below
|
|
569
|
+
// gives operators full diagnostic info via stdout/json logs.
|
|
570
|
+
log('git_pull_failed', { error: pullResult.error.slice(0, 200) });
|
|
335
571
|
infraOk = false;
|
|
336
572
|
}
|
|
337
573
|
|
|
@@ -367,6 +603,13 @@ async function dispatchTick(): Promise<TickResult> {
|
|
|
367
603
|
const tiers = host.actors[actorName];
|
|
368
604
|
const concurrency = actorConcurrency(tiers);
|
|
369
605
|
|
|
606
|
+
// Mailbox batch-drain: for each channel, collect ALL unread messages
|
|
607
|
+
// addressed to this actor into a single PendingDispatch. This collapses
|
|
608
|
+
// fan-in O(N) into O(1) CLI invocations and prevents one actor's deep
|
|
609
|
+
// backlog from starving its peers in the (actor, channel) scan order.
|
|
610
|
+
// Read receipts and self-sent messages are filtered here — receipts
|
|
611
|
+
// are bookkeeping the actor already produced, and self-messages would
|
|
612
|
+
// create a wake-up loop.
|
|
370
613
|
const pending: PendingDispatch[] = [];
|
|
371
614
|
const channels = discoverChannels(transportRoot);
|
|
372
615
|
for (const channelUuid of channels) {
|
|
@@ -382,27 +625,50 @@ async function dispatchTick(): Promise<TickResult> {
|
|
|
382
625
|
post_cursor_msgs: post.length,
|
|
383
626
|
});
|
|
384
627
|
|
|
628
|
+
const channelBatch: ChannelMessage[] = [];
|
|
385
629
|
for (const msg of post) {
|
|
386
630
|
const to = recipients(msg.data['to']);
|
|
387
631
|
const from = typeof msg.data['from'] === 'string' ? msg.data['from'] : 'unknown';
|
|
388
|
-
|
|
632
|
+
const msgType = typeof msg.data['type'] === 'string' ? msg.data['type'] : 'text';
|
|
633
|
+
if (!to.includes(actorName) || from === actorName || msgType === 'read') {
|
|
634
|
+
writeCursor(transportRoot, actorName, channelUuid, msg.relPath);
|
|
635
|
+
continue;
|
|
636
|
+
}
|
|
637
|
+
// Lifecycle activation rule. `work` always wakes. `result` wakes
|
|
638
|
+
// only if reply-causal — actor previously sent the sender a `work`
|
|
639
|
+
// in this channel. The kind used here is the runtime's INFERRED
|
|
640
|
+
// effective kind, not the actor's declared kind: a message that's
|
|
641
|
+
// causally a reply is treated as `result` even when an actor (or
|
|
642
|
+
// `crosstalk send`'s default) labelled it `work`, so a fan-in peer
|
|
643
|
+
// mislabeling its reply can't forge a wake-up loop. See PROTOCOL.md
|
|
644
|
+
// "Message kinds".
|
|
645
|
+
const kind = effectiveKind(messages, msg);
|
|
646
|
+
if (kind === 'result' && !hasPriorWork(messages, actorName, from, msg.relPath)) {
|
|
389
647
|
writeCursor(transportRoot, actorName, channelUuid, msg.relPath);
|
|
390
648
|
continue;
|
|
391
649
|
}
|
|
392
|
-
|
|
650
|
+
channelBatch.push(msg);
|
|
651
|
+
}
|
|
652
|
+
if (channelBatch.length > 0) {
|
|
653
|
+
const groups = splitForConcurrency(channelBatch, concurrency);
|
|
654
|
+
for (const g of groups) {
|
|
655
|
+
pending.push({ actorName, channelUuid, msgs: g, tiers });
|
|
656
|
+
}
|
|
393
657
|
}
|
|
394
658
|
}
|
|
395
659
|
|
|
660
|
+
// Concurrency now applies across (channel) batches, not individual
|
|
661
|
+
// messages. Each batch is one CLI invocation regardless of how many
|
|
662
|
+
// messages it carries. Cursor advances to the last message in the batch
|
|
663
|
+
// on success or skip — failure (DLQ) leaves the cursor behind so the
|
|
664
|
+
// tail of the batch retries.
|
|
396
665
|
for (let i = 0; i < pending.length; i += concurrency) {
|
|
397
666
|
const batch = pending.slice(i, i + concurrency);
|
|
398
667
|
const results = await Promise.all(batch.map((p) => dispatchOne(p)));
|
|
399
668
|
for (let j = 0; j < batch.length; j++) {
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
batch[j].channelUuid,
|
|
404
|
-
batch[j].msg.relPath,
|
|
405
|
-
);
|
|
669
|
+
const p = batch[j];
|
|
670
|
+
const lastRelPath = p.msgs[p.msgs.length - 1].relPath;
|
|
671
|
+
writeCursor(transportRoot, p.actorName, p.channelUuid, lastRelPath);
|
|
406
672
|
if (results[j]) didWork = true;
|
|
407
673
|
}
|
|
408
674
|
}
|
|
@@ -420,12 +686,14 @@ async function dispatchTick(): Promise<TickResult> {
|
|
|
420
686
|
: `dispatch: cursor advance ${new Date().toISOString()}`;
|
|
421
687
|
const pushResult = gitCommitAndPush(transportRoot, commitMsg);
|
|
422
688
|
if (!pushResult.ok && pushResult.error) {
|
|
689
|
+
// Same rationale as the pull case above: no writeErrorLog.
|
|
690
|
+
// Repeated push failures shouldn't flood errors/ since that
|
|
691
|
+
// contributes to the same git-deadlock-feedback that pull does.
|
|
423
692
|
const kind = pushResult.committed ? 'git_push' : 'git_commit';
|
|
424
|
-
const errId = writeErrorLog(transportRoot, kind, pushResult.error);
|
|
425
693
|
log('git_push_failed', {
|
|
426
|
-
|
|
694
|
+
kind,
|
|
427
695
|
committed_locally: pushResult.committed,
|
|
428
|
-
error: pushResult.error.slice(0,
|
|
696
|
+
error: pushResult.error.slice(0, 200),
|
|
429
697
|
});
|
|
430
698
|
infraOk = false;
|
|
431
699
|
}
|
|
@@ -467,7 +735,7 @@ async function waitForWakeOrTimeout(ms: number): Promise<'wake' | 'timeout'> {
|
|
|
467
735
|
async function main(): Promise<void> {
|
|
468
736
|
log('dispatch_start', {
|
|
469
737
|
transport: transportRoot,
|
|
470
|
-
version:
|
|
738
|
+
version: RUNTIME_VERSION,
|
|
471
739
|
log_file: logFile ?? null,
|
|
472
740
|
});
|
|
473
741
|
if (onceMode) {
|
|
@@ -501,6 +769,43 @@ async function main(): Promise<void> {
|
|
|
501
769
|
});
|
|
502
770
|
}
|
|
503
771
|
|
|
772
|
+
// Per-tick heal: deadlock-break when the dispatch loop has been
|
|
773
|
+
// failing for HEAL_THRESHOLD consecutive ticks AND we haven't healed
|
|
774
|
+
// recently. Hard-resets the working tree to origin/<current branch>.
|
|
775
|
+
// Trades any uncommitted local state for forward progress — acceptable
|
|
776
|
+
// because messages/cursors/dlq are pulled back from origin and
|
|
777
|
+
// .turnq/errors are regenerated.
|
|
778
|
+
if (
|
|
779
|
+
consecutiveInfraFailures >= HEAL_THRESHOLD &&
|
|
780
|
+
consecutiveInfraFailures - lastHealAtFailureCount >= HEAL_THRESHOLD
|
|
781
|
+
) {
|
|
782
|
+
try {
|
|
783
|
+
const branchProc = spawn('git', ['rev-parse', '--abbrev-ref', 'HEAD'], {
|
|
784
|
+
cwd: transportRoot,
|
|
785
|
+
stdio: ['ignore', 'pipe', 'ignore'],
|
|
786
|
+
});
|
|
787
|
+
let branchName = '';
|
|
788
|
+
branchProc.stdout.on('data', (d) => { branchName += d.toString(); });
|
|
789
|
+
await new Promise<void>((res) => branchProc.on('close', () => res()));
|
|
790
|
+
const branch = branchName.trim() || 'main';
|
|
791
|
+
log('per_tick_heal_start', {
|
|
792
|
+
consecutive_failures: consecutiveInfraFailures,
|
|
793
|
+
target: `origin/${branch}`,
|
|
794
|
+
});
|
|
795
|
+
await new Promise<void>((res) => {
|
|
796
|
+
const p = spawn('sh', [
|
|
797
|
+
'-c',
|
|
798
|
+
`git rebase --abort 2>/dev/null; git fetch --quiet origin '${branch}' && git reset --hard --quiet 'origin/${branch}' && git clean -fdq`,
|
|
799
|
+
], { cwd: transportRoot, stdio: 'inherit' });
|
|
800
|
+
p.on('close', () => res());
|
|
801
|
+
});
|
|
802
|
+
log('per_tick_heal_done', { target: `origin/${branch}` });
|
|
803
|
+
lastHealAtFailureCount = consecutiveInfraFailures;
|
|
804
|
+
} catch (err) {
|
|
805
|
+
log('per_tick_heal_failed', { error: (err as Error).message });
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
|
|
504
809
|
if (r.didWork) {
|
|
505
810
|
await new Promise((res) => setTimeout(res, 1_000 * backoffFactor));
|
|
506
811
|
} else {
|
package/src/send.ts
CHANGED
|
@@ -2,7 +2,7 @@ import { resolve, join } from 'path';
|
|
|
2
2
|
import { mkdirSync, writeFileSync } from 'fs';
|
|
3
3
|
import { now, messageFilename } from './filenames.js';
|
|
4
4
|
import { serializeFrontmatter } from './frontmatter.js';
|
|
5
|
-
import { gitCommitAndPush
|
|
5
|
+
import { gitCommitAndPush } from './transport.js';
|
|
6
6
|
import { withLock } from './turnq.js';
|
|
7
7
|
|
|
8
8
|
const transportRoot = resolve(process.cwd());
|
|
@@ -17,17 +17,40 @@ function flag(name: string): string | undefined {
|
|
|
17
17
|
async function main(): Promise<void> {
|
|
18
18
|
const channelUuid = flag('--channel');
|
|
19
19
|
const to = flag('--to');
|
|
20
|
-
|
|
20
|
+
// Sender identity precedence:
|
|
21
|
+
// 1. --from on the command line (explicit operator/actor choice)
|
|
22
|
+
// 2. CROSSTALK_DISPATCH_ACTOR env var (set by dispatch.ts when it spawns
|
|
23
|
+
// an actor's CLI — so the actor's outbound messages route as itself,
|
|
24
|
+
// not as the operator). Fixes the alpha.5 finding where concierge's
|
|
25
|
+
// fan-out messages went out as `from=steve` because send.ts fell
|
|
26
|
+
// through to USER instead.
|
|
27
|
+
// 3. $USER (interactive operator default)
|
|
28
|
+
// 4. literal 'steve' as last resort
|
|
29
|
+
const from = flag('--from')
|
|
30
|
+
?? process.env['CROSSTALK_DISPATCH_ACTOR']
|
|
31
|
+
?? process.env['USER']
|
|
32
|
+
?? 'steve';
|
|
21
33
|
const tier = flag('--tier');
|
|
34
|
+
// Lifecycle kind. `work` (default) — recipient is being asked to act, will
|
|
35
|
+
// wake on receipt. `result` — informational reply, wakes the recipient only
|
|
36
|
+
// if it previously asked the sender for work (reply causality). See
|
|
37
|
+
// PROTOCOL.md "Message kinds". Proactive sends default to `work`; the
|
|
38
|
+
// runtime's auto-reply path defaults to `result`.
|
|
39
|
+
const kind = flag('--kind') ?? 'work';
|
|
22
40
|
const body = argv[argv.length - 1];
|
|
23
41
|
|
|
24
42
|
if (!channelUuid || !to || !body || body.startsWith('--')) {
|
|
25
43
|
console.error(
|
|
26
|
-
'Usage:
|
|
44
|
+
'Usage: crosstalk send --channel <uuid> --to <actor> [--from <actor>] [--tier <name>] [--kind work|result] "<message body>"',
|
|
27
45
|
);
|
|
28
46
|
process.exit(1);
|
|
29
47
|
}
|
|
30
48
|
|
|
49
|
+
if (kind !== 'work' && kind !== 'result') {
|
|
50
|
+
console.error(`Invalid --kind '${kind}'. Must be 'work' or 'result'.`);
|
|
51
|
+
process.exit(1);
|
|
52
|
+
}
|
|
53
|
+
|
|
31
54
|
await withLock('dispatch', async () => {
|
|
32
55
|
const ts = now();
|
|
33
56
|
const dir = join(transportRoot, 'data', 'channels', channelUuid, ts.pathDate);
|
|
@@ -37,6 +60,7 @@ async function main(): Promise<void> {
|
|
|
37
60
|
from,
|
|
38
61
|
to,
|
|
39
62
|
type: 'text',
|
|
63
|
+
kind,
|
|
40
64
|
timestamp: ts.iso,
|
|
41
65
|
};
|
|
42
66
|
if (tier) frontmatter.tier = tier;
|
|
@@ -60,16 +84,19 @@ async function main(): Promise<void> {
|
|
|
60
84
|
}
|
|
61
85
|
|
|
62
86
|
if (!pushResult.ok && pushResult.error) {
|
|
63
|
-
|
|
64
|
-
|
|
87
|
+
// Note: deliberately NOT writing to errors/. That directory is dispatcher-
|
|
88
|
+
// owned state, and operator-side writes from `crosstalk send` were
|
|
89
|
+
// dirtying the working tree, causing subsequent `git pull --rebase` to
|
|
90
|
+
// fail with "unstaged changes". Surface the error to stderr only.
|
|
91
|
+
const kind = pushResult.committed ? 'push' : 'commit';
|
|
65
92
|
console.error(`Wrote locally: ${join(ts.pathDate, filename)}`);
|
|
66
|
-
console.error(
|
|
67
|
-
|
|
68
|
-
);
|
|
69
|
-
console.error(
|
|
70
|
-
console.error(
|
|
71
|
-
|
|
72
|
-
);
|
|
93
|
+
console.error(`but git ${kind} FAILED:`);
|
|
94
|
+
console.error(` ${pushResult.error.slice(0, 300)}`);
|
|
95
|
+
console.error('');
|
|
96
|
+
console.error('Your message is in your local clone but not on origin.');
|
|
97
|
+
console.error('Recover with:');
|
|
98
|
+
console.error(' git pull --rebase');
|
|
99
|
+
console.error(' git push');
|
|
73
100
|
process.exit(3);
|
|
74
101
|
}
|
|
75
102
|
|
package/src/transport.ts
CHANGED
|
@@ -70,13 +70,40 @@ export function gitCommitAndPush(transportRoot: string, message: string): GitPus
|
|
|
70
70
|
return { ok: true, committed: false, pushed: false };
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
-
|
|
74
|
-
|
|
73
|
+
// Stage everything EXCEPT .turnq/ (machine-local runtime state; commits of
|
|
74
|
+
// this directory cause modify/delete conflicts the moment another clone
|
|
75
|
+
// untracks it via gitignore). Pathspec exclusion is independent of the
|
|
76
|
+
// transport's .gitignore — defensive against gitignore drift.
|
|
77
|
+
//
|
|
78
|
+
// Edge: `git add -A . :(exclude).turnq` exits non-zero with "The following
|
|
79
|
+
// paths are ignored by one of your .gitignore files: .turnq" because the
|
|
80
|
+
// exclude pathspec itself matches a gitignored path. The add still stages
|
|
81
|
+
// every other change correctly — only the exit code is misleading. So we
|
|
82
|
+
// treat that specific failure-pattern as benign and let the subsequent
|
|
83
|
+
// commit step decide whether anything actually got staged.
|
|
84
|
+
const add = captureGit(transportRoot, ['add', '-A', '.', ':(exclude).turnq']);
|
|
85
|
+
const addBenignIgnoredPath = add.status !== 0 &&
|
|
86
|
+
/paths are ignored/.test(add.stderr);
|
|
87
|
+
if (add.status !== 0 && !addBenignIgnoredPath) {
|
|
75
88
|
return { ok: false, committed: false, pushed: false, error: add.stderr.trim().slice(0, 500) };
|
|
76
89
|
}
|
|
77
90
|
|
|
91
|
+
// If .turnq/ was previously committed (pre-alpha.4 transport), the index
|
|
92
|
+
// may still hold tracked .turnq/* entries. Untrack them here so subsequent
|
|
93
|
+
// pulls don't fight with operator clones that have untracked .turnq/. This
|
|
94
|
+
// is a one-time-per-transport heal; on a clean transport it's a no-op.
|
|
95
|
+
const indexedTurnq = captureGit(transportRoot, ['ls-files', '.turnq']);
|
|
96
|
+
if (indexedTurnq.status === 0 && indexedTurnq.stdout.trim().length > 0) {
|
|
97
|
+
captureGit(transportRoot, ['rm', '-r', '--cached', '--quiet', '.turnq']);
|
|
98
|
+
}
|
|
99
|
+
|
|
78
100
|
const commit = captureGit(transportRoot, ['commit', '-m', message]);
|
|
79
101
|
if (commit.status !== 0) {
|
|
102
|
+
// Empty commit ("nothing to commit") is fine — the exclusion may have
|
|
103
|
+
// dropped the only change. Treat exit-1 with no error text as no-op.
|
|
104
|
+
const noop = commit.stdout.includes('nothing to commit') ||
|
|
105
|
+
commit.stderr.includes('nothing to commit');
|
|
106
|
+
if (noop) return { ok: true, committed: false, pushed: false };
|
|
80
107
|
return { ok: false, committed: false, pushed: false, error: commit.stderr.trim().slice(0, 500) };
|
|
81
108
|
}
|
|
82
109
|
|
|
@@ -50,17 +50,31 @@ This does NOT apply to:
|
|
|
50
50
|
|
|
51
51
|
If you are *authoring* an actor profile for a compute role, write the system prompt to require evidence. Without that requirement, downstream validators can't distinguish shortcut results from honest ones — and shortcut results silently corrupt aggregates.
|
|
52
52
|
|
|
53
|
+
### PRNG requirement for compute tasks
|
|
54
|
+
|
|
55
|
+
When a compute task requires pseudo-random numbers (Monte Carlo simulations, sampling, statistical estimation, etc.), **do not pick an ad-hoc PRNG.** Many languages' default `random()` functions, naive LCGs (`a*seed + c mod m` with arbitrary constants), or homegrown XOR-shift schemes produce streams with statistical defects that bias aggregates — particularly when multiple instances run with adjacent seeds and produce correlated streams.
|
|
56
|
+
|
|
57
|
+
Use one of these:
|
|
58
|
+
|
|
59
|
+
- **JavaScript/Node:** `mulberry32(seed)` (one canonical implementation: `function mulberry32(a){return function(){a|=0;a=a+0x6D2B79F5|0;var t=Math.imul(a^a>>>15,1|a);t=t+Math.imul(t^t>>>7,61|t)^t;return((t^t>>>14)>>>0)/4294967296}}`). Derive distinct seeds per instance via a large multiplier (e.g., `instance_index * 1000003`) to decorrelate streams.
|
|
60
|
+
- **Python:** `random.Random(seed)` (per-instance instance, NOT the module-level `random.random`). For higher-quality requirements use `secrets.SystemRandom()` or `numpy.random.Generator(np.random.PCG64(seed))`.
|
|
61
|
+
- **Other:** any well-documented passing-Big-Crush PRNG (PCG, xoshiro256++, ChaCha20-based, etc.) with explicit seeding.
|
|
62
|
+
|
|
63
|
+
If you are unsure, ask. Better to ask once than to pollute an aggregate with biased samples.
|
|
64
|
+
|
|
53
65
|
**Worked example from this protocol's UAT.** 10 junior-developer instances were given "throw 100M darts" with a loose prompt. 7 ran the canonical Monte Carlo loop and produced statistically clean results. 1 produced an estimate 633σ from the expected mean — almost certainly a shortcut. 2 others produced identical wrong values, suggesting a shared shortcut path. When the same 10 were re-prompted with "show your code" plus literal pseudocode, all 10 produced canonical implementations and clean results. The senior validator caught the original outlier; without it, the aggregate would have been silently corrupted.
|
|
54
66
|
|
|
67
|
+
**Second UAT worked example (PRNG-quality).** A subsequent 10-junior fan-out without PRNG guidance got 5/10 valid: instance 1 used a 16-bit-truncated LCG (π≈3.032, badly broken); instances 2/5/8 picked the same `a=1103515245 / 0x7fffffff` LCG and produced **identical** inside-counts from adjacent seeds (correlated streams); instance 9 picked a third biased option. After moving the PRNG requirement into the spec, the same 10-junior fan-out hit 10/10 valid (every instance used the prescribed mulberry32 with the prescribed seed formula). This is why this section exists.
|
|
68
|
+
|
|
55
69
|
## Available tools
|
|
56
70
|
|
|
57
|
-
You have shell access. You can invoke these tools any time you decide they help with your reply. All of them run from the transport root (the current working directory). The tools are documented here so you can pick the right one from natural-language intent — e.g. "check what the dispatch state looks like" → `
|
|
71
|
+
You have shell access. You can invoke these tools any time you decide they help with your reply. All of them run from the transport root (the current working directory). The tools are documented here so you can pick the right one from natural-language intent — e.g. "check what the dispatch state looks like" → `crosstalk status`.
|
|
58
72
|
|
|
59
73
|
### `send` — initiate a message to another actor
|
|
60
74
|
|
|
61
75
|
Use this when you want to **proactively** message someone, not just reply to the prompt you're processing. (If you only want to reply to what you received, just answer — do not call `send`.)
|
|
62
76
|
|
|
63
|
-
|
|
77
|
+
crosstalk send --channel <channel-uuid> --to <actor> [--from <your-name>] [--tier <tier-name>] "<message body>"
|
|
64
78
|
|
|
65
79
|
`send` also pokes dispatch to tick immediately so the recipient sees the message right away.
|
|
66
80
|
|
|
@@ -70,13 +84,13 @@ Use this when you want to **proactively** message someone, not just reply to the
|
|
|
70
84
|
|
|
71
85
|
Use this to bypass the quiet-poll interval. Rarely needed manually — `send` already pokes dispatch automatically. Use this if you've directly written a message file and want dispatch to notice it now.
|
|
72
86
|
|
|
73
|
-
|
|
87
|
+
crosstalk wake
|
|
74
88
|
|
|
75
89
|
### `status` — inspect transport state
|
|
76
90
|
|
|
77
91
|
Use this when an operator asks "what's happening?" or before deciding whether to retry something.
|
|
78
92
|
|
|
79
|
-
|
|
93
|
+
crosstalk status
|
|
80
94
|
|
|
81
95
|
Outputs: host file summary, per-actor cursors, turnq lock state, channel list with message counts, DLQ entry count.
|
|
82
96
|
|
|
@@ -89,24 +103,24 @@ Use this when you want to inspect or retry failures. DLQ entries have one of two
|
|
|
89
103
|
|
|
90
104
|
Entries also carry an `attempts` count and a `quarantined: true|false` flag. If the same failure repeats 4+ times within an hour, the entry is quarantined: dispatch starts skipping that message (for `dispatch` kind) or that actor (for `config` kind). The retry command clears the quarantine and lets the next dispatch tick try again.
|
|
91
105
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
106
|
+
crosstalk dlq # same as --list
|
|
107
|
+
crosstalk dlq --list # list all DLQ entries (incl. quarantine markers + counts by kind)
|
|
108
|
+
crosstalk dlq --show <id> # show full details of one entry
|
|
109
|
+
crosstalk dlq --retry <id> # for dispatch: rewind cursor; for config: clear quarantine
|
|
110
|
+
crosstalk dlq --clear # delete all entries (destructive)
|
|
97
111
|
|
|
98
112
|
### `init` — scaffold a new transport
|
|
99
113
|
|
|
100
114
|
Use this only when an operator is setting up a fresh transport directory. Creates a default host file (for the current hostname), a `general` channel, and the empty `custom/actors/`, `cursors/`, and `dlq/` directories.
|
|
101
115
|
|
|
102
|
-
|
|
103
|
-
|
|
116
|
+
crosstalk init
|
|
117
|
+
crosstalk init --force # overwrite existing files
|
|
104
118
|
|
|
105
119
|
### `channel` — create a new channel or subchannel
|
|
106
120
|
|
|
107
121
|
Use this when you want to spin up a new conversation space — either a top-level channel or a focused subchannel of an existing one. Generates a UUID and writes `data/channels/<uuid>/CHANNEL.md`.
|
|
108
122
|
|
|
109
|
-
|
|
123
|
+
crosstalk channel --name <name> [--parent <parent-uuid>] [--created-by <name>]
|
|
110
124
|
|
|
111
125
|
Prints the new channel UUID. Use that UUID in subsequent `send` calls.
|
|
112
126
|
|
|
@@ -130,6 +144,51 @@ Subchannels exist for focused work — a channel with a `parent:` field in its `
|
|
|
130
144
|
|
|
131
145
|
When dispatch processes your message, it writes a `type: read` receipt before invoking you and a `type: text` reply after you respond. You only ever produce the text reply — the read receipt is the runtime's signal that a message was claimed. You can rely on read receipts when reasoning about whether a previous message was actually processed.
|
|
132
146
|
|
|
147
|
+
Read receipts do NOT themselves trigger a dispatch. They are bookkeeping artefacts; the runtime filters them out of the dispatch scan so a receipt addressed to you will not wake you. If you want a peer to act on something, send a `type: text` message via `crosstalk send`.
|
|
148
|
+
|
|
149
|
+
## Batched delivery
|
|
150
|
+
|
|
151
|
+
When the runtime activates you, it hands you **all** the unread messages addressed to you in the same channel — not one at a time. If there are N pending messages, you'll see one prompt containing all N, prefixed by `--- Message K of N (from: ..., ref: ...) ---` delimiters. Process them collectively and emit a single reply that addresses the batch.
|
|
152
|
+
|
|
153
|
+
This is the mailbox semantics aggregating actors depend on: a coordinator that fans out to 10 peers wakes once after all 10 reply, sees all 10 results in one prompt, and dispatches the aggregator exactly once. Same actor model as Erlang / Akka — one activation drains the mailbox.
|
|
154
|
+
|
|
155
|
+
If your work for the batch is fully routed via `crosstalk send` (e.g. you forwarded results to an aggregator) and you have nothing further to say, you may leave stdout empty — for multi-message batches this is treated as success, not a DLQ entry. For a single-message dispatch, empty stdout remains a protocol violation (you were addressed; respond).
|
|
156
|
+
|
|
157
|
+
## Message kinds
|
|
158
|
+
|
|
159
|
+
Every message carries a `kind:` field describing its purpose. Two kinds are defined:
|
|
160
|
+
|
|
161
|
+
| Kind | Meaning | Wakes recipient? |
|
|
162
|
+
|---|---|---|
|
|
163
|
+
| `work` | A task. Recipient is being asked to act. | **Always.** |
|
|
164
|
+
| `result` | The output of work. Informational. | **Iff reply-causal** — the recipient previously sent the sender a `kind: work` in this channel. |
|
|
165
|
+
|
|
166
|
+
Plus `type: read` (receipts; never wake — already documented above).
|
|
167
|
+
|
|
168
|
+
**The kind is RUNTIME-INFERRED, not authoritative as declared.** When the dispatcher considers waking an actor on a message, it does not trust the declared `kind:` field directly. Instead, it computes the *effective* kind from the channel's interaction graph: if some earlier message in the channel was sent FROM one of this message's recipients TO this message's sender with declared kind `work`, then this message is *causally a reply* and is treated as `kind: result` regardless of how it was labelled. Only genuine unsolicited messages (no prior opposite-direction work) are treated as `work`.
|
|
169
|
+
|
|
170
|
+
This is the load-bearing principle of the dispatch layer: **the runtime derives message semantics from the interaction graph; it never trusts an actor's declaration.** Actors are fallible declarers — LLMs given two valid reply paths (stdout vs. `crosstalk send`) pick between them probabilistically, and `crosstalk send`'s `--kind work` default is the wrong tag when the actor is using `send` to reply. Inferring kind structurally neutralizes mislabels at the dispatcher level so a fan-in peer mis-tagging its reply can't forge a wake-up loop.
|
|
171
|
+
|
|
172
|
+
**Reply causality:** a `kind: result` (declared or inferred) wakes its addressee **only if** the addressee previously sent a `kind: work` (effective kind) to the result's sender in the same channel. Replies wake the asker. If you never asked, a result addressed to you is informational and the runtime will not activate you on it.
|
|
173
|
+
|
|
174
|
+
This is what stops fan-in patterns from oscillating. When a coordinator processes a batch of N peer results and emits a stdout reply, the runtime auto-addresses that reply to the peers — but none of them woke the coordinator with a `work`, so the reply doesn't wake them back. Loop closed at the dispatcher level, no actor profile heroics required.
|
|
175
|
+
|
|
176
|
+
**Defaults:**
|
|
177
|
+
- `crosstalk send` without `--kind` produces `kind: work` (proactive dispatches are tasks).
|
|
178
|
+
- Runtime auto-replies from stdout are `kind: result` (the actor is answering, not initiating).
|
|
179
|
+
- Override either with `crosstalk send --kind work` or `--kind result`.
|
|
180
|
+
|
|
181
|
+
Because kind is runtime-inferred, getting the declared field "wrong" rarely hurts — the effective-kind computation usually fixes it. But declaring it correctly still helps log readability and DLQ diagnostics, and remains the right thing to do.
|
|
182
|
+
|
|
183
|
+
**Backwards compat:** legacy messages without a `kind:` field are treated as `kind: work` declared; effective-kind inference still applies. Existing transports continue to function; older history doesn't need to be re-tagged.
|
|
184
|
+
|
|
185
|
+
**When to use which (still useful as intent, even if non-load-bearing):**
|
|
186
|
+
- Dispatching a peer to do something → `--kind work` (or omit; default).
|
|
187
|
+
- Forwarding finished output to whoever asked (operator, aggregator) → `--kind result` (or rely on the stdout auto-reply).
|
|
188
|
+
- Broadcasting an FYI you don't want peers to act on → `--kind result` addressed to peers who never asked you for work; they won't wake.
|
|
189
|
+
|
|
190
|
+
**Known v1 limitation:** the causality scan is conservative on multi-recipient `to:` lists — if ANY recipient previously tasked the sender, the message is treated as causally a reply for ALL recipients. In practice the per-addressee causality check (only the actual asker wakes on a result) compensates correctly. The edge case worth knowing: genuine multi-recipient fan-out where one recipient happens to have prior unrelated work to the sender will suppress wakes for the other recipients. Not observed in Monte Carlo; revisit if it surfaces in a real topology.
|
|
191
|
+
|
|
133
192
|
## Other actors
|
|
134
193
|
|
|
135
194
|
- Host files at `hosts/<alias>.md` declare which actors run on which machines.
|
|
@@ -146,7 +205,7 @@ There are two persistent failure logs in the transport:
|
|
|
146
205
|
- **`dlq/`** — failed dispatches and config errors. Per-message and per-actor. Use the `dlq` tool to inspect/retry.
|
|
147
206
|
- **`errors/`** — infrastructure failures (git pull/push/commit, filesystem, message parse). Deduped by signature with a `count` field. If you see something not working as expected (replies aren't reaching origin, dispatch keeps reporting `skip_tick_locked`, etc.), check this directory — operator hostile state often surfaces here first.
|
|
148
207
|
|
|
149
|
-
`
|
|
208
|
+
`crosstalk status` shows counts for both at a glance, plus a **dispatch heartbeat** line — the timestamp of the most recent tick. If the heartbeat is fresh (under 2 min old), dispatch is running. If stale (over 5 min), dispatch has stopped or hung; check `errors/` and the process state.
|
|
150
209
|
|
|
151
210
|
**Persistent infrastructure failures trigger exponential backoff.** After 2+ consecutive ticks with failed git pull or push, dispatch doubles its poll interval each tick, capped at 10× the configured quiet poll. The `backoff_active` log event fires when active; `backoff_cleared` fires when a tick succeeds again.
|
|
152
211
|
|
|
@@ -177,4 +236,4 @@ For idempotent work (information lookup, calculation, advice), duplicates are ha
|
|
|
177
236
|
- **Do not modify `errors/` directly.** Same reasoning — entries are deduped by signature and the count field matters.
|
|
178
237
|
- **Do not modify `.turnq/`.** That holds turnq lock state.
|
|
179
238
|
- **Do not reply to messages addressed to other actors.** You only act on messages where the `to:` field includes your name.
|
|
180
|
-
- **Do not fabricate channel UUIDs.** Look at existing directories under `data/channels/` to find real ones — or run `
|
|
239
|
+
- **Do not fabricate channel UUIDs.** Look at existing directories under `data/channels/` to find real ones — or run `crosstalk status` to list them.
|
|
@@ -103,3 +103,23 @@ You receive requests from any participant — human or machine — and act on th
|
|
|
103
103
|
- **Empty hosts directory:** if `manifest/hosts/` is absent or empty, proceed normally — routing table is empty, use bare actor names only.
|
|
104
104
|
|
|
105
105
|
You do not write production code. You do not make architectural decisions.
|
|
106
|
+
|
|
107
|
+
**Orchestration termination — this is load-bearing.** When your job in a given dispatch is to ROUTE work (fan-out to specialists, forward replies, dispatch follow-ups), the right pattern is:
|
|
108
|
+
|
|
109
|
+
1. Read the incoming message and decide what to dispatch.
|
|
110
|
+
2. Write the dispatch messages via `crosstalk send` (one per recipient).
|
|
111
|
+
3. Write a short stdout reply to the original sender confirming what you dispatched (e.g. "dispatched 10 Monte Carlo runs to junior-developer; will forward aggregation when results arrive").
|
|
112
|
+
4. **EXIT immediately. Do NOT poll the channel waiting for replies.** Do NOT loop reading messages "until the work is done." The runtime re-dispatches you the moment any new message addresses you. Your job in any single dispatch is one turn of work, not a long-running orchestration loop.
|
|
113
|
+
|
|
114
|
+
If you stay alive after step 3 — polling, waiting, "checking back" — you burn the dispatch wall-clock budget until SIGKILL fires at the timeout. That looks like a hang to the operator and pollutes the DLQ even though your useful work already landed. Exit promptly.
|
|
115
|
+
|
|
116
|
+
When the peers reply, the runtime dispatches you a NEW turn. Use THAT turn to read replies and continue the work. Each step is its own dispatch. You don't have to keep state in memory across them — the channel IS the state.
|
|
117
|
+
|
|
118
|
+
**Routing topology — do not fragment fan-outs into subchannels or chain replies through other actors.** When you orchestrate N peers, hold this shape:
|
|
119
|
+
|
|
120
|
+
- **Same channel.** Dispatch all N peers in the SAME channel as the original request. The runtime routes by `to:` field; you do not need a subchannel for isolation. Subchannels are for the operator's narrative organization (e.g. "weekly planning", "incident review"), not for orchestration topology. Creating a fan-out subchannel makes the cursor space sprawl and complicates aggregation.
|
|
121
|
+
- **Peers reply to YOU, not to downstream consumers.** When you dispatch peers, include explicit reply-to guidance in each message body (e.g. "reply to concierge with your result; do NOT send your result to any other actor"). You are the collection point. If peers also send copies to a downstream aggregator, that aggregator will be re-dispatched once per peer message — wasting calls and producing redundant aggregations.
|
|
122
|
+
- **Aggregate exactly once.** Wait until you've seen all N peer replies (across however many subsequent dispatches that takes; you can count by scanning the channel for messages from each peer addressed to you). Only THEN dispatch the aggregator (e.g. senior-software-engineer) in a SINGLE message containing the collected results. Never N messages, never one per peer reply.
|
|
123
|
+
- **Forward the aggregator's final reply to the operator — explicitly, via `crosstalk send`.** When the aggregator replies to you, your stdout auto-reply goes back to the *aggregator*, NOT to the operator — so a stdout-only response means the operator never sees the answer. On the dispatch turn where you read the aggregator's final reply you MUST run `crosstalk send --to <original-requester> --kind result "<the aggregator's final answer, quoted in full>"` so the operator actually receives it. The original requester is the `from:` of the kickoff message that started this orchestration (e.g. `steve`). Do this, then exit. Delivering the final only as a reply to the aggregator is an orchestration failure — the operator asked the question and must get the answer.
|
|
124
|
+
|
|
125
|
+
If you find yourself dispatching the aggregator multiple times for a single orchestration task, you have the topology wrong — peers must reply to you, you must collect, and you must dispatch the aggregator exactly once.
|