@cordfuse/crosstalk 5.0.0-alpha.7 → 6.0.0-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/bin/crosstalk.js +34 -78
  2. package/package.json +4 -4
  3. package/src/activation.ts +104 -0
  4. package/src/attach.ts +1 -1
  5. package/src/channel.ts +8 -21
  6. package/src/chat.ts +52 -115
  7. package/src/dispatch.ts +265 -660
  8. package/src/dlq.ts +68 -136
  9. package/src/init.ts +17 -41
  10. package/src/open.ts +55 -31
  11. package/src/replies.ts +59 -0
  12. package/src/send.ts +48 -67
  13. package/src/state.ts +173 -0
  14. package/src/status.ts +18 -57
  15. package/src/stop.ts +37 -0
  16. package/src/transport.ts +68 -198
  17. package/src/turnq.ts +64 -32
  18. package/src/upgrade.ts +9 -11
  19. package/src/wake.ts +5 -6
  20. package/src/cursor.ts +0 -48
  21. package/template/.amazonq/rules/crosstalk.md +0 -2
  22. package/template/.continue/rules/crosstalk.md +0 -7
  23. package/template/.cursor/rules/crosstalk.mdc +0 -7
  24. package/template/.github/copilot-instructions.md +0 -2
  25. package/template/.windsurfrules +0 -2
  26. package/template/AGENTS.md +0 -2
  27. package/template/ANTIGRAVITY.md +0 -2
  28. package/template/CLAUDE.md +0 -2
  29. package/template/GEMINI.md +0 -2
  30. package/template/OPENCODE.md +0 -2
  31. package/template/QWEN.md +0 -2
  32. package/template/README.md +0 -22
  33. package/template/local/CROSSTALK.md +0 -4
  34. package/template/upstream/CROSSTALK-VERSION +0 -1
  35. package/template/upstream/CROSSTALK.md +0 -589
  36. package/template/upstream/JITTER.md +0 -24
  37. package/template/upstream/OPERATOR.md +0 -60
  38. package/template/upstream/PROTOCOL.md +0 -260
  39. package/template/upstream/actors/cloud-architect.md +0 -83
  40. package/template/upstream/actors/concierge.md +0 -130
  41. package/template/upstream/actors/devops-engineer.md +0 -83
  42. package/template/upstream/actors/documentation-engineer.md +0 -107
  43. package/template/upstream/actors/infrastructure-engineer.md +0 -83
  44. package/template/upstream/actors/junior-developer.md +0 -83
  45. package/template/upstream/actors/precise-generalist.md +0 -48
  46. package/template/upstream/actors/product-manager.md +0 -83
  47. package/template/upstream/actors/qa-engineer.md +0 -83
  48. package/template/upstream/actors/security-engineer.md +0 -92
  49. package/template/upstream/actors/senior-generalist-engineer.md +0 -111
  50. package/template/upstream/actors/senior-software-engineer.md +0 -94
  51. package/template/upstream/actors/skeptic.md +0 -89
  52. package/template/upstream/actors/technical-writer.md +0 -89
  53. package/template/upstream/actors/ux-designer.md +0 -83
package/src/dispatch.ts CHANGED
@@ -1,30 +1,19 @@
1
+ // crosstalk dispatch — the loop.
2
+ //
3
+ // Tick: pull → for each local actor, scan channels for messages past the
4
+ // cursor → decideWake (activation.ts, the one rule) → invoke the actor's
5
+ // CLI per batch → write replies (re: linked per sender) → commit+push.
6
+ //
7
+ // Only the commit+push is locked, and the lock is advisory (turnq.ts) —
8
+ // git arbitrates correctness. Cursors, DLQ, heartbeat and the error log
9
+ // live in the machine-local state dir (state.ts), so a tick's commit only
10
+ // ever contains data/ and there is no self-inflicted git deadlock to heal.
11
+
1
12
  import { resolve, join, dirname } from 'path';
2
13
  import { spawn } from 'child_process';
3
- import {
4
- mkdirSync,
5
- writeFileSync,
6
- readFileSync,
7
- existsSync,
8
- appendFileSync,
9
- openSync,
10
- closeSync,
11
- } from 'fs';
14
+ import { mkdirSync, writeFileSync, readFileSync, existsSync, appendFileSync } from 'fs';
12
15
  import { watch } from 'fs/promises';
13
16
  import { fileURLToPath } from 'url';
14
-
15
- // Read runtime version from the installed package's package.json at startup
16
- // so dispatch_start logs and heartbeat content always match the actual
17
- // installed @cordfuse/crosstalk version. Avoids hand-editing on every release.
18
- const RUNTIME_VERSION: string = (() => {
19
- try {
20
- const thisFileDir = dirname(fileURLToPath(import.meta.url));
21
- const pkgPath = join(thisFileDir, '..', 'package.json');
22
- const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8')) as { version?: string };
23
- return pkg.version ?? 'unknown';
24
- } catch {
25
- return 'unknown';
26
- }
27
- })();
28
17
  import {
29
18
  findHostFile,
30
19
  loadActorProfile,
@@ -38,15 +27,33 @@ import {
38
27
  listChannelMessages,
39
28
  gitPull,
40
29
  gitCommitAndPush,
41
- writeErrorLog,
42
- sweepStaleReadReceipts,
30
+ cursorBaseline,
31
+ newFilesSince,
43
32
  type ChannelMessage,
44
33
  } from './transport.js';
45
- import { readCursor, writeCursor } from './cursor.js';
34
+ import {
35
+ stateDir,
36
+ readCursor,
37
+ writeCursor,
38
+ writeHeartbeat,
39
+ writePidfile,
40
+ removePidfile,
41
+ logError,
42
+ } from './state.js';
43
+ import { recipients, reList, decideWake, splitForConcurrency } from './activation.js';
46
44
  import { now, messageFilename } from './filenames.js';
47
45
  import { serializeFrontmatter } from './frontmatter.js';
48
46
  import { withLock } from './turnq.js';
49
- import { writeDlqEntry, isQuarantined, isActorQuarantined } from './dlq.js';
47
+ import { writeDlqEntry, isQuarantined } from './dlq.js';
48
+
49
+ const RUNTIME_VERSION: string = (() => {
50
+ try {
51
+ const pkgPath = join(dirname(fileURLToPath(import.meta.url)), '..', 'package.json');
52
+ return (JSON.parse(readFileSync(pkgPath, 'utf-8')) as { version?: string }).version ?? 'unknown';
53
+ } catch {
54
+ return 'unknown';
55
+ }
56
+ })();
50
57
 
51
58
  const transportRoot = resolve(process.cwd());
52
59
  const argv = process.argv.slice(2);
@@ -63,27 +70,9 @@ const hostOverride = flag('--host');
63
70
  const pollSeconds = Number(flag('--poll')) || 30;
64
71
  const logFile = flag('--log-file');
65
72
 
66
- // Backoff config persistent infra failures (git pull/push) trigger
67
- // exponential delay. Reset on any successful pull+push cycle.
68
- const MAX_BACKOFF_MULTIPLIER = 10; // cap: pollSeconds * 10
69
- const BACKOFF_GRACE = 2; // first N failures don't trigger backoff
70
-
71
- // Per-tick heal: when N consecutive infra failures pile up, the dispatch
72
- // loop is stuck in a deadlock that entrypoint's boot-time auto-recovery
73
- // can't break (because dispatch is already running). At HEAL_THRESHOLD
74
- // consecutive failures, attempt a `git fetch && reset --hard origin/<branch>
75
- // && clean -fd` from inside the tick loop. Mirrors the entrypoint logic.
76
- // Throttled — won't reattempt until fully BACKOFF_GRACE+HEAL_THRESHOLD more
77
- // failures pile up after a heal, to avoid heal-loop-storms.
78
- const HEAL_THRESHOLD = 5;
79
- let lastHealAtFailureCount = 0;
80
-
81
- // Stale-read-receipt sweep config — runs at most every SWEEP_INTERVAL_MS
82
- // of wall-clock to surface read receipts that never produced a reply
83
- // (indicates dispatch crashed mid-tick or CLI hung silently).
84
- const SWEEP_INTERVAL_MS = 5 * 60_000;
85
- const STALE_RECEIPT_THRESHOLD_MS = 5 * 60_000;
86
- let lastSweepAt = 0;
73
+ const CLI_TIMEOUT_MS = 5 * 60_000;
74
+ const MAX_BACKOFF_MULTIPLIER = 10;
75
+ const BACKOFF_GRACE = 2;
87
76
 
88
77
  function log(event: string, fields: Record<string, unknown> = {}): void {
89
78
  let line: string;
@@ -101,185 +90,24 @@ function log(event: string, fields: Record<string, unknown> = {}): void {
101
90
  }
102
91
  }
103
92
 
104
- function writeHeartbeat(): void {
105
- try {
106
- const dir = join(transportRoot, '.turnq');
107
- mkdirSync(dir, { recursive: true });
108
- const data = { ts: new Date().toISOString(), pid: process.pid, version: RUNTIME_VERSION };
109
- writeFileSync(join(dir, 'heartbeat'), JSON.stringify(data) + '\n');
110
- } catch { /* best-effort */ }
93
+ // Config errors (bad host file, bad actor profile) repeat every tick until
94
+ // fixed — log each distinct one once per process run, not once per tick.
95
+ const loggedConfigErrors = new Set<string>();
96
+ function logConfigError(scope: string, message: string): void {
97
+ const key = `${scope}::${message}`;
98
+ if (loggedConfigErrors.has(key)) return;
99
+ loggedConfigErrors.add(key);
100
+ logError(transportRoot, 'parse', `${scope}: ${message}`);
101
+ log('config_error', { scope, message: message.slice(0, 200) });
111
102
  }
112
103
 
113
- function loadProtocolPrompt(): string {
104
+ const protocolPrompt = (() => {
114
105
  const p = join(transportRoot, 'upstream', 'PROTOCOL.md');
115
- if (!existsSync(p)) return '';
116
- return readFileSync(p, 'utf-8').trim();
117
- }
118
-
119
- const protocolPrompt = loadProtocolPrompt();
120
-
121
- function recipients(toField: unknown): string[] {
122
- if (Array.isArray(toField)) return toField.map(String);
123
- if (typeof toField === 'string') return [toField];
124
- return [];
125
- }
126
-
127
- // A `to:` recipient is either a bare actor name (`junior-developer`) or
128
- // an actor@host pair (`junior-developer@cachy`). Bare names broadcast to
129
- // every host that declares the actor; @host narrows to one host.
130
- //
131
- // Documented in concierge.md "Host-aware routing"; honored by the runtime
132
- // as of alpha.7 step 1. Prior to this, the recipient string was matched
133
- // verbatim against the actor name, so `junior-developer@cachy` never
134
- // matched the cachy dispatcher's `junior-developer` actor declaration —
135
- // the harness's first cross-host bug.
136
- function extractActor(recipient: string): string {
137
- const at = recipient.indexOf('@');
138
- return at === -1 ? recipient : recipient.slice(0, at);
139
- }
140
-
141
- function targetHost(recipient: string): string | null {
142
- const at = recipient.indexOf('@');
143
- return at === -1 ? null : recipient.slice(at + 1);
144
- }
145
-
146
- // Does `recipientList` address `actorName` on `thisHost`? Returns the match
147
- // outcome plus a flag for "actor was named but every instance targeted a
148
- // different host" — useful as a diagnostic so silent wrong-host routes are
149
- // logged rather than dropped without trace.
150
- function matchHostRouting(
151
- recipientList: string[],
152
- actorName: string,
153
- thisHost: string,
154
- ): { addressed: boolean; wrongHost: boolean } {
155
- let addressed = false;
156
- let actorNamedAtAll = false;
157
- for (const r of recipientList) {
158
- if (extractActor(r) !== actorName) continue;
159
- actorNamedAtAll = true;
160
- const host = targetHost(r);
161
- if (host === null || host === thisHost) {
162
- addressed = true;
163
- break;
164
- }
165
- }
166
- return { addressed, wrongHost: !addressed && actorNamedAtAll };
167
- }
168
-
169
- // Host-agnostic actor name check, used by causality scans (isCausalReply,
170
- // hasPriorWork) where the question is "does this recipient list name actor
171
- // X at all?" — host doesn't matter because the `from` field of replies
172
- // doesn't carry a host suffix either.
173
- function namesActor(recipientList: string[], actorName: string): boolean {
174
- for (const r of recipientList) {
175
- if (extractActor(r) === actorName) return true;
176
- }
177
- return false;
178
- }
179
-
180
- // Declared lifecycle kind for a message. `work` (default for legacy messages
181
- // without the field) is the as-tagged intent. The runtime does NOT trust this
182
- // value directly for the activation decision — see effectiveKind() below.
183
- // Kept for use as the seed of the effective-kind computation.
184
- function messageKind(msg: ChannelMessage): 'work' | 'result' {
185
- const raw = msg.data['kind'];
186
- return raw === 'result' ? 'result' : 'work';
187
- }
188
-
189
- // Is `msg` causally a reply to a prior ask? True iff some message strictly
190
- // before `msg` was sent FROM one of `msg`'s recipients TO `msg`'s sender with
191
- // declared kind `work`. If so, `msg` is that recipient's answer coming back —
192
- // regardless of how its sender (a fallible LLM actor, or `crosstalk send`'s
193
- // `work` default) labelled it.
194
- //
195
- // Conservative on multi-recipient `to:` lists: if ANY recipient previously
196
- // tasked the sender, the message is treated as causally a reply for all
197
- // recipients. The per-addressee asymmetry in hasPriorWork (below) compensates
198
- // — only the recipient that actually asked wakes on it. Known v1 limitation:
199
- // genuine multi-recipient fan-out where one recipient happens to have prior
200
- // unrelated work to the sender will be demoted to result and suppress wakes
201
- // for the other recipients. Not observed in Monte Carlo; revisit if it
202
- // surfaces.
203
- function isCausalReply(channelMessages: ChannelMessage[], msg: ChannelMessage): boolean {
204
- const sender = typeof msg.data['from'] === 'string' ? msg.data['from'] : '';
205
- if (!sender) return false;
206
- const toList = recipients(msg.data['to']);
207
- for (const m of channelMessages) {
208
- if (m.relPath >= msg.relPath) break;
209
- // Read receipts are bookkeeping, never causal evidence. The activation
210
- // scan already filters them out before considering a message for
211
- // dispatch — this filter is the same guard at the causality-helper
212
- // level, so a receipt from one of msg's recipients to msg's sender
213
- // can't forge a false causal-reply edge (which would then demote a
214
- // legitimate `work` to `result` and silently skip it). This was the
215
- // alpha.7 step 2 finding from the cross-host harness — receipts
216
- // pre-existing in the channel from cachy's first dispatch burst
217
- // misclassified mac's subsequent fan-out msgs as replies.
218
- if (m.data['type'] === 'read') continue;
219
- const mFrom = typeof m.data['from'] === 'string' ? m.data['from'] : '';
220
- // Host-agnostic actor name match: `from` fields are bare actor names,
221
- // but `to` fields may include `@host` suffixes that don't change
222
- // causal semantics.
223
- if (!namesActor(toList, mFrom)) continue;
224
- if ((m.data['kind'] ?? 'work') === 'result') continue;
225
- if (namesActor(recipients(m.data['to']), sender)) return true;
226
- }
227
- return false;
228
- }
229
-
230
- // Effective lifecycle kind. The runtime INFERS kind from the causality graph
231
- // rather than trusting the declared field: a message that is causally a reply
232
- // is a `result` even if it was labelled `work` (actors routinely report
233
- // results via `crosstalk send`, which defaults to `work`, and that mislabel
234
- // forges false reply-causality edges → wake-up loops). Genuine unsolicited
235
- // tasks (kickoffs, fresh dispatches) have no prior opposite-direction work
236
- // and keep their `work` kind. See PROTOCOL.md "Message kinds".
237
- //
238
- // This is the load-bearing principle the rest of the activation rule rides
239
- // on: the dispatcher derives semantics from the interaction graph; it never
240
- // trusts an actor's declaration.
241
- function effectiveKind(channelMessages: ChannelMessage[], msg: ChannelMessage): 'work' | 'result' {
242
- if (messageKind(msg) === 'result') return 'result';
243
- return isCausalReply(channelMessages, msg) ? 'result' : 'work';
244
- }
245
-
246
- // Reply causality — does `addressee` have a prior `kind: work` outbound to
247
- // `sender` somewhere in the channel's history strictly before `before`? If
248
- // yes, an inbound `kind: result` from `sender` to `addressee` is the answer
249
- // to that ask, and the addressee should wake on it. If no, the result is
250
- // unsolicited from addressee's POV and is informational only.
251
- //
252
- // Uses effectiveKind (not messageKind) when checking prior messages — a
253
- // mislabeled "work" reply from a prior peer would otherwise forge a false
254
- // causality edge here, which was the ping-pong root.
255
- //
256
- // The channel is already sorted by relPath ascending in
257
- // listChannelMessages(), so the scan walks chronologically.
258
- function hasPriorWork(
259
- channelMessages: ChannelMessage[],
260
- addressee: string,
261
- sender: string,
262
- before: string,
263
- ): boolean {
264
- for (const m of channelMessages) {
265
- if (m.relPath >= before) break;
266
- // Same receipt filter as isCausalReply — a receipt from `addressee`
267
- // to `sender` would otherwise look like a prior work outbound and
268
- // forge a false causal edge here too. Defense against the same
269
- // bug class at every causality-walking helper.
270
- if (m.data['type'] === 'read') continue;
271
- if (typeof m.data['from'] !== 'string' || m.data['from'] !== addressee) continue;
272
- if (effectiveKind(channelMessages, m) !== 'work') continue;
273
- const toList = recipients(m.data['to']);
274
- if (namesActor(toList, sender)) return true;
275
- }
276
- return false;
277
- }
106
+ return existsSync(p) ? readFileSync(p, 'utf-8').trim() : '';
107
+ })();
278
108
 
279
109
  function composeSystemPrompt(actorPrompt: string): string {
280
- return [protocolPrompt, actorPrompt]
281
- .filter((p) => p.length > 0)
282
- .join('\n\n---\n\n');
110
+ return [protocolPrompt, actorPrompt].filter((p) => p.length > 0).join('\n\n---\n\n');
283
111
  }
284
112
 
285
113
  function actorConcurrency(tiers: HostActorTiers): number {
@@ -291,6 +119,10 @@ function actorConcurrency(tiers: HostActorTiers): number {
291
119
  return 1;
292
120
  }
293
121
 
122
+ function messageSender(msg: ChannelMessage): string {
123
+ return typeof msg.data['from'] === 'string' ? (msg.data['from'] as string) : 'unknown';
124
+ }
125
+
294
126
  interface CliResult {
295
127
  status: number;
296
128
  stdout: string;
@@ -301,7 +133,7 @@ function invokeCli(
301
133
  cli: string,
302
134
  systemPrompt: string,
303
135
  userMessage: string,
304
- actorName: string,
136
+ env: Record<string, string>,
305
137
  ): Promise<CliResult> {
306
138
  return new Promise((res) => {
307
139
  const fullPrompt = `${systemPrompt}\n\n---\n\n${userMessage}`;
@@ -310,15 +142,13 @@ function invokeCli(
310
142
  res({ status: 1, stdout: '', stderr: 'tokenized cli is empty' });
311
143
  return;
312
144
  }
313
- // detached: true creates a new process group so we can SIGKILL the
314
- // group (not just the parent) on timeout orphan children writing
315
- // to the transport after parent SIGKILL was an observed alpha.5 hazard.
316
- // Env: CROSSTALK_DISPATCH_ACTOR tells send.ts what to use as --from when
317
- // the dispatched actor invokes `crosstalk send` without explicit --from.
318
- const child = spawn(parts[0], parts.slice(1), {
145
+ // detached: new process group, so the timeout SIGKILL takes the actor's
146
+ // children with it orphans writing to the transport after a timeout
147
+ // was an observed v5 hazard.
148
+ const child = spawn(parts[0]!, parts.slice(1), {
319
149
  stdio: ['pipe', 'pipe', 'pipe'],
320
150
  detached: true,
321
- env: { ...process.env, CROSSTALK_DISPATCH_ACTOR: actorName },
151
+ env: { ...process.env, ...env },
322
152
  });
323
153
  let stdout = '';
324
154
  let stderr = '';
@@ -326,20 +156,14 @@ function invokeCli(
326
156
  const timeout = setTimeout(() => {
327
157
  if (resolved) return;
328
158
  resolved = true;
329
- // SIGKILL the process group (negative pid) so any children the actor
330
- // spawned (e.g. crosstalk send subprocesses) die with the parent.
331
- // Fallback to single-pid kill if the group signal fails (some envs).
332
159
  try {
333
- if (typeof child.pid === 'number') {
334
- process.kill(-child.pid, 'SIGKILL');
335
- } else {
336
- child.kill('SIGKILL');
337
- }
160
+ if (typeof child.pid === 'number') process.kill(-child.pid, 'SIGKILL');
161
+ else child.kill('SIGKILL');
338
162
  } catch {
339
163
  try { child.kill('SIGKILL'); } catch { /* already dead */ }
340
164
  }
341
165
  res({ status: 124, stdout, stderr: stderr + '\n[timeout]' });
342
- }, 5 * 60_000);
166
+ }, CLI_TIMEOUT_MS);
343
167
  child.stdout.on('data', (d) => { stdout += d.toString(); });
344
168
  child.stderr.on('data', (d) => { stderr += d.toString(); });
345
169
  child.on('close', (code) => {
@@ -354,158 +178,53 @@ function invokeCli(
354
178
  clearTimeout(timeout);
355
179
  res({ status: 1, stdout, stderr: stderr + '\n' + err.message });
356
180
  });
357
- // The child may exit before reading stdin (e.g. cli=`false`). Attach
358
- // an error handler so EPIPE is swallowed instead of crashing dispatch,
359
- // and guard the write itself.
360
- child.stdin.on('error', () => { /* EPIPE/etc. — child closed stdin */ });
361
- try {
362
- child.stdin.write(fullPrompt);
363
- } catch { /* same: child closed stdin before we could write */ }
364
- try {
365
- child.stdin.end();
366
- } catch { /* ignore */ }
181
+ child.stdin.on('error', () => { /* child closed stdin */ });
182
+ try { child.stdin.write(fullPrompt); } catch { /* same */ }
183
+ try { child.stdin.end(); } catch { /* ignore */ }
367
184
  });
368
185
  }
369
186
 
370
187
  function writeReply(
371
188
  channelUuid: string,
372
189
  fromActor: string,
373
- toActor: string | string[],
190
+ toActor: string,
191
+ re: string | string[],
374
192
  body: string,
375
193
  ): void {
376
194
  const ts = now();
377
195
  const dir = join(transportRoot, 'data', 'channels', channelUuid, ts.pathDate);
378
196
  mkdirSync(dir, { recursive: true });
379
- // Auto-replies emitted via stdout are `kind: result` by default — the actor
380
- // is answering, not initiating new work. Recipients only wake on a result if
381
- // they previously asked the sender for work in this channel (reply
382
- // causality, see activation rule below). Actors that want to dispatch new
383
- // work do so explicitly via `crosstalk send --kind work`.
384
197
  const content = serializeFrontmatter(
385
- { from: fromActor, to: toActor, type: 'text', kind: 'result', timestamp: ts.iso },
198
+ { from: fromActor, to: toActor, type: 'text', timestamp: ts.iso, re },
386
199
  body,
387
200
  );
388
201
  writeFileSync(join(dir, messageFilename(ts)), content);
389
202
  }
390
203
 
391
- function writeReadReceipt(
392
- channelUuid: string,
393
- fromActor: string,
394
- toActor: string,
395
- ref: string,
396
- ): void {
397
- const ts = now();
398
- const dir = join(transportRoot, 'data', 'channels', channelUuid, ts.pathDate);
399
- mkdirSync(dir, { recursive: true });
400
- const content = serializeFrontmatter(
401
- { from: fromActor, to: toActor, type: 'read', ref, timestamp: ts.iso },
402
- '',
403
- );
404
- writeFileSync(join(dir, messageFilename(ts)), content);
405
- }
406
-
407
- interface PendingDispatch {
408
- actorName: string;
409
- channelUuid: string;
410
- msgs: ChannelMessage[]; // all unread messages addressed to this actor in this channel
411
- tiers: HostActorTiers;
412
- }
413
-
414
- function messageSender(msg: ChannelMessage): string {
415
- return typeof msg.data['from'] === 'string' ? msg.data['from'] : 'unknown';
416
- }
417
-
418
204
  function formatBatchedUserMessage(msgs: ChannelMessage[]): string {
419
- if (msgs.length === 1) return msgs[0].body;
420
- const header = `You have ${msgs.length} new messages in this channel. Process them collectively and reply once.`;
421
- const parts: string[] = [header];
205
+ if (msgs.length === 1) return msgs[0]!.body;
206
+ const parts = [`You have ${msgs.length} new messages in this channel. Process them collectively and reply once.`];
422
207
  for (let i = 0; i < msgs.length; i++) {
423
- const m = msgs[i];
424
- const from = messageSender(m);
425
- const ts = typeof m.data['timestamp'] === 'string' ? (m.data['timestamp'] as string) : '';
426
- parts.push(`--- Message ${i + 1} of ${msgs.length} (from: ${from}, ref: ${m.relPath}${ts ? `, ts: ${ts}` : ''}) ---`);
208
+ const m = msgs[i]!;
209
+ const ts = typeof m.data['timestamp'] === 'string' ? `, ts: ${m.data['timestamp']}` : '';
210
+ parts.push(`--- Message ${i + 1} of ${msgs.length} (from: ${messageSender(m)}, ref: ${m.relPath}${ts}) ---`);
427
211
  parts.push(m.body);
428
212
  }
429
213
  return parts.join('\n\n');
430
214
  }
431
215
 
432
- // Split a channel's pending messages (already sorted by relPath) into
433
- // contiguous batches sized for the actor's concurrency. Contiguous (not
434
- // round-robin) so each batch's highest relPath is monotone across batches —
435
- // the cursor advances safely after the dispatch loop's per-batch writes
436
- // without leaving a gap that would re-dispatch on the next tick.
437
- //
438
- // When pending fits within concurrency, every batch is a single message
439
- // (preserves parallel fan-out — junior-developer with count: 10 and 10
440
- // pending fan-out messages dispatches 10 parallel CLI invocations of 1
441
- // message each). When pending exceeds concurrency, batches collapse pending
442
- // into ~concurrency parallel invocations, each handling ceil(N/concurrency)
443
- // messages (preserves the fan-in collapse — concierge with count: 1 and 10
444
- // pending replies dispatches 1 invocation of 10 messages).
445
- function splitForConcurrency(
446
- msgs: ChannelMessage[],
447
- concurrency: number,
448
- ): ChannelMessage[][] {
449
- if (concurrency <= 1 || msgs.length <= 1) return [msgs];
450
- const chunkSize = Math.max(1, Math.ceil(msgs.length / concurrency));
451
- const out: ChannelMessage[][] = [];
452
- for (let i = 0; i < msgs.length; i += chunkSize) {
453
- out.push(msgs.slice(i, i + chunkSize));
454
- }
455
- return out;
456
- }
457
-
458
- function distinctSenders(msgs: ChannelMessage[]): string[] {
459
- const seen = new Set<string>();
460
- const out: string[] = [];
461
- for (const m of msgs) {
462
- const s = messageSender(m);
463
- if (s !== 'unknown' && !seen.has(s)) {
464
- seen.add(s);
465
- out.push(s);
466
- }
467
- }
468
- return out;
216
+ interface PendingDispatch {
217
+ actorName: string;
218
+ channelUuid: string;
219
+ msgs: ChannelMessage[];
220
+ tiers: HostActorTiers;
469
221
  }
470
222
 
471
223
  async function dispatchOne(p: PendingDispatch): Promise<boolean> {
472
- // Tier resolution uses the first message's `tier:` hint (if any). Batched
473
- // dispatches assume homogeneous tier preference within an (actor, channel)
474
- // pairing — true for fan-in (all peer replies omit tier) and for explicit
475
- // single-message dispatches alike.
476
- const firstMsg = p.msgs[0];
477
- const lastMsg = p.msgs[p.msgs.length - 1];
478
- const preferredTier = typeof firstMsg.data['tier'] === 'string'
479
- ? (firstMsg.data['tier'] as string)
480
- : undefined;
481
- let resolved;
482
- try {
483
- resolved = pickTier(p.tiers, preferredTier);
484
- } catch (err) {
485
- const r = writeDlqEntry(
486
- transportRoot,
487
- 'config',
488
- p.actorName,
489
- '(config)',
490
- '(config)',
491
- `tier selection failed: ${(err as Error).message}`,
492
- );
493
- log('actor_config_error', {
494
- actor: p.actorName,
495
- dlq_id: r.id,
496
- attempts: r.attempts,
497
- quarantined: r.quarantined,
498
- });
499
- return false;
500
- }
501
- const cli = resolved.cli;
502
-
503
- // Quarantine check uses the LAST message's relPath as the batch's identity.
504
- // Per-message quarantine semantics are preserved because batch boundaries
505
- // align with cursor checkpoints; if a single message in a batch keeps
506
- // failing, the cursor never advances past it and it surfaces as a singleton
507
- // batch on the next tick.
508
- if (isQuarantined(transportRoot, 'dispatch', p.actorName, p.channelUuid, lastMsg.relPath)) {
224
+ const firstMsg = p.msgs[0]!;
225
+ const lastMsg = p.msgs[p.msgs.length - 1]!;
226
+
227
+ if (isQuarantined(transportRoot, p.actorName, p.channelUuid, lastMsg.relPath)) {
509
228
  log('dispatch_skipped_quarantined', {
510
229
  actor: p.actorName,
511
230
  channel: p.channelUuid.slice(0, 8),
@@ -514,6 +233,17 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
514
233
  return false;
515
234
  }
516
235
 
236
+ const preferredTier = typeof firstMsg.data['tier'] === 'string' ? (firstMsg.data['tier'] as string) : undefined;
237
+ let cli: string;
238
+ let profile;
239
+ try {
240
+ cli = pickTier(p.tiers, preferredTier).cli;
241
+ profile = loadActorProfile(transportRoot, p.actorName);
242
+ } catch (err) {
243
+ logConfigError(`actor:${p.actorName}`, (err as Error).message);
244
+ return false;
245
+ }
246
+
517
247
  log('dispatch', {
518
248
  actor: p.actorName,
519
249
  channel: p.channelUuid.slice(0, 8),
@@ -522,42 +252,22 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
522
252
  last_msg: lastMsg.relPath,
523
253
  });
524
254
 
525
- // Read receipt per message — preserves the audit trail (each original
526
- // message gets exactly one receipt) and keeps the stale-receipt sweep
527
- // correct.
528
- for (const m of p.msgs) {
529
- writeReadReceipt(p.channelUuid, p.actorName, messageSender(m), m.relPath);
530
- }
531
-
532
- let profile;
533
- try {
534
- profile = loadActorProfile(transportRoot, p.actorName);
535
- } catch (err) {
536
- const r = writeDlqEntry(
537
- transportRoot,
538
- 'config',
539
- p.actorName,
540
- '(config)',
541
- '(config)',
542
- `actor profile load failed: ${(err as Error).message}`,
543
- );
544
- log('dispatch_config_error', {
545
- actor: p.actorName,
546
- dlq_id: r.id,
547
- attempts: r.attempts,
548
- quarantined: r.quarantined,
549
- });
550
- return false;
551
- }
552
-
553
- const systemPrompt = composeSystemPrompt(profile.systemPrompt);
554
- const userMessage = formatBatchedUserMessage(p.msgs);
555
- const result = await invokeCli(cli, systemPrompt, userMessage, p.actorName);
255
+ const result = await invokeCli(
256
+ cli,
257
+ composeSystemPrompt(profile.systemPrompt),
258
+ formatBatchedUserMessage(p.msgs),
259
+ {
260
+ CROSSTALK_DISPATCH_ACTOR: p.actorName,
261
+ CROSSTALK_DISPATCH_CHANNEL: p.channelUuid,
262
+ // Every relPath in the batch — `crosstalk send` records them all as
263
+ // the reply's re: list, so batching never loses an answered message.
264
+ CROSSTALK_DISPATCH_RE: p.msgs.map((m) => m.relPath).join(','),
265
+ },
266
+ );
556
267
 
557
268
  if (result.status !== 0) {
558
269
  const r = writeDlqEntry(
559
270
  transportRoot,
560
- 'dispatch',
561
271
  p.actorName,
562
272
  p.channelUuid,
563
273
  lastMsg.relPath,
@@ -577,45 +287,26 @@ async function dispatchOne(p: PendingDispatch): Promise<boolean> {
577
287
 
578
288
  const reply = result.stdout.trim();
579
289
  if (reply.length === 0) {
580
- // Empty stdout on a multi-message batch is treated as success — the
581
- // actor likely routed via `crosstalk send` and has nothing to add as
582
- // an auto-reply. For a single-message batch we keep the prior DLQ
583
- // semantics: a single dispatched message that produces no reply is a
584
- // protocol violation.
585
- if (p.msgs.length > 1) {
586
- log('dispatch_batch_silent_ok', {
587
- actor: p.actorName,
588
- channel: p.channelUuid.slice(0, 8),
589
- batch_size: p.msgs.length,
590
- });
591
- return true;
592
- }
593
- const r = writeDlqEntry(
594
- transportRoot,
595
- 'dispatch',
596
- p.actorName,
597
- p.channelUuid,
598
- lastMsg.relPath,
599
- 'cli returned empty reply',
600
- );
601
- log('dispatch_empty_reply', {
602
- actor: p.actorName,
603
- channel: p.channelUuid.slice(0, 8),
604
- dlq_id: r.id,
605
- attempts: r.attempts,
606
- quarantined: r.quarantined,
607
- });
608
- return false;
290
+ // Legitimate: the actor routed its answer via `crosstalk send` (which
291
+ // auto-links re:). If it truly did nothing, the asker's `crosstalk
292
+ // replies` stays PENDING visible, not silently lost.
293
+ log('dispatch_silent', { actor: p.actorName, channel: p.channelUuid.slice(0, 8), batch_size: p.msgs.length });
294
+ return true;
609
295
  }
610
296
 
611
- // Auto-reply addressing: single-sender batches reply to that sender
612
- // (preserves prior behavior). Multi-sender batches address all distinct
613
- // senders so each peer sees the response.
614
- const senders = distinctSenders(p.msgs);
615
- const replyTo: string | string[] = senders.length <= 1
616
- ? (senders[0] ?? messageSender(firstMsg))
617
- : senders;
618
- writeReply(p.channelUuid, p.actorName, replyTo, reply);
297
+ // One reply per distinct sender, re:-linked to EVERY message that sender
298
+ // had in the batch the asker's activation rule fires, and `crosstalk
299
+ // replies` sees each individual message as answered.
300
+ const bySender = new Map<string, string[]>();
301
+ for (const m of p.msgs) {
302
+ const sender = messageSender(m);
303
+ bySender.set(sender, [...(bySender.get(sender) ?? []), m.relPath]);
304
+ }
305
+ bySender.delete('unknown');
306
+ if (bySender.size === 0) bySender.set(messageSender(firstMsg), [firstMsg.relPath]);
307
+ for (const [sender, relPaths] of bySender) {
308
+ writeReply(p.channelUuid, p.actorName, sender, relPaths.length === 1 ? relPaths[0]! : relPaths, reply);
309
+ }
619
310
  return true;
620
311
  }
621
312
 
@@ -625,278 +316,192 @@ interface TickResult {
625
316
  }
626
317
 
627
318
  async function dispatchTick(): Promise<TickResult> {
628
- writeHeartbeat();
629
-
630
- return withLock('dispatch', async () => {
631
- let infraOk = true;
632
-
633
- const pullResult = gitPull(transportRoot);
634
- if (!pullResult.ok && pullResult.error) {
635
- // Note: deliberately NOT calling writeErrorLog here. Repeated pull
636
- // failures (deadlock loop) would otherwise write a new errors/*.md
637
- // every tick, which dispatch then has to commit, which the next
638
- // pull then chokes on a positive feedback that contributed to
639
- // the alpha.3/alpha.4 Mac UAT wedge. The structured log line below
640
- // gives operators full diagnostic info via stdout/json logs.
641
- log('git_pull_failed', { error: pullResult.error.slice(0, 200) });
642
- infraOk = false;
643
- }
644
-
645
- let host: HostFile;
646
- try {
647
- host = findHostFile(transportRoot, hostOverride);
648
- } catch (err) {
649
- const r = writeDlqEntry(
650
- transportRoot,
651
- 'config',
652
- '(host)',
653
- '(config)',
654
- '(config)',
655
- `host file load failed: ${(err as Error).message}`,
656
- );
657
- log('tick_config_error', {
658
- scope: 'host',
659
- dlq_id: r.id,
660
- attempts: r.attempts,
661
- quarantined: r.quarantined,
662
- });
663
- return { didWork: false, infraOk };
664
- }
319
+ writeHeartbeat(transportRoot, RUNTIME_VERSION);
320
+ let infraOk = true;
321
+
322
+ const pullResult = gitPull(transportRoot);
323
+ if (!pullResult.ok) {
324
+ // Skip the whole tick: a failed pull can leave origin/HEAD (the cursor
325
+ // baseline) ahead of the working tree, and scanning against that would
326
+ // advance cursors past messages that never materialized.
327
+ logError(transportRoot, 'git_pull', pullResult.error ?? 'unknown');
328
+ log('git_pull_failed', { error: (pullResult.error ?? '').slice(0, 200) });
329
+ return { didWork: false, infraOk: false };
330
+ }
665
331
 
666
- let didWork = false;
332
+ let host: HostFile;
333
+ try {
334
+ host = findHostFile(transportRoot, hostOverride);
335
+ } catch (err) {
336
+ logConfigError('host', (err as Error).message);
337
+ return { didWork: false, infraOk };
338
+ }
667
339
 
668
- for (const actorName of Object.keys(host.actors)) {
669
- if (isActorQuarantined(transportRoot, actorName)) {
670
- log('actor_skipped_quarantined', { actor: actorName });
340
+ // Cursors are commit hashes, not relPaths: filenames order by sender
341
+ // timestamp but arrive in push order, so a relPath cursor can advance
342
+ // past a slower writer's earlier-stamped message and lose it forever.
343
+ // "New since cursor" is asked of git, which records arrival truthfully.
344
+ const head = cursorBaseline(transportRoot);
345
+ if (!head) {
346
+ logError(transportRoot, 'other', 'git rev-parse failed for origin/HEAD and HEAD — skipping tick');
347
+ return { didWork: false, infraOk: false };
348
+ }
349
+ // diff results keyed by cursor commit (shared across actors on the same
350
+ // cursor); null = commit unknown to this clone -> full re-scan.
351
+ const addedSince = new Map<string, Set<string> | null>();
352
+
353
+ let didWork = false;
354
+ const channels = discoverChannels(transportRoot);
355
+
356
+ for (const actorName of Object.keys(host.actors)) {
357
+ const tiers = host.actors[actorName]!;
358
+ const concurrency = actorConcurrency(tiers);
359
+ const pending: PendingDispatch[] = [];
360
+
361
+ for (const channelUuid of channels) {
362
+ const cursor = readCursor(transportRoot, actorName, channelUuid);
363
+ if (cursor === head) continue;
364
+
365
+ // First encounter: seed to HEAD so only future messages are dispatched.
366
+ // Without this, a null cursor falls through to `post = messages` and
367
+ // replays the full channel history on every fresh-state boot.
368
+ if (cursor === null) {
369
+ writeCursor(transportRoot, actorName, channelUuid, head);
671
370
  continue;
672
371
  }
673
372
 
674
- const tiers = host.actors[actorName];
675
- const concurrency = actorConcurrency(tiers);
676
-
677
- // Mailbox batch-drain: for each channel, collect ALL unread messages
678
- // addressed to this actor into a single PendingDispatch. This collapses
679
- // fan-in O(N) into O(1) CLI invocations and prevents one actor's deep
680
- // backlog from starving its peers in the (actor, channel) scan order.
681
- // Read receipts and self-sent messages are filtered here — receipts
682
- // are bookkeeping the actor already produced, and self-messages would
683
- // create a wake-up loop.
684
- const pending: PendingDispatch[] = [];
685
- const channels = discoverChannels(transportRoot);
686
- for (const channelUuid of channels) {
687
- const cursor = readCursor(transportRoot, actorName, channelUuid);
688
- const messages = listChannelMessages(transportRoot, channelUuid);
689
- const post = cursor ? messages.filter((m) => m.relPath > cursor) : messages;
690
-
691
- log('tick_scan', {
692
- actor: actorName,
693
- channel: channelUuid.slice(0, 8),
694
- cursor: cursor ?? '(none)',
695
- total_msgs: messages.length,
696
- post_cursor_msgs: post.length,
697
- });
698
-
699
- const channelBatch: ChannelMessage[] = [];
700
- for (const msg of post) {
701
- const to = recipients(msg.data['to']);
702
- const from = typeof msg.data['from'] === 'string' ? msg.data['from'] : 'unknown';
703
- const msgType = typeof msg.data['type'] === 'string' ? msg.data['type'] : 'text';
704
- // Host-aware routing match. A recipient may target this actor
705
- // either by bare name (`junior-developer` — broadcast to every
706
- // host that declares the actor) or by `actor@host` (narrowed to
707
- // a specific host). Bare-name match always succeeds when the
708
- // actor name matches; @host match succeeds only when the host
709
- // alias also matches this dispatcher's host. A recipient that
710
- // names this actor but targets a different host is flagged as
711
- // `host_routing_mismatch` so silent wrong-host routes are
712
- // surfaced rather than dropped without trace. See concierge.md
713
- // "Host-aware routing" + PROTOCOL.md.
714
- const routing = matchHostRouting(to, actorName, host.alias);
715
- if (!routing.addressed || from === actorName || msgType === 'read') {
716
- if (routing.wrongHost) {
717
- log('host_routing_mismatch', {
718
- actor: actorName,
719
- this_host: host.alias,
720
- channel: channelUuid.slice(0, 8),
721
- msg: msg.relPath,
722
- to,
723
- });
724
- }
725
- writeCursor(transportRoot, actorName, channelUuid, msg.relPath);
726
- continue;
727
- }
728
- // Lifecycle activation rule. `work` always wakes. `result` wakes
729
- // only if reply-causal — actor previously sent the sender a `work`
730
- // in this channel. The kind used here is the runtime's INFERRED
731
- // effective kind, not the actor's declared kind: a message that's
732
- // causally a reply is treated as `result` even when an actor (or
733
- // `crosstalk send`'s default) labelled it `work`, so a fan-in peer
734
- // mislabeling its reply can't forge a wake-up loop. See PROTOCOL.md
735
- // "Message kinds".
736
- const kind = effectiveKind(messages, msg);
737
- if (kind === 'result' && !hasPriorWork(messages, actorName, from, msg.relPath)) {
738
- writeCursor(transportRoot, actorName, channelUuid, msg.relPath);
739
- continue;
740
- }
741
- channelBatch.push(msg);
742
- }
743
- if (channelBatch.length > 0) {
744
- const groups = splitForConcurrency(channelBatch, concurrency);
745
- for (const g of groups) {
746
- pending.push({ actorName, channelUuid, msgs: g, tiers });
747
- }
373
+ const messages = listChannelMessages(transportRoot, channelUuid);
374
+ const senderByRelPath = new Map(messages.map((m) => [m.relPath, messageSender(m)]));
375
+ const senderOf = (relPath: string) => senderByRelPath.get(relPath);
376
+
377
+ let added = addedSince.get(cursor);
378
+ if (added === undefined) {
379
+ const files = newFilesSince(transportRoot, cursor);
380
+ added = files === null ? null : new Set(files);
381
+ addedSince.set(cursor, added);
382
+ if (added === null) {
383
+ logError(transportRoot, 'other', `cursor commit ${cursor.slice(0, 12)} unknown to this clone — full channel re-scan`);
748
384
  }
749
385
  }
386
+ let post = messages;
387
+ if (added !== null) {
388
+ const prefix = `data/channels/${channelUuid}/`;
389
+ post = messages.filter((m) => added.has(prefix + m.relPath));
390
+ }
391
+ if (post.length === 0) {
392
+ writeCursor(transportRoot, actorName, channelUuid, head);
393
+ continue;
394
+ }
750
395
 
751
- // Concurrency now applies across (channel) batches, not individual
752
- // messages. Each batch is one CLI invocation regardless of how many
753
- // messages it carries. Cursor advances to the last message in the batch
754
- // on success or skip — failure (DLQ) leaves the cursor behind so the
755
- // tail of the batch retries.
756
- for (let i = 0; i < pending.length; i += concurrency) {
757
- const batch = pending.slice(i, i + concurrency);
758
- const results = await Promise.all(batch.map((p) => dispatchOne(p)));
759
- for (let j = 0; j < batch.length; j++) {
760
- const p = batch[j];
761
- const lastRelPath = p.msgs[p.msgs.length - 1].relPath;
762
- writeCursor(transportRoot, p.actorName, p.channelUuid, lastRelPath);
763
- if (results[j]) didWork = true;
396
+ const batch: ChannelMessage[] = [];
397
+ for (const msg of post) {
398
+ if (msg.data['type'] !== 'text') continue;
399
+ const decision = decideWake(
400
+ {
401
+ from: messageSender(msg),
402
+ to: recipients(msg.data['to']),
403
+ re: reList(msg.data['re']),
404
+ },
405
+ actorName,
406
+ host.alias,
407
+ senderOf,
408
+ );
409
+ if (decision === 'wake') {
410
+ batch.push(msg);
411
+ } else if (decision === 'wrong-host') {
412
+ log('host_routing_mismatch', {
413
+ actor: actorName,
414
+ this_host: host.alias,
415
+ channel: channelUuid.slice(0, 8),
416
+ msg: msg.relPath,
417
+ to: recipients(msg.data['to']),
418
+ });
764
419
  }
765
420
  }
421
+
422
+ if (batch.length === 0) {
423
+ writeCursor(transportRoot, actorName, channelUuid, head);
424
+ continue;
425
+ }
426
+ for (const g of splitForConcurrency(batch, concurrency)) {
427
+ pending.push({ actorName, channelUuid, msgs: g, tiers });
428
+ }
766
429
  }
767
430
 
768
- // Always attempt commit+push at end of tick gitCommitAndPush
769
- // short-circuits if the working tree is clean. This is required
770
- // even when no replies were produced, because cursors advance for
771
- // messages addressed to other actors (the actor's own replies and
772
- // read receipts appear on the next pull and need to be skipped past).
773
- // Without this commit, the orphan cursor change blocks the next
774
- // git pull --rebase and dispatch dead-ends in backoff.
775
- const commitMsg = didWork
776
- ? `dispatch: replies + cursor advance ${new Date().toISOString()}`
777
- : `dispatch: cursor advance ${new Date().toISOString()}`;
778
- const pushResult = gitCommitAndPush(transportRoot, commitMsg);
779
- if (!pushResult.ok && pushResult.error) {
780
- // Same rationale as the pull case above: no writeErrorLog.
781
- // Repeated push failures shouldn't flood errors/ since that
782
- // contributes to the same git-deadlock-feedback that pull does.
783
- const kind = pushResult.committed ? 'git_push' : 'git_commit';
784
- log('git_push_failed', {
785
- kind,
786
- committed_locally: pushResult.committed,
787
- error: pushResult.error.slice(0, 200),
788
- });
789
- infraOk = false;
431
+ // Waves of `concurrency` parallel CLI invocations. The cursor advances
432
+ // to the scanned commit whether each batch succeeded or DLQ'd —
433
+ // at-least-once was attempted; `crosstalk dlq --retry` rewinds the
434
+ // cursor explicitly. A crash mid-wave leaves the cursor behind, so the
435
+ // whole span replays next tick (at-least-once, never lost).
436
+ for (let i = 0; i < pending.length; i += concurrency) {
437
+ const wave = pending.slice(i, i + concurrency);
438
+ const results = await Promise.all(wave.map((p) => dispatchOne(p)));
439
+ if (results.some(Boolean)) didWork = true;
440
+ }
441
+ for (const p of pending) {
442
+ writeCursor(transportRoot, p.actorName, p.channelUuid, head);
790
443
  }
444
+ }
791
445
 
792
- // Periodic stale-read-receipt sweep
793
- if (Date.now() - lastSweepAt > SWEEP_INTERVAL_MS) {
794
- const surfaced = sweepStaleReadReceipts(transportRoot, STALE_RECEIPT_THRESHOLD_MS);
795
- lastSweepAt = Date.now();
796
- if (surfaced > 0) {
797
- log('stale_receipts_surfaced', { count: surfaced });
798
- }
446
+ if (didWork) {
447
+ const pushResult = await withLock(transportRoot, 'git', async () =>
448
+ gitCommitAndPush(transportRoot, `dispatch: replies ${new Date().toISOString()}`),
449
+ );
450
+ if (!pushResult.ok && pushResult.error) {
451
+ logError(transportRoot, pushResult.committed ? 'git_push' : 'git_commit', pushResult.error);
452
+ log('git_push_failed', { committed_locally: pushResult.committed, error: pushResult.error.slice(0, 200) });
453
+ infraOk = false;
799
454
  }
455
+ }
800
456
 
801
- return { didWork, infraOk };
802
- });
457
+ return { didWork, infraOk };
803
458
  }
804
459
 
805
- async function waitForWakeOrTimeout(ms: number): Promise<'wake' | 'timeout'> {
806
- const wakeDir = join(transportRoot, '.turnq');
807
- mkdirSync(wakeDir, { recursive: true });
460
+ async function waitForWakeOrTimeout(ms: number): Promise<void> {
461
+ const dir = stateDir(transportRoot);
808
462
  const ac = new AbortController();
809
463
  const timer = setTimeout(() => ac.abort(), ms);
810
464
  try {
811
- const watcher = watch(wakeDir, { signal: ac.signal });
465
+ const watcher = watch(dir, { signal: ac.signal });
812
466
  for await (const ev of watcher) {
813
- if (ev.filename === 'wake.signal') {
814
- clearTimeout(timer);
815
- return 'wake';
816
- }
467
+ if (ev.filename === 'wake.signal') return;
817
468
  }
818
- return 'timeout';
819
469
  } catch {
820
- return 'timeout';
470
+ /* abort = timeout */
821
471
  } finally {
822
472
  clearTimeout(timer);
823
473
  }
824
474
  }
825
475
 
826
476
  async function main(): Promise<void> {
827
- log('dispatch_start', {
828
- transport: transportRoot,
829
- version: RUNTIME_VERSION,
830
- log_file: logFile ?? null,
831
- });
477
+ writePidfile(transportRoot);
478
+ const cleanup = () => removePidfile(transportRoot);
479
+ process.on('exit', cleanup);
480
+ process.on('SIGTERM', () => { cleanup(); process.exit(0); });
481
+ process.on('SIGINT', () => { cleanup(); process.exit(0); });
482
+
483
+ log('dispatch_start', { transport: transportRoot, version: RUNTIME_VERSION, state_dir: stateDir(transportRoot) });
832
484
  if (onceMode) {
833
485
  await dispatchTick();
834
- return;
486
+ process.exit(0);
835
487
  }
836
- log('coordinator_running', { quiet_poll_s: pollSeconds, active_poll_s: 1 });
488
+ log('dispatch_running', { quiet_poll_s: pollSeconds });
837
489
 
838
490
  let consecutiveInfraFailures = 0;
839
-
840
491
  while (true) {
841
492
  try {
842
493
  const r = await dispatchTick();
843
494
  if (r.infraOk) {
844
- if (consecutiveInfraFailures > 0) {
845
- log('backoff_cleared', { previous_consecutive_failures: consecutiveInfraFailures });
846
- }
495
+ if (consecutiveInfraFailures > 0) log('backoff_cleared', { previous_failures: consecutiveInfraFailures });
847
496
  consecutiveInfraFailures = 0;
848
497
  } else {
849
498
  consecutiveInfraFailures++;
850
499
  }
851
-
852
- // Backoff kicks in only after a grace period of failures.
853
500
  const beyondGrace = Math.max(0, consecutiveInfraFailures - BACKOFF_GRACE);
854
501
  const backoffFactor = Math.min(MAX_BACKOFF_MULTIPLIER, 2 ** beyondGrace);
855
-
856
- if (consecutiveInfraFailures > BACKOFF_GRACE) {
857
- log('backoff_active', {
858
- consecutive_failures: consecutiveInfraFailures,
859
- factor: backoffFactor,
860
- });
502
+ if (backoffFactor > 1) {
503
+ log('backoff_active', { consecutive_failures: consecutiveInfraFailures, factor: backoffFactor });
861
504
  }
862
-
863
- // Per-tick heal: deadlock-break when the dispatch loop has been
864
- // failing for HEAL_THRESHOLD consecutive ticks AND we haven't healed
865
- // recently. Hard-resets the working tree to origin/<current branch>.
866
- // Trades any uncommitted local state for forward progress — acceptable
867
- // because messages/cursors/dlq are pulled back from origin and
868
- // .turnq/errors are regenerated.
869
- if (
870
- consecutiveInfraFailures >= HEAL_THRESHOLD &&
871
- consecutiveInfraFailures - lastHealAtFailureCount >= HEAL_THRESHOLD
872
- ) {
873
- try {
874
- const branchProc = spawn('git', ['rev-parse', '--abbrev-ref', 'HEAD'], {
875
- cwd: transportRoot,
876
- stdio: ['ignore', 'pipe', 'ignore'],
877
- });
878
- let branchName = '';
879
- branchProc.stdout.on('data', (d) => { branchName += d.toString(); });
880
- await new Promise<void>((res) => branchProc.on('close', () => res()));
881
- const branch = branchName.trim() || 'main';
882
- log('per_tick_heal_start', {
883
- consecutive_failures: consecutiveInfraFailures,
884
- target: `origin/${branch}`,
885
- });
886
- await new Promise<void>((res) => {
887
- const p = spawn('sh', [
888
- '-c',
889
- `git rebase --abort 2>/dev/null; git fetch --quiet origin '${branch}' && git reset --hard --quiet 'origin/${branch}' && git clean -fdq`,
890
- ], { cwd: transportRoot, stdio: 'inherit' });
891
- p.on('close', () => res());
892
- });
893
- log('per_tick_heal_done', { target: `origin/${branch}` });
894
- lastHealAtFailureCount = consecutiveInfraFailures;
895
- } catch (err) {
896
- log('per_tick_heal_failed', { error: (err as Error).message });
897
- }
898
- }
899
-
900
505
  if (r.didWork) {
901
506
  await new Promise((res) => setTimeout(res, 1_000 * backoffFactor));
902
507
  } else {
@@ -904,7 +509,7 @@ async function main(): Promise<void> {
904
509
  }
905
510
  } catch (err) {
906
511
  const msg = (err as Error).message;
907
- writeErrorLog(transportRoot, 'other', `tick error: ${msg}`);
512
+ logError(transportRoot, 'other', `tick error: ${msg}`);
908
513
  log('tick_error', { message: msg });
909
514
  consecutiveInfraFailures++;
910
515
  await new Promise((res) => setTimeout(res, pollSeconds * 1_000));