@cotal-ai/core 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/endpoint.js CHANGED
@@ -1,13 +1,14 @@
1
1
  import { EventEmitter } from "node:events";
2
2
  import { randomUUID } from "node:crypto";
3
- import { connect, credsAuthenticator, nanos, AuthorizationError, PermissionViolationError, UserAuthenticationExpiredError, } from "@nats-io/transport-node";
3
+ import { connect, credsAuthenticator, nanos, AuthorizationError, PermissionViolationError, UserAuthenticationExpiredError, NoRespondersError, RequestError, } from "@nats-io/transport-node";
4
4
  import { idFromCreds } from "./identity.js";
5
5
  import { assertValidName } from "./resolve.js";
6
- import { createSpaceStreams, chatDurableConfig, dmDurableConfig, taskDurableConfig, MAX_MSGS_PER_SUBJECT } from "./streams.js";
6
+ import { createSpaceStreams, dmDurableConfig, dlvDurableConfig, taskDurableConfig, fanoutDurableConfig, inboxReaderConfig, MAX_MSGS_PER_SUBJECT } from "./streams.js";
7
7
  import { jetstream, jetstreamManager, AckPolicy, DeliverPolicy, } from "@nats-io/jetstream";
8
8
  import { Kvm } from "@nats-io/kv";
9
- import { openChannelRegistry, effectiveReplay, effectiveReplayWindowMs, readChannelConfig, readChannelDefaults, } from "./channels.js";
10
- import { anycastSubject, CHANNEL_DEFAULTS_KEY, chatStream, chatDurable, chatHistDurable, chatSubject, collapseFilterSubjects, controlServiceSubject, CONTROL_SELF_SERVICE, dmStream, dmDurable, isConcreteChannel, normalizeMentions, parseSubject, presenceBucket, spacePrefix, spaceWildcard, subjectMatches, taskStream, taskDurable, token, unicastSubject, } from "./subjects.js";
9
+ import { openMembersRegistry, commitMember, tombstoneMember, activateMember, readMember, listMembers, durableEligible, StaleMembershipWrite, } from "./members.js";
10
+ import { openChannelRegistry, effectiveReplay, effectiveReplayWindowMs, effectiveDeliveryClass, readChannelConfig, readChannelDefaults, } from "./channels.js";
11
+ import { anycastSubject, CHANNEL_DEFAULTS_KEY, chatStream, chatHistDurable, chatSubject, controlServiceSubject, CONTROL_SELF_SERVICE, dmStream, dmDurable, dlvStream, dlvDurable, dlvSubject, dinboxSubject, inboxStream, parseDinboxOwner, FANOUT_DURABLE, INBOX_READER_DURABLE, chatWildcard, channelInAllow, isConcreteChannel, normalizeMentions, parseSubject, presenceBucket, spacePrefix, spaceWildcard, subjectMatches, taskStream, taskDurable, token, unicastSubject, } from "./subjects.js";
11
12
  export const DEFAULT_SERVER = "nats://127.0.0.1:4222";
12
13
  /** Space joined when none is given on the CLI (the `cotal-<space>` cmux tab, etc.). */
13
14
  export const DEFAULT_SPACE = "main";
@@ -23,6 +24,10 @@ export const DEFAULT_SPACE = "main";
23
24
  * synchronously on an unhandled "error" — a missing listener turns any such fault into a
24
25
  * process crash instead of a logged denial.
25
26
  */
27
+ /** Plane-3 trusted-reader redelivery ceiling: a dinbox entry that keeps failing re-auth-defer
28
+ * (unknown owner) or DELIVER transfer is `term()`d + surfaced after this many redeliveries, so one
29
+ * stuck/poison entry can't head-of-line the single shared reader forever. */
30
+ const READER_MAX_REDELIVERIES = 10;
26
31
  export class CotalEndpoint extends EventEmitter {
27
32
  card;
28
33
  space;
@@ -45,6 +50,11 @@ export class CotalEndpoint extends EventEmitter {
45
50
  jsm;
46
51
  kv;
47
52
  channelKv;
53
+ /** Plane-3 durable-membership registry KV — lazily opened by the privileged (manager) endpoint. */
54
+ membersKv;
55
+ /** When set, this endpoint hosts the Plane-3 fan-out writer + trusted reader (the manager). `aclFor`
56
+ * maps an owner id to its current read ACL (`allowSubscribe`) for the reader's re-authorization. */
57
+ plane3;
48
58
  /** Live local cache of the channel registry (key = channel token), kept by a KV watch. */
49
59
  channelConfigs = new Map();
50
60
  channelDefaults = {};
@@ -58,11 +68,45 @@ export class CotalEndpoint extends EventEmitter {
58
68
  histLock = Promise.resolve();
59
69
  subs = [];
60
70
  streamMsgs = [];
71
+ /** Per-channel native core subscriptions (SPEC v0.3) — the manager-free live read path for boot +
72
+ * runtime channels (there is no per-instance chat durable). Keyed by channel so leave unsubscribes
73
+ * just one. */
74
+ chatSubs = new Map();
75
+ /** Channels whose core-sub the broker refused (async sub.allow violation) — read by the
76
+ * broker-confirmed join: a denied subscribe is NOT a successful join (SPEC conformance #13). */
77
+ chatSubDenied = new Set();
78
+ /** Channels this session has a Plane-3 durable backstop for (per-channel join GENERATION, from
79
+ * durableJoin, so leave passes it back for the stale-leave guard). A durable channel's core-sub is
80
+ * NOT coverage-dropped — it stays a live wake-hint, dedup-coalesced with the Plane-3 durable copy by
81
+ * id-dedup. Drives the durable-state surface + routes leave to `durableLeave`. PERSISTS across
82
+ * reconnect (like `this.channels`): the membership record + the `dlv_<id>` durable are persistent so
83
+ * the backstop survives a reconnect on its own; the agent can't re-read the privileged members KV,
84
+ * so this in-memory mirror is kept, not rebuilt. Cleared only on full stop. */
85
+ plane3Channels = new Map();
86
+ /** Channels whose live sub was REFUSED while they held a Plane-3 durable membership, whose §7
87
+ * tombstone has not yet confirmed (channel → join generation). {@link closeRefusedMembership} retries
88
+ * the tombstone until it lands; until then this is a `durable-unclosed` state surfaced via
89
+ * {@link pendingDurableLeaves} (the connector shows it in `cotal_channels`, never as ordinary
90
+ * absence). Persists across reconnect; cleared on tombstone success or full stop. */
91
+ pendingDurableLeave = new Map();
92
+ /** Chat-join subjects currently being broker-confirmed. An out-of-ACL subscribe among these trips an
93
+ * EXPECTED async permission violation that joinChannel turns into a clean throw, so watchStatus
94
+ * suppresses it rather than surfacing a spurious connection error. */
95
+ confirmingChatSubs = new Set();
96
+ /** True until the first successful connect completes its boot backfill — distinguishes first-connect
97
+ * (backfill the boot channels' history) from a reconnect (reopen the core-subs, no re-backfill).
98
+ * Persists across reconnect (NOT connection-scoped). Replaces the legacy chat-durable consumed-cursor
99
+ * signal now that there is no per-instance chat durable. */
100
+ firstConnect = true;
61
101
  heartbeatTimer;
62
102
  sweepTimer;
63
103
  roster = new Map();
64
104
  status = "idle";
65
105
  activity;
106
+ /** Mirror of the connector's authoritative attention state, published in presence (advisory). The
107
+ * endpoint never reads these back into delivery — they exist only to broadcast. */
108
+ attentionMode;
109
+ channelModes;
66
110
  stopped = false;
67
111
  /** In-flight rebuild (drain+rebind) — serializes manual reconnect, the supervisor's
68
112
  * closed(), and reestablishLoop so only ONE rebuild runs at a time (a second trigger
@@ -106,6 +150,9 @@ export class CotalEndpoint extends EventEmitter {
106
150
  this.doRegister = opts.registerPresence ?? true;
107
151
  this.doWatch = opts.watchPresence ?? true;
108
152
  this.doConsume = opts.consume ?? true;
153
+ // Seed the presence mirror so file-default channel modes are visible from the first publish
154
+ // (not only after the first runtime toggle). Mirror only — delivery reads the connector's state.
155
+ this.channelModes = opts.channelModes && Object.keys(opts.channelModes).length ? opts.channelModes : undefined;
109
156
  this.ackWaitMs = opts.ackWaitMs ?? 60_000;
110
157
  this.inactiveThresholdMs = opts.inactiveThresholdMs ?? 600_000;
111
158
  }
@@ -173,6 +220,10 @@ export class CotalEndpoint extends EventEmitter {
173
220
  await this.ensureStreams();
174
221
  await this.startConsumers();
175
222
  }
223
+ // Re-arm Plane-3 (manager-hosted fan-out + trusted reader) on every (re)connect — no-op unless this
224
+ // endpoint hosts it. The first arm comes from startPlane3 (after start()); this re-binds the loops
225
+ // a reconnect's clearConnectionScoped() tore down, so a broker blip doesn't silently kill the backstop.
226
+ await this.armPlane3();
176
227
  // Bound and live — covers initial start, manual reconnect, AND background self-heal (every
177
228
  // path lands here). The single signal an in-process agent's connected flag tracks.
178
229
  this.emit("connection", { connected: true });
@@ -198,6 +249,17 @@ export class CotalEndpoint extends EventEmitter {
198
249
  }
199
250
  }
200
251
  this.streamMsgs.length = 0;
252
+ for (const sub of this.chatSubs.values()) {
253
+ try {
254
+ sub.unsubscribe();
255
+ }
256
+ catch {
257
+ /* already closed with the connection */
258
+ }
259
+ }
260
+ this.chatSubs.clear();
261
+ this.chatSubDenied.clear();
262
+ this.confirmingChatSubs.clear();
201
263
  this.roster.clear();
202
264
  this.joinSeq.clear();
203
265
  this.channelConfigs.clear();
@@ -515,6 +577,30 @@ export class CotalEndpoint extends EventEmitter {
515
577
  this.status = status;
516
578
  await this.publishPresence();
517
579
  }
580
+ /** Publish the agent's global attention mode into presence (advisory observability). Mirror only —
581
+ * delivery decisions stay in the connector's authoritative state. */
582
+ async setAttention(attention) {
583
+ this.attentionMode = attention;
584
+ await this.publishPresence();
585
+ }
586
+ /** Publish the agent's per-channel attention overrides into presence (advisory). An empty map drops
587
+ * the field. Mirror only — never read back into delivery. */
588
+ async setChannelModes(modes) {
589
+ this.channelModes = Object.keys(modes).length ? modes : undefined;
590
+ await this.publishPresence();
591
+ }
592
+ /** Overlay the host's live model onto the card's display-only `meta.model` and republish presence.
593
+ * For connectors that learn the actual model only *after* launch (e.g. Claude Code's `SessionStart`
594
+ * hook payload) rather than from an operator pin. Display-only discovery metadata; a no-op when the
595
+ * value is empty or already current (no redundant publish). The mutated card is read live by every
596
+ * later publish, so even a pre-connect call surfaces on the first presence write. */
597
+ async setCardModel(model) {
598
+ const m = model.trim();
599
+ if (!m || this.card.meta?.model === m)
600
+ return;
601
+ this.card.meta = { ...(this.card.meta ?? {}), model: m };
602
+ await this.publishPresence();
603
+ }
518
604
  // ---- channel discovery ---------------------------------------------------
519
605
  /** This channel's registry config from the live local cache (undefined if unset). */
520
606
  getChannelConfig(channel) {
@@ -531,78 +617,105 @@ export class CotalEndpoint extends EventEmitter {
531
617
  return [...this.channels];
532
618
  }
533
619
  /**
534
- * Join a channel mid-session: add it to our chat durable's `filter_subjects` (same durable,
535
- * same ack-floor, no teardown `update` rides the self-scoped create grant), capture the
536
- * stream frontier as this channel's join watermark, and backfill its history if replay is on.
537
- * Idempotent: re-joining a channel already in our filter is a no-op (no re-backfill). Returns
538
- * the number of historical messages backfilled (emitted as `historical` "message" events).
620
+ * Join a channel mid-session: open a native core subscription (manager-free live read, broker-
621
+ * confirmed against `sub.allow`), capture the stream frontier as the join watermark, backfill its
622
+ * history if replay is on, and for a `durable`-class channel under a manager — request a Plane-3
623
+ * durable backstop. Idempotent: re-joining is a no-op (no re-backfill). Returns the backfill count +
624
+ * whether the durable backstop is active (+ a `reason` when a durable channel couldn't get one).
539
625
  */
540
626
  async joinChannel(channel) {
541
627
  if (!this.jsm)
542
628
  throw new Error(this.notLiveMsg());
543
629
  if (this.channels.includes(channel))
544
- return { joined: false, backfilled: 0 };
545
- // Arm the watermark BEFORE the filter flip (single-delivery: a tail message on the new
546
- // channel is then either frontier backfill-only or > frontier → tail-only, never both),
547
- // and filter BEFORE backfill (gap-safe: backfill-first leaves a window in neither stream).
630
+ return { joined: false, backfilled: 0, durable: this.plane3Channels.has(channel) };
631
+ // Arm the watermark BEFORE going live: the backfill reads frontier and the core-sub only ever
632
+ // delivers post-subscribe live messages (> frontier), so the two never overlap.
548
633
  const armed = await this.armJoin([channel]);
634
+ // Live read (SPEC v0.3): open the native core subscription — MANAGER-FREE, broker-enforced by
635
+ // sub.allow. This is what lets an agent join a channel's live feed on its own. The sub.allow
636
+ // refusal is async — broker-confirm before committing local join state; the subscribe handler
637
+ // ALSO drops a channel on ANY refusal (incl. a late one), so this is not a timing gamble (#13).
638
+ this.subscribeChat(channel);
549
639
  try {
550
- await this.setChatFilter([...this.channels, channel]);
640
+ await this.confirmChatSub();
551
641
  }
552
642
  catch (e) {
553
- this.joinSeq.delete(channel); // the flip was rejected (e.g. outside allowSubscribe) — undo the arm
554
- throw e;
643
+ // The confirm boundary (flush) failed the connection drained/closed mid-join, so we have NO
644
+ // confirmation the subscribe was accepted. Fail closed: undo the half-open join rather than
645
+ // returning as if it were confirmed (a reconnect re-confirms from this.channels, which we never
646
+ // pushed to). unsubscribeChat clears chatSubs + confirmingChatSubs.
647
+ this.unsubscribeChat(channel);
648
+ this.joinSeq.delete(channel);
649
+ throw new Error(`cannot join "${channel}": live subscription could not be confirmed (${e.message})`);
650
+ }
651
+ this.confirmingChatSubs.delete(chatSubject(this.space, "*", channel));
652
+ if (this.chatSubDenied.has(channel)) {
653
+ this.unsubscribeChat(channel);
654
+ this.joinSeq.delete(channel);
655
+ throw new Error(`cannot join "${channel}": not within this agent's read ACL (allowSubscribe)`);
555
656
  }
556
657
  this.channels.push(channel);
658
+ // Durable backstop. The live core-sub above already delivers (manager-free). For a `durable`-class
659
+ // channel, request a Plane-3 per-member backstop from the manager (durableJoin) so a post reaches a
660
+ // busy/offline turn — the core-sub stays as the live wake-hint, dedup-coalesced with the Plane-3
661
+ // copy by id-dedup. No manager (open dev / manager-less) ⇒ joined LIVE only, surfaced via `reason`
662
+ // (never silent). A `live`-class channel takes no backstop (joined live is the contract).
663
+ let durable = false;
664
+ let reason;
665
+ if (effectiveDeliveryClass(this.channelConfigs.get(channel), this.channelDefaults) === "durable") {
666
+ try {
667
+ const r = await this.durableJoinChannel(channel);
668
+ if (r.durable) {
669
+ this.plane3Channels.set(channel, r.generation ?? 0);
670
+ durable = true;
671
+ }
672
+ else {
673
+ reason = r.reason ?? "durable backstop unavailable";
674
+ }
675
+ }
676
+ catch (e) {
677
+ // No privileged writer (manager-less) or the write was rejected — joined live, backstop
678
+ // unavailable. NOT a join failure: the live subscription is up and authorized.
679
+ reason = `durable backstop unavailable (${e.message})`;
680
+ }
681
+ }
557
682
  const backfilled = await this.backfillArmed(armed);
558
- return { joined: true, backfilled };
559
- }
560
- /** Leave a channel mid-session: drop it from the durable's `filter_subjects`. Refuses to leave
561
- * the *last* channel (an empty filter would match every chat subject the opposite of
562
- * leaving). Returns whether anything changed. */
683
+ return { joined: true, backfilled, durable, ...(reason !== undefined ? { reason } : {}) };
684
+ }
685
+ /** Leave a channel mid-session MANAGER-FREE for the live read: close the core subscription. For a
686
+ * Plane-3 durable channel, the membership is tombstoned FIRST at the leave cursor (SPEC §7: leave is
687
+ * a hard read boundary for the backstop — a pre-leave entry stays deliverable, `seq > leaveCursor` is
688
+ * denied). FAIL-CLOSED: if the tombstone can't be confirmed the call throws and the leave is NOT
689
+ * applied (live sub stays up, local mirror intact) so the caller can retry — never close the live
690
+ * read while the backstop keeps delivering. */
563
691
  async leaveChannel(channel) {
564
692
  if (!this.jsm)
565
693
  throw new Error(this.notLiveMsg());
566
- const i = this.channels.indexOf(channel);
567
- if (i < 0)
694
+ if (!this.channels.includes(channel))
568
695
  return { left: false };
569
- if (this.channels.length === 1)
570
- throw new Error(`cannot leave "${channel}" it is your only channel (an empty filter would subscribe to all)`);
571
- const remaining = this.channels.filter((c) => c !== channel);
572
- await this.setChatFilter(remaining);
573
- this.channels.splice(i, 1);
696
+ // Auth + durable-class ⇒ a Plane-3 membership may exist; tombstone it BEFORE touching local state.
697
+ // The join generation comes from the local mirror, but a BOOT membership whose hydration was missed
698
+ // (transient manager error at connect) is NOT in the mirror — so re-resolve it from the manager on
699
+ // demand. FAIL-CLOSED: fetchMemberships throws on a responder-present error, so a leave whose
700
+ // tombstone can't be confirmed propagates (live sub stays up, mirror intact) for the caller to retry
701
+ // — reporting `left` while the trusted reader keeps transferring to DLV is the fail-open leak. A
702
+ // genuine no-responder (open / manager-less, no Plane-3) means there is no membership to tombstone.
703
+ if (this.creds && effectiveDeliveryClass(this.channelConfigs.get(channel), this.channelDefaults) === "durable") {
704
+ let generation = this.plane3Channels.get(channel);
705
+ if (generation === undefined)
706
+ generation = (await this.fetchMemberships())?.find((m) => m.channel === channel)?.generation;
707
+ if (generation !== undefined) {
708
+ await this.durableLeaveChannel(channel, generation);
709
+ this.plane3Channels.delete(channel);
710
+ }
711
+ }
712
+ this.unsubscribeChat(channel);
713
+ const i = this.channels.indexOf(channel);
714
+ if (i >= 0)
715
+ this.channels.splice(i, 1);
574
716
  this.joinSeq.delete(channel);
575
717
  return { left: true };
576
718
  }
577
- /** Move the chat live-tail durable to a new channel set. OPEN mode self-serves the
578
- * `consumers.update` (the agent owns its durable). AUTH mode is bind-only — the agent has no
579
- * UPDATE grant — so it sends a mediated control request to the manager, which validates the set
580
- * ⊆ its `allowSubscribe` before moving the filter. Throws clearly when no privileged responder is
581
- * present: a manager-less standalone auth session is fixed to its boot subscribe set — a
582
- * documented limitation, not a silent degrade. */
583
- async setChatFilter(channels) {
584
- if (!this.jsm)
585
- throw new Error(this.notLiveMsg());
586
- if (!this.creds) {
587
- await this.jsm.consumers.update(chatStream(this.space), chatDurable(this.card.id), {
588
- filter_subjects: collapseFilterSubjects(channels.map((ch) => chatSubject(this.space, "*", ch))),
589
- });
590
- return;
591
- }
592
- let reply;
593
- try {
594
- reply = await this.requestControl(CONTROL_SELF_SERVICE, { op: "setChannels", args: { channels } });
595
- }
596
- catch (e) {
597
- const msg = e.message;
598
- if (/no responders/i.test(msg))
599
- throw new Error("cannot change channels at runtime: no privileged provisioner (manager) is serving the mesh — " +
600
- "this session is fixed to its boot subscribe set");
601
- throw e;
602
- }
603
- if (!reply.ok)
604
- throw new Error(reply.error ?? "channel change rejected");
605
- }
606
719
  /** One coherent channel model for dashboards: every channel that has messages OR a registry
607
720
  * entry (configured-but-empty), each tagged with its {@link ChannelConfig}. Works even on
608
721
  * observer endpoints (no consumers needed). */
@@ -636,56 +749,32 @@ export class CotalEndpoint extends EventEmitter {
636
749
  .sort((a, b) => a.channel.localeCompare(b.channel));
637
750
  }
638
751
  async channelMembers(channel) {
639
- const mgr = await this.manager();
640
- // Group channel patterns by each consumer's durable id-token (chat_<id> → token(id)).
641
- // One peer has one chat consumer, so this is a straight per-peer collection; join/leave
642
- // just mutates that consumer's filter_subjects, which the next call re-reads live.
643
- const byTok = new Map();
644
- for await (const ci of mgr.consumers.list(chatStream(this.space))) {
645
- const tok = chatDurableToken(ci.config.durable_name ?? ci.name);
646
- if (tok === null)
647
- continue;
648
- // The server may report a single filter as `filter_subject` or `filter_subjects` — both
649
- // are the same datum; read whichever is present. Filters are already collapsed (the
650
- // effective subscription), so parse the channel straight out of each.
651
- const filters = ci.config.filter_subjects ?? (ci.config.filter_subject ? [ci.config.filter_subject] : []);
652
- const set = byTok.get(tok) ?? new Set();
653
- for (const f of filters) {
654
- const p = parseSubject(f);
655
- if (p?.kind === "chat")
656
- set.add(p.rest);
657
- }
658
- byTok.set(tok, set);
659
- }
660
- // Join with presence for liveness. token() is lossy, so match forward: index the roster
661
- // by token(id). A durable with no roster match is a ghost/foreign id — keep its token,
662
- // never drop it.
663
- const byToken = new Map();
752
+ const members = (await listMembers(await this.membersRegistry())).filter((r) => r.leaveCursor === undefined && r.activated === true);
753
+ const byId = new Map();
664
754
  for (const p of this.roster.values())
665
- byToken.set(token(p.card.id), p);
666
- const memberFor = (tok) => {
667
- const p = byToken.get(tok);
755
+ byId.set(p.card.id, p);
756
+ const memberForId = (id) => {
757
+ const p = byId.get(id);
668
758
  return p
669
759
  ? { id: p.card.id, name: p.card.name, role: p.card.role, live: p.status !== "offline" }
670
- : { id: tok, name: tok, live: false };
760
+ : { id, name: id, live: false };
671
761
  };
672
762
  const byName = (a, b) => a.name.localeCompare(b.name);
673
- if (channel !== undefined) {
674
- const out = [];
675
- for (const [tok, patterns] of byTok)
676
- if ([...patterns].some((pat) => subjectMatches(pat, channel)))
677
- out.push(memberFor(tok));
678
- return out.sort(byName);
679
- }
763
+ if (channel !== undefined)
764
+ return members
765
+ .filter((r) => subjectMatches(r.channel, channel))
766
+ .map((r) => memberForId(r.owner))
767
+ .sort(byName);
680
768
  const map = new Map();
681
- for (const [tok, patterns] of byTok) {
682
- const m = memberFor(tok);
683
- for (const pat of patterns) {
684
- const arr = map.get(pat);
685
- if (arr)
769
+ for (const r of members) {
770
+ const arr = map.get(r.channel);
771
+ const m = memberForId(r.owner);
772
+ if (arr) {
773
+ if (!arr.some((x) => x.id === m.id))
686
774
  arr.push(m);
687
- else
688
- map.set(pat, [m]);
775
+ }
776
+ else {
777
+ map.set(r.channel, [m]);
689
778
  }
690
779
  }
691
780
  for (const arr of map.values())
@@ -746,8 +835,14 @@ export class CotalEndpoint extends EventEmitter {
746
835
  return;
747
836
  void (async () => {
748
837
  for await (const s of this.nc.status()) {
749
- if (s.type === "error")
750
- this.emit("error", describeStatusError(s.error));
838
+ if (s.type !== "error")
839
+ continue;
840
+ // Suppress the EXPECTED permission violation from a manager-free join we're confirming: an
841
+ // out-of-ACL `nc.subscribe` is refused async on its chat subject, which joinChannel catches
842
+ // and turns into a clean throw — it is not a connection error to surface.
843
+ if (s.error instanceof PermissionViolationError && this.confirmingChatSubs.has(s.error.subject))
844
+ continue;
845
+ this.emit("error", describeStatusError(s.error));
751
846
  }
752
847
  })().catch((e) => {
753
848
  if (!this.stopped)
@@ -777,27 +872,26 @@ export class CotalEndpoint extends EventEmitter {
777
872
  await createSpaceStreams(this.jsm, this.space);
778
873
  }
779
874
  /**
780
- * Privileged: pre-create an agent's bind-only chat live-tail durable (auth mode), filtered to its
781
- * `subscribe` set, so the agent can BIND it without holding CONSUMER.CREATE/UPDATE on CHAT — its
782
- * live read can't be self-widened past `allowSubscribe`. The creator sets the filter; the agent
783
- * never does (mirrors {@link provisionDmInbox}). Idempotent. The caller must be permissive on CHAT.
784
- */
785
- async provisionChatDurable(targetId, subscribe) {
786
- const jsm = await this.manager();
787
- await jsm.consumers.add(chatStream(this.space), chatDurableConfig(this.space, targetId, subscribe));
788
- }
789
- /**
790
- * Privileged: move an agent's bind-only chat durable to a new channel set — the write half of the
791
- * mediated join/leave. The manager calls this AFTER validating the set ⊆ the agent's
792
- * `allowSubscribe`; the agent itself has no UPDATE grant, so this trusted path is the only way its
793
- * live filter moves. The filter is rebuilt from channel names here (not from agent-supplied
794
- * subjects) so a caller can't smuggle a hand-built filter.
875
+ * Privileged: write an agent's BOOT durable membership each `durable`-class channel in its boot
876
+ * subscribe set gets a Plane-3 durable-active record (via {@link durableJoinFor}: cursor capture +
877
+ * activation catch-up), so it receives durable backstop copies from boot exactly like a runtime
878
+ * `durableJoin`. `live`-class (and non-concrete) channels are skipped. Idempotent.
879
+ *
880
+ * Writes the durable RECORDS with the caller's privileged creds — it does NOT require this endpoint
881
+ * to host the runtime fan-out/reader loops (a space-level manager service), so EVERY auth launcher
882
+ * provisions identically: the manager AND the short-lived `cotal spawn` provisioner both write boot
883
+ * records, which the space's manager then delivers (no silent no-op — that would hide a boot
884
+ * membership; AGENTS.md "no fallbacks"). A space running no manager is live-only for everyone (the
885
+ * records exist; nothing delivers them until a manager hosts the loops).
795
886
  */
796
- async setChatFilterFor(targetId, channels) {
797
- const jsm = await this.manager();
798
- await jsm.consumers.update(chatStream(this.space), chatDurable(targetId), {
799
- filter_subjects: collapseFilterSubjects(channels.map((ch) => chatSubject(this.space, "*", ch))),
800
- });
887
+ async provisionMembership(targetId, channels) {
888
+ for (const ch of channels) {
889
+ if (!isConcreteChannel(ch))
890
+ continue; // durable membership is per-concrete-channel
891
+ if ((await this.deliveryClassFresh(ch)) !== "durable")
892
+ continue;
893
+ await this.durableJoinFor(targetId, ch);
894
+ }
801
895
  }
802
896
  /**
803
897
  * Privileged: pre-create an agent's DM inbox durable (auth mode), so the agent can BIND
@@ -810,6 +904,17 @@ export class CotalEndpoint extends EventEmitter {
810
904
  const jsm = await this.manager();
811
905
  await jsm.consumers.add(dmStream(this.space), dmDurableConfig(this.space, targetId));
812
906
  }
907
+ /**
908
+ * Privileged: pre-create an agent's bind-only Plane-3 DELIVER durable (`dlv_<id>`, filtered to
909
+ * `dlv.<id>`), so the agent can BIND its per-member durable handoff without holding CONSUMER.CREATE
910
+ * on the DLV stream. Same bind-only model as {@link provisionDmInbox}: the creator sets the filter,
911
+ * the agent never does. The trusted reader transfers re-authorized copies onto `dlv.<id>`; the agent
912
+ * acks them via native JetStream (SPEC §8). Idempotent. The caller must be permissive on DLV.
913
+ */
914
+ async provisionDlvInbox(targetId) {
915
+ const jsm = await this.manager();
916
+ await jsm.consumers.add(dlvStream(this.space), dlvDurableConfig(this.space, targetId));
917
+ }
813
918
  /**
814
919
  * Privileged: pre-create a role's shared TASK work-queue durable (auth mode), so agents
815
920
  * of that role can BIND it without holding CONSUMER.CREATE on TASK_<space>. The creator
@@ -820,6 +925,524 @@ export class CotalEndpoint extends EventEmitter {
820
925
  const jsm = await this.manager();
821
926
  await jsm.consumers.add(taskStream(this.space), taskDurableConfig(this.space, role));
822
927
  }
928
+ // ---- Plane-3: durable backstop (SPEC §8) — privileged, manager-hosted ----------------------------
929
+ //
930
+ // Two manager loops + two privileged membership ops. The FAN-OUT writer (routing, not auth) reads
931
+ // every chat message and copies it into each eligible owner's MIXED inbox (`dinbox.<owner>`); the
932
+ // TRUSTED READER (the auth gate) re-authorizes each entry against the CURRENT ACL + membership
933
+ // interval and TRANSFERS the authorized copy to the owner's per-member DELIVER store
934
+ // (`dlv.<owner>`), which the agent binds + acks via native JetStream. The agent holds no read on the
935
+ // mixed store. See `.internal/research/stage4-impl-design.md`.
936
+ /** Lazily open the privileged members registry KV (manager / open-mode self). */
937
+ async membersRegistry() {
938
+ if (!this.nc)
939
+ throw new Error("endpoint not started");
940
+ this.membersKv ??= await openMembersRegistry(this.nc, this.space);
941
+ return this.membersKv;
942
+ }
943
+ /** Privileged: one owner's NON-TOMBSTONED durable memberships as `{channel, generation, activated}` —
944
+ * the manager serves this to a connecting agent (via the `listMemberships` self-service op). The agent
945
+ * hydrates its leave mirror from the ACTIVATED ones (the confirmed backstops), but the non-activated
946
+ * ones are returned too so `leaveChannel` can discover + close a record that still routes under the
947
+ * pure-interval predicate (a crash-stuck pending activation) — without reading the privileged KV. */
948
+ async ownerMemberships(owner) {
949
+ const recs = await listMembers(await this.membersRegistry(), { owner });
950
+ return recs
951
+ .filter((r) => r.leaveCursor === undefined)
952
+ .map((r) => ({ channel: r.channel, generation: r.generation, activated: r.activated === true }));
953
+ }
954
+ /** Effective delivery class read AUTHORITATIVELY from the registry KV (not the watch cache) — so a
955
+ * `live`→`durable` flip is seen by fan-out without a cache-propagation gap (red-team MED-3). */
956
+ async deliveryClassFresh(channel) {
957
+ if (!this.channelKv)
958
+ return effectiveDeliveryClass(undefined, undefined);
959
+ const [cfg, defaults] = await Promise.all([
960
+ isConcreteChannel(channel) ? readChannelConfig(this.channelKv, channel) : Promise.resolve(undefined),
961
+ readChannelDefaults(this.channelKv),
962
+ ]);
963
+ return effectiveDeliveryClass(cfg, defaults);
964
+ }
965
+ /** Collision-safe `@mention` → owner-id resolution: a name that resolves to exactly one present
966
+ * peer wins; 0 or >1 matches drop (never fan a directed durable copy to an unrelated same-named
967
+ * bystander — red-team LOW; SPEC §4 unique instance id). */
968
+ resolveOwnerByName(name) {
969
+ const matches = [...this.roster.values()].filter((p) => p.card.name.toLowerCase() === name.toLowerCase());
970
+ return matches.length === 1 ? matches[0].card.id : undefined;
971
+ }
972
+ /** Publish one fan-out entry into an owner's mixed inbox, idempotent via `Nats-Msg-Id`
973
+ * (`<msgId>:<owner>:<generation>`) so a catch-up copy and a racing fan-out copy collapse. */
974
+ async publishDinbox(owner, entry) {
975
+ if (!this.js)
976
+ return;
977
+ await this.js.publish(dinboxSubject(this.space, owner), JSON.stringify(entry), {
978
+ msgID: `${entry.msg.id}:${owner}:${entry.generation}`,
979
+ });
980
+ }
981
+ /** The fan-out consumer's delivered stream-seq — the activation-fence upper bound (red-team
982
+ * BLOCKER-1: the shared fan-out cursor advances independently of the stream frontier). */
983
+ async fanoutDeliveredSeq() {
984
+ const info = await this.consumerInfo(chatStream(this.space), FANOUT_DURABLE);
985
+ return info?.delivered?.stream_seq ?? 0;
986
+ }
987
+ /**
988
+ * Privileged durable-JOIN write (the manager calls this after validating channel ⊆ allowSubscribe;
989
+ * {@link provisionMembership} calls it at provision time for boot channels): capture `joinCursor`,
990
+ * commit a `durable-active` record (CAS + generation bump), then ACTIVATION CATCH-UP idempotently
991
+ * copies `(joinCursor, fence]` into the owner inbox where `fence = max(frontier, fanoutDelivered)` —
992
+ * fan-out owns `seq > fence`. Idempotent against a timeout-retry (an already-activated membership
993
+ * no-ops). Returns `{durable:false}` (honest degrade) only if the catch-up window was evicted.
994
+ *
995
+ * This writes durable KV + dinbox state with the caller's privileged creds; it does NOT require THIS
996
+ * endpoint to host the fan-out/reader loops (those are a space-level manager service). So a
997
+ * short-lived provisioner can write a boot membership a separate long-lived manager then delivers.
998
+ */
999
+ async durableJoinFor(owner, channel) {
1000
+ if (!this.js)
1001
+ throw new Error("endpoint not started");
1002
+ await this.manager(); // ensure jsm — a non-consuming provisioner inits it lazily; catch-up + fence need it
1003
+ const kv = await this.membersRegistry();
1004
+ const existing = await readMember(kv, channel, owner);
1005
+ const open = existing?.record.state === "durable-active" && existing.record.leaveCursor === undefined;
1006
+ if (open && existing.record.activated)
1007
+ return { durable: true, generation: existing.record.generation }; // fully activated — idempotent
1008
+ // Either a NEW join (no record / a tombstone to supersede) → fresh joinCursor + bumped generation,
1009
+ // OR a retry of an INCOMPLETE activation (durable-active but not yet activated, from an earlier
1010
+ // eviction/crash) → re-run catch-up over the SAME join window, no bump. The record is committed
1011
+ // `activated:false` first and routes IN-INTERVAL immediately (fan-out + reader deliver via the
1012
+ // pure-interval durableEligible) so no live message published during catch-up is lost. `activated`
1013
+ // gates only the REPORT — durableJoin returns true / channelMembers lists the owner only after the
1014
+ // catch-up confirms. A join that never completes catch-up still routes live (harmless: the agent is
1015
+ // live-subscribed and DLV is id-deduped) but honestly reports durable:false and stays hidden.
1016
+ const joinCursor = open ? existing.record.joinCursor : await this.chatFrontier();
1017
+ const generation = open ? existing.record.generation : (existing?.record.generation ?? 0) + 1;
1018
+ const base = {
1019
+ channel, owner, state: "durable-active", joinCursor, generation,
1020
+ activated: false, writerIdentity: this.card.id, updatedAt: Date.now(),
1021
+ };
1022
+ if (!open)
1023
+ await commitMember(kv, base);
1024
+ const fence = Math.max(await this.chatFrontier(), await this.fanoutDeliveredSeq());
1025
+ const cu = await this.catchupCopy(owner, channel, joinCursor, fence, generation);
1026
+ if (cu.evicted) {
1027
+ // Catch-up window irreparably evicted (the oldest in-window message aged out) — this join can never
1028
+ // be a complete backstop. TOMBSTONE the just-committed record at `fence` so it does NOT route:
1029
+ // pure-interval durableEligible would otherwise keep delivering to a record the agent was told is
1030
+ // durable:false AND can't discover to leave (critic BLOCKER-1). Pass `generation` as the expected
1031
+ // generation (ux stale-write guard) so this cleanup can't tombstone a concurrent NEWER rejoin — if
1032
+ // one won, StaleMembershipWrite is the correct no-op (the rejoin is the live record). Then degrade
1033
+ // honestly — a retry is a fresh join (no longer `open`, so a current joinCursor is captured).
1034
+ try {
1035
+ await tombstoneMember(kv, channel, owner, fence, this.card.id, generation);
1036
+ }
1037
+ catch (e) {
1038
+ if (!(e instanceof StaleMembershipWrite))
1039
+ throw e;
1040
+ }
1041
+ return { durable: false, reason: "activation catch-up window partially evicted by retention", generation };
1042
+ }
1043
+ // Flip → reported durable, ATOMICALLY: refuse if a concurrent SAME-generation leave (tombstone) or a
1044
+ // rejoin superseded this pending join while catch-up ran. A blind same-gen commit would clobber the
1045
+ // tombstone (clear leaveCursor) and resurrect the membership, reopening §7 (review-general-2 BLOCKER).
1046
+ const activated = await activateMember(kv, channel, owner, generation, joinCursor);
1047
+ if (!activated)
1048
+ return { durable: false, reason: "activation superseded by a concurrent leave or rejoin", generation };
1049
+ return { durable: true, generation };
1050
+ }
1051
+ /** Privileged durable-LEAVE write: tombstone the membership at `leaveCursor = frontier` so the
1052
+ * backstop denies `seq > leaveCursor` while a pre-leave entry stays deliverable (SPEC §7 interval). */
1053
+ async durableLeaveFor(owner, channel, expectedGeneration) {
1054
+ if (!this.plane3)
1055
+ return; // not a Plane-3 host — no membership to tombstone
1056
+ const kv = await this.membersRegistry();
1057
+ // expectedGeneration (captured by the agent at durableJoin) refuses a stale leave from tombstoning
1058
+ // a newer rejoin (StaleMembershipWrite) — a durable-disable primitive otherwise.
1059
+ await tombstoneMember(kv, channel, owner, await this.chatFrontier(), this.card.id, expectedGeneration);
1060
+ }
1061
+ /** Idempotently copy the eligible chat messages in `(fromSeqExcl, toSeqIncl]` for `channel` into the
1062
+ * owner inbox, via a DEDICATED per-(owner,join) ephemeral consumer (NOT the agent-scoped
1063
+ * `chathist_<id>`/`histLock` — red-team HIGH-8). `evicted` ⇒ the oldest eligible seq aged out under
1064
+ * `discard=Old` (the start seq could not be served), a durable shortfall the caller surfaces. */
1065
+ async catchupCopy(owner, channel, fromSeqExcl, toSeqIncl, generation) {
1066
+ if (!this.js || !this.jsm || toSeqIncl <= fromSeqExcl)
1067
+ return { copied: 0, evicted: false };
1068
+ const subject = chatSubject(this.space, "*", channel);
1069
+ // Eviction = a message in `(joinCursor, …]` on THIS channel's subject aged out under discard=Old.
1070
+ // Judged PER-SUBJECT (reuse channelDropped: oldest-retained-for-subject vs the watermark, only at
1071
+ // the per-subject cap), NOT against the stream-global joinCursor+1 — other channels' traffic
1072
+ // inflates the global seq, so a naive "first delivered seq > joinCursor+1" false-positives on any
1073
+ // busy multi-channel space (impl-review HIGH-2). A true eviction → durableJoin reports durable:false.
1074
+ const evicted = await this.channelDropped(subject, fromSeqExcl);
1075
+ const name = `cu_${token(owner)}_${generation}`;
1076
+ try {
1077
+ await this.jsm.consumers.delete(chatStream(this.space), name);
1078
+ }
1079
+ catch { /* none */ }
1080
+ await this.jsm.consumers.add(chatStream(this.space), {
1081
+ name, filter_subject: subject, ack_policy: AckPolicy.None, mem_storage: true,
1082
+ inactive_threshold: nanos(30_000), deliver_policy: DeliverPolicy.StartSequence, opt_start_seq: fromSeqExcl + 1,
1083
+ });
1084
+ let copied = 0;
1085
+ try {
1086
+ const consumer = await this.js.consumers.get(chatStream(this.space), name);
1087
+ let pending = (await consumer.info()).num_pending;
1088
+ while (pending > 0) {
1089
+ const want = Math.min(pending, 256);
1090
+ const iter = await consumer.fetch({ max_messages: want, expires: 5_000 });
1091
+ let got = 0;
1092
+ for await (const m of iter) {
1093
+ got++;
1094
+ if (m.seq > toSeqIncl)
1095
+ return { copied, evicted };
1096
+ let msg;
1097
+ try {
1098
+ msg = m.json();
1099
+ }
1100
+ catch {
1101
+ continue;
1102
+ }
1103
+ const parsed = parseSubject(m.subject);
1104
+ if (!parsed || msg.from?.id !== parsed.sender || msg.from.id === owner)
1105
+ continue;
1106
+ await this.publishDinbox(owner, { msg, channel, seq: m.seq, reason: "durable-channel", generation });
1107
+ copied++;
1108
+ }
1109
+ if (got < want)
1110
+ break;
1111
+ pending -= got;
1112
+ }
1113
+ }
1114
+ finally {
1115
+ try {
1116
+ await this.jsm.consumers.delete(chatStream(this.space), name);
1117
+ }
1118
+ catch { /* gone */ }
1119
+ }
1120
+ return { copied, evicted };
1121
+ }
1122
+ /** Start the Plane-3 fan-out writer + trusted reader on THIS (privileged) endpoint. `aclFor` maps an
1123
+ * owner id to its current read ACL for the reader's re-authorization (the manager passes its managed
1124
+ * set). Call once after connect; idempotent durable creation lets it resume on a manager restart. */
1125
+ async startPlane3(aclFor) {
1126
+ if (!this.js)
1127
+ throw new Error("endpoint not started");
1128
+ this.plane3 = { aclFor };
1129
+ await this.armPlane3();
1130
+ }
1131
+ /** (Re)bind the Plane-3 fan-out writer + trusted reader. Idempotent — the durables resume from their
1132
+ * cursor. Called by {@link startPlane3} once AND by {@link connectAndBind} on every (re)connect, so
1133
+ * a manager-endpoint reconnect RE-ARMS the backstop. Without this, a broker blip would silently kill
1134
+ * the loops while `durableJoinFor` kept reporting `durable:true` (the impl-review's BLOCKER-1). No-op
1135
+ * unless this endpoint hosts Plane-3 (`this.plane3` set). */
1136
+ async armPlane3() {
1137
+ if (!this.plane3 || !this.js)
1138
+ return;
1139
+ await this.manager(); // the manager runs consume:false, so this.jsm is lazy — ensure it
1140
+ await this.runFanout();
1141
+ await this.runReader();
1142
+ }
1143
+ /** Fan-out loop: bind the privileged `fanout` durable on CHAT and route each message (routing only —
1144
+ * the trusted reader is the auth gate). */
1145
+ async runFanout() {
1146
+ if (!this.js || !this.jsm)
1147
+ return;
1148
+ try {
1149
+ await this.jsm.consumers.add(chatStream(this.space), fanoutDurableConfig(this.space, { ackWaitMs: this.ackWaitMs }));
1150
+ }
1151
+ catch { /* exists */ }
1152
+ const consumer = await this.js.consumers.get(chatStream(this.space), FANOUT_DURABLE);
1153
+ const msgs = await consumer.consume();
1154
+ this.streamMsgs.push(msgs);
1155
+ void (async () => {
1156
+ for await (const m of msgs) {
1157
+ try {
1158
+ await this.fanOutMessage(m);
1159
+ }
1160
+ catch (e) {
1161
+ if (!this.stopped)
1162
+ this.emit("error", e);
1163
+ try {
1164
+ m.nak();
1165
+ }
1166
+ catch { /* draining */ }
1167
+ }
1168
+ }
1169
+ })().catch((e) => { if (!this.stopped)
1170
+ this.emit("error", e); });
1171
+ }
1172
+ /** Route ONE chat message to eligible owners' mixed inboxes. `durable` channel → its `durable-active`
1173
+ * members within interval; `live` channel → `@mention` targets authorized to read it (ACL only).
1174
+ * Members KV is scanned FRESH per message (no cache — red-team BLOCKER-1 catch-up correctness). */
1175
+ async fanOutMessage(m) {
1176
+ const parsed = parseSubject(m.subject);
1177
+ if (!parsed || parsed.kind !== "chat") {
1178
+ m.ack();
1179
+ return;
1180
+ }
1181
+ const channel = parsed.rest;
1182
+ let msg;
1183
+ try {
1184
+ msg = m.json();
1185
+ }
1186
+ catch {
1187
+ m.ack();
1188
+ return;
1189
+ }
1190
+ if (!msg.from || msg.from.id !== parsed.sender) {
1191
+ m.ack();
1192
+ return;
1193
+ } // authenticity
1194
+ const seq = m.seq;
1195
+ if ((await this.deliveryClassFresh(channel)) === "durable") {
1196
+ for (const rec of await listMembers(await this.membersRegistry(), { channel })) {
1197
+ if (rec.owner === msg.from.id)
1198
+ continue; // never backstop the sender's own post
1199
+ if (!durableEligible(rec, seq))
1200
+ continue; // routing fast-filter (reader re-checks)
1201
+ await this.publishDinbox(rec.owner, { msg, channel, seq, reason: "durable-channel", generation: rec.generation });
1202
+ }
1203
+ }
1204
+ else {
1205
+ for (const name of msg.mentions ?? []) {
1206
+ const owner = this.resolveOwnerByName(name);
1207
+ if (!owner || owner === msg.from.id)
1208
+ continue;
1209
+ const acl = this.plane3?.aclFor(owner);
1210
+ if (!acl || !channelInAllow(acl, channel))
1211
+ continue; // @mention can't bypass the read ACL
1212
+ await this.publishDinbox(owner, { msg, channel, seq, reason: "live-mention", generation: 0 });
1213
+ }
1214
+ }
1215
+ m.ack();
1216
+ }
1217
+ /** Trusted-reader loop: bind the single privileged `reader` durable over `dinbox.>` and re-authorize
1218
+ * + transfer each entry. */
1219
+ async runReader() {
1220
+ if (!this.js || !this.jsm)
1221
+ return;
1222
+ try {
1223
+ await this.jsm.consumers.add(inboxStream(this.space), inboxReaderConfig(this.space, { ackWaitMs: this.ackWaitMs }));
1224
+ }
1225
+ catch { /* exists */ }
1226
+ const consumer = await this.js.consumers.get(inboxStream(this.space), INBOX_READER_DURABLE);
1227
+ const msgs = await consumer.consume();
1228
+ this.streamMsgs.push(msgs);
1229
+ void (async () => {
1230
+ for await (const m of msgs) {
1231
+ try {
1232
+ await this.readerHandle(m);
1233
+ }
1234
+ catch (e) {
1235
+ if (!this.stopped)
1236
+ this.emit("error", e);
1237
+ try {
1238
+ m.nak();
1239
+ }
1240
+ catch { /* draining */ }
1241
+ }
1242
+ }
1243
+ })().catch((e) => { if (!this.stopped)
1244
+ this.emit("error", e); });
1245
+ }
1246
+ /** Re-authorize ONE mixed-inbox entry and transfer it to the owner's DELIVER store. Deny (drop) on a
1247
+ * revoked/narrowed ACL or out-of-interval seq; on transfer success, ack the mixed entry (durability
1248
+ * has moved to DLV — an §8 equivalent per-member at-least-once mechanism). The agent acks DLV. */
1249
+ async readerHandle(m) {
1250
+ const owner = parseDinboxOwner(m.subject);
1251
+ if (!owner) {
1252
+ m.ack();
1253
+ return;
1254
+ } // unparseable subject — not a real entry
1255
+ let entry;
1256
+ try {
1257
+ entry = m.json();
1258
+ }
1259
+ catch {
1260
+ m.ack();
1261
+ return;
1262
+ } // undecodable — drop
1263
+ const redeliveries = m.info?.deliveryCount ?? 1; // JsMsg delivery attempts (1 on first delivery)
1264
+ const acl = this.plane3?.aclFor(owner);
1265
+ if (acl === undefined) {
1266
+ // UNKNOWN owner — the manager has not (re)hydrated this owner's ACL yet (e.g. right after a
1267
+ // manager PROCESS restart). This is NOT a revocation: DEFER (redeliver), never drop — an ack here
1268
+ // would lose at-least-once on restart (impl-review BLOCKER-2). A delayed nak + a redelivery
1269
+ // ceiling stops one perma-unknown owner from head-of-lining the shared reader.
1270
+ // (Follow-up: the manager does not yet rehydrate its managed set across a process restart — until
1271
+ // it does, a long-unknown owner's entries term after the ceiling; tracked, not a silent ack-drop.)
1272
+ if (redeliveries >= READER_MAX_REDELIVERIES) {
1273
+ m.term();
1274
+ this.emit("error", new Error(`plane-3 reader: gave up on entry for unknown owner ${owner} after ${redeliveries} redeliveries`));
1275
+ return;
1276
+ }
1277
+ m.nak(2000);
1278
+ return;
1279
+ }
1280
+ // KNOWN owner whose CURRENT ACL no longer covers the channel — a revocation/narrowing. Drop: the
1281
+ // entry is no longer authorized (SPEC §7 current-ACL gate before surfacing).
1282
+ if (!channelInAllow(acl, entry.channel)) {
1283
+ m.ack();
1284
+ return;
1285
+ }
1286
+ if (entry.reason === "durable-channel") {
1287
+ const rec = await readMember(await this.membersRegistry(), entry.channel, owner);
1288
+ // INTERVAL re-auth (not a current-member boolean): a pre-leave entry (seq ≤ leaveCursor) stays
1289
+ // deliverable; seq > leaveCursor (or after a rejoin's newer joinCursor) is the hard cut.
1290
+ if (!rec || !durableEligible(rec.record, entry.seq)) {
1291
+ m.ack();
1292
+ return;
1293
+ }
1294
+ }
1295
+ try {
1296
+ await this.js.publish(dlvSubject(this.space, owner), JSON.stringify(entry.msg), {
1297
+ msgID: `${entry.msg.id}:${owner}:${entry.generation}`,
1298
+ });
1299
+ }
1300
+ catch {
1301
+ // Transfer failed — keep the entry pending (redeliver), bounded by the same ceiling so a poison
1302
+ // entry can't head-of-line the shared reader forever.
1303
+ if (redeliveries >= READER_MAX_REDELIVERIES) {
1304
+ m.term();
1305
+ this.emit("error", new Error(`plane-3 reader: gave up transferring ${entry.msg.id} for ${owner} after ${redeliveries} redeliveries`));
1306
+ return;
1307
+ }
1308
+ m.nak(2000);
1309
+ return;
1310
+ }
1311
+ m.ack();
1312
+ }
1313
+ /** Agent-side: bind + pump our pre-created Plane-3 DELIVER durable (`dlv_<id>`). Every message here is
1314
+ * manager-written (DLV is manager-write-only, broker-enforced) and is a CHANNEL message by contract
1315
+ * (the backstop never carries DMs), so `kind=channel` is path-derived (SPEC §4) and the body is
1316
+ * trusted (no spoof-guard). `durable:true` — real JetStream ack, coalesced with the core-sub live
1317
+ * copy by `MeshAgent.ingest`. No-op when the durable isn't present (open mode / not provisioned). */
1318
+ async pumpDlv() {
1319
+ if (!this.js)
1320
+ return;
1321
+ let consumer;
1322
+ try {
1323
+ consumer = await this.js.consumers.get(dlvStream(this.space), dlvDurable(this.card.id));
1324
+ }
1325
+ catch {
1326
+ return;
1327
+ } // no DLV durable — Plane-3 not active for us
1328
+ const msgs = await consumer.consume();
1329
+ this.streamMsgs.push(msgs);
1330
+ void (async () => {
1331
+ for await (const m of msgs) {
1332
+ let msg;
1333
+ try {
1334
+ msg = m.json();
1335
+ }
1336
+ catch (e) {
1337
+ this.emit("error", e);
1338
+ try {
1339
+ m.term();
1340
+ }
1341
+ catch { /* draining */ }
1342
+ continue;
1343
+ }
1344
+ if (msg.from?.id === this.card.id) {
1345
+ m.ack();
1346
+ continue;
1347
+ } // own echo (defensive)
1348
+ const delivery = { ack: () => m.ack(), nak: () => m.nak(), durable: true };
1349
+ this.emit("message", msg, delivery, { historical: false, kind: "channel" });
1350
+ }
1351
+ })().catch((e) => { if (!this.stopped)
1352
+ this.emit("error", e); });
1353
+ }
1354
+ /** Agent-side: request a Plane-3 durable backstop for a channel via the manager (ctl.self). Throws
1355
+ * when no privileged writer is present (open / manager-less). 30s timeout — activation catch-up may
1356
+ * run before the reply (the window is small, but a busy channel can take more than the 5s default). */
1357
+ async durableJoinChannel(channel) {
1358
+ const reply = await this.requestControl(CONTROL_SELF_SERVICE, { op: "durableJoin", args: { channel } }, 30_000);
1359
+ if (!reply.ok)
1360
+ throw new Error(reply.error ?? "durable join rejected");
1361
+ return reply.data ?? { durable: false };
1362
+ }
1363
+ /** Agent-side: release a Plane-3 durable backstop (tombstone membership at the leave cursor). Passes
1364
+ * the join generation so a stale leave can't tombstone a newer rejoin (the manager validates it). */
1365
+ async durableLeaveChannel(channel, generation) {
1366
+ const reply = await this.requestControl(CONTROL_SELF_SERVICE, { op: "durableLeave", args: { channel, generation } });
1367
+ if (!reply.ok)
1368
+ throw new Error(reply.error ?? "durable leave rejected");
1369
+ }
1370
+ /** Fail-closed async cleanup for a channel forced out by a LATE sub.allow refusal (the broker revoked
1371
+ * the live read). The sync sub callback can't await, so this RETRIES the Plane-3 tombstone with capped
1372
+ * backoff UNTIL IT SUCCEEDS (or the endpoint stops) — the §7 boundary always closes once the manager
1373
+ * is reachable, never a silent give-up. While pending, the channel is tracked in
1374
+ * {@link pendingDurableLeave} and surfaced via {@link pendingDurableLeaves} (the connector shows it in
1375
+ * `cotal_channels` as `durable-unclosed`, never ordinary absence). The generation is kept the whole
1376
+ * time. Authoritative closure of a revoked membership is also the manager's job (revocation). */
1377
+ async closeRefusedMembership(channel, generation) {
1378
+ this.pendingDurableLeave.set(channel, generation);
1379
+ for (let attempt = 0;; attempt++) {
1380
+ if (this.stopped)
1381
+ return;
1382
+ try {
1383
+ await this.durableLeaveChannel(channel, generation);
1384
+ this.plane3Channels.delete(channel);
1385
+ this.pendingDurableLeave.delete(channel);
1386
+ return;
1387
+ }
1388
+ catch (e) {
1389
+ if (attempt === 0)
1390
+ this.emit("error", new Error(`channel "${channel}": Plane-3 durable membership (generation ${generation}) not yet tombstoned after a refused live sub — retrying; §7 boundary may be open until it succeeds (${e.message})`));
1391
+ await new Promise((r) => setTimeout(r, Math.min(30_000, 1000 * 2 ** attempt)));
1392
+ }
1393
+ }
1394
+ }
1395
+ /** Channels with a Plane-3 durable membership whose §7 tombstone is still pending after a refused live
1396
+ * sub (see {@link closeRefusedMembership}) — surfaced by the connector as a `durable-unclosed` state so
1397
+ * it is never presented as ordinary "not subscribed". */
1398
+ pendingDurableLeaves() {
1399
+ return [...this.pendingDurableLeave.keys()];
1400
+ }
1401
+ /** A control request that found NO responder — open / manager-less (no privileged control plane),
1402
+ * distinct from a responder that errored. nats.js surfaces it as NoRespondersError, or a RequestError
1403
+ * whose `isNoResponders()` is true. */
1404
+ isNoResponders(e) {
1405
+ return e instanceof NoRespondersError || (e instanceof RequestError && e.isNoResponders());
1406
+ }
1407
+ /** Agent-side: this session's CURRENT durable memberships (channel + join generation) from the
1408
+ * manager — the agent holds no read on the privileged members KV. `undefined` ⇒ NO control responder
1409
+ * (open / manager-less, so there is no Plane-3 and no memberships). THROWS on a responder-present RPC
1410
+ * failure, so a caller can FAIL-CLOSED rather than mistaking a transient error for "no membership". */
1411
+ async fetchMemberships() {
1412
+ let reply;
1413
+ try {
1414
+ reply = await this.requestControl(CONTROL_SELF_SERVICE, { op: "listMemberships", args: {} }, 5_000);
1415
+ }
1416
+ catch (e) {
1417
+ if (this.isNoResponders(e))
1418
+ return undefined; // no manager — open / manager-less, no Plane-3
1419
+ throw e; // responder present but errored — surface it (leaveChannel fails closed)
1420
+ }
1421
+ if (!reply.ok)
1422
+ throw new Error(reply.error ?? "listMemberships failed");
1423
+ return reply.data?.memberships ?? [];
1424
+ }
1425
+ /** Agent-side: seed `plane3Channels` with this session's boot durable memberships + generations on
1426
+ * first connect (the agent holds no read on the privileged members KV). A best-effort OPTIMIZATION: it
1427
+ * pre-fills the leave-generation mirror + the durable-state surface. If it can't (a transient manager
1428
+ * error), {@link leaveChannel} re-resolves the generation on demand and fails closed there — so a
1429
+ * missed hydration never silently leaves a boot durable channel untombstonable. */
1430
+ async hydrateMemberships() {
1431
+ let memberships;
1432
+ try {
1433
+ memberships = await this.fetchMemberships();
1434
+ }
1435
+ catch {
1436
+ return; // transient manager error at boot — leaveChannel re-resolves on demand (fail-closed there)
1437
+ }
1438
+ if (!memberships)
1439
+ return; // no manager — live-only
1440
+ // Seed the mirror (+ durable-state surface) with CONFIRMED backstops only; leaveChannel re-resolves a
1441
+ // non-activated record on demand if it ever needs to close one.
1442
+ for (const m of memberships)
1443
+ if (m.activated && this.channels.includes(m.channel))
1444
+ this.plane3Channels.set(m.channel, m.generation);
1445
+ }
823
1446
  /** Lazily obtain a JetStream manager — so a non-consuming endpoint (e.g. the supervisor,
824
1447
  * consume:false) can still pre-create others' durables. */
825
1448
  async manager() {
@@ -843,64 +1466,36 @@ export class CotalEndpoint extends EventEmitter {
843
1466
  }));
844
1467
  }
845
1468
  await this.pump(dmStream(this.space), dmDurable(id));
846
- // Multicast: a DeliverPolicy.New *tail* of our channels. History is NOT a durable replay —
847
- // it's an explicit, per-channel backfill on join (replay-policy gated, below), the only
848
- // shape that can honor per-channel policy given deliver_policy is consumer-wide.
1469
+ // Plane-3 (SPEC §8): bind + pump our per-member DELIVER durable (`dlv_<id>`) the re-authorized
1470
+ // durable-backstop channel copies the trusted reader transfers to us. No-op when it isn't present
1471
+ // (open mode / un-provisioned). Auth-only feature; the pump self-guards on the durable's existence.
1472
+ await this.pumpDlv();
1473
+ // Multicast: open a native CORE subscription for each channel (live, manager-free, broker-enforced
1474
+ // by sub.allow) — boot + runtime joins use the SAME path; there is no per-instance chat durable.
1475
+ // The durable backstop (a busy/offline turn) is Plane-3 (auth: membership written at provision, the
1476
+ // manager's fan-out writer + trusted reader deliver via the `dlv_<id>` pump above; open dev mode is
1477
+ // live-only — the durable plane needs the manager's trusted reader, the security boundary). Per-
1478
+ // channel history is the explicit replay-gated backfill, on FIRST connect only; a reconnect reopens
1479
+ // the subs without re-backfilling (the durable backstop redelivers any missed window via dlv).
849
1480
  if (this.channels.length) {
850
- const durable = chatDurable(id);
851
- const want = collapseFilterSubjects(this.channels.map((ch) => chatSubject(this.space, "*", ch)));
852
- // Auth mode: the chat live-tail durable is pre-created BIND-ONLY by the provisioner (the agent
853
- // is denied CONSUMER.CREATE/UPDATE on CHAT — its filter is the read boundary). Open mode: the
854
- // agent owns it and self-creates. Either way it is a DeliverPolicy.New tail; per-channel
855
- // history is the explicit backfill below (the only shape that honors per-channel replay
856
- // policy given deliver_policy is consumer-wide).
857
- const info = await this.consumerInfo(chatStream(this.space), durable);
858
- if (!info) {
859
- if (this.creds)
860
- throw new Error(`chat durable ${durable} not pre-created — a launcher must call provisionChatDurable ` +
861
- `(auth mode binds the durable, it never self-creates)`);
862
- await this.jsm.consumers.add(chatStream(this.space), chatDurableConfig(this.space, id, this.channels, {
863
- ackWaitMs: this.ackWaitMs,
864
- inactiveThresholdMs: this.inactiveThresholdMs,
865
- }));
866
- }
867
- // First bind to this durable (open self-create, or an auth pre-create never consumed) ⇒
868
- // backfill the full subscribe set. A later reconnect (the consumed cursor has advanced)
869
- // backfills only channels the config GAINED — un-acked live messages auto-redeliver, so a full
870
- // re-backfill would double up. With pre-create, `info` always exists under auth, so the
871
- // consumed cursor — not the durable's existence — is what tells first-bind from reconnect.
872
- //
873
- // Caveat (best-effort, by design): `consumer_seq > 0` proves the durable has delivered at
874
- // least once, NOT that the initial backfill completed. A crash between the first delivery and
875
- // backfillArmed() makes the next bind take the reconnect path and skip the full pre-bind
876
- // backfill. This is unchanged from the prior self-create path (which keyed on durable
877
- // existence and had the same gap — and was actually weaker: a crash before any delivery left
878
- // the durable existing, so it never re-backfilled; consumer_seq still 0 here re-backfills).
879
- // Reliable FORWARD delivery is the durable's job (un-acked redelivery); pre-bind history is
880
- // opportunistic. A backfill-completion marker would make it reliable — a deferred follow-up.
881
- const consumed = (info?.delivered?.consumer_seq ?? 0) > 0;
882
- if (!consumed) {
883
- // Arm the tail-drop watermarks BEFORE pump starts, so the tail can never deliver a
884
- // just-bound channel's message un-watermarked (which would double-emit: live + backfill).
885
- const armed = await this.armJoin(this.channels);
886
- await this.pump(chatStream(this.space), durable);
1481
+ // Arm the per-channel join watermarks BEFORE opening the subs: the backfill reads <= frontier and
1482
+ // the core-sub delivers > frontier, so they never overlap (first connect). On reconnect we reopen
1483
+ // without arming/backfilling.
1484
+ const armed = this.firstConnect ? await this.armJoin(this.channels) : undefined;
1485
+ for (const ch of this.channels)
1486
+ this.subscribeChat(ch);
1487
+ await this.confirmChatSub();
1488
+ for (const ch of this.channels)
1489
+ this.confirmingChatSubs.delete(chatSubject(this.space, "*", ch));
1490
+ if (armed)
887
1491
  await this.backfillArmed(armed);
888
- }
889
- else {
890
- // Reconnect: resume the tail, then backfill any channels the config GAINED since.
891
- await this.pump(chatStream(this.space), durable);
892
- const haveFilters = info.config.filter_subjects ?? (info.config.filter_subject ? [info.config.filter_subject] : []);
893
- const gained = this.channels.filter((c) => !haveFilters.some((f) => subjectMatches(f, chatSubject(this.space, "*", c))));
894
- const armed = gained.length ? await this.armJoin(gained) : undefined;
895
- // Reconcile the durable's filter to the CURRENT config — OPEN MODE ONLY. Auth mode is
896
- // bind-only (no UPDATE grant): the durable's filter is authoritative, moved solely by the
897
- // mediated join/leave control op, so the agent never self-reconciles it.
898
- if (!this.creds && !sameSet(haveFilters, want))
899
- await this.jsm.consumers.update(chatStream(this.space), durable, { filter_subjects: want });
900
- if (armed)
901
- await this.backfillArmed(armed);
902
- }
903
1492
  }
1493
+ // First connect, auth mode: hydrate the local generation mirror for BOOT durable memberships (the
1494
+ // manager provisioned them server-side, so they are not in plane3Channels yet) — without it,
1495
+ // leaving a boot durable channel could not tombstone its §7 boundary. Open mode has no Plane-3.
1496
+ if (this.firstConnect && this.creds && this.channels.length)
1497
+ await this.hydrateMemberships();
1498
+ this.firstConnect = false;
904
1499
  // Anycast: a shared work-queue consumer for our role — one instance grabs each task.
905
1500
  // Open mode self-creates; auth mode BINDS the provisioner-pre-created svc_<role>
906
1501
  // durable (agents are denied CONSUMER.CREATE on TASK_<space>, since the create-time
@@ -955,8 +1550,14 @@ export class CotalEndpoint extends EventEmitter {
955
1550
  m.ack();
956
1551
  continue;
957
1552
  }
1553
+ // No pre-commit dedup here: the durable is the at-least-once path, so it must NEVER ack a copy
1554
+ // just because an id was "seen" — that would drop an unhandled message (the security/critic
1555
+ // HIGH). Steady state is single-path (coverage-partition: the core-sub drops durable-covered
1556
+ // channels). The only overlap is the brief live-first transition window, and a duplicate there
1557
+ // is coalesced downstream by the receiver's commit-aware id-dedup (MeshAgent.ingest keeps ONE
1558
+ // entry and takes THIS durable ack handle) — so the durable copy is acked only once handled.
958
1559
  }
959
- const delivery = { ack: () => m.ack(), nak: () => m.nak() };
1560
+ const delivery = { ack: () => m.ack(), nak: () => m.nak(), durable: true };
960
1561
  this.emit("message", msg, delivery, {
961
1562
  historical: false,
962
1563
  kind: kindFromParsed(parsed.kind),
@@ -967,6 +1568,98 @@ export class CotalEndpoint extends EventEmitter {
967
1568
  this.emit("error", e);
968
1569
  });
969
1570
  }
1571
+ /** Open a native core subscription to a channel's live feed (the manager-free live read path,
1572
+ * broker-enforced by `sub.allow`). At-most-once — no replay, no ack; it is the live delivery for
1573
+ * every channel (boot + runtime). For a `durable` channel it is also the low-latency wake-hint
1574
+ * alongside the Plane-3 durable copy, coalesced by the receiver's id-dedup. Drops our own echo +
1575
+ * spoofed senders. */
1576
+ subscribeChat(channel) {
1577
+ if (!this.nc || this.chatSubs.has(channel))
1578
+ return;
1579
+ this.chatSubDenied.delete(channel);
1580
+ const subject = chatSubject(this.space, "*", channel);
1581
+ this.confirmingChatSubs.add(subject);
1582
+ const sub = this.nc.subscribe(subject, {
1583
+ callback: (err, m) => {
1584
+ if (err) {
1585
+ // async sub.allow refusal (or sub error): the live feed for this channel is dead — never a
1586
+ // leak (the broker refused it). Drop the channel from local joined state even if it was
1587
+ // already treated as joined — a LATE refusal beyond the confirm window: conformance #13
1588
+ // "drop on late refusal". (During the join's own confirm the channel isn't pushed yet, so
1589
+ // this fires nothing then; joinChannel reads `chatSubDenied` and throws cleanly.)
1590
+ this.chatSubDenied.add(channel);
1591
+ this.chatSubs.delete(channel);
1592
+ // NOTE: do NOT remove `subject` from confirmingChatSubs here — that set gates watchStatus's
1593
+ // suppression of this expected violation, and is cleared by joinChannel after confirm (or by
1594
+ // unsubscribeChat). Removing it in the callback races the watcher and leaks a spurious error.
1595
+ const i = this.channels.indexOf(channel);
1596
+ if (i >= 0) {
1597
+ this.channels.splice(i, 1);
1598
+ this.joinSeq.delete(channel);
1599
+ // A late sub.allow refusal forces this agent out of the channel (the broker revoked its live
1600
+ // read). If it held a Plane-3 durable membership, the §7 boundary must close too. This sub
1601
+ // callback can't await, so a fail-closed async helper RETRIES the tombstone (backoff) UNTIL it
1602
+ // succeeds, clearing the mirror only then; while pending it is surfaced via cotal_channels —
1603
+ // never a silent drop, never lost retry state.
1604
+ const gen = this.plane3Channels.get(channel);
1605
+ if (gen !== undefined)
1606
+ void this.closeRefusedMembership(channel, gen);
1607
+ this.emit("error", new Error(`left channel "${channel}": its live subscription was refused by the broker`));
1608
+ }
1609
+ return;
1610
+ }
1611
+ const parsed = parseSubject(m.subject);
1612
+ if (!parsed || parsed.kind !== "chat")
1613
+ return;
1614
+ let msg;
1615
+ try {
1616
+ msg = m.json();
1617
+ }
1618
+ catch (e) {
1619
+ this.emit("error", e);
1620
+ return;
1621
+ }
1622
+ if (!msg.from || msg.from.id !== parsed.sender)
1623
+ return; // spoof/malformed — drop (at-most-once)
1624
+ if (msg.from.id === this.card.id)
1625
+ return; // our own echo
1626
+ const delivery = { ack: () => { }, nak: () => { }, durable: false }; // live = at-most-once, not acked
1627
+ this.emit("message", msg, delivery, {
1628
+ historical: false,
1629
+ kind: kindFromParsed(parsed.kind),
1630
+ });
1631
+ },
1632
+ });
1633
+ this.chatSubs.set(channel, sub);
1634
+ }
1635
+ /** Close a channel's core subscription (manager-free leave). */
1636
+ unsubscribeChat(channel) {
1637
+ this.confirmingChatSubs.delete(chatSubject(this.space, "*", channel));
1638
+ const sub = this.chatSubs.get(channel);
1639
+ if (sub) {
1640
+ try {
1641
+ sub.unsubscribe();
1642
+ }
1643
+ catch {
1644
+ /* closing with the connection */
1645
+ }
1646
+ this.chatSubs.delete(channel);
1647
+ }
1648
+ this.chatSubDenied.delete(channel);
1649
+ }
1650
+ /** Confirm a just-opened core subscription was accepted by the broker. A `sub.allow` violation is
1651
+ * async in NATS, so flush (round-trips the SUB) then settle briefly to let the refusal land — a
1652
+ * denied subscribe must not read as a successful join (SPEC conformance #13). */
1653
+ async confirmChatSub() {
1654
+ if (!this.nc)
1655
+ throw new Error("connection not established");
1656
+ // flush() is the deterministic boundary: the broker's -ERR for an out-of-ACL SUB arrives BEFORE the
1657
+ // PONG, so once flush resolves the subscribe callback has already recorded any denial. A flush
1658
+ // FAILURE means the connection drained/closed mid-join — we have no confirmation, so let it throw
1659
+ // (joinChannel fails closed) instead of swallowing it and continuing as if confirmed.
1660
+ await this.nc.flush();
1661
+ await new Promise((r) => setTimeout(r, 50));
1662
+ }
970
1663
  /** The highest join watermark among the joined subscriptions that cover `concreteChannel`
971
1664
  * (a wildcard sub like `team.>` covers `team.backend`), or undefined if none — the tail
972
1665
  * drops a chat message with `seq <= ` this. */
@@ -997,8 +1690,8 @@ export class CotalEndpoint extends EventEmitter {
997
1690
  return (await this.jsm.streams.info(chatStream(this.space))).state.last_seq;
998
1691
  }
999
1692
  /** Phase 1 of a join — arm each channel's tail-drop watermark at the current frontier. MUST run
1000
- * BEFORE the filter flip (consumers.update, or pump on a fresh create) so the tail can never
1001
- * carry a just-joined message un-watermarked — which would double-emit it (live + backfill).
1693
+ * BEFORE opening the core subscription so the live tail can never carry a just-joined message
1694
+ * un-watermarked — which would double-emit it (live + backfill).
1002
1695
  * Returns the per-channel frontiers for {@link backfillArmed}. */
1003
1696
  async armJoin(channels) {
1004
1697
  const frontiers = new Map();
@@ -1123,7 +1816,7 @@ export class CotalEndpoint extends EventEmitter {
1123
1816
  this.emit("error", e);
1124
1817
  return 0;
1125
1818
  }
1126
- const noop = { ack: () => { }, nak: () => { } };
1819
+ const noop = { ack: () => { }, nak: () => { }, durable: false };
1127
1820
  let n = 0;
1128
1821
  for (const sm of msgs) {
1129
1822
  let msg;
@@ -1238,9 +1931,15 @@ export class CotalEndpoint extends EventEmitter {
1238
1931
  card: this.card,
1239
1932
  status: this.status,
1240
1933
  activity: this.activity,
1934
+ attention: this.attentionMode,
1935
+ channelModes: this.channelModes,
1241
1936
  ts: Date.now(),
1242
1937
  };
1243
- await this.kv.put(this.card.id, JSON.stringify(p));
1938
+ // Wire contract (SPEC §6): an OFFLINE record must not carry the advisory attention fields. Scrub at
1939
+ // the publisher — this covers stop(), setStatus("offline"), and any future offline publish site, so
1940
+ // the raw KV record is compliant, not only the observer-side roster materialization.
1941
+ const record = this.status === "offline" ? this.toOffline(p) : p;
1942
+ await this.kv.put(this.card.id, JSON.stringify(record));
1244
1943
  }
1245
1944
  async startPresenceWatch() {
1246
1945
  if (!this.kv)
@@ -1305,7 +2004,9 @@ export class CotalEndpoint extends EventEmitter {
1305
2004
  applyPresence(id, raw) {
1306
2005
  const prev = this.roster.get(id);
1307
2006
  const stale = Date.now() - raw.ts > this.ttlMs;
1308
- const p = stale && raw.status !== "offline" ? { ...raw, status: "offline" } : raw;
2007
+ // Any offline materialization (a stale snapshot OR a graceful-leave record) drops the advisory
2008
+ // attention fields — an offline peer must not carry a stale `[focus]`/`locally muted` hint.
2009
+ const p = stale || raw.status === "offline" ? this.toOffline(raw) : raw;
1309
2010
  // First time we hear about an already-offline peer (stale snapshot): record quietly.
1310
2011
  if (!prev && p.status === "offline") {
1311
2012
  this.roster.set(id, p);
@@ -1318,7 +2019,9 @@ export class CotalEndpoint extends EventEmitter {
1318
2019
  prev.status !== "offline" &&
1319
2020
  p.status !== "offline" &&
1320
2021
  prev.status === p.status &&
1321
- prev.activity === p.activity) {
2022
+ prev.activity === p.activity &&
2023
+ prev.attention === p.attention &&
2024
+ sameChannelModes(prev.channelModes, p.channelModes)) {
1322
2025
  this.roster.set(id, p);
1323
2026
  return;
1324
2027
  }
@@ -1331,12 +2034,18 @@ export class CotalEndpoint extends EventEmitter {
1331
2034
  this.emit("presence", { type, presence: p });
1332
2035
  this.emit("roster", this.getRoster());
1333
2036
  }
2037
+ /** Materialize an OFFLINE presence record: drop the advisory attention fields. An offline peer must
2038
+ * not show a stale `[focus]` or "locally muted #x" hint — SPEC: attention removed on offline sweep,
2039
+ * channel modes reset on restart. card/activity/ts are kept. */
2040
+ toOffline(p) {
2041
+ return { ...p, status: "offline", attention: undefined, channelModes: undefined };
2042
+ }
1334
2043
  /** Mark a known peer offline (on KV delete/purge), keeping it in the roster. */
1335
2044
  markOffline(id) {
1336
2045
  const prev = this.roster.get(id);
1337
2046
  if (!prev || prev.status === "offline")
1338
2047
  return;
1339
- const offline = { ...prev, status: "offline" };
2048
+ const offline = this.toOffline(prev);
1340
2049
  this.roster.set(id, offline);
1341
2050
  this.emit("presence", { type: "offline", presence: offline });
1342
2051
  this.emit("roster", this.getRoster());
@@ -1344,10 +2053,11 @@ export class CotalEndpoint extends EventEmitter {
1344
2053
  sweep() {
1345
2054
  const now = Date.now();
1346
2055
  let changed = false;
1347
- for (const [, p] of this.roster) {
2056
+ for (const [id, p] of this.roster) {
1348
2057
  if (p.status !== "offline" && now - p.ts > this.ttlMs) {
1349
- p.status = "offline";
1350
- this.emit("presence", { type: "offline", presence: p });
2058
+ const offline = this.toOffline(p);
2059
+ this.roster.set(id, offline);
2060
+ this.emit("presence", { type: "offline", presence: offline });
1351
2061
  changed = true;
1352
2062
  }
1353
2063
  }
@@ -1355,13 +2065,6 @@ export class CotalEndpoint extends EventEmitter {
1355
2065
  this.emit("roster", this.getRoster());
1356
2066
  }
1357
2067
  }
1358
- /** The id token of a chat-stream durable, or null if it isn't one — the inverse of
1359
- * `chatDurable` (`chat_<token(id)>`). token() is lossy, so this returns the token, not the
1360
- * original id; callers match it forward against `token(card.id)`. */
1361
- function chatDurableToken(durable) {
1362
- const prefix = "chat_";
1363
- return durable.startsWith(prefix) ? durable.slice(prefix.length) : null;
1364
- }
1365
2068
  /** Map an authenticated parsed-subject kind to the message class surfaced to "message" listeners.
1366
2069
  * Throws on `ctl` (control-plane is request/reply, never a "message") — per repo convention, no
1367
2070
  * silent default: an unexpected delivering kind is a bug, not something to swallow. */
@@ -1377,12 +2080,14 @@ function kindFromParsed(kind) {
1377
2080
  throw new Error(`cannot derive a message kind from subject kind "${kind}"`);
1378
2081
  }
1379
2082
  }
1380
- /** Set equality over two subject lists (order/duplicate-insensitive). */
1381
- function sameSet(a, b) {
1382
- if (a.length !== b.length)
2083
+ /** Shallow-equal two per-channel-mode maps (presence dedup): a change must re-emit, so an attention
2084
+ * toggle isn't swallowed as a quiet heartbeat. Absent and empty compare equal. */
2085
+ function sameChannelModes(a, b) {
2086
+ const ak = a ? Object.keys(a) : [];
2087
+ const bk = b ? Object.keys(b) : [];
2088
+ if (ak.length !== bk.length)
1383
2089
  return false;
1384
- const s = new Set(a);
1385
- return b.every((x) => s.has(x));
2090
+ return ak.every((k) => a[k] === b?.[k]);
1386
2091
  }
1387
2092
  function authOpts(a) {
1388
2093
  const tls = a.tls ? {} : undefined;