@cotal-ai/core 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/dist/acls.d.ts +45 -0
  2. package/dist/acls.d.ts.map +1 -0
  3. package/dist/acls.js +86 -0
  4. package/dist/acls.js.map +1 -0
  5. package/dist/command.d.ts +3 -0
  6. package/dist/command.d.ts.map +1 -1
  7. package/dist/connector.d.ts +10 -0
  8. package/dist/connector.d.ts.map +1 -1
  9. package/dist/endpoint.d.ts +197 -54
  10. package/dist/endpoint.d.ts.map +1 -1
  11. package/dist/endpoint.js +443 -100
  12. package/dist/endpoint.js.map +1 -1
  13. package/dist/index.d.ts +5 -0
  14. package/dist/index.d.ts.map +1 -1
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -1
  17. package/dist/lease.d.ts +40 -0
  18. package/dist/lease.d.ts.map +1 -0
  19. package/dist/lease.js +64 -0
  20. package/dist/lease.js.map +1 -0
  21. package/dist/membership-feed.d.ts +30 -0
  22. package/dist/membership-feed.d.ts.map +1 -0
  23. package/dist/membership-feed.js +315 -0
  24. package/dist/membership-feed.js.map +1 -0
  25. package/dist/mesh-registry.d.ts +45 -0
  26. package/dist/mesh-registry.d.ts.map +1 -0
  27. package/dist/mesh-registry.js +78 -0
  28. package/dist/mesh-registry.js.map +1 -0
  29. package/dist/mesh-target.d.ts +42 -0
  30. package/dist/mesh-target.d.ts.map +1 -0
  31. package/dist/mesh-target.js +95 -0
  32. package/dist/mesh-target.js.map +1 -0
  33. package/dist/provision.d.ts +45 -21
  34. package/dist/provision.d.ts.map +1 -1
  35. package/dist/provision.js +177 -15
  36. package/dist/provision.js.map +1 -1
  37. package/dist/streams.d.ts +16 -0
  38. package/dist/streams.d.ts.map +1 -1
  39. package/dist/streams.js +29 -5
  40. package/dist/streams.js.map +1 -1
  41. package/dist/subjects.d.ts +89 -2
  42. package/dist/subjects.d.ts.map +1 -1
  43. package/dist/subjects.js +132 -3
  44. package/dist/subjects.js.map +1 -1
  45. package/dist/types.d.ts +52 -0
  46. package/dist/types.d.ts.map +1 -1
  47. package/package.json +1 -1
package/dist/endpoint.js CHANGED
@@ -7,8 +7,10 @@ import { createSpaceStreams, dmDurableConfig, dlvDurableConfig, taskDurableConfi
7
7
  import { jetstream, jetstreamManager, AckPolicy, DeliverPolicy, } from "@nats-io/jetstream";
8
8
  import { Kvm } from "@nats-io/kv";
9
9
  import { openMembersRegistry, commitMember, tombstoneMember, activateMember, readMember, listMembers, durableEligible, StaleMembershipWrite, } from "./members.js";
10
+ import { openAclRegistry, readAcl, commitAcl as writeAclRecord } from "./acls.js";
11
+ import { openDeliveryRegistry } from "./lease.js";
10
12
  import { openChannelRegistry, effectiveReplay, effectiveReplayWindowMs, effectiveDeliveryClass, readChannelConfig, readChannelDefaults, } from "./channels.js";
11
- import { anycastSubject, CHANNEL_DEFAULTS_KEY, chatStream, chatHistDurable, chatSubject, controlServiceSubject, CONTROL_SELF_SERVICE, dmStream, dmDurable, dlvStream, dlvDurable, dlvSubject, dinboxSubject, inboxStream, parseDinboxOwner, FANOUT_DURABLE, INBOX_READER_DURABLE, chatWildcard, channelInAllow, isConcreteChannel, normalizeMentions, parseSubject, presenceBucket, spacePrefix, spaceWildcard, subjectMatches, taskStream, taskDurable, token, unicastSubject, } from "./subjects.js";
13
+ import { anycastSubject, CHANNEL_DEFAULTS_KEY, chatStream, chatHistDurable, chatSubject, controlServiceSubject, CONTROL_SELF_SERVICE, CONTROL_DELIVERY, dmStream, dmDurable, dlvStream, dlvDurable, dlvSubject, dinboxSubject, inboxStream, parseDinboxOwner, FANOUT_DURABLE, INBOX_READER_DURABLE, leaseKey, chatWildcard, assertValidChannel, channelInAllow, isConcreteChannel, normalizeMentions, parseSubject, presenceBucket, membershipBucket, MEMBERSHIP_FEED_KEY, spacePrefix, spaceWildcard, subjectMatches, taskStream, taskDurable, token, unicastSubject, } from "./subjects.js";
12
14
  export const DEFAULT_SERVER = "nats://127.0.0.1:4222";
13
15
  /** Space joined when none is given on the CLI (the `cotal-<space>` cmux tab, etc.). */
14
16
  export const DEFAULT_SPACE = "main";
@@ -50,10 +52,18 @@ export class CotalEndpoint extends EventEmitter {
50
52
  jsm;
51
53
  kv;
52
54
  channelKv;
53
- /** Plane-3 durable-membership registry KV — lazily opened by the privileged (manager) endpoint. */
55
+ /** Plane-3 durable-membership registry KV — lazily opened by the privileged delivery daemon (or a
56
+ * short-lived provisioner). */
54
57
  membersKv;
55
- /** When set, this endpoint hosts the Plane-3 fan-out writer + trusted reader (the manager). `aclFor`
56
- * maps an owner id to its current read ACL (`allowSubscribe`) for the reader's re-authorization. */
58
+ aclKv;
59
+ deliveryKv;
60
+ membershipKv;
61
+ /** The live `ctl.delivery` serve subscription (delivery daemon) — re-created on every (re)connect by
62
+ * {@link armDeliveryControl}; tracked so the stale one is dropped on reconnect. */
63
+ deliveryServeSub;
64
+ /** When set, this endpoint hosts the Plane-3 fan-out writer + trusted reader (the server-side delivery
65
+ * daemon). `aclFor` maps an owner id to its current read ACL (`allowSubscribe`) for the reader's
66
+ * re-authorization — read FRESH per entry from the durable ACL registry KV, hence async. */
57
67
  plane3;
58
68
  /** Live local cache of the channel registry (key = channel token), kept by a KV watch. */
59
69
  channelConfigs = new Map();
@@ -89,6 +99,12 @@ export class CotalEndpoint extends EventEmitter {
89
99
  * {@link pendingDurableLeaves} (the connector shows it in `cotal_channels`, never as ordinary
90
100
  * absence). Persists across reconnect; cleared on tombstone success or full stop. */
91
101
  pendingDurableLeave = new Map();
102
+ /** Boot durable channels whose self-join hasn't yet established a membership (daemon down/absent at
103
+ * first connect, or a transient `durable:false`). {@link reconcileBootJoin} retries with capped
104
+ * backoff until the membership exists or the channel is left — so a first-connect daemon outage
105
+ * self-heals on recovery instead of leaving the channel silently live-only. Surfaced to the connector
106
+ * via {@link hasDurableMembership} (a joined durable channel NOT yet a member renders degraded). */
107
+ pendingBootJoins = new Set();
92
108
  /** Chat-join subjects currently being broker-confirmed. An out-of-ACL subscribe among these trips an
93
109
  * EXPECTED async permission violation that joinChannel turns into a clean throw, so watchStatus
94
110
  * suppresses it rather than surfacing a spurious connection error. */
@@ -220,7 +236,7 @@ export class CotalEndpoint extends EventEmitter {
220
236
  await this.ensureStreams();
221
237
  await this.startConsumers();
222
238
  }
223
- // Re-arm Plane-3 (manager-hosted fan-out + trusted reader) on every (re)connect — no-op unless this
239
+ // Re-arm Plane-3 (delivery-daemon-hosted fan-out + trusted reader + ctl.delivery) on every (re)connect — no-op unless this
224
240
  // endpoint hosts it. The first arm comes from startPlane3 (after start()); this re-binds the loops
225
241
  // a reconnect's clearConnectionScoped() tore down, so a broker blip doesn't silently kill the backstop.
226
242
  await this.armPlane3();
@@ -332,6 +348,12 @@ export class CotalEndpoint extends EventEmitter {
332
348
  this.jsm = undefined;
333
349
  this.kv = undefined;
334
350
  this.channelKv = undefined;
351
+ // Plane-3 KV handles are bound to the old connection too — drop them so the daemon re-opens them on
352
+ // the fresh nc (else durableJoin/leave/list, the reader's ACL re-auth, and lease renew use a dead
353
+ // handle after a reconnect).
354
+ this.membersKv = undefined;
355
+ this.aclKv = undefined;
356
+ this.deliveryKv = undefined;
335
357
  this.emit("connection", { connected: false }); // null window opened — not live until the rebind below
336
358
  try {
337
359
  await oldNc?.drain();
@@ -518,8 +540,16 @@ export class CotalEndpoint extends EventEmitter {
518
540
  })().catch((e) => this.emit("error", e));
519
541
  }
520
542
  // ---- control plane (request/reply) --------------------------------------
521
- /** Serve control requests for a service (manager side). */
522
- serveControl(service, handler) {
543
+ /** Serve control requests for a service. Returns the subscription so a caller that re-registers on
544
+ * reconnect (the delivery daemon) can drop the stale one. `boundReply` is REQUIRED for any service
545
+ * whose responder holds a wildcard publish grant over the service subtree (the delivery daemon's
546
+ * `ctl.delivery.*.reply.>`): without it, an authenticated caller could set its reply target to a
547
+ * PEER's reply lane (`ctl.delivery.<victim>.reply.<n>`) and turn the responder into a confused
548
+ * deputy — the broker does NOT permission-check the requester's embedded reply subject. With it, a
549
+ * reply is published only when `m.reply` is under the AUTHENTICATED request subject
550
+ * (`${m.subject}.reply.…`), binding the reply to the broker-policed sender token. (The manager's
551
+ * tiers reply into the per-id `_INBOX` and leave it off.) */
552
+ serveControl(service, handler, opts = {}) {
523
553
  if (!this.nc)
524
554
  throw new Error("endpoint not started");
525
555
  const sub = this.nc.subscribe(controlServiceSubject(this.space, service, "*"), {
@@ -528,6 +558,12 @@ export class CotalEndpoint extends EventEmitter {
528
558
  this.subs.push(sub);
529
559
  void (async () => {
530
560
  for await (const m of sub) {
561
+ // Sender-bound reply guard (confused-deputy fix): never respond to a reply target outside the
562
+ // authenticated request subject's own `.reply.` subtree. Drop silently (don't inject elsewhere).
563
+ if (opts.boundReply && (!m.reply || !m.reply.startsWith(`${m.subject}.reply.`))) {
564
+ this.emit("error", new Error(`rejected ${service} request on ${m.subject}: reply target "${m.reply ?? "(none)"}" is not under the sender's own reply subtree`));
565
+ continue;
566
+ }
531
567
  let reply;
532
568
  try {
533
569
  const req = m.json();
@@ -556,6 +592,7 @@ export class CotalEndpoint extends EventEmitter {
556
592
  }
557
593
  }
558
594
  })().catch((e) => this.emit("error", e));
595
+ return sub;
559
596
  }
560
597
  /** Send a control request to a service and await its reply (client side). */
561
598
  async requestControl(service, req, timeoutMs = 5000) {
@@ -565,6 +602,26 @@ export class CotalEndpoint extends EventEmitter {
565
602
  const m = await this.nc.request(controlServiceSubject(this.space, service, this.card.id), JSON.stringify(body), { timeout: timeoutMs });
566
603
  return m.json();
567
604
  }
605
+ /** Send a durable-membership request to the SERVER-SIDE delivery daemon (`ctl.delivery`) and await its
606
+ * reply. Unlike {@link requestControl}, the reply rides a subject UNDER `ctl.delivery.<id>.>` (not the
607
+ * per-id `_INBOX`), so the scoped delivery cred can answer without broad inbox-publish — see
608
+ * CONTROL_DELIVERY. `noMux` lets us name the reply subject while keeping NoResponders detection (so a
609
+ * caller can fail-closed vs. degrade to live-only when no daemon is present). */
610
+ async requestDelivery(op, args, timeoutMs = 5000) {
611
+ if (!this.nc)
612
+ throw new Error(this.notLiveMsg());
613
+ const reqSubject = controlServiceSubject(this.space, CONTROL_DELIVERY, this.card.id); // ctl.delivery.<id>
614
+ // Reply rides the sender's OWN subtree so the daemon's serveControl boundReply guard accepts it
615
+ // (`${reqSubject}.reply.…`). The sender-bound guard is the COMPLETE confused-deputy closure. The
616
+ // random suffix is genuine defense-in-depth (NOT cosmetic): `noMux` subscribes this SPECIFIC named
617
+ // reply subject (not a standing `.reply.>` wildcard), so a predictable suffix would let a peer target
618
+ // an in-flight reply subscription — randomUUID brings it to parity with the nuid-protected `_INBOX`
619
+ // model. Keep both; don't regress to a counter. (Confirmed by the review panel's fact-check.)
620
+ const reply = `${reqSubject}.reply.${randomUUID()}`;
621
+ const body = { op, args, from: this.ref() };
622
+ const m = await this.nc.request(reqSubject, JSON.stringify(body), { timeout: timeoutMs, noMux: true, reply });
623
+ return m.json();
624
+ }
568
625
  // ---- presence ------------------------------------------------------------
569
626
  getRoster() {
570
627
  return [...this.roster.values()].sort((a, b) => a.card.name.localeCompare(b.card.name));
@@ -611,6 +668,12 @@ export class CotalEndpoint extends EventEmitter {
611
668
  channelReplay(channel) {
612
669
  return effectiveReplay(this.channelConfigs.get(channel), this.channelDefaults);
613
670
  }
671
+ /** Effective delivery class for a channel (per-channel override ?? space default ?? "durable"),
672
+ * from the live watch cache — drives the non-gating delivery-health surface (only durable-class
673
+ * channels have a Plane-3 backstop to report on). */
674
+ channelDeliveryClass(channel) {
675
+ return effectiveDeliveryClass(this.channelConfigs.get(channel), this.channelDefaults);
676
+ }
614
677
  // ---- dynamic subscription (join / leave mid-session) ---------------------
615
678
  /** The channels this endpoint is currently subscribed to (live — reflects join/leave). */
616
679
  joinedChannels() {
@@ -619,9 +682,10 @@ export class CotalEndpoint extends EventEmitter {
619
682
  /**
620
683
  * Join a channel mid-session: open a native core subscription (manager-free live read, broker-
621
684
  * confirmed against `sub.allow`), capture the stream frontier as the join watermark, backfill its
622
- * history if replay is on, and — for a `durable`-class channel under a manager request a Plane-3
623
- * durable backstop. Idempotent: re-joining is a no-op (no re-backfill). Returns the backfill count +
624
- * whether the durable backstop is active (+ a `reason` when a durable channel couldn't get one).
685
+ * history if replay is on, and — for a `durable`-class channel when a delivery daemon is present
686
+ * request a Plane-3 durable backstop (via `ctl.delivery`). Idempotent: re-joining is a no-op (no
687
+ * re-backfill). Returns the backfill count + whether the durable backstop is active (+ a `reason`
688
+ * when a durable channel couldn't get one).
625
689
  */
626
690
  async joinChannel(channel) {
627
691
  if (!this.jsm)
@@ -656,7 +720,7 @@ export class CotalEndpoint extends EventEmitter {
656
720
  }
657
721
  this.channels.push(channel);
658
722
  // Durable backstop. The live core-sub above already delivers (manager-free). For a `durable`-class
659
- // channel, request a Plane-3 per-member backstop from the manager (durableJoin) so a post reaches a
723
+ // channel, request a Plane-3 per-member backstop from the server-side delivery daemon (durableJoin via ctl.delivery) so a post reaches a
660
724
  // busy/offline turn — the core-sub stays as the live wake-hint, dedup-coalesced with the Plane-3
661
725
  // copy by id-dedup. No manager (open dev / manager-less) ⇒ joined LIVE only, surfaced via `reason`
662
726
  // (never silent). A `live`-class channel takes no backstop (joined live is the contract).
@@ -674,7 +738,7 @@ export class CotalEndpoint extends EventEmitter {
674
738
  }
675
739
  }
676
740
  catch (e) {
677
- // No privileged writer (manager-less) or the write was rejected — joined live, backstop
741
+ // No privileged writer (no delivery daemon) or the write was rejected — joined live, backstop
678
742
  // unavailable. NOT a join failure: the live subscription is up and authorized.
679
743
  reason = `durable backstop unavailable (${e.message})`;
680
744
  }
@@ -695,11 +759,11 @@ export class CotalEndpoint extends EventEmitter {
695
759
  return { left: false };
696
760
  // Auth + durable-class ⇒ a Plane-3 membership may exist; tombstone it BEFORE touching local state.
697
761
  // The join generation comes from the local mirror, but a BOOT membership whose hydration was missed
698
- // (transient manager error at connect) is NOT in the mirror — so re-resolve it from the manager on
762
+ // (daemon down at connect) is NOT in the mirror — so re-resolve it from the delivery service on
699
763
  // demand. FAIL-CLOSED: fetchMemberships throws on a responder-present error, so a leave whose
700
764
  // tombstone can't be confirmed propagates (live sub stays up, mirror intact) for the caller to retry
701
765
  // — reporting `left` while the trusted reader keeps transferring to DLV is the fail-open leak. A
702
- // genuine no-responder (open / manager-less, no Plane-3) means there is no membership to tombstone.
766
+ // genuine no-responder (open / no delivery daemon, no Plane-3) means there is no membership to tombstone.
703
767
  if (this.creds && effectiveDeliveryClass(this.channelConfigs.get(channel), this.channelDefaults) === "durable") {
704
768
  let generation = this.plane3Channels.get(channel);
705
769
  if (generation === undefined)
@@ -781,6 +845,57 @@ export class CotalEndpoint extends EventEmitter {
781
845
  arr.sort(byName);
782
846
  return map;
783
847
  }
848
+ /** Lazily open the derived membership feed KV (admin/observer read; the delivery daemon writes it).
849
+ * Read-only here — the dashboard consumes it; agents hold no grant and never call this. */
850
+ async membershipRegistry() {
851
+ if (!this.nc)
852
+ throw new Error("endpoint not started");
853
+ this.membershipKv ??= await new Kvm(this.nc).open(membershipBucket(this.space));
854
+ return this.membershipKv;
855
+ }
856
+ /**
857
+ * Snapshot the broker-sourced channel-membership feed (admin/observer read): every agent's
858
+ * `{live, durable}` record plus `asOf` — the feed's freshness heartbeat (epoch ms of the daemon's last
859
+ * successful poll, from the reserved {@link MEMBERSHIP_FEED_KEY}). `live` patterns are kept as-is
860
+ * (wildcards preserved); the consumer expands them against the channel registry. `asOf` is undefined
861
+ * when the feed has never been written (no daemon → the dashboard degrades to traffic-only).
862
+ */
863
+ async readMembership() {
864
+ const kv = await this.membershipRegistry();
865
+ const members = [];
866
+ let asOf;
867
+ for await (const key of await kv.keys()) {
868
+ const e = await kv.get(key);
869
+ if (!e || e.operation === "DEL" || e.operation === "PURGE")
870
+ continue;
871
+ if (key === MEMBERSHIP_FEED_KEY) {
872
+ try {
873
+ asOf = e.json().observedAt;
874
+ }
875
+ catch { /* heartbeat garbled — leave undefined */ }
876
+ continue;
877
+ }
878
+ try {
879
+ const rec = e.json();
880
+ members.push({ id: key, live: rec.live ?? [], durable: rec.durable ?? [], observedAt: rec.observedAt });
881
+ }
882
+ catch { /* skip undecodable */ }
883
+ }
884
+ return { asOf, members };
885
+ }
886
+ /** Watch the membership feed for changes (admin/observer): `onChange` fires on every KV entry,
887
+ * including the initial replay — the caller debounces + re-reads {@link readMembership}. Returns a
888
+ * stop handle. Best-effort: a feed the cred can't read (or absent) surfaces as an `error` event and
889
+ * the dashboard keeps its last snapshot. */
890
+ async watchMembership(onChange) {
891
+ const kv = await this.membershipRegistry();
892
+ const iter = await kv.watch();
893
+ void (async () => {
894
+ for await (const _ of iter)
895
+ onChange();
896
+ })().catch((err) => this.emit("error", err));
897
+ return { stop: () => iter.stop() };
898
+ }
784
899
  /** Fetch recent messages from a channel's JetStream backlog. */
785
900
  async channelHistory(channel, opts) {
786
901
  // history from any sender
@@ -871,28 +986,10 @@ export class CotalEndpoint extends EventEmitter {
871
986
  throw new Error("endpoint not started");
872
987
  await createSpaceStreams(this.jsm, this.space);
873
988
  }
874
- /**
875
- * Privileged: write an agent's BOOT durable membership each `durable`-class channel in its boot
876
- * subscribe set gets a Plane-3 durable-active record (via {@link durableJoinFor}: cursor capture +
877
- * activation catch-up), so it receives durable backstop copies from boot exactly like a runtime
878
- * `durableJoin`. `live`-class (and non-concrete) channels are skipped. Idempotent.
879
- *
880
- * Writes the durable RECORDS with the caller's privileged creds — it does NOT require this endpoint
881
- * to host the runtime fan-out/reader loops (a space-level manager service), so EVERY auth launcher
882
- * provisions identically: the manager AND the short-lived `cotal spawn` provisioner both write boot
883
- * records, which the space's manager then delivers (no silent no-op — that would hide a boot
884
- * membership; AGENTS.md "no fallbacks"). A space running no manager is live-only for everyone (the
885
- * records exist; nothing delivers them until a manager hosts the loops).
886
- */
887
- async provisionMembership(targetId, channels) {
888
- for (const ch of channels) {
889
- if (!isConcreteChannel(ch))
890
- continue; // durable membership is per-concrete-channel
891
- if ((await this.deliveryClassFresh(ch)) !== "durable")
892
- continue;
893
- await this.durableJoinFor(targetId, ch);
894
- }
895
- }
989
+ // (v3) The old `provisionMembership` — manager/provisioner-written boot membership at spawn — is GONE.
990
+ // Boot durable membership is now the AGENT self-joining its durable boot channels via the daemon's
991
+ // `ctl.delivery` op at connect ({@link armBootDurableMemberships}), reconciled on outage. The
992
+ // primitive it wrapped, {@link durableJoinFor}, is now driven by the daemon's `ctl.delivery` handler.
896
993
  /**
897
994
  * Privileged: pre-create an agent's DM inbox durable (auth mode), so the agent can BIND
898
995
  * it without holding CONSUMER.CREATE on DM_<space>. The creator sets the filter to
@@ -925,26 +1022,102 @@ export class CotalEndpoint extends EventEmitter {
925
1022
  const jsm = await this.manager();
926
1023
  await jsm.consumers.add(taskStream(this.space), taskDurableConfig(this.space, role));
927
1024
  }
928
- // ---- Plane-3: durable backstop (SPEC §8) — privileged, manager-hosted ----------------------------
1025
+ // ---- Plane-3: durable backstop (SPEC §8) — privileged, hosted by the server-side DELIVERY DAEMON ----
929
1026
  //
930
- // Two manager loops + two privileged membership ops. The FAN-OUT writer (routing, not auth) reads
931
- // every chat message and copies it into each eligible owner's MIXED inbox (`dinbox.<owner>`); the
932
- // TRUSTED READER (the auth gate) re-authorizes each entry against the CURRENT ACL + membership
933
- // interval and TRANSFERS the authorized copy to the owner's per-member DELIVER store
934
- // (`dlv.<owner>`), which the agent binds + acks via native JetStream. The agent holds no read on the
935
- // mixed store. See `.internal/research/stage4-impl-design.md`.
936
- /** Lazily open the privileged members registry KV (manager / open-mode self). */
1027
+ // Two daemon loops + two privileged membership ops (served to agents on `ctl.delivery`). The FAN-OUT
1028
+ // writer (routing, not auth) reads every chat message and copies it into each eligible owner's MIXED
1029
+ // inbox (`dinbox.<owner>`); the TRUSTED READER (the auth gate) re-authorizes each entry against the
1030
+ // CURRENT ACL + membership interval and TRANSFERS the authorized copy to the owner's per-member
1031
+ // DELIVER store (`dlv.<owner>`), which the agent binds + acks via native JetStream. The agent holds no
1032
+ // read on the mixed store. (v3: this all moved off the manager — the manager is lifecycle-only; it
1033
+ // records the read-ACL at mint via commitAcl.) See `.internal/research/stage4-impl-design.md`.
1034
+ /** Lazily open the privileged members registry KV (delivery daemon / open-mode self). */
937
1035
  async membersRegistry() {
938
1036
  if (!this.nc)
939
1037
  throw new Error("endpoint not started");
940
1038
  this.membersKv ??= await openMembersRegistry(this.nc, this.space);
941
1039
  return this.membersKv;
942
1040
  }
1041
+ /** Lazily open the durable read-ACL registry KV. Privileged write (the manager records an agent's
1042
+ * ACL at mint); the delivery daemon reads it fresh per durable entry to re-authorize. */
1043
+ async aclRegistry() {
1044
+ if (!this.nc)
1045
+ throw new Error("endpoint not started");
1046
+ this.aclKv ??= await openAclRegistry(this.nc, this.space);
1047
+ return this.aclKv;
1048
+ }
1049
+ /** Privileged ({@link DurableProvisioner}): record an agent's read ACL in the durable registry at
1050
+ * provision/mint time — the same act as baking it into the JWT, persisted so the server-side
1051
+ * delivery daemon can re-authorize the agent's durable entries and validate its runtime
1052
+ * durable-joins without holding any in-memory ledger. Written ATOMICALLY ({@link writeAclRecord}),
1053
+ * so a present record is always complete (`[]` = known no-read, never a half-write). */
1054
+ async commitAcl(targetId, allowSubscribe) {
1055
+ await writeAclRecord(await this.aclRegistry(), targetId, allowSubscribe);
1056
+ }
1057
+ /** The server-side delivery daemon's fresh-per-entry ACL read: an owner's CURRENT read ACL
1058
+ * (`allowSubscribe`) from the durable registry, or `undefined` if no record (an unknown owner — the
1059
+ * reader DEFERS, never drops). A present `[]` (known no-read) returns `[]` (the reader DROPS). */
1060
+ async aclForOwner(owner) {
1061
+ return (await readAcl(await this.aclRegistry(), owner))?.record.allowSubscribe;
1062
+ }
1063
+ /** Lazily open the delivery lease/readiness KV (pre-created at `cotal up`; bind, never create). */
1064
+ async deliveryRegistry() {
1065
+ if (!this.nc)
1066
+ throw new Error("endpoint not started");
1067
+ this.deliveryKv ??= await openDeliveryRegistry(this.nc, this.space);
1068
+ return this.deliveryKv;
1069
+ }
1070
+ encodeLease(ready) {
1071
+ return new TextEncoder().encode(JSON.stringify({ holder: this.card.id, since: Date.now(), ready }));
1072
+ }
1073
+ /** Acquire the single-flight delivery lease for a shard via an ATOMIC CAS create, marked NOT-ready.
1074
+ * THROWS if a live lease exists — a loud refusal-to-bind (the daemon exits), never a retry, so two
1075
+ * daemons can't split a durable's delivery. A crashed holder's lease auto-expires (bucket TTL),
1076
+ * freeing a re-acquire. Acquired BEFORE binding (single-flight gate); {@link markDeliveryLeaseReady}
1077
+ * flips it ready AFTER the loops + `ctl.delivery` are bound. Returns the lease revision. */
1078
+ async acquireDeliveryLease(shardIndex) {
1079
+ return (await this.deliveryRegistry()).create(leaseKey(shardIndex), this.encodeLease(false));
1080
+ }
1081
+ /** Flip the held lease to READY (CAS `kv.update`) AFTER `startPlane3` has bound the loops + the
1082
+ * `ctl.delivery` responder — so "lease ready" proves the responder is up, not just that the slot was
1083
+ * claimed. Returns the new revision. */
1084
+ async markDeliveryLeaseReady(shardIndex, revision) {
1085
+ return (await this.deliveryRegistry()).update(leaseKey(shardIndex), this.encodeLease(true), revision);
1086
+ }
1087
+ /** Renew the held lease (CAS `kv.update` against `revision`, keeping `ready:true`) to refresh it before
1088
+ * the bucket TTL expires it. Returns the new revision. Throws if the revision moved (lost the lease —
1089
+ * the daemon should exit). */
1090
+ async renewDeliveryLease(shardIndex, revision) {
1091
+ return (await this.deliveryRegistry()).update(leaseKey(shardIndex), this.encodeLease(true), revision);
1092
+ }
1093
+ /** Release the held lease on clean shutdown so a replacement daemon re-acquires immediately (best
1094
+ * effort — a crash just lets the bucket TTL expire it). */
1095
+ async releaseDeliveryLease(shardIndex) {
1096
+ try {
1097
+ await (await this.deliveryRegistry()).delete(leaseKey(shardIndex));
1098
+ }
1099
+ catch { /* already gone */ }
1100
+ }
1101
+ /** Read a shard's delivery lease (the daemon-availability signal), or `undefined` if none is live.
1102
+ * READ-ONLY surface — drives Component 6's `cotal_channels` delivery-health field (an agent reads it
1103
+ * under its own cred, which holds lease-bucket read but no write). */
1104
+ async readDeliveryLease(shardIndex) {
1105
+ const e = await (await this.deliveryRegistry()).get(leaseKey(shardIndex));
1106
+ if (!e || e.operation === "DEL" || e.operation === "PURGE")
1107
+ return undefined;
1108
+ try {
1109
+ return e.json();
1110
+ }
1111
+ catch {
1112
+ return undefined;
1113
+ }
1114
+ }
943
1115
  /** Privileged: one owner's NON-TOMBSTONED durable memberships as `{channel, generation, activated}` —
944
- * the manager serves this to a connecting agent (via the `listMemberships` self-service op). The agent
945
- * hydrates its leave mirror from the ACTIVATED ones (the confirmed backstops), but the non-activated
946
- * ones are returned too so `leaveChannel` can discover + close a record that still routes under the
947
- * pure-interval predicate (a crash-stuck pending activation) — without reading the privileged KV. */
1116
+ * the server-side delivery daemon serves this to a connecting agent (the `listMemberships` op on
1117
+ * `ctl.delivery`). The agent seeds its leave mirror from the ACTIVATED ones (the confirmed backstops),
1118
+ * but the non-activated ones are returned too so `leaveChannel` can discover + close a record that
1119
+ * still routes under the pure-interval predicate (a crash-stuck pending activation) — without reading
1120
+ * the privileged KV itself. */
948
1121
  async ownerMemberships(owner) {
949
1122
  const recs = await listMembers(await this.membersRegistry(), { owner });
950
1123
  return recs
@@ -985,16 +1158,15 @@ export class CotalEndpoint extends EventEmitter {
985
1158
  return info?.delivered?.stream_seq ?? 0;
986
1159
  }
987
1160
  /**
988
- * Privileged durable-JOIN write (the manager calls this after validating channel ⊆ allowSubscribe;
989
- * {@link provisionMembership} calls it at provision time for boot channels): capture `joinCursor`,
990
- * commit a `durable-active` record (CAS + generation bump), then ACTIVATION CATCH-UP idempotently
991
- * copies `(joinCursor, fence]` into the owner inbox where `fence = max(frontier, fanoutDelivered)` —
992
- * fan-out owns `seq > fence`. Idempotent against a timeout-retry (an already-activated membership
993
- * no-ops). Returns `{durable:false}` (honest degrade) only if the catch-up window was evicted.
1161
+ * Privileged durable-JOIN write (v3: the delivery daemon calls this from its `ctl.delivery` handler
1162
+ * after validating channel the caller's read ACL): capture `joinCursor`, commit a `durable-active`
1163
+ * record (CAS + generation bump), then ACTIVATION CATCH-UP idempotently copies `(joinCursor, fence]`
1164
+ * into the owner inbox where `fence = max(frontier, fanoutDelivered)` — fan-out owns `seq > fence`.
1165
+ * Idempotent against a timeout-retry (an already-activated membership no-ops). Returns `{durable:false}`
1166
+ * (honest degrade) only if the catch-up window was evicted.
994
1167
  *
995
- * This writes durable KV + dinbox state with the caller's privileged creds; it does NOT require THIS
996
- * endpoint to host the fan-out/reader loops (those are a space-level manager service). So a
997
- * short-lived provisioner can write a boot membership a separate long-lived manager then delivers.
1168
+ * Runs on the daemon (which hosts the fan-out/reader loops + the members KV), so catch-up + the
1169
+ * activation fence read are in-process no cross-process cursor read.
998
1170
  */
999
1171
  async durableJoinFor(owner, channel) {
1000
1172
  if (!this.js)
@@ -1119,27 +1291,122 @@ export class CotalEndpoint extends EventEmitter {
1119
1291
  }
1120
1292
  return { copied, evicted };
1121
1293
  }
1122
- /** Start the Plane-3 fan-out writer + trusted reader on THIS (privileged) endpoint. `aclFor` maps an
1123
- * owner id to its current read ACL for the reader's re-authorization (the manager passes its managed
1124
- * set). Call once after connect; idempotent durable creation lets it resume on a manager restart. */
1294
+ /** Start the Plane-3 fan-out writer + trusted reader on THIS (privileged, server-side delivery-daemon)
1295
+ * endpoint, AND serve the `ctl.delivery` control service (runtime durable join/leave/list). `aclFor`
1296
+ * maps an owner id to its current read ACL for the reader's re-authorization read FRESH per entry
1297
+ * from the durable ACL registry (async). Call once after connect; idempotent durable creation lets it
1298
+ * resume on a daemon restart. Both the JS loops AND the `ctl.delivery` subscription are (re)bound by
1299
+ * {@link armPlane3} on EVERY (re)connect — a reconnect drains the old connection, so re-binding both
1300
+ * is required, not optional (the responder would otherwise be lost on a broker blip). */
1125
1301
  async startPlane3(aclFor) {
1126
1302
  if (!this.js)
1127
1303
  throw new Error("endpoint not started");
1128
1304
  this.plane3 = { aclFor };
1129
1305
  await this.armPlane3();
1130
1306
  }
1307
+ /** Serve one runtime durable-membership control request (the server-side delivery daemon). The caller
1308
+ * id is the authenticated subject sender ({@link serveControl} fail-closes on a mismatch). Validation
1309
+ * is against the durable ACL registry — the SAME KV the reader re-auths against (single source of
1310
+ * truth, no in-memory ledger to drift). */
1311
+ async handleDeliveryControl(req) {
1312
+ const caller = req.from.id;
1313
+ const args = req.args ?? {};
1314
+ if (req.op === "durableJoin")
1315
+ return this.deliveryJoin(caller, args);
1316
+ if (req.op === "durableLeave")
1317
+ return this.deliveryLeave(caller, args);
1318
+ if (req.op === "listMemberships")
1319
+ return { ok: true, data: { memberships: await this.ownerMemberships(caller) } };
1320
+ return { ok: false, error: `op "${req.op}" not supported on the delivery control service` };
1321
+ }
1322
+ /** Validate the channel ARG shape only — non-blank, valid, concrete (NO ACL check, that is op-specific).
1323
+ * Returns the channel on success or a ControlReply error to short-circuit. */
1324
+ checkDurableChannelArg(args, op) {
1325
+ const channel = typeof args.channel === "string" ? args.channel.trim() : "";
1326
+ if (!channel)
1327
+ return { ok: false, error: `${op}: channel must be a non-blank string` };
1328
+ try {
1329
+ assertValidChannel(channel);
1330
+ }
1331
+ catch (e) {
1332
+ return { ok: false, error: e.message };
1333
+ }
1334
+ if (!isConcreteChannel(channel))
1335
+ return { ok: false, error: `${op}: "${channel}" must be a concrete channel (durable membership is per-concrete-channel, not wildcard)` };
1336
+ return channel;
1337
+ }
1338
+ /** JOIN requires the channel be within the caller's CURRENT read ACL (you can't durable-subscribe a
1339
+ * channel you may not read). */
1340
+ async deliveryJoin(caller, args) {
1341
+ const channel = this.checkDurableChannelArg(args, "durableJoin");
1342
+ if (typeof channel !== "string")
1343
+ return channel; // a ControlReply error
1344
+ const acl = await readAcl(await this.aclRegistry(), caller);
1345
+ if (acl === undefined)
1346
+ return { ok: false, error: `durableJoin: no read ACL on record for ${caller} (not provisioned for durable delivery)` };
1347
+ if (!channelInAllow(acl.record.allowSubscribe, channel))
1348
+ return { ok: false, error: `channel "${channel}" is not within your read ACL [${acl.record.allowSubscribe.join(", ")}]` };
1349
+ try {
1350
+ return { ok: true, data: await this.durableJoinFor(caller, channel) };
1351
+ }
1352
+ catch (e) {
1353
+ return { ok: false, error: e.message };
1354
+ }
1355
+ }
1356
+ /** LEAVE must NOT require current-ACL coverage. Leave fires precisely when the ACL was narrowed/revoked
1357
+ * (a refused live sub → {@link closeRefusedMembership}); gating the tombstone on the current ACL would
1358
+ * loop forever and leave the SPEC §7 boundary open (the membership could resume if the ACL is later
1359
+ * restored). The guards are: authenticated caller (serveControl), concrete channel, a finite generation
1360
+ * (the join epoch — without it a stale/replayed leave could tombstone a newer rejoin), and an EXISTING
1361
+ * own membership; `durableLeaveFor` → `tombstoneMember` then enforces the generation match. */
1362
+ async deliveryLeave(caller, args) {
1363
+ const channel = this.checkDurableChannelArg(args, "durableLeave");
1364
+ if (typeof channel !== "string")
1365
+ return channel; // a ControlReply error
1366
+ if (typeof args.generation !== "number" || !Number.isFinite(args.generation))
1367
+ return { ok: false, error: "durableLeave: a finite generation is required (fail-closed stale-leave guard)" };
1368
+ const existing = await readMember(await this.membersRegistry(), channel, caller);
1369
+ if (!existing)
1370
+ return { ok: true, data: { channel, alreadyLeft: true } }; // nothing to tombstone — idempotent
1371
+ try {
1372
+ await this.durableLeaveFor(caller, channel, args.generation);
1373
+ }
1374
+ catch (e) {
1375
+ return { ok: false, error: e.message };
1376
+ }
1377
+ return { ok: true, data: { channel } };
1378
+ }
1131
1379
  /** (Re)bind the Plane-3 fan-out writer + trusted reader. Idempotent — the durables resume from their
1132
1380
  * cursor. Called by {@link startPlane3} once AND by {@link connectAndBind} on every (re)connect, so
1133
- * a manager-endpoint reconnect RE-ARMS the backstop. Without this, a broker blip would silently kill
1381
+ * the delivery daemon's reconnect RE-ARMS the backstop + the ctl.delivery responder. Without this, a broker blip would silently kill
1134
1382
  * the loops while `durableJoinFor` kept reporting `durable:true` (the impl-review's BLOCKER-1). No-op
1135
1383
  * unless this endpoint hosts Plane-3 (`this.plane3` set). */
1136
1384
  async armPlane3() {
1137
1385
  if (!this.plane3 || !this.js)
1138
1386
  return;
1139
1387
  await this.manager(); // the manager runs consume:false, so this.jsm is lazy — ensure it
1388
+ this.armDeliveryControl();
1140
1389
  await this.runFanout();
1141
1390
  await this.runReader();
1142
1391
  }
1392
+ /** (Re)register the `ctl.delivery` control responder on the CURRENT connection. A reconnect drains the
1393
+ * old connection (the old sub is dead and `clearConnectionScoped` leaves caller-owned subs alone), so
1394
+ * this MUST run on every arm — otherwise durable join/leave/list silently lose their responder after a
1395
+ * broker blip. The stale sub is dropped (unsubscribed + removed from `this.subs`) before re-creating.
1396
+ * `boundReply` is essential here: the daemon holds a wildcard reply-publish grant, so the serve path
1397
+ * must reject any reply target outside the authenticated sender's own subtree (confused-deputy fix). */
1398
+ armDeliveryControl() {
1399
+ if (this.deliveryServeSub) {
1400
+ try {
1401
+ this.deliveryServeSub.unsubscribe();
1402
+ }
1403
+ catch { /* dead with the old connection */ }
1404
+ const i = this.subs.indexOf(this.deliveryServeSub);
1405
+ if (i >= 0)
1406
+ this.subs.splice(i, 1);
1407
+ }
1408
+ this.deliveryServeSub = this.serveControl(CONTROL_DELIVERY, (req) => this.handleDeliveryControl(req), { boundReply: true });
1409
+ }
1143
1410
  /** Fan-out loop: bind the privileged `fanout` durable on CHAT and route each message (routing only —
1144
1411
  * the trusted reader is the auth gate). */
1145
1412
  async runFanout() {
@@ -1206,7 +1473,7 @@ export class CotalEndpoint extends EventEmitter {
1206
1473
  const owner = this.resolveOwnerByName(name);
1207
1474
  if (!owner || owner === msg.from.id)
1208
1475
  continue;
1209
- const acl = this.plane3?.aclFor(owner);
1476
+ const acl = await this.plane3?.aclFor(owner);
1210
1477
  if (!acl || !channelInAllow(acl, channel))
1211
1478
  continue; // @mention can't bypass the read ACL
1212
1479
  await this.publishDinbox(owner, { msg, channel, seq, reason: "live-mention", generation: 0 });
@@ -1261,7 +1528,7 @@ export class CotalEndpoint extends EventEmitter {
1261
1528
  return;
1262
1529
  } // undecodable — drop
1263
1530
  const redeliveries = m.info?.deliveryCount ?? 1; // JsMsg delivery attempts (1 on first delivery)
1264
- const acl = this.plane3?.aclFor(owner);
1531
+ const acl = await this.plane3?.aclFor(owner);
1265
1532
  if (acl === undefined) {
1266
1533
  // UNKNOWN owner — the manager has not (re)hydrated this owner's ACL yet (e.g. right after a
1267
1534
  // manager PROCESS restart). This is NOT a revocation: DEFER (redeliver), never drop — an ack here
@@ -1311,7 +1578,7 @@ export class CotalEndpoint extends EventEmitter {
1311
1578
  m.ack();
1312
1579
  }
1313
1580
  /** Agent-side: bind + pump our pre-created Plane-3 DELIVER durable (`dlv_<id>`). Every message here is
1314
- * manager-written (DLV is manager-write-only, broker-enforced) and is a CHANNEL message by contract
1581
+ * delivery-daemon-written (DLV is delivery-write-only, broker-enforced) and is a CHANNEL message by contract
1315
1582
  * (the backstop never carries DMs), so `kind=channel` is path-derived (SPEC §4) and the body is
1316
1583
  * trusted (no spoof-guard). `durable:true` — real JetStream ack, coalesced with the core-sub live
1317
1584
  * copy by `MeshAgent.ingest`. No-op when the durable isn't present (open mode / not provisioned). */
@@ -1351,19 +1618,19 @@ export class CotalEndpoint extends EventEmitter {
1351
1618
  })().catch((e) => { if (!this.stopped)
1352
1619
  this.emit("error", e); });
1353
1620
  }
1354
- /** Agent-side: request a Plane-3 durable backstop for a channel via the manager (ctl.self). Throws
1355
- * when no privileged writer is present (open / manager-less). 30s timeout — activation catch-up may
1621
+ /** Agent-side: request a Plane-3 durable backstop for a channel via the server-side delivery daemon (ctl.delivery). Throws
1622
+ * when no privileged writer is present (open / no delivery daemon). 30s timeout — activation catch-up may
1356
1623
  * run before the reply (the window is small, but a busy channel can take more than the 5s default). */
1357
1624
  async durableJoinChannel(channel) {
1358
- const reply = await this.requestControl(CONTROL_SELF_SERVICE, { op: "durableJoin", args: { channel } }, 30_000);
1625
+ const reply = await this.requestDelivery("durableJoin", { channel }, 30_000);
1359
1626
  if (!reply.ok)
1360
1627
  throw new Error(reply.error ?? "durable join rejected");
1361
1628
  return reply.data ?? { durable: false };
1362
1629
  }
1363
1630
  /** Agent-side: release a Plane-3 durable backstop (tombstone membership at the leave cursor). Passes
1364
- * the join generation so a stale leave can't tombstone a newer rejoin (the manager validates it). */
1631
+ * the join generation so a stale leave can't tombstone a newer rejoin (the delivery daemon validates it). */
1365
1632
  async durableLeaveChannel(channel, generation) {
1366
- const reply = await this.requestControl(CONTROL_SELF_SERVICE, { op: "durableLeave", args: { channel, generation } });
1633
+ const reply = await this.requestDelivery("durableLeave", { channel, generation });
1367
1634
  if (!reply.ok)
1368
1635
  throw new Error(reply.error ?? "durable leave rejected");
1369
1636
  }
@@ -1373,7 +1640,7 @@ export class CotalEndpoint extends EventEmitter {
1373
1640
  * is reachable, never a silent give-up. While pending, the channel is tracked in
1374
1641
  * {@link pendingDurableLeave} and surfaced via {@link pendingDurableLeaves} (the connector shows it in
1375
1642
  * `cotal_channels` as `durable-unclosed`, never ordinary absence). The generation is kept the whole
1376
- * time. Authoritative closure of a revoked membership is also the manager's job (revocation). */
1643
+ * time. Authoritative closure of a revoked membership is also handled by revocation (rotate creds + tear down). */
1377
1644
  async closeRefusedMembership(channel, generation) {
1378
1645
  this.pendingDurableLeave.set(channel, generation);
1379
1646
  for (let attempt = 0;; attempt++) {
@@ -1406,42 +1673,94 @@ export class CotalEndpoint extends EventEmitter {
1406
1673
  }
1407
1674
  /** Agent-side: this session's CURRENT durable memberships (channel + join generation) from the
1408
1675
  * manager — the agent holds no read on the privileged members KV. `undefined` ⇒ NO control responder
1409
- * (open / manager-less, so there is no Plane-3 and no memberships). THROWS on a responder-present RPC
1676
+ * (open / no delivery daemon, so there is no Plane-3 and no memberships). THROWS on a responder-present RPC
1410
1677
  * failure, so a caller can FAIL-CLOSED rather than mistaking a transient error for "no membership". */
1411
1678
  async fetchMemberships() {
1412
1679
  let reply;
1413
1680
  try {
1414
- reply = await this.requestControl(CONTROL_SELF_SERVICE, { op: "listMemberships", args: {} }, 5_000);
1681
+ reply = await this.requestDelivery("listMemberships", {}, 5_000);
1415
1682
  }
1416
1683
  catch (e) {
1417
1684
  if (this.isNoResponders(e))
1418
- return undefined; // no manager — open / manager-less, no Plane-3
1685
+ return undefined; // no delivery daemon — open / daemon-less, no Plane-3
1419
1686
  throw e; // responder present but errored — surface it (leaveChannel fails closed)
1420
1687
  }
1421
1688
  if (!reply.ok)
1422
1689
  throw new Error(reply.error ?? "listMemberships failed");
1423
1690
  return reply.data?.memberships ?? [];
1424
1691
  }
1425
- /** Agent-side: seed `plane3Channels` with this session's boot durable memberships + generations on
1426
- * first connect (the agent holds no read on the privileged members KV). A best-effort OPTIMIZATION: it
1427
- * pre-fills the leave-generation mirror + the durable-state surface. If it can't (a transient manager
1428
- * error), {@link leaveChannel} re-resolves the generation on demand and fails closed there so a
1429
- * missed hydration never silently leaves a boot durable channel untombstonable. */
1430
- async hydrateMemberships() {
1431
- let memberships;
1432
- try {
1433
- memberships = await this.fetchMemberships();
1692
+ /** Agent-side, first connect (auth): SELF-JOIN this session's durable boot channels via the
1693
+ * server-side delivery daemon replacing the old manager-written boot membership. Each concrete
1694
+ * `durable`-class boot channel gets a `durableJoin` whose returned generation seeds the leave mirror
1695
+ * + durable-state surface; an already-active membership (a relaunch) is idempotent (no re-catch-up).
1696
+ * If the daemon is down/absent at first connect (or reports a transient `durable:false`), the channel
1697
+ * is handed to {@link reconcileBootJoin} for capped-backoff retry — so the backstop is RESTORED once
1698
+ * the daemon recovers, not left silently live-only. Until a membership exists the channel renders
1699
+ * degraded in `cotal_channels` ({@link hasDurableMembership}). */
1700
+ async armBootDurableMemberships() {
1701
+ for (const channel of this.channels) {
1702
+ if (!isConcreteChannel(channel) || this.plane3Channels.has(channel))
1703
+ continue;
1704
+ let cls;
1705
+ try {
1706
+ cls = await this.deliveryClassFresh(channel);
1707
+ }
1708
+ catch {
1709
+ continue;
1710
+ }
1711
+ if (cls !== "durable")
1712
+ continue;
1713
+ try {
1714
+ const r = await this.durableJoinChannel(channel);
1715
+ if (r.durable)
1716
+ this.plane3Channels.set(channel, r.generation ?? 0);
1717
+ else
1718
+ void this.reconcileBootJoin(channel); // present but not yet durable — reconcile to recovery
1719
+ }
1720
+ catch (e) {
1721
+ if (!this.isNoResponders(e))
1722
+ this.emit("error", e); // no daemon ⇒ retry until it recovers
1723
+ void this.reconcileBootJoin(channel);
1724
+ }
1434
1725
  }
1435
- catch {
1436
- return; // transient manager error at boot leaveChannel re-resolves on demand (fail-closed there)
1726
+ }
1727
+ /** Retry a boot durable self-join with capped backoff until a membership EXISTS (success → seed
1728
+ * `plane3Channels`) or the channel is left / the endpoint stops. Mirrors {@link closeRefusedMembership}:
1729
+ * a one-shot first-connect attempt that swallowed a daemon outage would leave the boot channel live-only
1730
+ * forever after the daemon recovers (and the lease-based health could then read "active" with no owner
1731
+ * membership). This loop is the reconcile that closes that gap. Idempotent — a channel already pending
1732
+ * is not double-driven; survives reconnect (it re-issues `durableJoinChannel` on the current connection). */
1733
+ async reconcileBootJoin(channel) {
1734
+ if (this.pendingBootJoins.has(channel))
1735
+ return; // already reconciling
1736
+ this.pendingBootJoins.add(channel);
1737
+ for (let attempt = 0;; attempt++) {
1738
+ await new Promise((r) => setTimeout(r, Math.min(30_000, 1000 * 2 ** attempt)));
1739
+ if (this.stopped || !this.channels.includes(channel) || this.plane3Channels.has(channel)) {
1740
+ this.pendingBootJoins.delete(channel);
1741
+ return; // stopped, left, or another path established it
1742
+ }
1743
+ try {
1744
+ const r = await this.durableJoinChannel(channel);
1745
+ if (r.durable) {
1746
+ this.plane3Channels.set(channel, r.generation ?? 0);
1747
+ this.pendingBootJoins.delete(channel);
1748
+ return;
1749
+ }
1750
+ // present but durable:false (e.g. catch-up window evicted) — keep retrying; the channel stays
1751
+ // honestly degraded meanwhile, never silently "active".
1752
+ }
1753
+ catch (e) {
1754
+ if (attempt === 0 && !this.isNoResponders(e))
1755
+ this.emit("error", new Error(`channel "${channel}": boot durable self-join not yet established — retrying until the delivery daemon is reachable (${e.message})`));
1756
+ }
1437
1757
  }
1438
- if (!memberships)
1439
- return; // no manager live-only
1440
- // Seed the mirror (+ durable-state surface) with CONFIRMED backstops only; leaveChannel re-resolves a
1441
- // non-activated record on demand if it ever needs to close one.
1442
- for (const m of memberships)
1443
- if (m.activated && this.channels.includes(m.channel))
1444
- this.plane3Channels.set(m.channel, m.generation);
1758
+ }
1759
+ /** True if this session holds an established Plane-3 durable membership for `channel` (in `plane3Channels`).
1760
+ * Drives the membership-aware delivery-health surface: a joined durable channel that is NOT yet a member
1761
+ * (boot self-join pending / daemon down) must render degraded, never "active" off a live lease alone. */
1762
+ hasDurableMembership(channel) {
1763
+ return this.plane3Channels.has(channel);
1445
1764
  }
1446
1765
  /** Lazily obtain a JetStream manager — so a non-consuming endpoint (e.g. the supervisor,
1447
1766
  * consume:false) can still pre-create others' durables. */
@@ -1472,9 +1791,10 @@ export class CotalEndpoint extends EventEmitter {
1472
1791
  await this.pumpDlv();
1473
1792
  // Multicast: open a native CORE subscription for each channel (live, manager-free, broker-enforced
1474
1793
  // by sub.allow) — boot + runtime joins use the SAME path; there is no per-instance chat durable.
1475
- // The durable backstop (a busy/offline turn) is Plane-3 (auth: membership written at provision, the
1476
- // manager's fan-out writer + trusted reader deliver via the `dlv_<id>` pump above; open dev mode is
1477
- // live-only — the durable plane needs the manager's trusted reader, the security boundary). Per-
1794
+ // The durable backstop (a busy/offline turn) is Plane-3 (auth: membership established by the agent's
1795
+ // self-join, the delivery daemon's fan-out writer + trusted reader deliver via the `dlv_<id>` pump
1796
+ // above; open dev mode is live-only — the durable plane needs the daemon's trusted reader, the
1797
+ // security boundary). Per-
1478
1798
  // channel history is the explicit replay-gated backfill, on FIRST connect only; a reconnect reopens
1479
1799
  // the subs without re-backfilling (the durable backstop redelivers any missed window via dlv).
1480
1800
  if (this.channels.length) {
@@ -1490,11 +1810,11 @@ export class CotalEndpoint extends EventEmitter {
1490
1810
  if (armed)
1491
1811
  await this.backfillArmed(armed);
1492
1812
  }
1493
- // First connect, auth mode: hydrate the local generation mirror for BOOT durable memberships (the
1494
- // manager provisioned them server-side, so they are not in plane3Channels yet) without it,
1495
- // leaving a boot durable channel could not tombstone its §7 boundary. Open mode has no Plane-3.
1813
+ // First connect, auth mode: self-join BOOT durable channels via the server-side delivery daemon
1814
+ // (it owns membership now there is no manager-written boot membership). Seeds plane3Channels so a
1815
+ // later leave can tombstone the §7 boundary; idempotent on relaunch. Open mode has no Plane-3.
1496
1816
  if (this.firstConnect && this.creds && this.channels.length)
1497
- await this.hydrateMemberships();
1817
+ await this.armBootDurableMemberships();
1498
1818
  this.firstConnect = false;
1499
1819
  // Anycast: a shared work-queue consumer for our role — one instance grabs each task.
1500
1820
  // Open mode self-creates; auth mode BINDS the provisioner-pre-created svc_<role>
@@ -1925,8 +2245,8 @@ export class CotalEndpoint extends EventEmitter {
1925
2245
  }
1926
2246
  }
1927
2247
  async publishPresence() {
1928
- if (!this.kv)
1929
- return;
2248
+ if (!this.doRegister || !this.kv)
2249
+ return; // observers watch but never publish their own record
1930
2250
  const p = {
1931
2251
  card: this.card,
1932
2252
  status: this.status,
@@ -2143,4 +2463,27 @@ export async function isReachable(servers = DEFAULT_SERVER, opts = {}) {
2143
2463
  return e instanceof AuthorizationError || e instanceof UserAuthenticationExpiredError;
2144
2464
  }
2145
2465
  }
2466
+ /** Like {@link isReachable}, but distinguishes "up but won't take these creds" from "nothing there".
2467
+ * `spawn` needs the difference: auth-required → name the trust dir + next step; unreachable → the
2468
+ * mesh is down (prune the stale entry, tell the user to `cotal up`). Pass `creds` to confirm a
2469
+ * specific identity is accepted (`ok`); omit them to probe mere liveness (an auth broker answers
2470
+ * `auth-required`, which still proves it's up). */
2471
+ export async function probeConnect(server = DEFAULT_SERVER, opts = {}) {
2472
+ try {
2473
+ const nc = await connect({
2474
+ servers: server,
2475
+ timeout: opts.timeoutMs ?? 1000,
2476
+ reconnect: false,
2477
+ maxReconnectAttempts: 0,
2478
+ ...authOpts(opts),
2479
+ });
2480
+ await nc.close();
2481
+ return { ok: true };
2482
+ }
2483
+ catch (e) {
2484
+ if (e instanceof AuthorizationError || e instanceof UserAuthenticationExpiredError)
2485
+ return { ok: false, reason: "auth-required" };
2486
+ return { ok: false, reason: "unreachable" };
2487
+ }
2488
+ }
2146
2489
  //# sourceMappingURL=endpoint.js.map