@cotal-ai/core 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/acls.d.ts +45 -0
- package/dist/acls.d.ts.map +1 -0
- package/dist/acls.js +86 -0
- package/dist/acls.js.map +1 -0
- package/dist/command.d.ts +3 -0
- package/dist/command.d.ts.map +1 -1
- package/dist/endpoint.d.ts +158 -54
- package/dist/endpoint.d.ts.map +1 -1
- package/dist/endpoint.js +366 -98
- package/dist/endpoint.js.map +1 -1
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/lease.d.ts +40 -0
- package/dist/lease.d.ts.map +1 -0
- package/dist/lease.js +64 -0
- package/dist/lease.js.map +1 -0
- package/dist/provision.d.ts +29 -19
- package/dist/provision.d.ts.map +1 -1
- package/dist/provision.js +94 -12
- package/dist/provision.js.map +1 -1
- package/dist/streams.d.ts +11 -0
- package/dist/streams.d.ts.map +1 -1
- package/dist/streams.js +19 -5
- package/dist/streams.js.map +1 -1
- package/dist/subjects.d.ts +45 -2
- package/dist/subjects.d.ts.map +1 -1
- package/dist/subjects.js +67 -3
- package/dist/subjects.js.map +1 -1
- package/dist/types.d.ts +17 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +1 -1
package/dist/endpoint.js
CHANGED
|
@@ -7,8 +7,10 @@ import { createSpaceStreams, dmDurableConfig, dlvDurableConfig, taskDurableConfi
|
|
|
7
7
|
import { jetstream, jetstreamManager, AckPolicy, DeliverPolicy, } from "@nats-io/jetstream";
|
|
8
8
|
import { Kvm } from "@nats-io/kv";
|
|
9
9
|
import { openMembersRegistry, commitMember, tombstoneMember, activateMember, readMember, listMembers, durableEligible, StaleMembershipWrite, } from "./members.js";
|
|
10
|
+
import { openAclRegistry, readAcl, commitAcl as writeAclRecord } from "./acls.js";
|
|
11
|
+
import { openDeliveryRegistry } from "./lease.js";
|
|
10
12
|
import { openChannelRegistry, effectiveReplay, effectiveReplayWindowMs, effectiveDeliveryClass, readChannelConfig, readChannelDefaults, } from "./channels.js";
|
|
11
|
-
import { anycastSubject, CHANNEL_DEFAULTS_KEY, chatStream, chatHistDurable, chatSubject, controlServiceSubject, CONTROL_SELF_SERVICE, dmStream, dmDurable, dlvStream, dlvDurable, dlvSubject, dinboxSubject, inboxStream, parseDinboxOwner, FANOUT_DURABLE, INBOX_READER_DURABLE, chatWildcard, channelInAllow, isConcreteChannel, normalizeMentions, parseSubject, presenceBucket, spacePrefix, spaceWildcard, subjectMatches, taskStream, taskDurable, token, unicastSubject, } from "./subjects.js";
|
|
13
|
+
import { anycastSubject, CHANNEL_DEFAULTS_KEY, chatStream, chatHistDurable, chatSubject, controlServiceSubject, CONTROL_SELF_SERVICE, CONTROL_DELIVERY, dmStream, dmDurable, dlvStream, dlvDurable, dlvSubject, dinboxSubject, inboxStream, parseDinboxOwner, FANOUT_DURABLE, INBOX_READER_DURABLE, leaseKey, chatWildcard, assertValidChannel, channelInAllow, isConcreteChannel, normalizeMentions, parseSubject, presenceBucket, spacePrefix, spaceWildcard, subjectMatches, taskStream, taskDurable, token, unicastSubject, } from "./subjects.js";
|
|
12
14
|
export const DEFAULT_SERVER = "nats://127.0.0.1:4222";
|
|
13
15
|
/** Space joined when none is given on the CLI (the `cotal-<space>` cmux tab, etc.). */
|
|
14
16
|
export const DEFAULT_SPACE = "main";
|
|
@@ -50,10 +52,17 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
50
52
|
jsm;
|
|
51
53
|
kv;
|
|
52
54
|
channelKv;
|
|
53
|
-
/** Plane-3 durable-membership registry KV — lazily opened by the privileged (
|
|
55
|
+
/** Plane-3 durable-membership registry KV — lazily opened by the privileged delivery daemon (or a
|
|
56
|
+
* short-lived provisioner). */
|
|
54
57
|
membersKv;
|
|
55
|
-
|
|
56
|
-
|
|
58
|
+
aclKv;
|
|
59
|
+
deliveryKv;
|
|
60
|
+
/** The live `ctl.delivery` serve subscription (delivery daemon) — re-created on every (re)connect by
|
|
61
|
+
* {@link armDeliveryControl}; tracked so the stale one is dropped on reconnect. */
|
|
62
|
+
deliveryServeSub;
|
|
63
|
+
/** When set, this endpoint hosts the Plane-3 fan-out writer + trusted reader (the server-side delivery
|
|
64
|
+
* daemon). `aclFor` maps an owner id to its current read ACL (`allowSubscribe`) for the reader's
|
|
65
|
+
* re-authorization — read FRESH per entry from the durable ACL registry KV, hence async. */
|
|
57
66
|
plane3;
|
|
58
67
|
/** Live local cache of the channel registry (key = channel token), kept by a KV watch. */
|
|
59
68
|
channelConfigs = new Map();
|
|
@@ -89,6 +98,12 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
89
98
|
* {@link pendingDurableLeaves} (the connector shows it in `cotal_channels`, never as ordinary
|
|
90
99
|
* absence). Persists across reconnect; cleared on tombstone success or full stop. */
|
|
91
100
|
pendingDurableLeave = new Map();
|
|
101
|
+
/** Boot durable channels whose self-join hasn't yet established a membership (daemon down/absent at
|
|
102
|
+
* first connect, or a transient `durable:false`). {@link reconcileBootJoin} retries with capped
|
|
103
|
+
* backoff until the membership exists or the channel is left — so a first-connect daemon outage
|
|
104
|
+
* self-heals on recovery instead of leaving the channel silently live-only. Surfaced to the connector
|
|
105
|
+
* via {@link hasDurableMembership} (a joined durable channel NOT yet a member renders degraded). */
|
|
106
|
+
pendingBootJoins = new Set();
|
|
92
107
|
/** Chat-join subjects currently being broker-confirmed. An out-of-ACL subscribe among these trips an
|
|
93
108
|
* EXPECTED async permission violation that joinChannel turns into a clean throw, so watchStatus
|
|
94
109
|
* suppresses it rather than surfacing a spurious connection error. */
|
|
@@ -220,7 +235,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
220
235
|
await this.ensureStreams();
|
|
221
236
|
await this.startConsumers();
|
|
222
237
|
}
|
|
223
|
-
// Re-arm Plane-3 (
|
|
238
|
+
// Re-arm Plane-3 (delivery-daemon-hosted fan-out + trusted reader + ctl.delivery) on every (re)connect — no-op unless this
|
|
224
239
|
// endpoint hosts it. The first arm comes from startPlane3 (after start()); this re-binds the loops
|
|
225
240
|
// a reconnect's clearConnectionScoped() tore down, so a broker blip doesn't silently kill the backstop.
|
|
226
241
|
await this.armPlane3();
|
|
@@ -332,6 +347,12 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
332
347
|
this.jsm = undefined;
|
|
333
348
|
this.kv = undefined;
|
|
334
349
|
this.channelKv = undefined;
|
|
350
|
+
// Plane-3 KV handles are bound to the old connection too — drop them so the daemon re-opens them on
|
|
351
|
+
// the fresh nc (else durableJoin/leave/list, the reader's ACL re-auth, and lease renew use a dead
|
|
352
|
+
// handle after a reconnect).
|
|
353
|
+
this.membersKv = undefined;
|
|
354
|
+
this.aclKv = undefined;
|
|
355
|
+
this.deliveryKv = undefined;
|
|
335
356
|
this.emit("connection", { connected: false }); // null window opened — not live until the rebind below
|
|
336
357
|
try {
|
|
337
358
|
await oldNc?.drain();
|
|
@@ -518,8 +539,16 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
518
539
|
})().catch((e) => this.emit("error", e));
|
|
519
540
|
}
|
|
520
541
|
// ---- control plane (request/reply) --------------------------------------
|
|
521
|
-
/** Serve control requests for a service
|
|
522
|
-
|
|
542
|
+
/** Serve control requests for a service. Returns the subscription so a caller that re-registers on
|
|
543
|
+
* reconnect (the delivery daemon) can drop the stale one. `boundReply` is REQUIRED for any service
|
|
544
|
+
* whose responder holds a wildcard publish grant over the service subtree (the delivery daemon's
|
|
545
|
+
* `ctl.delivery.*.reply.>`): without it, an authenticated caller could set its reply target to a
|
|
546
|
+
* PEER's reply lane (`ctl.delivery.<victim>.reply.<n>`) and turn the responder into a confused
|
|
547
|
+
* deputy — the broker does NOT permission-check the requester's embedded reply subject. With it, a
|
|
548
|
+
* reply is published only when `m.reply` is under the AUTHENTICATED request subject
|
|
549
|
+
* (`${m.subject}.reply.…`), binding the reply to the broker-policed sender token. (The manager's
|
|
550
|
+
* tiers reply into the per-id `_INBOX` and leave it off.) */
|
|
551
|
+
serveControl(service, handler, opts = {}) {
|
|
523
552
|
if (!this.nc)
|
|
524
553
|
throw new Error("endpoint not started");
|
|
525
554
|
const sub = this.nc.subscribe(controlServiceSubject(this.space, service, "*"), {
|
|
@@ -528,6 +557,12 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
528
557
|
this.subs.push(sub);
|
|
529
558
|
void (async () => {
|
|
530
559
|
for await (const m of sub) {
|
|
560
|
+
// Sender-bound reply guard (confused-deputy fix): never respond to a reply target outside the
|
|
561
|
+
// authenticated request subject's own `.reply.` subtree. Drop silently (don't inject elsewhere).
|
|
562
|
+
if (opts.boundReply && (!m.reply || !m.reply.startsWith(`${m.subject}.reply.`))) {
|
|
563
|
+
this.emit("error", new Error(`rejected ${service} request on ${m.subject}: reply target "${m.reply ?? "(none)"}" is not under the sender's own reply subtree`));
|
|
564
|
+
continue;
|
|
565
|
+
}
|
|
531
566
|
let reply;
|
|
532
567
|
try {
|
|
533
568
|
const req = m.json();
|
|
@@ -556,6 +591,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
556
591
|
}
|
|
557
592
|
}
|
|
558
593
|
})().catch((e) => this.emit("error", e));
|
|
594
|
+
return sub;
|
|
559
595
|
}
|
|
560
596
|
/** Send a control request to a service and await its reply (client side). */
|
|
561
597
|
async requestControl(service, req, timeoutMs = 5000) {
|
|
@@ -565,6 +601,26 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
565
601
|
const m = await this.nc.request(controlServiceSubject(this.space, service, this.card.id), JSON.stringify(body), { timeout: timeoutMs });
|
|
566
602
|
return m.json();
|
|
567
603
|
}
|
|
604
|
+
/** Send a durable-membership request to the SERVER-SIDE delivery daemon (`ctl.delivery`) and await its
|
|
605
|
+
* reply. Unlike {@link requestControl}, the reply rides a subject UNDER `ctl.delivery.<id>.>` (not the
|
|
606
|
+
* per-id `_INBOX`), so the scoped delivery cred can answer without broad inbox-publish — see
|
|
607
|
+
* CONTROL_DELIVERY. `noMux` lets us name the reply subject while keeping NoResponders detection (so a
|
|
608
|
+
* caller can fail-closed vs. degrade to live-only when no daemon is present). */
|
|
609
|
+
async requestDelivery(op, args, timeoutMs = 5000) {
|
|
610
|
+
if (!this.nc)
|
|
611
|
+
throw new Error(this.notLiveMsg());
|
|
612
|
+
const reqSubject = controlServiceSubject(this.space, CONTROL_DELIVERY, this.card.id); // ctl.delivery.<id>
|
|
613
|
+
// Reply rides the sender's OWN subtree so the daemon's serveControl boundReply guard accepts it
|
|
614
|
+
// (`${reqSubject}.reply.…`). The sender-bound guard is the COMPLETE confused-deputy closure. The
|
|
615
|
+
// random suffix is genuine defense-in-depth (NOT cosmetic): `noMux` subscribes this SPECIFIC named
|
|
616
|
+
// reply subject (not a standing `.reply.>` wildcard), so a predictable suffix would let a peer target
|
|
617
|
+
// an in-flight reply subscription — randomUUID brings it to parity with the nuid-protected `_INBOX`
|
|
618
|
+
// model. Keep both; don't regress to a counter. (Confirmed by the review panel's fact-check.)
|
|
619
|
+
const reply = `${reqSubject}.reply.${randomUUID()}`;
|
|
620
|
+
const body = { op, args, from: this.ref() };
|
|
621
|
+
const m = await this.nc.request(reqSubject, JSON.stringify(body), { timeout: timeoutMs, noMux: true, reply });
|
|
622
|
+
return m.json();
|
|
623
|
+
}
|
|
568
624
|
// ---- presence ------------------------------------------------------------
|
|
569
625
|
getRoster() {
|
|
570
626
|
return [...this.roster.values()].sort((a, b) => a.card.name.localeCompare(b.card.name));
|
|
@@ -611,6 +667,12 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
611
667
|
channelReplay(channel) {
|
|
612
668
|
return effectiveReplay(this.channelConfigs.get(channel), this.channelDefaults);
|
|
613
669
|
}
|
|
670
|
+
/** Effective delivery class for a channel (per-channel override ?? space default ?? "durable"),
|
|
671
|
+
* from the live watch cache — drives the non-gating delivery-health surface (only durable-class
|
|
672
|
+
* channels have a Plane-3 backstop to report on). */
|
|
673
|
+
channelDeliveryClass(channel) {
|
|
674
|
+
return effectiveDeliveryClass(this.channelConfigs.get(channel), this.channelDefaults);
|
|
675
|
+
}
|
|
614
676
|
// ---- dynamic subscription (join / leave mid-session) ---------------------
|
|
615
677
|
/** The channels this endpoint is currently subscribed to (live — reflects join/leave). */
|
|
616
678
|
joinedChannels() {
|
|
@@ -619,9 +681,10 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
619
681
|
/**
|
|
620
682
|
* Join a channel mid-session: open a native core subscription (manager-free live read, broker-
|
|
621
683
|
* confirmed against `sub.allow`), capture the stream frontier as the join watermark, backfill its
|
|
622
|
-
* history if replay is on, and — for a `durable`-class channel
|
|
623
|
-
* durable backstop. Idempotent: re-joining is a no-op (no
|
|
624
|
-
* whether the durable backstop is active (+ a `reason`
|
|
684
|
+
* history if replay is on, and — for a `durable`-class channel when a delivery daemon is present —
|
|
685
|
+
* request a Plane-3 durable backstop (via `ctl.delivery`). Idempotent: re-joining is a no-op (no
|
|
686
|
+
* re-backfill). Returns the backfill count + whether the durable backstop is active (+ a `reason`
|
|
687
|
+
* when a durable channel couldn't get one).
|
|
625
688
|
*/
|
|
626
689
|
async joinChannel(channel) {
|
|
627
690
|
if (!this.jsm)
|
|
@@ -656,7 +719,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
656
719
|
}
|
|
657
720
|
this.channels.push(channel);
|
|
658
721
|
// Durable backstop. The live core-sub above already delivers (manager-free). For a `durable`-class
|
|
659
|
-
// channel, request a Plane-3 per-member backstop from the
|
|
722
|
+
// channel, request a Plane-3 per-member backstop from the server-side delivery daemon (durableJoin via ctl.delivery) so a post reaches a
|
|
660
723
|
// busy/offline turn — the core-sub stays as the live wake-hint, dedup-coalesced with the Plane-3
|
|
661
724
|
// copy by id-dedup. No manager (open dev / manager-less) ⇒ joined LIVE only, surfaced via `reason`
|
|
662
725
|
// (never silent). A `live`-class channel takes no backstop (joined live is the contract).
|
|
@@ -674,7 +737,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
674
737
|
}
|
|
675
738
|
}
|
|
676
739
|
catch (e) {
|
|
677
|
-
// No privileged writer (
|
|
740
|
+
// No privileged writer (no delivery daemon) or the write was rejected — joined live, backstop
|
|
678
741
|
// unavailable. NOT a join failure: the live subscription is up and authorized.
|
|
679
742
|
reason = `durable backstop unavailable (${e.message})`;
|
|
680
743
|
}
|
|
@@ -695,11 +758,11 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
695
758
|
return { left: false };
|
|
696
759
|
// Auth + durable-class ⇒ a Plane-3 membership may exist; tombstone it BEFORE touching local state.
|
|
697
760
|
// The join generation comes from the local mirror, but a BOOT membership whose hydration was missed
|
|
698
|
-
// (
|
|
761
|
+
// (daemon down at connect) is NOT in the mirror — so re-resolve it from the delivery service on
|
|
699
762
|
// demand. FAIL-CLOSED: fetchMemberships throws on a responder-present error, so a leave whose
|
|
700
763
|
// tombstone can't be confirmed propagates (live sub stays up, mirror intact) for the caller to retry
|
|
701
764
|
// — reporting `left` while the trusted reader keeps transferring to DLV is the fail-open leak. A
|
|
702
|
-
// genuine no-responder (open /
|
|
765
|
+
// genuine no-responder (open / no delivery daemon, no Plane-3) means there is no membership to tombstone.
|
|
703
766
|
if (this.creds && effectiveDeliveryClass(this.channelConfigs.get(channel), this.channelDefaults) === "durable") {
|
|
704
767
|
let generation = this.plane3Channels.get(channel);
|
|
705
768
|
if (generation === undefined)
|
|
@@ -871,28 +934,10 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
871
934
|
throw new Error("endpoint not started");
|
|
872
935
|
await createSpaceStreams(this.jsm, this.space);
|
|
873
936
|
}
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
* `durableJoin`. `live`-class (and non-concrete) channels are skipped. Idempotent.
|
|
879
|
-
*
|
|
880
|
-
* Writes the durable RECORDS with the caller's privileged creds — it does NOT require this endpoint
|
|
881
|
-
* to host the runtime fan-out/reader loops (a space-level manager service), so EVERY auth launcher
|
|
882
|
-
* provisions identically: the manager AND the short-lived `cotal spawn` provisioner both write boot
|
|
883
|
-
* records, which the space's manager then delivers (no silent no-op — that would hide a boot
|
|
884
|
-
* membership; AGENTS.md "no fallbacks"). A space running no manager is live-only for everyone (the
|
|
885
|
-
* records exist; nothing delivers them until a manager hosts the loops).
|
|
886
|
-
*/
|
|
887
|
-
async provisionMembership(targetId, channels) {
|
|
888
|
-
for (const ch of channels) {
|
|
889
|
-
if (!isConcreteChannel(ch))
|
|
890
|
-
continue; // durable membership is per-concrete-channel
|
|
891
|
-
if ((await this.deliveryClassFresh(ch)) !== "durable")
|
|
892
|
-
continue;
|
|
893
|
-
await this.durableJoinFor(targetId, ch);
|
|
894
|
-
}
|
|
895
|
-
}
|
|
937
|
+
// (v3) The old `provisionMembership` — manager/provisioner-written boot membership at spawn — is GONE.
|
|
938
|
+
// Boot durable membership is now the AGENT self-joining its durable boot channels via the daemon's
|
|
939
|
+
// `ctl.delivery` op at connect ({@link armBootDurableMemberships}), reconciled on outage. The
|
|
940
|
+
// primitive it wrapped, {@link durableJoinFor}, is now driven by the daemon's `ctl.delivery` handler.
|
|
896
941
|
/**
|
|
897
942
|
* Privileged: pre-create an agent's DM inbox durable (auth mode), so the agent can BIND
|
|
898
943
|
* it without holding CONSUMER.CREATE on DM_<space>. The creator sets the filter to
|
|
@@ -925,26 +970,102 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
925
970
|
const jsm = await this.manager();
|
|
926
971
|
await jsm.consumers.add(taskStream(this.space), taskDurableConfig(this.space, role));
|
|
927
972
|
}
|
|
928
|
-
// ---- Plane-3: durable backstop (SPEC §8) — privileged,
|
|
973
|
+
// ---- Plane-3: durable backstop (SPEC §8) — privileged, hosted by the server-side DELIVERY DAEMON ----
|
|
929
974
|
//
|
|
930
|
-
// Two
|
|
931
|
-
// every chat message and copies it into each eligible owner's MIXED
|
|
932
|
-
// TRUSTED READER (the auth gate) re-authorizes each entry against the
|
|
933
|
-
// interval and TRANSFERS the authorized copy to the owner's per-member
|
|
934
|
-
// (`dlv.<owner>`), which the agent binds + acks via native JetStream. The agent holds no
|
|
935
|
-
// mixed store.
|
|
936
|
-
|
|
975
|
+
// Two daemon loops + two privileged membership ops (served to agents on `ctl.delivery`). The FAN-OUT
|
|
976
|
+
// writer (routing, not auth) reads every chat message and copies it into each eligible owner's MIXED
|
|
977
|
+
// inbox (`dinbox.<owner>`); the TRUSTED READER (the auth gate) re-authorizes each entry against the
|
|
978
|
+
// CURRENT ACL + membership interval and TRANSFERS the authorized copy to the owner's per-member
|
|
979
|
+
// DELIVER store (`dlv.<owner>`), which the agent binds + acks via native JetStream. The agent holds no
|
|
980
|
+
// read on the mixed store. (v3: this all moved off the manager — the manager is lifecycle-only; it
|
|
981
|
+
// records the read-ACL at mint via commitAcl.) See `.internal/research/stage4-impl-design.md`.
|
|
982
|
+
/** Lazily open the privileged members registry KV (delivery daemon / open-mode self). */
|
|
937
983
|
async membersRegistry() {
|
|
938
984
|
if (!this.nc)
|
|
939
985
|
throw new Error("endpoint not started");
|
|
940
986
|
this.membersKv ??= await openMembersRegistry(this.nc, this.space);
|
|
941
987
|
return this.membersKv;
|
|
942
988
|
}
|
|
989
|
+
/** Lazily open the durable read-ACL registry KV. Privileged write (the manager records an agent's
|
|
990
|
+
* ACL at mint); the delivery daemon reads it fresh per durable entry to re-authorize. */
|
|
991
|
+
async aclRegistry() {
|
|
992
|
+
if (!this.nc)
|
|
993
|
+
throw new Error("endpoint not started");
|
|
994
|
+
this.aclKv ??= await openAclRegistry(this.nc, this.space);
|
|
995
|
+
return this.aclKv;
|
|
996
|
+
}
|
|
997
|
+
/** Privileged ({@link DurableProvisioner}): record an agent's read ACL in the durable registry at
|
|
998
|
+
* provision/mint time — the same act as baking it into the JWT, persisted so the server-side
|
|
999
|
+
* delivery daemon can re-authorize the agent's durable entries and validate its runtime
|
|
1000
|
+
* durable-joins without holding any in-memory ledger. Written ATOMICALLY ({@link writeAclRecord}),
|
|
1001
|
+
* so a present record is always complete (`[]` = known no-read, never a half-write). */
|
|
1002
|
+
async commitAcl(targetId, allowSubscribe) {
|
|
1003
|
+
await writeAclRecord(await this.aclRegistry(), targetId, allowSubscribe);
|
|
1004
|
+
}
|
|
1005
|
+
/** The server-side delivery daemon's fresh-per-entry ACL read: an owner's CURRENT read ACL
|
|
1006
|
+
* (`allowSubscribe`) from the durable registry, or `undefined` if no record (an unknown owner — the
|
|
1007
|
+
* reader DEFERS, never drops). A present `[]` (known no-read) returns `[]` (the reader DROPS). */
|
|
1008
|
+
async aclForOwner(owner) {
|
|
1009
|
+
return (await readAcl(await this.aclRegistry(), owner))?.record.allowSubscribe;
|
|
1010
|
+
}
|
|
1011
|
+
/** Lazily open the delivery lease/readiness KV (pre-created at `cotal up`; bind, never create). */
|
|
1012
|
+
async deliveryRegistry() {
|
|
1013
|
+
if (!this.nc)
|
|
1014
|
+
throw new Error("endpoint not started");
|
|
1015
|
+
this.deliveryKv ??= await openDeliveryRegistry(this.nc, this.space);
|
|
1016
|
+
return this.deliveryKv;
|
|
1017
|
+
}
|
|
1018
|
+
encodeLease(ready) {
|
|
1019
|
+
return new TextEncoder().encode(JSON.stringify({ holder: this.card.id, since: Date.now(), ready }));
|
|
1020
|
+
}
|
|
1021
|
+
/** Acquire the single-flight delivery lease for a shard via an ATOMIC CAS create, marked NOT-ready.
|
|
1022
|
+
* THROWS if a live lease exists — a loud refusal-to-bind (the daemon exits), never a retry, so two
|
|
1023
|
+
* daemons can't split a durable's delivery. A crashed holder's lease auto-expires (bucket TTL),
|
|
1024
|
+
* freeing a re-acquire. Acquired BEFORE binding (single-flight gate); {@link markDeliveryLeaseReady}
|
|
1025
|
+
* flips it ready AFTER the loops + `ctl.delivery` are bound. Returns the lease revision. */
|
|
1026
|
+
async acquireDeliveryLease(shardIndex) {
|
|
1027
|
+
return (await this.deliveryRegistry()).create(leaseKey(shardIndex), this.encodeLease(false));
|
|
1028
|
+
}
|
|
1029
|
+
/** Flip the held lease to READY (CAS `kv.update`) AFTER `startPlane3` has bound the loops + the
|
|
1030
|
+
* `ctl.delivery` responder — so "lease ready" proves the responder is up, not just that the slot was
|
|
1031
|
+
* claimed. Returns the new revision. */
|
|
1032
|
+
async markDeliveryLeaseReady(shardIndex, revision) {
|
|
1033
|
+
return (await this.deliveryRegistry()).update(leaseKey(shardIndex), this.encodeLease(true), revision);
|
|
1034
|
+
}
|
|
1035
|
+
/** Renew the held lease (CAS `kv.update` against `revision`, keeping `ready:true`) to refresh it before
|
|
1036
|
+
* the bucket TTL expires it. Returns the new revision. Throws if the revision moved (lost the lease —
|
|
1037
|
+
* the daemon should exit). */
|
|
1038
|
+
async renewDeliveryLease(shardIndex, revision) {
|
|
1039
|
+
return (await this.deliveryRegistry()).update(leaseKey(shardIndex), this.encodeLease(true), revision);
|
|
1040
|
+
}
|
|
1041
|
+
/** Release the held lease on clean shutdown so a replacement daemon re-acquires immediately (best
|
|
1042
|
+
* effort — a crash just lets the bucket TTL expire it). */
|
|
1043
|
+
async releaseDeliveryLease(shardIndex) {
|
|
1044
|
+
try {
|
|
1045
|
+
await (await this.deliveryRegistry()).delete(leaseKey(shardIndex));
|
|
1046
|
+
}
|
|
1047
|
+
catch { /* already gone */ }
|
|
1048
|
+
}
|
|
1049
|
+
/** Read a shard's delivery lease (the daemon-availability signal), or `undefined` if none is live.
|
|
1050
|
+
* READ-ONLY surface — drives Component 6's `cotal_channels` delivery-health field (an agent reads it
|
|
1051
|
+
* under its own cred, which holds lease-bucket read but no write). */
|
|
1052
|
+
async readDeliveryLease(shardIndex) {
|
|
1053
|
+
const e = await (await this.deliveryRegistry()).get(leaseKey(shardIndex));
|
|
1054
|
+
if (!e || e.operation === "DEL" || e.operation === "PURGE")
|
|
1055
|
+
return undefined;
|
|
1056
|
+
try {
|
|
1057
|
+
return e.json();
|
|
1058
|
+
}
|
|
1059
|
+
catch {
|
|
1060
|
+
return undefined;
|
|
1061
|
+
}
|
|
1062
|
+
}
|
|
943
1063
|
/** Privileged: one owner's NON-TOMBSTONED durable memberships as `{channel, generation, activated}` —
|
|
944
|
-
* the
|
|
945
|
-
*
|
|
946
|
-
* ones are returned too so `leaveChannel` can discover + close a record that
|
|
947
|
-
* pure-interval predicate (a crash-stuck pending activation) — without reading
|
|
1064
|
+
* the server-side delivery daemon serves this to a connecting agent (the `listMemberships` op on
|
|
1065
|
+
* `ctl.delivery`). The agent seeds its leave mirror from the ACTIVATED ones (the confirmed backstops),
|
|
1066
|
+
* but the non-activated ones are returned too so `leaveChannel` can discover + close a record that
|
|
1067
|
+
* still routes under the pure-interval predicate (a crash-stuck pending activation) — without reading
|
|
1068
|
+
* the privileged KV itself. */
|
|
948
1069
|
async ownerMemberships(owner) {
|
|
949
1070
|
const recs = await listMembers(await this.membersRegistry(), { owner });
|
|
950
1071
|
return recs
|
|
@@ -985,16 +1106,15 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
985
1106
|
return info?.delivered?.stream_seq ?? 0;
|
|
986
1107
|
}
|
|
987
1108
|
/**
|
|
988
|
-
* Privileged durable-JOIN write (the
|
|
989
|
-
*
|
|
990
|
-
*
|
|
991
|
-
*
|
|
992
|
-
*
|
|
993
|
-
*
|
|
1109
|
+
* Privileged durable-JOIN write (v3: the delivery daemon calls this from its `ctl.delivery` handler
|
|
1110
|
+
* after validating channel ⊆ the caller's read ACL): capture `joinCursor`, commit a `durable-active`
|
|
1111
|
+
* record (CAS + generation bump), then ACTIVATION CATCH-UP idempotently copies `(joinCursor, fence]`
|
|
1112
|
+
* into the owner inbox where `fence = max(frontier, fanoutDelivered)` — fan-out owns `seq > fence`.
|
|
1113
|
+
* Idempotent against a timeout-retry (an already-activated membership no-ops). Returns `{durable:false}`
|
|
1114
|
+
* (honest degrade) only if the catch-up window was evicted.
|
|
994
1115
|
*
|
|
995
|
-
*
|
|
996
|
-
*
|
|
997
|
-
* short-lived provisioner can write a boot membership a separate long-lived manager then delivers.
|
|
1116
|
+
* Runs on the daemon (which hosts the fan-out/reader loops + the members KV), so catch-up + the
|
|
1117
|
+
* activation fence read are in-process — no cross-process cursor read.
|
|
998
1118
|
*/
|
|
999
1119
|
async durableJoinFor(owner, channel) {
|
|
1000
1120
|
if (!this.js)
|
|
@@ -1119,27 +1239,122 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1119
1239
|
}
|
|
1120
1240
|
return { copied, evicted };
|
|
1121
1241
|
}
|
|
1122
|
-
/** Start the Plane-3 fan-out writer + trusted reader on THIS (privileged
|
|
1123
|
-
*
|
|
1124
|
-
*
|
|
1242
|
+
/** Start the Plane-3 fan-out writer + trusted reader on THIS (privileged, server-side delivery-daemon)
|
|
1243
|
+
* endpoint, AND serve the `ctl.delivery` control service (runtime durable join/leave/list). `aclFor`
|
|
1244
|
+
* maps an owner id to its current read ACL for the reader's re-authorization — read FRESH per entry
|
|
1245
|
+
* from the durable ACL registry (async). Call once after connect; idempotent durable creation lets it
|
|
1246
|
+
* resume on a daemon restart. Both the JS loops AND the `ctl.delivery` subscription are (re)bound by
|
|
1247
|
+
* {@link armPlane3} on EVERY (re)connect — a reconnect drains the old connection, so re-binding both
|
|
1248
|
+
* is required, not optional (the responder would otherwise be lost on a broker blip). */
|
|
1125
1249
|
async startPlane3(aclFor) {
|
|
1126
1250
|
if (!this.js)
|
|
1127
1251
|
throw new Error("endpoint not started");
|
|
1128
1252
|
this.plane3 = { aclFor };
|
|
1129
1253
|
await this.armPlane3();
|
|
1130
1254
|
}
|
|
1255
|
+
/** Serve one runtime durable-membership control request (the server-side delivery daemon). The caller
|
|
1256
|
+
* id is the authenticated subject sender ({@link serveControl} fail-closes on a mismatch). Validation
|
|
1257
|
+
* is against the durable ACL registry — the SAME KV the reader re-auths against (single source of
|
|
1258
|
+
* truth, no in-memory ledger to drift). */
|
|
1259
|
+
async handleDeliveryControl(req) {
|
|
1260
|
+
const caller = req.from.id;
|
|
1261
|
+
const args = req.args ?? {};
|
|
1262
|
+
if (req.op === "durableJoin")
|
|
1263
|
+
return this.deliveryJoin(caller, args);
|
|
1264
|
+
if (req.op === "durableLeave")
|
|
1265
|
+
return this.deliveryLeave(caller, args);
|
|
1266
|
+
if (req.op === "listMemberships")
|
|
1267
|
+
return { ok: true, data: { memberships: await this.ownerMemberships(caller) } };
|
|
1268
|
+
return { ok: false, error: `op "${req.op}" not supported on the delivery control service` };
|
|
1269
|
+
}
|
|
1270
|
+
/** Validate the channel ARG shape only — non-blank, valid, concrete (NO ACL check, that is op-specific).
|
|
1271
|
+
* Returns the channel on success or a ControlReply error to short-circuit. */
|
|
1272
|
+
checkDurableChannelArg(args, op) {
|
|
1273
|
+
const channel = typeof args.channel === "string" ? args.channel.trim() : "";
|
|
1274
|
+
if (!channel)
|
|
1275
|
+
return { ok: false, error: `${op}: channel must be a non-blank string` };
|
|
1276
|
+
try {
|
|
1277
|
+
assertValidChannel(channel);
|
|
1278
|
+
}
|
|
1279
|
+
catch (e) {
|
|
1280
|
+
return { ok: false, error: e.message };
|
|
1281
|
+
}
|
|
1282
|
+
if (!isConcreteChannel(channel))
|
|
1283
|
+
return { ok: false, error: `${op}: "${channel}" must be a concrete channel (durable membership is per-concrete-channel, not wildcard)` };
|
|
1284
|
+
return channel;
|
|
1285
|
+
}
|
|
1286
|
+
/** JOIN requires the channel be within the caller's CURRENT read ACL (you can't durable-subscribe a
|
|
1287
|
+
* channel you may not read). */
|
|
1288
|
+
async deliveryJoin(caller, args) {
|
|
1289
|
+
const channel = this.checkDurableChannelArg(args, "durableJoin");
|
|
1290
|
+
if (typeof channel !== "string")
|
|
1291
|
+
return channel; // a ControlReply error
|
|
1292
|
+
const acl = await readAcl(await this.aclRegistry(), caller);
|
|
1293
|
+
if (acl === undefined)
|
|
1294
|
+
return { ok: false, error: `durableJoin: no read ACL on record for ${caller} (not provisioned for durable delivery)` };
|
|
1295
|
+
if (!channelInAllow(acl.record.allowSubscribe, channel))
|
|
1296
|
+
return { ok: false, error: `channel "${channel}" is not within your read ACL [${acl.record.allowSubscribe.join(", ")}]` };
|
|
1297
|
+
try {
|
|
1298
|
+
return { ok: true, data: await this.durableJoinFor(caller, channel) };
|
|
1299
|
+
}
|
|
1300
|
+
catch (e) {
|
|
1301
|
+
return { ok: false, error: e.message };
|
|
1302
|
+
}
|
|
1303
|
+
}
|
|
1304
|
+
/** LEAVE must NOT require current-ACL coverage. Leave fires precisely when the ACL was narrowed/revoked
|
|
1305
|
+
* (a refused live sub → {@link closeRefusedMembership}); gating the tombstone on the current ACL would
|
|
1306
|
+
* loop forever and leave the SPEC §7 boundary open (the membership could resume if the ACL is later
|
|
1307
|
+
* restored). The guards are: authenticated caller (serveControl), concrete channel, a finite generation
|
|
1308
|
+
* (the join epoch — without it a stale/replayed leave could tombstone a newer rejoin), and an EXISTING
|
|
1309
|
+
* own membership; `durableLeaveFor` → `tombstoneMember` then enforces the generation match. */
|
|
1310
|
+
async deliveryLeave(caller, args) {
|
|
1311
|
+
const channel = this.checkDurableChannelArg(args, "durableLeave");
|
|
1312
|
+
if (typeof channel !== "string")
|
|
1313
|
+
return channel; // a ControlReply error
|
|
1314
|
+
if (typeof args.generation !== "number" || !Number.isFinite(args.generation))
|
|
1315
|
+
return { ok: false, error: "durableLeave: a finite generation is required (fail-closed stale-leave guard)" };
|
|
1316
|
+
const existing = await readMember(await this.membersRegistry(), channel, caller);
|
|
1317
|
+
if (!existing)
|
|
1318
|
+
return { ok: true, data: { channel, alreadyLeft: true } }; // nothing to tombstone — idempotent
|
|
1319
|
+
try {
|
|
1320
|
+
await this.durableLeaveFor(caller, channel, args.generation);
|
|
1321
|
+
}
|
|
1322
|
+
catch (e) {
|
|
1323
|
+
return { ok: false, error: e.message };
|
|
1324
|
+
}
|
|
1325
|
+
return { ok: true, data: { channel } };
|
|
1326
|
+
}
|
|
1131
1327
|
/** (Re)bind the Plane-3 fan-out writer + trusted reader. Idempotent — the durables resume from their
|
|
1132
1328
|
* cursor. Called by {@link startPlane3} once AND by {@link connectAndBind} on every (re)connect, so
|
|
1133
|
-
*
|
|
1329
|
+
* the delivery daemon's reconnect RE-ARMS the backstop + the ctl.delivery responder. Without this, a broker blip would silently kill
|
|
1134
1330
|
* the loops while `durableJoinFor` kept reporting `durable:true` (the impl-review's BLOCKER-1). No-op
|
|
1135
1331
|
* unless this endpoint hosts Plane-3 (`this.plane3` set). */
|
|
1136
1332
|
async armPlane3() {
|
|
1137
1333
|
if (!this.plane3 || !this.js)
|
|
1138
1334
|
return;
|
|
1139
1335
|
await this.manager(); // the manager runs consume:false, so this.jsm is lazy — ensure it
|
|
1336
|
+
this.armDeliveryControl();
|
|
1140
1337
|
await this.runFanout();
|
|
1141
1338
|
await this.runReader();
|
|
1142
1339
|
}
|
|
1340
|
+
/** (Re)register the `ctl.delivery` control responder on the CURRENT connection. A reconnect drains the
|
|
1341
|
+
* old connection (the old sub is dead and `clearConnectionScoped` leaves caller-owned subs alone), so
|
|
1342
|
+
* this MUST run on every arm — otherwise durable join/leave/list silently lose their responder after a
|
|
1343
|
+
* broker blip. The stale sub is dropped (unsubscribed + removed from `this.subs`) before re-creating.
|
|
1344
|
+
* `boundReply` is essential here: the daemon holds a wildcard reply-publish grant, so the serve path
|
|
1345
|
+
* must reject any reply target outside the authenticated sender's own subtree (confused-deputy fix). */
|
|
1346
|
+
armDeliveryControl() {
|
|
1347
|
+
if (this.deliveryServeSub) {
|
|
1348
|
+
try {
|
|
1349
|
+
this.deliveryServeSub.unsubscribe();
|
|
1350
|
+
}
|
|
1351
|
+
catch { /* dead with the old connection */ }
|
|
1352
|
+
const i = this.subs.indexOf(this.deliveryServeSub);
|
|
1353
|
+
if (i >= 0)
|
|
1354
|
+
this.subs.splice(i, 1);
|
|
1355
|
+
}
|
|
1356
|
+
this.deliveryServeSub = this.serveControl(CONTROL_DELIVERY, (req) => this.handleDeliveryControl(req), { boundReply: true });
|
|
1357
|
+
}
|
|
1143
1358
|
/** Fan-out loop: bind the privileged `fanout` durable on CHAT and route each message (routing only —
|
|
1144
1359
|
* the trusted reader is the auth gate). */
|
|
1145
1360
|
async runFanout() {
|
|
@@ -1206,7 +1421,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1206
1421
|
const owner = this.resolveOwnerByName(name);
|
|
1207
1422
|
if (!owner || owner === msg.from.id)
|
|
1208
1423
|
continue;
|
|
1209
|
-
const acl = this.plane3?.aclFor(owner);
|
|
1424
|
+
const acl = await this.plane3?.aclFor(owner);
|
|
1210
1425
|
if (!acl || !channelInAllow(acl, channel))
|
|
1211
1426
|
continue; // @mention can't bypass the read ACL
|
|
1212
1427
|
await this.publishDinbox(owner, { msg, channel, seq, reason: "live-mention", generation: 0 });
|
|
@@ -1261,7 +1476,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1261
1476
|
return;
|
|
1262
1477
|
} // undecodable — drop
|
|
1263
1478
|
const redeliveries = m.info?.deliveryCount ?? 1; // JsMsg delivery attempts (1 on first delivery)
|
|
1264
|
-
const acl = this.plane3?.aclFor(owner);
|
|
1479
|
+
const acl = await this.plane3?.aclFor(owner);
|
|
1265
1480
|
if (acl === undefined) {
|
|
1266
1481
|
// UNKNOWN owner — the manager has not (re)hydrated this owner's ACL yet (e.g. right after a
|
|
1267
1482
|
// manager PROCESS restart). This is NOT a revocation: DEFER (redeliver), never drop — an ack here
|
|
@@ -1311,7 +1526,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1311
1526
|
m.ack();
|
|
1312
1527
|
}
|
|
1313
1528
|
/** Agent-side: bind + pump our pre-created Plane-3 DELIVER durable (`dlv_<id>`). Every message here is
|
|
1314
|
-
*
|
|
1529
|
+
* delivery-daemon-written (DLV is delivery-write-only, broker-enforced) and is a CHANNEL message by contract
|
|
1315
1530
|
* (the backstop never carries DMs), so `kind=channel` is path-derived (SPEC §4) and the body is
|
|
1316
1531
|
* trusted (no spoof-guard). `durable:true` — real JetStream ack, coalesced with the core-sub live
|
|
1317
1532
|
* copy by `MeshAgent.ingest`. No-op when the durable isn't present (open mode / not provisioned). */
|
|
@@ -1351,19 +1566,19 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1351
1566
|
})().catch((e) => { if (!this.stopped)
|
|
1352
1567
|
this.emit("error", e); });
|
|
1353
1568
|
}
|
|
1354
|
-
/** Agent-side: request a Plane-3 durable backstop for a channel via the
|
|
1355
|
-
* when no privileged writer is present (open /
|
|
1569
|
+
/** Agent-side: request a Plane-3 durable backstop for a channel via the server-side delivery daemon (ctl.delivery). Throws
|
|
1570
|
+
* when no privileged writer is present (open / no delivery daemon). 30s timeout — activation catch-up may
|
|
1356
1571
|
* run before the reply (the window is small, but a busy channel can take more than the 5s default). */
|
|
1357
1572
|
async durableJoinChannel(channel) {
|
|
1358
|
-
const reply = await this.
|
|
1573
|
+
const reply = await this.requestDelivery("durableJoin", { channel }, 30_000);
|
|
1359
1574
|
if (!reply.ok)
|
|
1360
1575
|
throw new Error(reply.error ?? "durable join rejected");
|
|
1361
1576
|
return reply.data ?? { durable: false };
|
|
1362
1577
|
}
|
|
1363
1578
|
/** Agent-side: release a Plane-3 durable backstop (tombstone membership at the leave cursor). Passes
|
|
1364
|
-
* the join generation so a stale leave can't tombstone a newer rejoin (the
|
|
1579
|
+
* the join generation so a stale leave can't tombstone a newer rejoin (the delivery daemon validates it). */
|
|
1365
1580
|
async durableLeaveChannel(channel, generation) {
|
|
1366
|
-
const reply = await this.
|
|
1581
|
+
const reply = await this.requestDelivery("durableLeave", { channel, generation });
|
|
1367
1582
|
if (!reply.ok)
|
|
1368
1583
|
throw new Error(reply.error ?? "durable leave rejected");
|
|
1369
1584
|
}
|
|
@@ -1373,7 +1588,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1373
1588
|
* is reachable, never a silent give-up. While pending, the channel is tracked in
|
|
1374
1589
|
* {@link pendingDurableLeave} and surfaced via {@link pendingDurableLeaves} (the connector shows it in
|
|
1375
1590
|
* `cotal_channels` as `durable-unclosed`, never ordinary absence). The generation is kept the whole
|
|
1376
|
-
* time. Authoritative closure of a revoked membership is also
|
|
1591
|
+
* time. Authoritative closure of a revoked membership is also handled by revocation (rotate creds + tear down). */
|
|
1377
1592
|
async closeRefusedMembership(channel, generation) {
|
|
1378
1593
|
this.pendingDurableLeave.set(channel, generation);
|
|
1379
1594
|
for (let attempt = 0;; attempt++) {
|
|
@@ -1406,42 +1621,94 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1406
1621
|
}
|
|
1407
1622
|
/** Agent-side: this session's CURRENT durable memberships (channel + join generation) from the
|
|
1408
1623
|
* manager — the agent holds no read on the privileged members KV. `undefined` ⇒ NO control responder
|
|
1409
|
-
* (open /
|
|
1624
|
+
* (open / no delivery daemon, so there is no Plane-3 and no memberships). THROWS on a responder-present RPC
|
|
1410
1625
|
* failure, so a caller can FAIL-CLOSED rather than mistaking a transient error for "no membership". */
|
|
1411
1626
|
async fetchMemberships() {
|
|
1412
1627
|
let reply;
|
|
1413
1628
|
try {
|
|
1414
|
-
reply = await this.
|
|
1629
|
+
reply = await this.requestDelivery("listMemberships", {}, 5_000);
|
|
1415
1630
|
}
|
|
1416
1631
|
catch (e) {
|
|
1417
1632
|
if (this.isNoResponders(e))
|
|
1418
|
-
return undefined; // no
|
|
1633
|
+
return undefined; // no delivery daemon — open / daemon-less, no Plane-3
|
|
1419
1634
|
throw e; // responder present but errored — surface it (leaveChannel fails closed)
|
|
1420
1635
|
}
|
|
1421
1636
|
if (!reply.ok)
|
|
1422
1637
|
throw new Error(reply.error ?? "listMemberships failed");
|
|
1423
1638
|
return reply.data?.memberships ?? [];
|
|
1424
1639
|
}
|
|
1425
|
-
/** Agent-side
|
|
1426
|
-
*
|
|
1427
|
-
*
|
|
1428
|
-
*
|
|
1429
|
-
*
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1640
|
+
/** Agent-side, first connect (auth): SELF-JOIN this session's durable boot channels via the
|
|
1641
|
+
* server-side delivery daemon — replacing the old manager-written boot membership. Each concrete
|
|
1642
|
+
* `durable`-class boot channel gets a `durableJoin` whose returned generation seeds the leave mirror
|
|
1643
|
+
* + durable-state surface; an already-active membership (a relaunch) is idempotent (no re-catch-up).
|
|
1644
|
+
* If the daemon is down/absent at first connect (or reports a transient `durable:false`), the channel
|
|
1645
|
+
* is handed to {@link reconcileBootJoin} for capped-backoff retry — so the backstop is RESTORED once
|
|
1646
|
+
* the daemon recovers, not left silently live-only. Until a membership exists the channel renders
|
|
1647
|
+
* degraded in `cotal_channels` ({@link hasDurableMembership}). */
|
|
1648
|
+
async armBootDurableMemberships() {
|
|
1649
|
+
for (const channel of this.channels) {
|
|
1650
|
+
if (!isConcreteChannel(channel) || this.plane3Channels.has(channel))
|
|
1651
|
+
continue;
|
|
1652
|
+
let cls;
|
|
1653
|
+
try {
|
|
1654
|
+
cls = await this.deliveryClassFresh(channel);
|
|
1655
|
+
}
|
|
1656
|
+
catch {
|
|
1657
|
+
continue;
|
|
1658
|
+
}
|
|
1659
|
+
if (cls !== "durable")
|
|
1660
|
+
continue;
|
|
1661
|
+
try {
|
|
1662
|
+
const r = await this.durableJoinChannel(channel);
|
|
1663
|
+
if (r.durable)
|
|
1664
|
+
this.plane3Channels.set(channel, r.generation ?? 0);
|
|
1665
|
+
else
|
|
1666
|
+
void this.reconcileBootJoin(channel); // present but not yet durable — reconcile to recovery
|
|
1667
|
+
}
|
|
1668
|
+
catch (e) {
|
|
1669
|
+
if (!this.isNoResponders(e))
|
|
1670
|
+
this.emit("error", e); // no daemon ⇒ retry until it recovers
|
|
1671
|
+
void this.reconcileBootJoin(channel);
|
|
1672
|
+
}
|
|
1434
1673
|
}
|
|
1435
|
-
|
|
1436
|
-
|
|
1674
|
+
}
|
|
1675
|
+
/** Retry a boot durable self-join with capped backoff until a membership EXISTS (success → seed
|
|
1676
|
+
* `plane3Channels`) or the channel is left / the endpoint stops. Mirrors {@link closeRefusedMembership}:
|
|
1677
|
+
* a one-shot first-connect attempt that swallowed a daemon outage would leave the boot channel live-only
|
|
1678
|
+
* forever after the daemon recovers (and the lease-based health could then read "active" with no owner
|
|
1679
|
+
* membership). This loop is the reconcile that closes that gap. Idempotent — a channel already pending
|
|
1680
|
+
* is not double-driven; survives reconnect (it re-issues `durableJoinChannel` on the current connection). */
|
|
1681
|
+
async reconcileBootJoin(channel) {
|
|
1682
|
+
if (this.pendingBootJoins.has(channel))
|
|
1683
|
+
return; // already reconciling
|
|
1684
|
+
this.pendingBootJoins.add(channel);
|
|
1685
|
+
for (let attempt = 0;; attempt++) {
|
|
1686
|
+
await new Promise((r) => setTimeout(r, Math.min(30_000, 1000 * 2 ** attempt)));
|
|
1687
|
+
if (this.stopped || !this.channels.includes(channel) || this.plane3Channels.has(channel)) {
|
|
1688
|
+
this.pendingBootJoins.delete(channel);
|
|
1689
|
+
return; // stopped, left, or another path established it
|
|
1690
|
+
}
|
|
1691
|
+
try {
|
|
1692
|
+
const r = await this.durableJoinChannel(channel);
|
|
1693
|
+
if (r.durable) {
|
|
1694
|
+
this.plane3Channels.set(channel, r.generation ?? 0);
|
|
1695
|
+
this.pendingBootJoins.delete(channel);
|
|
1696
|
+
return;
|
|
1697
|
+
}
|
|
1698
|
+
// present but durable:false (e.g. catch-up window evicted) — keep retrying; the channel stays
|
|
1699
|
+
// honestly degraded meanwhile, never silently "active".
|
|
1700
|
+
}
|
|
1701
|
+
catch (e) {
|
|
1702
|
+
if (attempt === 0 && !this.isNoResponders(e))
|
|
1703
|
+
this.emit("error", new Error(`channel "${channel}": boot durable self-join not yet established — retrying until the delivery daemon is reachable (${e.message})`));
|
|
1704
|
+
}
|
|
1437
1705
|
}
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
this.plane3Channels.set(m.channel, m.generation);
|
|
1706
|
+
}
|
|
1707
|
+
/** True if this session holds an established Plane-3 durable membership for `channel` (in `plane3Channels`).
|
|
1708
|
+
* Drives the membership-aware delivery-health surface: a joined durable channel that is NOT yet a member
|
|
1709
|
+
* (boot self-join pending / daemon down) must render degraded, never "active" off a live lease alone. */
|
|
1710
|
+
hasDurableMembership(channel) {
|
|
1711
|
+
return this.plane3Channels.has(channel);
|
|
1445
1712
|
}
|
|
1446
1713
|
/** Lazily obtain a JetStream manager — so a non-consuming endpoint (e.g. the supervisor,
|
|
1447
1714
|
* consume:false) can still pre-create others' durables. */
|
|
@@ -1472,9 +1739,10 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1472
1739
|
await this.pumpDlv();
|
|
1473
1740
|
// Multicast: open a native CORE subscription for each channel (live, manager-free, broker-enforced
|
|
1474
1741
|
// by sub.allow) — boot + runtime joins use the SAME path; there is no per-instance chat durable.
|
|
1475
|
-
// The durable backstop (a busy/offline turn) is Plane-3 (auth: membership
|
|
1476
|
-
//
|
|
1477
|
-
// live-only — the durable plane needs the
|
|
1742
|
+
// The durable backstop (a busy/offline turn) is Plane-3 (auth: membership established by the agent's
|
|
1743
|
+
// self-join, the delivery daemon's fan-out writer + trusted reader deliver via the `dlv_<id>` pump
|
|
1744
|
+
// above; open dev mode is live-only — the durable plane needs the daemon's trusted reader, the
|
|
1745
|
+
// security boundary). Per-
|
|
1478
1746
|
// channel history is the explicit replay-gated backfill, on FIRST connect only; a reconnect reopens
|
|
1479
1747
|
// the subs without re-backfilling (the durable backstop redelivers any missed window via dlv).
|
|
1480
1748
|
if (this.channels.length) {
|
|
@@ -1490,11 +1758,11 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1490
1758
|
if (armed)
|
|
1491
1759
|
await this.backfillArmed(armed);
|
|
1492
1760
|
}
|
|
1493
|
-
// First connect, auth mode:
|
|
1494
|
-
//
|
|
1495
|
-
//
|
|
1761
|
+
// First connect, auth mode: self-join BOOT durable channels via the server-side delivery daemon
|
|
1762
|
+
// (it owns membership now — there is no manager-written boot membership). Seeds plane3Channels so a
|
|
1763
|
+
// later leave can tombstone the §7 boundary; idempotent on relaunch. Open mode has no Plane-3.
|
|
1496
1764
|
if (this.firstConnect && this.creds && this.channels.length)
|
|
1497
|
-
await this.
|
|
1765
|
+
await this.armBootDurableMemberships();
|
|
1498
1766
|
this.firstConnect = false;
|
|
1499
1767
|
// Anycast: a shared work-queue consumer for our role — one instance grabs each task.
|
|
1500
1768
|
// Open mode self-creates; auth mode BINDS the provisioner-pre-created svc_<role>
|