@cotal-ai/core 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/acls.d.ts +45 -0
- package/dist/acls.d.ts.map +1 -0
- package/dist/acls.js +86 -0
- package/dist/acls.js.map +1 -0
- package/dist/command.d.ts +3 -0
- package/dist/command.d.ts.map +1 -1
- package/dist/connector.d.ts +10 -0
- package/dist/connector.d.ts.map +1 -1
- package/dist/endpoint.d.ts +197 -54
- package/dist/endpoint.d.ts.map +1 -1
- package/dist/endpoint.js +443 -100
- package/dist/endpoint.js.map +1 -1
- package/dist/index.d.ts +5 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -1
- package/dist/lease.d.ts +40 -0
- package/dist/lease.d.ts.map +1 -0
- package/dist/lease.js +64 -0
- package/dist/lease.js.map +1 -0
- package/dist/membership-feed.d.ts +30 -0
- package/dist/membership-feed.d.ts.map +1 -0
- package/dist/membership-feed.js +315 -0
- package/dist/membership-feed.js.map +1 -0
- package/dist/mesh-registry.d.ts +45 -0
- package/dist/mesh-registry.d.ts.map +1 -0
- package/dist/mesh-registry.js +78 -0
- package/dist/mesh-registry.js.map +1 -0
- package/dist/mesh-target.d.ts +42 -0
- package/dist/mesh-target.d.ts.map +1 -0
- package/dist/mesh-target.js +95 -0
- package/dist/mesh-target.js.map +1 -0
- package/dist/provision.d.ts +45 -21
- package/dist/provision.d.ts.map +1 -1
- package/dist/provision.js +177 -15
- package/dist/provision.js.map +1 -1
- package/dist/streams.d.ts +16 -0
- package/dist/streams.d.ts.map +1 -1
- package/dist/streams.js +29 -5
- package/dist/streams.js.map +1 -1
- package/dist/subjects.d.ts +89 -2
- package/dist/subjects.d.ts.map +1 -1
- package/dist/subjects.js +132 -3
- package/dist/subjects.js.map +1 -1
- package/dist/types.d.ts +52 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +1 -1
package/dist/endpoint.js
CHANGED
|
@@ -7,8 +7,10 @@ import { createSpaceStreams, dmDurableConfig, dlvDurableConfig, taskDurableConfi
|
|
|
7
7
|
import { jetstream, jetstreamManager, AckPolicy, DeliverPolicy, } from "@nats-io/jetstream";
|
|
8
8
|
import { Kvm } from "@nats-io/kv";
|
|
9
9
|
import { openMembersRegistry, commitMember, tombstoneMember, activateMember, readMember, listMembers, durableEligible, StaleMembershipWrite, } from "./members.js";
|
|
10
|
+
import { openAclRegistry, readAcl, commitAcl as writeAclRecord } from "./acls.js";
|
|
11
|
+
import { openDeliveryRegistry } from "./lease.js";
|
|
10
12
|
import { openChannelRegistry, effectiveReplay, effectiveReplayWindowMs, effectiveDeliveryClass, readChannelConfig, readChannelDefaults, } from "./channels.js";
|
|
11
|
-
import { anycastSubject, CHANNEL_DEFAULTS_KEY, chatStream, chatHistDurable, chatSubject, controlServiceSubject, CONTROL_SELF_SERVICE, dmStream, dmDurable, dlvStream, dlvDurable, dlvSubject, dinboxSubject, inboxStream, parseDinboxOwner, FANOUT_DURABLE, INBOX_READER_DURABLE, chatWildcard, channelInAllow, isConcreteChannel, normalizeMentions, parseSubject, presenceBucket, spacePrefix, spaceWildcard, subjectMatches, taskStream, taskDurable, token, unicastSubject, } from "./subjects.js";
|
|
13
|
+
import { anycastSubject, CHANNEL_DEFAULTS_KEY, chatStream, chatHistDurable, chatSubject, controlServiceSubject, CONTROL_SELF_SERVICE, CONTROL_DELIVERY, dmStream, dmDurable, dlvStream, dlvDurable, dlvSubject, dinboxSubject, inboxStream, parseDinboxOwner, FANOUT_DURABLE, INBOX_READER_DURABLE, leaseKey, chatWildcard, assertValidChannel, channelInAllow, isConcreteChannel, normalizeMentions, parseSubject, presenceBucket, membershipBucket, MEMBERSHIP_FEED_KEY, spacePrefix, spaceWildcard, subjectMatches, taskStream, taskDurable, token, unicastSubject, } from "./subjects.js";
|
|
12
14
|
export const DEFAULT_SERVER = "nats://127.0.0.1:4222";
|
|
13
15
|
/** Space joined when none is given on the CLI (the `cotal-<space>` cmux tab, etc.). */
|
|
14
16
|
export const DEFAULT_SPACE = "main";
|
|
@@ -50,10 +52,18 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
50
52
|
jsm;
|
|
51
53
|
kv;
|
|
52
54
|
channelKv;
|
|
53
|
-
/** Plane-3 durable-membership registry KV — lazily opened by the privileged (
|
|
55
|
+
/** Plane-3 durable-membership registry KV — lazily opened by the privileged delivery daemon (or a
|
|
56
|
+
* short-lived provisioner). */
|
|
54
57
|
membersKv;
|
|
55
|
-
|
|
56
|
-
|
|
58
|
+
aclKv;
|
|
59
|
+
deliveryKv;
|
|
60
|
+
membershipKv;
|
|
61
|
+
/** The live `ctl.delivery` serve subscription (delivery daemon) — re-created on every (re)connect by
|
|
62
|
+
* {@link armDeliveryControl}; tracked so the stale one is dropped on reconnect. */
|
|
63
|
+
deliveryServeSub;
|
|
64
|
+
/** When set, this endpoint hosts the Plane-3 fan-out writer + trusted reader (the server-side delivery
|
|
65
|
+
* daemon). `aclFor` maps an owner id to its current read ACL (`allowSubscribe`) for the reader's
|
|
66
|
+
* re-authorization — read FRESH per entry from the durable ACL registry KV, hence async. */
|
|
57
67
|
plane3;
|
|
58
68
|
/** Live local cache of the channel registry (key = channel token), kept by a KV watch. */
|
|
59
69
|
channelConfigs = new Map();
|
|
@@ -89,6 +99,12 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
89
99
|
* {@link pendingDurableLeaves} (the connector shows it in `cotal_channels`, never as ordinary
|
|
90
100
|
* absence). Persists across reconnect; cleared on tombstone success or full stop. */
|
|
91
101
|
pendingDurableLeave = new Map();
|
|
102
|
+
/** Boot durable channels whose self-join hasn't yet established a membership (daemon down/absent at
|
|
103
|
+
* first connect, or a transient `durable:false`). {@link reconcileBootJoin} retries with capped
|
|
104
|
+
* backoff until the membership exists or the channel is left — so a first-connect daemon outage
|
|
105
|
+
* self-heals on recovery instead of leaving the channel silently live-only. Surfaced to the connector
|
|
106
|
+
* via {@link hasDurableMembership} (a joined durable channel NOT yet a member renders degraded). */
|
|
107
|
+
pendingBootJoins = new Set();
|
|
92
108
|
/** Chat-join subjects currently being broker-confirmed. An out-of-ACL subscribe among these trips an
|
|
93
109
|
* EXPECTED async permission violation that joinChannel turns into a clean throw, so watchStatus
|
|
94
110
|
* suppresses it rather than surfacing a spurious connection error. */
|
|
@@ -220,7 +236,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
220
236
|
await this.ensureStreams();
|
|
221
237
|
await this.startConsumers();
|
|
222
238
|
}
|
|
223
|
-
// Re-arm Plane-3 (
|
|
239
|
+
// Re-arm Plane-3 (delivery-daemon-hosted fan-out + trusted reader + ctl.delivery) on every (re)connect — no-op unless this
|
|
224
240
|
// endpoint hosts it. The first arm comes from startPlane3 (after start()); this re-binds the loops
|
|
225
241
|
// a reconnect's clearConnectionScoped() tore down, so a broker blip doesn't silently kill the backstop.
|
|
226
242
|
await this.armPlane3();
|
|
@@ -332,6 +348,12 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
332
348
|
this.jsm = undefined;
|
|
333
349
|
this.kv = undefined;
|
|
334
350
|
this.channelKv = undefined;
|
|
351
|
+
// Plane-3 KV handles are bound to the old connection too — drop them so the daemon re-opens them on
|
|
352
|
+
// the fresh nc (else durableJoin/leave/list, the reader's ACL re-auth, and lease renew use a dead
|
|
353
|
+
// handle after a reconnect).
|
|
354
|
+
this.membersKv = undefined;
|
|
355
|
+
this.aclKv = undefined;
|
|
356
|
+
this.deliveryKv = undefined;
|
|
335
357
|
this.emit("connection", { connected: false }); // null window opened — not live until the rebind below
|
|
336
358
|
try {
|
|
337
359
|
await oldNc?.drain();
|
|
@@ -518,8 +540,16 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
518
540
|
})().catch((e) => this.emit("error", e));
|
|
519
541
|
}
|
|
520
542
|
// ---- control plane (request/reply) --------------------------------------
|
|
521
|
-
/** Serve control requests for a service
|
|
522
|
-
|
|
543
|
+
/** Serve control requests for a service. Returns the subscription so a caller that re-registers on
|
|
544
|
+
* reconnect (the delivery daemon) can drop the stale one. `boundReply` is REQUIRED for any service
|
|
545
|
+
* whose responder holds a wildcard publish grant over the service subtree (the delivery daemon's
|
|
546
|
+
* `ctl.delivery.*.reply.>`): without it, an authenticated caller could set its reply target to a
|
|
547
|
+
* PEER's reply lane (`ctl.delivery.<victim>.reply.<n>`) and turn the responder into a confused
|
|
548
|
+
* deputy — the broker does NOT permission-check the requester's embedded reply subject. With it, a
|
|
549
|
+
* reply is published only when `m.reply` is under the AUTHENTICATED request subject
|
|
550
|
+
* (`${m.subject}.reply.…`), binding the reply to the broker-policed sender token. (The manager's
|
|
551
|
+
* tiers reply into the per-id `_INBOX` and leave it off.) */
|
|
552
|
+
serveControl(service, handler, opts = {}) {
|
|
523
553
|
if (!this.nc)
|
|
524
554
|
throw new Error("endpoint not started");
|
|
525
555
|
const sub = this.nc.subscribe(controlServiceSubject(this.space, service, "*"), {
|
|
@@ -528,6 +558,12 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
528
558
|
this.subs.push(sub);
|
|
529
559
|
void (async () => {
|
|
530
560
|
for await (const m of sub) {
|
|
561
|
+
// Sender-bound reply guard (confused-deputy fix): never respond to a reply target outside the
|
|
562
|
+
// authenticated request subject's own `.reply.` subtree. Drop silently (don't inject elsewhere).
|
|
563
|
+
if (opts.boundReply && (!m.reply || !m.reply.startsWith(`${m.subject}.reply.`))) {
|
|
564
|
+
this.emit("error", new Error(`rejected ${service} request on ${m.subject}: reply target "${m.reply ?? "(none)"}" is not under the sender's own reply subtree`));
|
|
565
|
+
continue;
|
|
566
|
+
}
|
|
531
567
|
let reply;
|
|
532
568
|
try {
|
|
533
569
|
const req = m.json();
|
|
@@ -556,6 +592,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
556
592
|
}
|
|
557
593
|
}
|
|
558
594
|
})().catch((e) => this.emit("error", e));
|
|
595
|
+
return sub;
|
|
559
596
|
}
|
|
560
597
|
/** Send a control request to a service and await its reply (client side). */
|
|
561
598
|
async requestControl(service, req, timeoutMs = 5000) {
|
|
@@ -565,6 +602,26 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
565
602
|
const m = await this.nc.request(controlServiceSubject(this.space, service, this.card.id), JSON.stringify(body), { timeout: timeoutMs });
|
|
566
603
|
return m.json();
|
|
567
604
|
}
|
|
605
|
+
/** Send a durable-membership request to the SERVER-SIDE delivery daemon (`ctl.delivery`) and await its
|
|
606
|
+
* reply. Unlike {@link requestControl}, the reply rides a subject UNDER `ctl.delivery.<id>.>` (not the
|
|
607
|
+
* per-id `_INBOX`), so the scoped delivery cred can answer without broad inbox-publish — see
|
|
608
|
+
* CONTROL_DELIVERY. `noMux` lets us name the reply subject while keeping NoResponders detection (so a
|
|
609
|
+
* caller can fail-closed vs. degrade to live-only when no daemon is present). */
|
|
610
|
+
async requestDelivery(op, args, timeoutMs = 5000) {
|
|
611
|
+
if (!this.nc)
|
|
612
|
+
throw new Error(this.notLiveMsg());
|
|
613
|
+
const reqSubject = controlServiceSubject(this.space, CONTROL_DELIVERY, this.card.id); // ctl.delivery.<id>
|
|
614
|
+
// Reply rides the sender's OWN subtree so the daemon's serveControl boundReply guard accepts it
|
|
615
|
+
// (`${reqSubject}.reply.…`). The sender-bound guard is the COMPLETE confused-deputy closure. The
|
|
616
|
+
// random suffix is genuine defense-in-depth (NOT cosmetic): `noMux` subscribes this SPECIFIC named
|
|
617
|
+
// reply subject (not a standing `.reply.>` wildcard), so a predictable suffix would let a peer target
|
|
618
|
+
// an in-flight reply subscription — randomUUID brings it to parity with the nuid-protected `_INBOX`
|
|
619
|
+
// model. Keep both; don't regress to a counter. (Confirmed by the review panel's fact-check.)
|
|
620
|
+
const reply = `${reqSubject}.reply.${randomUUID()}`;
|
|
621
|
+
const body = { op, args, from: this.ref() };
|
|
622
|
+
const m = await this.nc.request(reqSubject, JSON.stringify(body), { timeout: timeoutMs, noMux: true, reply });
|
|
623
|
+
return m.json();
|
|
624
|
+
}
|
|
568
625
|
// ---- presence ------------------------------------------------------------
|
|
569
626
|
getRoster() {
|
|
570
627
|
return [...this.roster.values()].sort((a, b) => a.card.name.localeCompare(b.card.name));
|
|
@@ -611,6 +668,12 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
611
668
|
channelReplay(channel) {
|
|
612
669
|
return effectiveReplay(this.channelConfigs.get(channel), this.channelDefaults);
|
|
613
670
|
}
|
|
671
|
+
/** Effective delivery class for a channel (per-channel override ?? space default ?? "durable"),
|
|
672
|
+
* from the live watch cache — drives the non-gating delivery-health surface (only durable-class
|
|
673
|
+
* channels have a Plane-3 backstop to report on). */
|
|
674
|
+
channelDeliveryClass(channel) {
|
|
675
|
+
return effectiveDeliveryClass(this.channelConfigs.get(channel), this.channelDefaults);
|
|
676
|
+
}
|
|
614
677
|
// ---- dynamic subscription (join / leave mid-session) ---------------------
|
|
615
678
|
/** The channels this endpoint is currently subscribed to (live — reflects join/leave). */
|
|
616
679
|
joinedChannels() {
|
|
@@ -619,9 +682,10 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
619
682
|
/**
|
|
620
683
|
* Join a channel mid-session: open a native core subscription (manager-free live read, broker-
|
|
621
684
|
* confirmed against `sub.allow`), capture the stream frontier as the join watermark, backfill its
|
|
622
|
-
* history if replay is on, and — for a `durable`-class channel
|
|
623
|
-
* durable backstop. Idempotent: re-joining is a no-op (no
|
|
624
|
-
* whether the durable backstop is active (+ a `reason`
|
|
685
|
+
* history if replay is on, and — for a `durable`-class channel when a delivery daemon is present —
|
|
686
|
+
* request a Plane-3 durable backstop (via `ctl.delivery`). Idempotent: re-joining is a no-op (no
|
|
687
|
+
* re-backfill). Returns the backfill count + whether the durable backstop is active (+ a `reason`
|
|
688
|
+
* when a durable channel couldn't get one).
|
|
625
689
|
*/
|
|
626
690
|
async joinChannel(channel) {
|
|
627
691
|
if (!this.jsm)
|
|
@@ -656,7 +720,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
656
720
|
}
|
|
657
721
|
this.channels.push(channel);
|
|
658
722
|
// Durable backstop. The live core-sub above already delivers (manager-free). For a `durable`-class
|
|
659
|
-
// channel, request a Plane-3 per-member backstop from the
|
|
723
|
+
// channel, request a Plane-3 per-member backstop from the server-side delivery daemon (durableJoin via ctl.delivery) so a post reaches a
|
|
660
724
|
// busy/offline turn — the core-sub stays as the live wake-hint, dedup-coalesced with the Plane-3
|
|
661
725
|
// copy by id-dedup. No manager (open dev / manager-less) ⇒ joined LIVE only, surfaced via `reason`
|
|
662
726
|
// (never silent). A `live`-class channel takes no backstop (joined live is the contract).
|
|
@@ -674,7 +738,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
674
738
|
}
|
|
675
739
|
}
|
|
676
740
|
catch (e) {
|
|
677
|
-
// No privileged writer (
|
|
741
|
+
// No privileged writer (no delivery daemon) or the write was rejected — joined live, backstop
|
|
678
742
|
// unavailable. NOT a join failure: the live subscription is up and authorized.
|
|
679
743
|
reason = `durable backstop unavailable (${e.message})`;
|
|
680
744
|
}
|
|
@@ -695,11 +759,11 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
695
759
|
return { left: false };
|
|
696
760
|
// Auth + durable-class ⇒ a Plane-3 membership may exist; tombstone it BEFORE touching local state.
|
|
697
761
|
// The join generation comes from the local mirror, but a BOOT membership whose hydration was missed
|
|
698
|
-
// (
|
|
762
|
+
// (daemon down at connect) is NOT in the mirror — so re-resolve it from the delivery service on
|
|
699
763
|
// demand. FAIL-CLOSED: fetchMemberships throws on a responder-present error, so a leave whose
|
|
700
764
|
// tombstone can't be confirmed propagates (live sub stays up, mirror intact) for the caller to retry
|
|
701
765
|
// — reporting `left` while the trusted reader keeps transferring to DLV is the fail-open leak. A
|
|
702
|
-
// genuine no-responder (open /
|
|
766
|
+
// genuine no-responder (open / no delivery daemon, no Plane-3) means there is no membership to tombstone.
|
|
703
767
|
if (this.creds && effectiveDeliveryClass(this.channelConfigs.get(channel), this.channelDefaults) === "durable") {
|
|
704
768
|
let generation = this.plane3Channels.get(channel);
|
|
705
769
|
if (generation === undefined)
|
|
@@ -781,6 +845,57 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
781
845
|
arr.sort(byName);
|
|
782
846
|
return map;
|
|
783
847
|
}
|
|
848
|
+
/** Lazily open the derived membership feed KV (admin/observer read; the delivery daemon writes it).
|
|
849
|
+
* Read-only here — the dashboard consumes it; agents hold no grant and never call this. */
|
|
850
|
+
async membershipRegistry() {
|
|
851
|
+
if (!this.nc)
|
|
852
|
+
throw new Error("endpoint not started");
|
|
853
|
+
this.membershipKv ??= await new Kvm(this.nc).open(membershipBucket(this.space));
|
|
854
|
+
return this.membershipKv;
|
|
855
|
+
}
|
|
856
|
+
/**
|
|
857
|
+
* Snapshot the broker-sourced channel-membership feed (admin/observer read): every agent's
|
|
858
|
+
* `{live, durable}` record plus `asOf` — the feed's freshness heartbeat (epoch ms of the daemon's last
|
|
859
|
+
* successful poll, from the reserved {@link MEMBERSHIP_FEED_KEY}). `live` patterns are kept as-is
|
|
860
|
+
* (wildcards preserved); the consumer expands them against the channel registry. `asOf` is undefined
|
|
861
|
+
* when the feed has never been written (no daemon → the dashboard degrades to traffic-only).
|
|
862
|
+
*/
|
|
863
|
+
async readMembership() {
|
|
864
|
+
const kv = await this.membershipRegistry();
|
|
865
|
+
const members = [];
|
|
866
|
+
let asOf;
|
|
867
|
+
for await (const key of await kv.keys()) {
|
|
868
|
+
const e = await kv.get(key);
|
|
869
|
+
if (!e || e.operation === "DEL" || e.operation === "PURGE")
|
|
870
|
+
continue;
|
|
871
|
+
if (key === MEMBERSHIP_FEED_KEY) {
|
|
872
|
+
try {
|
|
873
|
+
asOf = e.json().observedAt;
|
|
874
|
+
}
|
|
875
|
+
catch { /* heartbeat garbled — leave undefined */ }
|
|
876
|
+
continue;
|
|
877
|
+
}
|
|
878
|
+
try {
|
|
879
|
+
const rec = e.json();
|
|
880
|
+
members.push({ id: key, live: rec.live ?? [], durable: rec.durable ?? [], observedAt: rec.observedAt });
|
|
881
|
+
}
|
|
882
|
+
catch { /* skip undecodable */ }
|
|
883
|
+
}
|
|
884
|
+
return { asOf, members };
|
|
885
|
+
}
|
|
886
|
+
/** Watch the membership feed for changes (admin/observer): `onChange` fires on every KV entry,
|
|
887
|
+
* including the initial replay — the caller debounces + re-reads {@link readMembership}. Returns a
|
|
888
|
+
* stop handle. Best-effort: a feed the cred can't read (or absent) surfaces as an `error` event and
|
|
889
|
+
* the dashboard keeps its last snapshot. */
|
|
890
|
+
async watchMembership(onChange) {
|
|
891
|
+
const kv = await this.membershipRegistry();
|
|
892
|
+
const iter = await kv.watch();
|
|
893
|
+
void (async () => {
|
|
894
|
+
for await (const _ of iter)
|
|
895
|
+
onChange();
|
|
896
|
+
})().catch((err) => this.emit("error", err));
|
|
897
|
+
return { stop: () => iter.stop() };
|
|
898
|
+
}
|
|
784
899
|
/** Fetch recent messages from a channel's JetStream backlog. */
|
|
785
900
|
async channelHistory(channel, opts) {
|
|
786
901
|
// history from any sender
|
|
@@ -871,28 +986,10 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
871
986
|
throw new Error("endpoint not started");
|
|
872
987
|
await createSpaceStreams(this.jsm, this.space);
|
|
873
988
|
}
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
* `durableJoin`. `live`-class (and non-concrete) channels are skipped. Idempotent.
|
|
879
|
-
*
|
|
880
|
-
* Writes the durable RECORDS with the caller's privileged creds — it does NOT require this endpoint
|
|
881
|
-
* to host the runtime fan-out/reader loops (a space-level manager service), so EVERY auth launcher
|
|
882
|
-
* provisions identically: the manager AND the short-lived `cotal spawn` provisioner both write boot
|
|
883
|
-
* records, which the space's manager then delivers (no silent no-op — that would hide a boot
|
|
884
|
-
* membership; AGENTS.md "no fallbacks"). A space running no manager is live-only for everyone (the
|
|
885
|
-
* records exist; nothing delivers them until a manager hosts the loops).
|
|
886
|
-
*/
|
|
887
|
-
async provisionMembership(targetId, channels) {
|
|
888
|
-
for (const ch of channels) {
|
|
889
|
-
if (!isConcreteChannel(ch))
|
|
890
|
-
continue; // durable membership is per-concrete-channel
|
|
891
|
-
if ((await this.deliveryClassFresh(ch)) !== "durable")
|
|
892
|
-
continue;
|
|
893
|
-
await this.durableJoinFor(targetId, ch);
|
|
894
|
-
}
|
|
895
|
-
}
|
|
989
|
+
// (v3) The old `provisionMembership` — manager/provisioner-written boot membership at spawn — is GONE.
|
|
990
|
+
// Boot durable membership is now the AGENT self-joining its durable boot channels via the daemon's
|
|
991
|
+
// `ctl.delivery` op at connect ({@link armBootDurableMemberships}), reconciled on outage. The
|
|
992
|
+
// primitive it wrapped, {@link durableJoinFor}, is now driven by the daemon's `ctl.delivery` handler.
|
|
896
993
|
/**
|
|
897
994
|
* Privileged: pre-create an agent's DM inbox durable (auth mode), so the agent can BIND
|
|
898
995
|
* it without holding CONSUMER.CREATE on DM_<space>. The creator sets the filter to
|
|
@@ -925,26 +1022,102 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
925
1022
|
const jsm = await this.manager();
|
|
926
1023
|
await jsm.consumers.add(taskStream(this.space), taskDurableConfig(this.space, role));
|
|
927
1024
|
}
|
|
928
|
-
// ---- Plane-3: durable backstop (SPEC §8) — privileged,
|
|
1025
|
+
// ---- Plane-3: durable backstop (SPEC §8) — privileged, hosted by the server-side DELIVERY DAEMON ----
|
|
929
1026
|
//
|
|
930
|
-
// Two
|
|
931
|
-
// every chat message and copies it into each eligible owner's MIXED
|
|
932
|
-
// TRUSTED READER (the auth gate) re-authorizes each entry against the
|
|
933
|
-
// interval and TRANSFERS the authorized copy to the owner's per-member
|
|
934
|
-
// (`dlv.<owner>`), which the agent binds + acks via native JetStream. The agent holds no
|
|
935
|
-
// mixed store.
|
|
936
|
-
|
|
1027
|
+
// Two daemon loops + two privileged membership ops (served to agents on `ctl.delivery`). The FAN-OUT
|
|
1028
|
+
// writer (routing, not auth) reads every chat message and copies it into each eligible owner's MIXED
|
|
1029
|
+
// inbox (`dinbox.<owner>`); the TRUSTED READER (the auth gate) re-authorizes each entry against the
|
|
1030
|
+
// CURRENT ACL + membership interval and TRANSFERS the authorized copy to the owner's per-member
|
|
1031
|
+
// DELIVER store (`dlv.<owner>`), which the agent binds + acks via native JetStream. The agent holds no
|
|
1032
|
+
// read on the mixed store. (v3: this all moved off the manager — the manager is lifecycle-only; it
|
|
1033
|
+
// records the read-ACL at mint via commitAcl.) See `.internal/research/stage4-impl-design.md`.
|
|
1034
|
+
/** Lazily open the privileged members registry KV (delivery daemon / open-mode self). */
|
|
937
1035
|
async membersRegistry() {
|
|
938
1036
|
if (!this.nc)
|
|
939
1037
|
throw new Error("endpoint not started");
|
|
940
1038
|
this.membersKv ??= await openMembersRegistry(this.nc, this.space);
|
|
941
1039
|
return this.membersKv;
|
|
942
1040
|
}
|
|
1041
|
+
/** Lazily open the durable read-ACL registry KV. Privileged write (the manager records an agent's
|
|
1042
|
+
* ACL at mint); the delivery daemon reads it fresh per durable entry to re-authorize. */
|
|
1043
|
+
async aclRegistry() {
|
|
1044
|
+
if (!this.nc)
|
|
1045
|
+
throw new Error("endpoint not started");
|
|
1046
|
+
this.aclKv ??= await openAclRegistry(this.nc, this.space);
|
|
1047
|
+
return this.aclKv;
|
|
1048
|
+
}
|
|
1049
|
+
/** Privileged ({@link DurableProvisioner}): record an agent's read ACL in the durable registry at
|
|
1050
|
+
* provision/mint time — the same act as baking it into the JWT, persisted so the server-side
|
|
1051
|
+
* delivery daemon can re-authorize the agent's durable entries and validate its runtime
|
|
1052
|
+
* durable-joins without holding any in-memory ledger. Written ATOMICALLY ({@link writeAclRecord}),
|
|
1053
|
+
* so a present record is always complete (`[]` = known no-read, never a half-write). */
|
|
1054
|
+
async commitAcl(targetId, allowSubscribe) {
|
|
1055
|
+
await writeAclRecord(await this.aclRegistry(), targetId, allowSubscribe);
|
|
1056
|
+
}
|
|
1057
|
+
/** The server-side delivery daemon's fresh-per-entry ACL read: an owner's CURRENT read ACL
|
|
1058
|
+
* (`allowSubscribe`) from the durable registry, or `undefined` if no record (an unknown owner — the
|
|
1059
|
+
* reader DEFERS, never drops). A present `[]` (known no-read) returns `[]` (the reader DROPS). */
|
|
1060
|
+
async aclForOwner(owner) {
|
|
1061
|
+
return (await readAcl(await this.aclRegistry(), owner))?.record.allowSubscribe;
|
|
1062
|
+
}
|
|
1063
|
+
/** Lazily open the delivery lease/readiness KV (pre-created at `cotal up`; bind, never create). */
|
|
1064
|
+
async deliveryRegistry() {
|
|
1065
|
+
if (!this.nc)
|
|
1066
|
+
throw new Error("endpoint not started");
|
|
1067
|
+
this.deliveryKv ??= await openDeliveryRegistry(this.nc, this.space);
|
|
1068
|
+
return this.deliveryKv;
|
|
1069
|
+
}
|
|
1070
|
+
encodeLease(ready) {
|
|
1071
|
+
return new TextEncoder().encode(JSON.stringify({ holder: this.card.id, since: Date.now(), ready }));
|
|
1072
|
+
}
|
|
1073
|
+
/** Acquire the single-flight delivery lease for a shard via an ATOMIC CAS create, marked NOT-ready.
|
|
1074
|
+
* THROWS if a live lease exists — a loud refusal-to-bind (the daemon exits), never a retry, so two
|
|
1075
|
+
* daemons can't split a durable's delivery. A crashed holder's lease auto-expires (bucket TTL),
|
|
1076
|
+
* freeing a re-acquire. Acquired BEFORE binding (single-flight gate); {@link markDeliveryLeaseReady}
|
|
1077
|
+
* flips it ready AFTER the loops + `ctl.delivery` are bound. Returns the lease revision. */
|
|
1078
|
+
async acquireDeliveryLease(shardIndex) {
|
|
1079
|
+
return (await this.deliveryRegistry()).create(leaseKey(shardIndex), this.encodeLease(false));
|
|
1080
|
+
}
|
|
1081
|
+
/** Flip the held lease to READY (CAS `kv.update`) AFTER `startPlane3` has bound the loops + the
|
|
1082
|
+
* `ctl.delivery` responder — so "lease ready" proves the responder is up, not just that the slot was
|
|
1083
|
+
* claimed. Returns the new revision. */
|
|
1084
|
+
async markDeliveryLeaseReady(shardIndex, revision) {
|
|
1085
|
+
return (await this.deliveryRegistry()).update(leaseKey(shardIndex), this.encodeLease(true), revision);
|
|
1086
|
+
}
|
|
1087
|
+
/** Renew the held lease (CAS `kv.update` against `revision`, keeping `ready:true`) to refresh it before
|
|
1088
|
+
* the bucket TTL expires it. Returns the new revision. Throws if the revision moved (lost the lease —
|
|
1089
|
+
* the daemon should exit). */
|
|
1090
|
+
async renewDeliveryLease(shardIndex, revision) {
|
|
1091
|
+
return (await this.deliveryRegistry()).update(leaseKey(shardIndex), this.encodeLease(true), revision);
|
|
1092
|
+
}
|
|
1093
|
+
/** Release the held lease on clean shutdown so a replacement daemon re-acquires immediately (best
|
|
1094
|
+
* effort — a crash just lets the bucket TTL expire it). */
|
|
1095
|
+
async releaseDeliveryLease(shardIndex) {
|
|
1096
|
+
try {
|
|
1097
|
+
await (await this.deliveryRegistry()).delete(leaseKey(shardIndex));
|
|
1098
|
+
}
|
|
1099
|
+
catch { /* already gone */ }
|
|
1100
|
+
}
|
|
1101
|
+
/** Read a shard's delivery lease (the daemon-availability signal), or `undefined` if none is live.
|
|
1102
|
+
* READ-ONLY surface — drives Component 6's `cotal_channels` delivery-health field (an agent reads it
|
|
1103
|
+
* under its own cred, which holds lease-bucket read but no write). */
|
|
1104
|
+
async readDeliveryLease(shardIndex) {
|
|
1105
|
+
const e = await (await this.deliveryRegistry()).get(leaseKey(shardIndex));
|
|
1106
|
+
if (!e || e.operation === "DEL" || e.operation === "PURGE")
|
|
1107
|
+
return undefined;
|
|
1108
|
+
try {
|
|
1109
|
+
return e.json();
|
|
1110
|
+
}
|
|
1111
|
+
catch {
|
|
1112
|
+
return undefined;
|
|
1113
|
+
}
|
|
1114
|
+
}
|
|
943
1115
|
/** Privileged: one owner's NON-TOMBSTONED durable memberships as `{channel, generation, activated}` —
|
|
944
|
-
* the
|
|
945
|
-
*
|
|
946
|
-
* ones are returned too so `leaveChannel` can discover + close a record that
|
|
947
|
-
* pure-interval predicate (a crash-stuck pending activation) — without reading
|
|
1116
|
+
* the server-side delivery daemon serves this to a connecting agent (the `listMemberships` op on
|
|
1117
|
+
* `ctl.delivery`). The agent seeds its leave mirror from the ACTIVATED ones (the confirmed backstops),
|
|
1118
|
+
* but the non-activated ones are returned too so `leaveChannel` can discover + close a record that
|
|
1119
|
+
* still routes under the pure-interval predicate (a crash-stuck pending activation) — without reading
|
|
1120
|
+
* the privileged KV itself. */
|
|
948
1121
|
async ownerMemberships(owner) {
|
|
949
1122
|
const recs = await listMembers(await this.membersRegistry(), { owner });
|
|
950
1123
|
return recs
|
|
@@ -985,16 +1158,15 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
985
1158
|
return info?.delivered?.stream_seq ?? 0;
|
|
986
1159
|
}
|
|
987
1160
|
/**
|
|
988
|
-
* Privileged durable-JOIN write (the
|
|
989
|
-
*
|
|
990
|
-
*
|
|
991
|
-
*
|
|
992
|
-
*
|
|
993
|
-
*
|
|
1161
|
+
* Privileged durable-JOIN write (v3: the delivery daemon calls this from its `ctl.delivery` handler
|
|
1162
|
+
* after validating channel ⊆ the caller's read ACL): capture `joinCursor`, commit a `durable-active`
|
|
1163
|
+
* record (CAS + generation bump), then ACTIVATION CATCH-UP idempotently copies `(joinCursor, fence]`
|
|
1164
|
+
* into the owner inbox where `fence = max(frontier, fanoutDelivered)` — fan-out owns `seq > fence`.
|
|
1165
|
+
* Idempotent against a timeout-retry (an already-activated membership no-ops). Returns `{durable:false}`
|
|
1166
|
+
* (honest degrade) only if the catch-up window was evicted.
|
|
994
1167
|
*
|
|
995
|
-
*
|
|
996
|
-
*
|
|
997
|
-
* short-lived provisioner can write a boot membership a separate long-lived manager then delivers.
|
|
1168
|
+
* Runs on the daemon (which hosts the fan-out/reader loops + the members KV), so catch-up + the
|
|
1169
|
+
* activation fence read are in-process — no cross-process cursor read.
|
|
998
1170
|
*/
|
|
999
1171
|
async durableJoinFor(owner, channel) {
|
|
1000
1172
|
if (!this.js)
|
|
@@ -1119,27 +1291,122 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1119
1291
|
}
|
|
1120
1292
|
return { copied, evicted };
|
|
1121
1293
|
}
|
|
1122
|
-
/** Start the Plane-3 fan-out writer + trusted reader on THIS (privileged
|
|
1123
|
-
*
|
|
1124
|
-
*
|
|
1294
|
+
/** Start the Plane-3 fan-out writer + trusted reader on THIS (privileged, server-side delivery-daemon)
|
|
1295
|
+
* endpoint, AND serve the `ctl.delivery` control service (runtime durable join/leave/list). `aclFor`
|
|
1296
|
+
* maps an owner id to its current read ACL for the reader's re-authorization — read FRESH per entry
|
|
1297
|
+
* from the durable ACL registry (async). Call once after connect; idempotent durable creation lets it
|
|
1298
|
+
* resume on a daemon restart. Both the JS loops AND the `ctl.delivery` subscription are (re)bound by
|
|
1299
|
+
* {@link armPlane3} on EVERY (re)connect — a reconnect drains the old connection, so re-binding both
|
|
1300
|
+
* is required, not optional (the responder would otherwise be lost on a broker blip). */
|
|
1125
1301
|
async startPlane3(aclFor) {
|
|
1126
1302
|
if (!this.js)
|
|
1127
1303
|
throw new Error("endpoint not started");
|
|
1128
1304
|
this.plane3 = { aclFor };
|
|
1129
1305
|
await this.armPlane3();
|
|
1130
1306
|
}
|
|
1307
|
+
/** Serve one runtime durable-membership control request (the server-side delivery daemon). The caller
|
|
1308
|
+
* id is the authenticated subject sender ({@link serveControl} fail-closes on a mismatch). Validation
|
|
1309
|
+
* is against the durable ACL registry — the SAME KV the reader re-auths against (single source of
|
|
1310
|
+
* truth, no in-memory ledger to drift). */
|
|
1311
|
+
async handleDeliveryControl(req) {
|
|
1312
|
+
const caller = req.from.id;
|
|
1313
|
+
const args = req.args ?? {};
|
|
1314
|
+
if (req.op === "durableJoin")
|
|
1315
|
+
return this.deliveryJoin(caller, args);
|
|
1316
|
+
if (req.op === "durableLeave")
|
|
1317
|
+
return this.deliveryLeave(caller, args);
|
|
1318
|
+
if (req.op === "listMemberships")
|
|
1319
|
+
return { ok: true, data: { memberships: await this.ownerMemberships(caller) } };
|
|
1320
|
+
return { ok: false, error: `op "${req.op}" not supported on the delivery control service` };
|
|
1321
|
+
}
|
|
1322
|
+
/** Validate the channel ARG shape only — non-blank, valid, concrete (NO ACL check, that is op-specific).
|
|
1323
|
+
* Returns the channel on success or a ControlReply error to short-circuit. */
|
|
1324
|
+
checkDurableChannelArg(args, op) {
|
|
1325
|
+
const channel = typeof args.channel === "string" ? args.channel.trim() : "";
|
|
1326
|
+
if (!channel)
|
|
1327
|
+
return { ok: false, error: `${op}: channel must be a non-blank string` };
|
|
1328
|
+
try {
|
|
1329
|
+
assertValidChannel(channel);
|
|
1330
|
+
}
|
|
1331
|
+
catch (e) {
|
|
1332
|
+
return { ok: false, error: e.message };
|
|
1333
|
+
}
|
|
1334
|
+
if (!isConcreteChannel(channel))
|
|
1335
|
+
return { ok: false, error: `${op}: "${channel}" must be a concrete channel (durable membership is per-concrete-channel, not wildcard)` };
|
|
1336
|
+
return channel;
|
|
1337
|
+
}
|
|
1338
|
+
/** JOIN requires the channel be within the caller's CURRENT read ACL (you can't durable-subscribe a
|
|
1339
|
+
* channel you may not read). */
|
|
1340
|
+
async deliveryJoin(caller, args) {
|
|
1341
|
+
const channel = this.checkDurableChannelArg(args, "durableJoin");
|
|
1342
|
+
if (typeof channel !== "string")
|
|
1343
|
+
return channel; // a ControlReply error
|
|
1344
|
+
const acl = await readAcl(await this.aclRegistry(), caller);
|
|
1345
|
+
if (acl === undefined)
|
|
1346
|
+
return { ok: false, error: `durableJoin: no read ACL on record for ${caller} (not provisioned for durable delivery)` };
|
|
1347
|
+
if (!channelInAllow(acl.record.allowSubscribe, channel))
|
|
1348
|
+
return { ok: false, error: `channel "${channel}" is not within your read ACL [${acl.record.allowSubscribe.join(", ")}]` };
|
|
1349
|
+
try {
|
|
1350
|
+
return { ok: true, data: await this.durableJoinFor(caller, channel) };
|
|
1351
|
+
}
|
|
1352
|
+
catch (e) {
|
|
1353
|
+
return { ok: false, error: e.message };
|
|
1354
|
+
}
|
|
1355
|
+
}
|
|
1356
|
+
/** LEAVE must NOT require current-ACL coverage. Leave fires precisely when the ACL was narrowed/revoked
|
|
1357
|
+
* (a refused live sub → {@link closeRefusedMembership}); gating the tombstone on the current ACL would
|
|
1358
|
+
* loop forever and leave the SPEC §7 boundary open (the membership could resume if the ACL is later
|
|
1359
|
+
* restored). The guards are: authenticated caller (serveControl), concrete channel, a finite generation
|
|
1360
|
+
* (the join epoch — without it a stale/replayed leave could tombstone a newer rejoin), and an EXISTING
|
|
1361
|
+
* own membership; `durableLeaveFor` → `tombstoneMember` then enforces the generation match. */
|
|
1362
|
+
async deliveryLeave(caller, args) {
|
|
1363
|
+
const channel = this.checkDurableChannelArg(args, "durableLeave");
|
|
1364
|
+
if (typeof channel !== "string")
|
|
1365
|
+
return channel; // a ControlReply error
|
|
1366
|
+
if (typeof args.generation !== "number" || !Number.isFinite(args.generation))
|
|
1367
|
+
return { ok: false, error: "durableLeave: a finite generation is required (fail-closed stale-leave guard)" };
|
|
1368
|
+
const existing = await readMember(await this.membersRegistry(), channel, caller);
|
|
1369
|
+
if (!existing)
|
|
1370
|
+
return { ok: true, data: { channel, alreadyLeft: true } }; // nothing to tombstone — idempotent
|
|
1371
|
+
try {
|
|
1372
|
+
await this.durableLeaveFor(caller, channel, args.generation);
|
|
1373
|
+
}
|
|
1374
|
+
catch (e) {
|
|
1375
|
+
return { ok: false, error: e.message };
|
|
1376
|
+
}
|
|
1377
|
+
return { ok: true, data: { channel } };
|
|
1378
|
+
}
|
|
1131
1379
|
/** (Re)bind the Plane-3 fan-out writer + trusted reader. Idempotent — the durables resume from their
|
|
1132
1380
|
* cursor. Called by {@link startPlane3} once AND by {@link connectAndBind} on every (re)connect, so
|
|
1133
|
-
*
|
|
1381
|
+
* the delivery daemon's reconnect RE-ARMS the backstop + the ctl.delivery responder. Without this, a broker blip would silently kill
|
|
1134
1382
|
* the loops while `durableJoinFor` kept reporting `durable:true` (the impl-review's BLOCKER-1). No-op
|
|
1135
1383
|
* unless this endpoint hosts Plane-3 (`this.plane3` set). */
|
|
1136
1384
|
async armPlane3() {
|
|
1137
1385
|
if (!this.plane3 || !this.js)
|
|
1138
1386
|
return;
|
|
1139
1387
|
await this.manager(); // the manager runs consume:false, so this.jsm is lazy — ensure it
|
|
1388
|
+
this.armDeliveryControl();
|
|
1140
1389
|
await this.runFanout();
|
|
1141
1390
|
await this.runReader();
|
|
1142
1391
|
}
|
|
1392
|
+
/** (Re)register the `ctl.delivery` control responder on the CURRENT connection. A reconnect drains the
|
|
1393
|
+
* old connection (the old sub is dead and `clearConnectionScoped` leaves caller-owned subs alone), so
|
|
1394
|
+
* this MUST run on every arm — otherwise durable join/leave/list silently lose their responder after a
|
|
1395
|
+
* broker blip. The stale sub is dropped (unsubscribed + removed from `this.subs`) before re-creating.
|
|
1396
|
+
* `boundReply` is essential here: the daemon holds a wildcard reply-publish grant, so the serve path
|
|
1397
|
+
* must reject any reply target outside the authenticated sender's own subtree (confused-deputy fix). */
|
|
1398
|
+
armDeliveryControl() {
|
|
1399
|
+
if (this.deliveryServeSub) {
|
|
1400
|
+
try {
|
|
1401
|
+
this.deliveryServeSub.unsubscribe();
|
|
1402
|
+
}
|
|
1403
|
+
catch { /* dead with the old connection */ }
|
|
1404
|
+
const i = this.subs.indexOf(this.deliveryServeSub);
|
|
1405
|
+
if (i >= 0)
|
|
1406
|
+
this.subs.splice(i, 1);
|
|
1407
|
+
}
|
|
1408
|
+
this.deliveryServeSub = this.serveControl(CONTROL_DELIVERY, (req) => this.handleDeliveryControl(req), { boundReply: true });
|
|
1409
|
+
}
|
|
1143
1410
|
/** Fan-out loop: bind the privileged `fanout` durable on CHAT and route each message (routing only —
|
|
1144
1411
|
* the trusted reader is the auth gate). */
|
|
1145
1412
|
async runFanout() {
|
|
@@ -1206,7 +1473,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1206
1473
|
const owner = this.resolveOwnerByName(name);
|
|
1207
1474
|
if (!owner || owner === msg.from.id)
|
|
1208
1475
|
continue;
|
|
1209
|
-
const acl = this.plane3?.aclFor(owner);
|
|
1476
|
+
const acl = await this.plane3?.aclFor(owner);
|
|
1210
1477
|
if (!acl || !channelInAllow(acl, channel))
|
|
1211
1478
|
continue; // @mention can't bypass the read ACL
|
|
1212
1479
|
await this.publishDinbox(owner, { msg, channel, seq, reason: "live-mention", generation: 0 });
|
|
@@ -1261,7 +1528,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1261
1528
|
return;
|
|
1262
1529
|
} // undecodable — drop
|
|
1263
1530
|
const redeliveries = m.info?.deliveryCount ?? 1; // JsMsg delivery attempts (1 on first delivery)
|
|
1264
|
-
const acl = this.plane3?.aclFor(owner);
|
|
1531
|
+
const acl = await this.plane3?.aclFor(owner);
|
|
1265
1532
|
if (acl === undefined) {
|
|
1266
1533
|
// UNKNOWN owner — the manager has not (re)hydrated this owner's ACL yet (e.g. right after a
|
|
1267
1534
|
// manager PROCESS restart). This is NOT a revocation: DEFER (redeliver), never drop — an ack here
|
|
@@ -1311,7 +1578,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1311
1578
|
m.ack();
|
|
1312
1579
|
}
|
|
1313
1580
|
/** Agent-side: bind + pump our pre-created Plane-3 DELIVER durable (`dlv_<id>`). Every message here is
|
|
1314
|
-
*
|
|
1581
|
+
* delivery-daemon-written (DLV is delivery-write-only, broker-enforced) and is a CHANNEL message by contract
|
|
1315
1582
|
* (the backstop never carries DMs), so `kind=channel` is path-derived (SPEC §4) and the body is
|
|
1316
1583
|
* trusted (no spoof-guard). `durable:true` — real JetStream ack, coalesced with the core-sub live
|
|
1317
1584
|
* copy by `MeshAgent.ingest`. No-op when the durable isn't present (open mode / not provisioned). */
|
|
@@ -1351,19 +1618,19 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1351
1618
|
})().catch((e) => { if (!this.stopped)
|
|
1352
1619
|
this.emit("error", e); });
|
|
1353
1620
|
}
|
|
1354
|
-
/** Agent-side: request a Plane-3 durable backstop for a channel via the
|
|
1355
|
-
* when no privileged writer is present (open /
|
|
1621
|
+
/** Agent-side: request a Plane-3 durable backstop for a channel via the server-side delivery daemon (ctl.delivery). Throws
|
|
1622
|
+
* when no privileged writer is present (open / no delivery daemon). 30s timeout — activation catch-up may
|
|
1356
1623
|
* run before the reply (the window is small, but a busy channel can take more than the 5s default). */
|
|
1357
1624
|
async durableJoinChannel(channel) {
|
|
1358
|
-
const reply = await this.
|
|
1625
|
+
const reply = await this.requestDelivery("durableJoin", { channel }, 30_000);
|
|
1359
1626
|
if (!reply.ok)
|
|
1360
1627
|
throw new Error(reply.error ?? "durable join rejected");
|
|
1361
1628
|
return reply.data ?? { durable: false };
|
|
1362
1629
|
}
|
|
1363
1630
|
/** Agent-side: release a Plane-3 durable backstop (tombstone membership at the leave cursor). Passes
|
|
1364
|
-
* the join generation so a stale leave can't tombstone a newer rejoin (the
|
|
1631
|
+
* the join generation so a stale leave can't tombstone a newer rejoin (the delivery daemon validates it). */
|
|
1365
1632
|
async durableLeaveChannel(channel, generation) {
|
|
1366
|
-
const reply = await this.
|
|
1633
|
+
const reply = await this.requestDelivery("durableLeave", { channel, generation });
|
|
1367
1634
|
if (!reply.ok)
|
|
1368
1635
|
throw new Error(reply.error ?? "durable leave rejected");
|
|
1369
1636
|
}
|
|
@@ -1373,7 +1640,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1373
1640
|
* is reachable, never a silent give-up. While pending, the channel is tracked in
|
|
1374
1641
|
* {@link pendingDurableLeave} and surfaced via {@link pendingDurableLeaves} (the connector shows it in
|
|
1375
1642
|
* `cotal_channels` as `durable-unclosed`, never ordinary absence). The generation is kept the whole
|
|
1376
|
-
* time. Authoritative closure of a revoked membership is also
|
|
1643
|
+
* time. Authoritative closure of a revoked membership is also handled by revocation (rotate creds + tear down). */
|
|
1377
1644
|
async closeRefusedMembership(channel, generation) {
|
|
1378
1645
|
this.pendingDurableLeave.set(channel, generation);
|
|
1379
1646
|
for (let attempt = 0;; attempt++) {
|
|
@@ -1406,42 +1673,94 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1406
1673
|
}
|
|
1407
1674
|
/** Agent-side: this session's CURRENT durable memberships (channel + join generation) from the
|
|
1408
1675
|
* manager — the agent holds no read on the privileged members KV. `undefined` ⇒ NO control responder
|
|
1409
|
-
* (open /
|
|
1676
|
+
* (open / no delivery daemon, so there is no Plane-3 and no memberships). THROWS on a responder-present RPC
|
|
1410
1677
|
* failure, so a caller can FAIL-CLOSED rather than mistaking a transient error for "no membership". */
|
|
1411
1678
|
async fetchMemberships() {
|
|
1412
1679
|
let reply;
|
|
1413
1680
|
try {
|
|
1414
|
-
reply = await this.
|
|
1681
|
+
reply = await this.requestDelivery("listMemberships", {}, 5_000);
|
|
1415
1682
|
}
|
|
1416
1683
|
catch (e) {
|
|
1417
1684
|
if (this.isNoResponders(e))
|
|
1418
|
-
return undefined; // no
|
|
1685
|
+
return undefined; // no delivery daemon — open / daemon-less, no Plane-3
|
|
1419
1686
|
throw e; // responder present but errored — surface it (leaveChannel fails closed)
|
|
1420
1687
|
}
|
|
1421
1688
|
if (!reply.ok)
|
|
1422
1689
|
throw new Error(reply.error ?? "listMemberships failed");
|
|
1423
1690
|
return reply.data?.memberships ?? [];
|
|
1424
1691
|
}
|
|
1425
|
-
/** Agent-side
|
|
1426
|
-
*
|
|
1427
|
-
*
|
|
1428
|
-
*
|
|
1429
|
-
*
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1692
|
+
/** Agent-side, first connect (auth): SELF-JOIN this session's durable boot channels via the
|
|
1693
|
+
* server-side delivery daemon — replacing the old manager-written boot membership. Each concrete
|
|
1694
|
+
* `durable`-class boot channel gets a `durableJoin` whose returned generation seeds the leave mirror
|
|
1695
|
+
* + durable-state surface; an already-active membership (a relaunch) is idempotent (no re-catch-up).
|
|
1696
|
+
* If the daemon is down/absent at first connect (or reports a transient `durable:false`), the channel
|
|
1697
|
+
* is handed to {@link reconcileBootJoin} for capped-backoff retry — so the backstop is RESTORED once
|
|
1698
|
+
* the daemon recovers, not left silently live-only. Until a membership exists the channel renders
|
|
1699
|
+
* degraded in `cotal_channels` ({@link hasDurableMembership}). */
|
|
1700
|
+
async armBootDurableMemberships() {
|
|
1701
|
+
for (const channel of this.channels) {
|
|
1702
|
+
if (!isConcreteChannel(channel) || this.plane3Channels.has(channel))
|
|
1703
|
+
continue;
|
|
1704
|
+
let cls;
|
|
1705
|
+
try {
|
|
1706
|
+
cls = await this.deliveryClassFresh(channel);
|
|
1707
|
+
}
|
|
1708
|
+
catch {
|
|
1709
|
+
continue;
|
|
1710
|
+
}
|
|
1711
|
+
if (cls !== "durable")
|
|
1712
|
+
continue;
|
|
1713
|
+
try {
|
|
1714
|
+
const r = await this.durableJoinChannel(channel);
|
|
1715
|
+
if (r.durable)
|
|
1716
|
+
this.plane3Channels.set(channel, r.generation ?? 0);
|
|
1717
|
+
else
|
|
1718
|
+
void this.reconcileBootJoin(channel); // present but not yet durable — reconcile to recovery
|
|
1719
|
+
}
|
|
1720
|
+
catch (e) {
|
|
1721
|
+
if (!this.isNoResponders(e))
|
|
1722
|
+
this.emit("error", e); // no daemon ⇒ retry until it recovers
|
|
1723
|
+
void this.reconcileBootJoin(channel);
|
|
1724
|
+
}
|
|
1434
1725
|
}
|
|
1435
|
-
|
|
1436
|
-
|
|
1726
|
+
}
|
|
1727
|
+
/** Retry a boot durable self-join with capped backoff until a membership EXISTS (success → seed
|
|
1728
|
+
* `plane3Channels`) or the channel is left / the endpoint stops. Mirrors {@link closeRefusedMembership}:
|
|
1729
|
+
* a one-shot first-connect attempt that swallowed a daemon outage would leave the boot channel live-only
|
|
1730
|
+
* forever after the daemon recovers (and the lease-based health could then read "active" with no owner
|
|
1731
|
+
* membership). This loop is the reconcile that closes that gap. Idempotent — a channel already pending
|
|
1732
|
+
* is not double-driven; survives reconnect (it re-issues `durableJoinChannel` on the current connection). */
|
|
1733
|
+
async reconcileBootJoin(channel) {
|
|
1734
|
+
if (this.pendingBootJoins.has(channel))
|
|
1735
|
+
return; // already reconciling
|
|
1736
|
+
this.pendingBootJoins.add(channel);
|
|
1737
|
+
for (let attempt = 0;; attempt++) {
|
|
1738
|
+
await new Promise((r) => setTimeout(r, Math.min(30_000, 1000 * 2 ** attempt)));
|
|
1739
|
+
if (this.stopped || !this.channels.includes(channel) || this.plane3Channels.has(channel)) {
|
|
1740
|
+
this.pendingBootJoins.delete(channel);
|
|
1741
|
+
return; // stopped, left, or another path established it
|
|
1742
|
+
}
|
|
1743
|
+
try {
|
|
1744
|
+
const r = await this.durableJoinChannel(channel);
|
|
1745
|
+
if (r.durable) {
|
|
1746
|
+
this.plane3Channels.set(channel, r.generation ?? 0);
|
|
1747
|
+
this.pendingBootJoins.delete(channel);
|
|
1748
|
+
return;
|
|
1749
|
+
}
|
|
1750
|
+
// present but durable:false (e.g. catch-up window evicted) — keep retrying; the channel stays
|
|
1751
|
+
// honestly degraded meanwhile, never silently "active".
|
|
1752
|
+
}
|
|
1753
|
+
catch (e) {
|
|
1754
|
+
if (attempt === 0 && !this.isNoResponders(e))
|
|
1755
|
+
this.emit("error", new Error(`channel "${channel}": boot durable self-join not yet established — retrying until the delivery daemon is reachable (${e.message})`));
|
|
1756
|
+
}
|
|
1437
1757
|
}
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
this.plane3Channels.set(m.channel, m.generation);
|
|
1758
|
+
}
|
|
1759
|
+
/** True if this session holds an established Plane-3 durable membership for `channel` (in `plane3Channels`).
|
|
1760
|
+
* Drives the membership-aware delivery-health surface: a joined durable channel that is NOT yet a member
|
|
1761
|
+
* (boot self-join pending / daemon down) must render degraded, never "active" off a live lease alone. */
|
|
1762
|
+
hasDurableMembership(channel) {
|
|
1763
|
+
return this.plane3Channels.has(channel);
|
|
1445
1764
|
}
|
|
1446
1765
|
/** Lazily obtain a JetStream manager — so a non-consuming endpoint (e.g. the supervisor,
|
|
1447
1766
|
* consume:false) can still pre-create others' durables. */
|
|
@@ -1472,9 +1791,10 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1472
1791
|
await this.pumpDlv();
|
|
1473
1792
|
// Multicast: open a native CORE subscription for each channel (live, manager-free, broker-enforced
|
|
1474
1793
|
// by sub.allow) — boot + runtime joins use the SAME path; there is no per-instance chat durable.
|
|
1475
|
-
// The durable backstop (a busy/offline turn) is Plane-3 (auth: membership
|
|
1476
|
-
//
|
|
1477
|
-
// live-only — the durable plane needs the
|
|
1794
|
+
// The durable backstop (a busy/offline turn) is Plane-3 (auth: membership established by the agent's
|
|
1795
|
+
// self-join, the delivery daemon's fan-out writer + trusted reader deliver via the `dlv_<id>` pump
|
|
1796
|
+
// above; open dev mode is live-only — the durable plane needs the daemon's trusted reader, the
|
|
1797
|
+
// security boundary). Per-
|
|
1478
1798
|
// channel history is the explicit replay-gated backfill, on FIRST connect only; a reconnect reopens
|
|
1479
1799
|
// the subs without re-backfilling (the durable backstop redelivers any missed window via dlv).
|
|
1480
1800
|
if (this.channels.length) {
|
|
@@ -1490,11 +1810,11 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1490
1810
|
if (armed)
|
|
1491
1811
|
await this.backfillArmed(armed);
|
|
1492
1812
|
}
|
|
1493
|
-
// First connect, auth mode:
|
|
1494
|
-
//
|
|
1495
|
-
//
|
|
1813
|
+
// First connect, auth mode: self-join BOOT durable channels via the server-side delivery daemon
|
|
1814
|
+
// (it owns membership now — there is no manager-written boot membership). Seeds plane3Channels so a
|
|
1815
|
+
// later leave can tombstone the §7 boundary; idempotent on relaunch. Open mode has no Plane-3.
|
|
1496
1816
|
if (this.firstConnect && this.creds && this.channels.length)
|
|
1497
|
-
await this.
|
|
1817
|
+
await this.armBootDurableMemberships();
|
|
1498
1818
|
this.firstConnect = false;
|
|
1499
1819
|
// Anycast: a shared work-queue consumer for our role — one instance grabs each task.
|
|
1500
1820
|
// Open mode self-creates; auth mode BINDS the provisioner-pre-created svc_<role>
|
|
@@ -1925,8 +2245,8 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1925
2245
|
}
|
|
1926
2246
|
}
|
|
1927
2247
|
async publishPresence() {
|
|
1928
|
-
if (!this.kv)
|
|
1929
|
-
return;
|
|
2248
|
+
if (!this.doRegister || !this.kv)
|
|
2249
|
+
return; // observers watch but never publish their own record
|
|
1930
2250
|
const p = {
|
|
1931
2251
|
card: this.card,
|
|
1932
2252
|
status: this.status,
|
|
@@ -2143,4 +2463,27 @@ export async function isReachable(servers = DEFAULT_SERVER, opts = {}) {
|
|
|
2143
2463
|
return e instanceof AuthorizationError || e instanceof UserAuthenticationExpiredError;
|
|
2144
2464
|
}
|
|
2145
2465
|
}
|
|
2466
|
+
/** Like {@link isReachable}, but distinguishes "up but won't take these creds" from "nothing there".
|
|
2467
|
+
* `spawn` needs the difference: auth-required → name the trust dir + next step; unreachable → the
|
|
2468
|
+
* mesh is down (prune the stale entry, tell the user to `cotal up`). Pass `creds` to confirm a
|
|
2469
|
+
* specific identity is accepted (`ok`); omit them to probe mere liveness (an auth broker answers
|
|
2470
|
+
* `auth-required`, which still proves it's up). */
|
|
2471
|
+
export async function probeConnect(server = DEFAULT_SERVER, opts = {}) {
|
|
2472
|
+
try {
|
|
2473
|
+
const nc = await connect({
|
|
2474
|
+
servers: server,
|
|
2475
|
+
timeout: opts.timeoutMs ?? 1000,
|
|
2476
|
+
reconnect: false,
|
|
2477
|
+
maxReconnectAttempts: 0,
|
|
2478
|
+
...authOpts(opts),
|
|
2479
|
+
});
|
|
2480
|
+
await nc.close();
|
|
2481
|
+
return { ok: true };
|
|
2482
|
+
}
|
|
2483
|
+
catch (e) {
|
|
2484
|
+
if (e instanceof AuthorizationError || e instanceof UserAuthenticationExpiredError)
|
|
2485
|
+
return { ok: false, reason: "auth-required" };
|
|
2486
|
+
return { ok: false, reason: "unreachable" };
|
|
2487
|
+
}
|
|
2488
|
+
}
|
|
2146
2489
|
//# sourceMappingURL=endpoint.js.map
|