@cotal-ai/core 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/acls.d.ts +45 -0
- package/dist/acls.d.ts.map +1 -0
- package/dist/acls.js +86 -0
- package/dist/acls.js.map +1 -0
- package/dist/agent-file.d.ts +7 -0
- package/dist/agent-file.d.ts.map +1 -1
- package/dist/agent-file.js +29 -2
- package/dist/agent-file.js.map +1 -1
- package/dist/channels.d.ts +13 -2
- package/dist/channels.d.ts.map +1 -1
- package/dist/channels.js +24 -1
- package/dist/channels.js.map +1 -1
- package/dist/command.d.ts +3 -0
- package/dist/command.d.ts.map +1 -1
- package/dist/endpoint.d.ts +341 -61
- package/dist/endpoint.d.ts.map +1 -1
- package/dist/endpoint.js +1178 -205
- package/dist/endpoint.js.map +1 -1
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -1
- package/dist/lease.d.ts +40 -0
- package/dist/lease.d.ts.map +1 -0
- package/dist/lease.js +64 -0
- package/dist/lease.js.map +1 -0
- package/dist/members.d.ts +93 -0
- package/dist/members.d.ts.map +1 -0
- package/dist/members.js +193 -0
- package/dist/members.js.map +1 -0
- package/dist/provision.d.ts +38 -13
- package/dist/provision.d.ts.map +1 -1
- package/dist/provision.js +121 -17
- package/dist/provision.js.map +1 -1
- package/dist/streams.d.ts +48 -23
- package/dist/streams.d.ts.map +1 -1
- package/dist/streams.js +101 -32
- package/dist/streams.js.map +1 -1
- package/dist/subjects.d.ts +85 -4
- package/dist/subjects.d.ts.map +1 -1
- package/dist/subjects.js +134 -4
- package/dist/subjects.js.map +1 -1
- package/dist/types.d.ts +128 -5
- package/dist/types.d.ts.map +1 -1
- package/package.json +2 -2
package/dist/endpoint.js
CHANGED
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
import { EventEmitter } from "node:events";
|
|
2
2
|
import { randomUUID } from "node:crypto";
|
|
3
|
-
import { connect, credsAuthenticator, nanos, AuthorizationError, PermissionViolationError, UserAuthenticationExpiredError, } from "@nats-io/transport-node";
|
|
3
|
+
import { connect, credsAuthenticator, nanos, AuthorizationError, PermissionViolationError, UserAuthenticationExpiredError, NoRespondersError, RequestError, } from "@nats-io/transport-node";
|
|
4
4
|
import { idFromCreds } from "./identity.js";
|
|
5
5
|
import { assertValidName } from "./resolve.js";
|
|
6
|
-
import { createSpaceStreams,
|
|
6
|
+
import { createSpaceStreams, dmDurableConfig, dlvDurableConfig, taskDurableConfig, fanoutDurableConfig, inboxReaderConfig, MAX_MSGS_PER_SUBJECT } from "./streams.js";
|
|
7
7
|
import { jetstream, jetstreamManager, AckPolicy, DeliverPolicy, } from "@nats-io/jetstream";
|
|
8
8
|
import { Kvm } from "@nats-io/kv";
|
|
9
|
-
import {
|
|
10
|
-
import {
|
|
9
|
+
import { openMembersRegistry, commitMember, tombstoneMember, activateMember, readMember, listMembers, durableEligible, StaleMembershipWrite, } from "./members.js";
|
|
10
|
+
import { openAclRegistry, readAcl, commitAcl as writeAclRecord } from "./acls.js";
|
|
11
|
+
import { openDeliveryRegistry } from "./lease.js";
|
|
12
|
+
import { openChannelRegistry, effectiveReplay, effectiveReplayWindowMs, effectiveDeliveryClass, readChannelConfig, readChannelDefaults, } from "./channels.js";
|
|
13
|
+
import { anycastSubject, CHANNEL_DEFAULTS_KEY, chatStream, chatHistDurable, chatSubject, controlServiceSubject, CONTROL_SELF_SERVICE, CONTROL_DELIVERY, dmStream, dmDurable, dlvStream, dlvDurable, dlvSubject, dinboxSubject, inboxStream, parseDinboxOwner, FANOUT_DURABLE, INBOX_READER_DURABLE, leaseKey, chatWildcard, assertValidChannel, channelInAllow, isConcreteChannel, normalizeMentions, parseSubject, presenceBucket, spacePrefix, spaceWildcard, subjectMatches, taskStream, taskDurable, token, unicastSubject, } from "./subjects.js";
|
|
11
14
|
export const DEFAULT_SERVER = "nats://127.0.0.1:4222";
|
|
12
15
|
/** Space joined when none is given on the CLI (the `cotal-<space>` cmux tab, etc.). */
|
|
13
16
|
export const DEFAULT_SPACE = "main";
|
|
@@ -23,6 +26,10 @@ export const DEFAULT_SPACE = "main";
|
|
|
23
26
|
* synchronously on an unhandled "error" — a missing listener turns any such fault into a
|
|
24
27
|
* process crash instead of a logged denial.
|
|
25
28
|
*/
|
|
29
|
+
/** Plane-3 trusted-reader redelivery ceiling: a dinbox entry that keeps failing re-auth-defer
|
|
30
|
+
* (unknown owner) or DELIVER transfer is `term()`d + surfaced after this many redeliveries, so one
|
|
31
|
+
* stuck/poison entry can't head-of-line the single shared reader forever. */
|
|
32
|
+
const READER_MAX_REDELIVERIES = 10;
|
|
26
33
|
export class CotalEndpoint extends EventEmitter {
|
|
27
34
|
card;
|
|
28
35
|
space;
|
|
@@ -45,6 +52,18 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
45
52
|
jsm;
|
|
46
53
|
kv;
|
|
47
54
|
channelKv;
|
|
55
|
+
/** Plane-3 durable-membership registry KV — lazily opened by the privileged delivery daemon (or a
|
|
56
|
+
* short-lived provisioner). */
|
|
57
|
+
membersKv;
|
|
58
|
+
aclKv;
|
|
59
|
+
deliveryKv;
|
|
60
|
+
/** The live `ctl.delivery` serve subscription (delivery daemon) — re-created on every (re)connect by
|
|
61
|
+
* {@link armDeliveryControl}; tracked so the stale one is dropped on reconnect. */
|
|
62
|
+
deliveryServeSub;
|
|
63
|
+
/** When set, this endpoint hosts the Plane-3 fan-out writer + trusted reader (the server-side delivery
|
|
64
|
+
* daemon). `aclFor` maps an owner id to its current read ACL (`allowSubscribe`) for the reader's
|
|
65
|
+
* re-authorization — read FRESH per entry from the durable ACL registry KV, hence async. */
|
|
66
|
+
plane3;
|
|
48
67
|
/** Live local cache of the channel registry (key = channel token), kept by a KV watch. */
|
|
49
68
|
channelConfigs = new Map();
|
|
50
69
|
channelDefaults = {};
|
|
@@ -58,11 +77,51 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
58
77
|
histLock = Promise.resolve();
|
|
59
78
|
subs = [];
|
|
60
79
|
streamMsgs = [];
|
|
80
|
+
/** Per-channel native core subscriptions (SPEC v0.3) — the manager-free live read path for boot +
|
|
81
|
+
* runtime channels (there is no per-instance chat durable). Keyed by channel so leave unsubscribes
|
|
82
|
+
* just one. */
|
|
83
|
+
chatSubs = new Map();
|
|
84
|
+
/** Channels whose core-sub the broker refused (async sub.allow violation) — read by the
|
|
85
|
+
* broker-confirmed join: a denied subscribe is NOT a successful join (SPEC conformance #13). */
|
|
86
|
+
chatSubDenied = new Set();
|
|
87
|
+
/** Channels this session has a Plane-3 durable backstop for (per-channel join GENERATION, from
|
|
88
|
+
* durableJoin, so leave passes it back for the stale-leave guard). A durable channel's core-sub is
|
|
89
|
+
* NOT coverage-dropped — it stays a live wake-hint, dedup-coalesced with the Plane-3 durable copy by
|
|
90
|
+
* id-dedup. Drives the durable-state surface + routes leave to `durableLeave`. PERSISTS across
|
|
91
|
+
* reconnect (like `this.channels`): the membership record + the `dlv_<id>` durable are persistent so
|
|
92
|
+
* the backstop survives a reconnect on its own; the agent can't re-read the privileged members KV,
|
|
93
|
+
* so this in-memory mirror is kept, not rebuilt. Cleared only on full stop. */
|
|
94
|
+
plane3Channels = new Map();
|
|
95
|
+
/** Channels whose live sub was REFUSED while they held a Plane-3 durable membership, whose §7
|
|
96
|
+
* tombstone has not yet confirmed (channel → join generation). {@link closeRefusedMembership} retries
|
|
97
|
+
* the tombstone until it lands; until then this is a `durable-unclosed` state surfaced via
|
|
98
|
+
* {@link pendingDurableLeaves} (the connector shows it in `cotal_channels`, never as ordinary
|
|
99
|
+
* absence). Persists across reconnect; cleared on tombstone success or full stop. */
|
|
100
|
+
pendingDurableLeave = new Map();
|
|
101
|
+
/** Boot durable channels whose self-join hasn't yet established a membership (daemon down/absent at
|
|
102
|
+
* first connect, or a transient `durable:false`). {@link reconcileBootJoin} retries with capped
|
|
103
|
+
* backoff until the membership exists or the channel is left — so a first-connect daemon outage
|
|
104
|
+
* self-heals on recovery instead of leaving the channel silently live-only. Surfaced to the connector
|
|
105
|
+
* via {@link hasDurableMembership} (a joined durable channel NOT yet a member renders degraded). */
|
|
106
|
+
pendingBootJoins = new Set();
|
|
107
|
+
/** Chat-join subjects currently being broker-confirmed. An out-of-ACL subscribe among these trips an
|
|
108
|
+
* EXPECTED async permission violation that joinChannel turns into a clean throw, so watchStatus
|
|
109
|
+
* suppresses it rather than surfacing a spurious connection error. */
|
|
110
|
+
confirmingChatSubs = new Set();
|
|
111
|
+
/** True until the first successful connect completes its boot backfill — distinguishes first-connect
|
|
112
|
+
* (backfill the boot channels' history) from a reconnect (reopen the core-subs, no re-backfill).
|
|
113
|
+
* Persists across reconnect (NOT connection-scoped). Replaces the legacy chat-durable consumed-cursor
|
|
114
|
+
* signal now that there is no per-instance chat durable. */
|
|
115
|
+
firstConnect = true;
|
|
61
116
|
heartbeatTimer;
|
|
62
117
|
sweepTimer;
|
|
63
118
|
roster = new Map();
|
|
64
119
|
status = "idle";
|
|
65
120
|
activity;
|
|
121
|
+
/** Mirror of the connector's authoritative attention state, published in presence (advisory). The
|
|
122
|
+
* endpoint never reads these back into delivery — they exist only to broadcast. */
|
|
123
|
+
attentionMode;
|
|
124
|
+
channelModes;
|
|
66
125
|
stopped = false;
|
|
67
126
|
/** In-flight rebuild (drain+rebind) — serializes manual reconnect, the supervisor's
|
|
68
127
|
* closed(), and reestablishLoop so only ONE rebuild runs at a time (a second trigger
|
|
@@ -106,6 +165,9 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
106
165
|
this.doRegister = opts.registerPresence ?? true;
|
|
107
166
|
this.doWatch = opts.watchPresence ?? true;
|
|
108
167
|
this.doConsume = opts.consume ?? true;
|
|
168
|
+
// Seed the presence mirror so file-default channel modes are visible from the first publish
|
|
169
|
+
// (not only after the first runtime toggle). Mirror only — delivery reads the connector's state.
|
|
170
|
+
this.channelModes = opts.channelModes && Object.keys(opts.channelModes).length ? opts.channelModes : undefined;
|
|
109
171
|
this.ackWaitMs = opts.ackWaitMs ?? 60_000;
|
|
110
172
|
this.inactiveThresholdMs = opts.inactiveThresholdMs ?? 600_000;
|
|
111
173
|
}
|
|
@@ -173,6 +235,10 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
173
235
|
await this.ensureStreams();
|
|
174
236
|
await this.startConsumers();
|
|
175
237
|
}
|
|
238
|
+
// Re-arm Plane-3 (delivery-daemon-hosted fan-out + trusted reader + ctl.delivery) on every (re)connect — no-op unless this
|
|
239
|
+
// endpoint hosts it. The first arm comes from startPlane3 (after start()); this re-binds the loops
|
|
240
|
+
// a reconnect's clearConnectionScoped() tore down, so a broker blip doesn't silently kill the backstop.
|
|
241
|
+
await this.armPlane3();
|
|
176
242
|
// Bound and live — covers initial start, manual reconnect, AND background self-heal (every
|
|
177
243
|
// path lands here). The single signal an in-process agent's connected flag tracks.
|
|
178
244
|
this.emit("connection", { connected: true });
|
|
@@ -198,6 +264,17 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
198
264
|
}
|
|
199
265
|
}
|
|
200
266
|
this.streamMsgs.length = 0;
|
|
267
|
+
for (const sub of this.chatSubs.values()) {
|
|
268
|
+
try {
|
|
269
|
+
sub.unsubscribe();
|
|
270
|
+
}
|
|
271
|
+
catch {
|
|
272
|
+
/* already closed with the connection */
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
this.chatSubs.clear();
|
|
276
|
+
this.chatSubDenied.clear();
|
|
277
|
+
this.confirmingChatSubs.clear();
|
|
201
278
|
this.roster.clear();
|
|
202
279
|
this.joinSeq.clear();
|
|
203
280
|
this.channelConfigs.clear();
|
|
@@ -270,6 +347,12 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
270
347
|
this.jsm = undefined;
|
|
271
348
|
this.kv = undefined;
|
|
272
349
|
this.channelKv = undefined;
|
|
350
|
+
// Plane-3 KV handles are bound to the old connection too — drop them so the daemon re-opens them on
|
|
351
|
+
// the fresh nc (else durableJoin/leave/list, the reader's ACL re-auth, and lease renew use a dead
|
|
352
|
+
// handle after a reconnect).
|
|
353
|
+
this.membersKv = undefined;
|
|
354
|
+
this.aclKv = undefined;
|
|
355
|
+
this.deliveryKv = undefined;
|
|
273
356
|
this.emit("connection", { connected: false }); // null window opened — not live until the rebind below
|
|
274
357
|
try {
|
|
275
358
|
await oldNc?.drain();
|
|
@@ -456,8 +539,16 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
456
539
|
})().catch((e) => this.emit("error", e));
|
|
457
540
|
}
|
|
458
541
|
// ---- control plane (request/reply) --------------------------------------
|
|
459
|
-
/** Serve control requests for a service
|
|
460
|
-
|
|
542
|
+
/** Serve control requests for a service. Returns the subscription so a caller that re-registers on
|
|
543
|
+
* reconnect (the delivery daemon) can drop the stale one. `boundReply` is REQUIRED for any service
|
|
544
|
+
* whose responder holds a wildcard publish grant over the service subtree (the delivery daemon's
|
|
545
|
+
* `ctl.delivery.*.reply.>`): without it, an authenticated caller could set its reply target to a
|
|
546
|
+
* PEER's reply lane (`ctl.delivery.<victim>.reply.<n>`) and turn the responder into a confused
|
|
547
|
+
* deputy — the broker does NOT permission-check the requester's embedded reply subject. With it, a
|
|
548
|
+
* reply is published only when `m.reply` is under the AUTHENTICATED request subject
|
|
549
|
+
* (`${m.subject}.reply.…`), binding the reply to the broker-policed sender token. (The manager's
|
|
550
|
+
* tiers reply into the per-id `_INBOX` and leave it off.) */
|
|
551
|
+
serveControl(service, handler, opts = {}) {
|
|
461
552
|
if (!this.nc)
|
|
462
553
|
throw new Error("endpoint not started");
|
|
463
554
|
const sub = this.nc.subscribe(controlServiceSubject(this.space, service, "*"), {
|
|
@@ -466,6 +557,12 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
466
557
|
this.subs.push(sub);
|
|
467
558
|
void (async () => {
|
|
468
559
|
for await (const m of sub) {
|
|
560
|
+
// Sender-bound reply guard (confused-deputy fix): never respond to a reply target outside the
|
|
561
|
+
// authenticated request subject's own `.reply.` subtree. Drop silently (don't inject elsewhere).
|
|
562
|
+
if (opts.boundReply && (!m.reply || !m.reply.startsWith(`${m.subject}.reply.`))) {
|
|
563
|
+
this.emit("error", new Error(`rejected ${service} request on ${m.subject}: reply target "${m.reply ?? "(none)"}" is not under the sender's own reply subtree`));
|
|
564
|
+
continue;
|
|
565
|
+
}
|
|
469
566
|
let reply;
|
|
470
567
|
try {
|
|
471
568
|
const req = m.json();
|
|
@@ -494,6 +591,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
494
591
|
}
|
|
495
592
|
}
|
|
496
593
|
})().catch((e) => this.emit("error", e));
|
|
594
|
+
return sub;
|
|
497
595
|
}
|
|
498
596
|
/** Send a control request to a service and await its reply (client side). */
|
|
499
597
|
async requestControl(service, req, timeoutMs = 5000) {
|
|
@@ -503,6 +601,26 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
503
601
|
const m = await this.nc.request(controlServiceSubject(this.space, service, this.card.id), JSON.stringify(body), { timeout: timeoutMs });
|
|
504
602
|
return m.json();
|
|
505
603
|
}
|
|
604
|
+
/** Send a durable-membership request to the SERVER-SIDE delivery daemon (`ctl.delivery`) and await its
|
|
605
|
+
* reply. Unlike {@link requestControl}, the reply rides a subject UNDER `ctl.delivery.<id>.>` (not the
|
|
606
|
+
* per-id `_INBOX`), so the scoped delivery cred can answer without broad inbox-publish — see
|
|
607
|
+
* CONTROL_DELIVERY. `noMux` lets us name the reply subject while keeping NoResponders detection (so a
|
|
608
|
+
* caller can fail-closed vs. degrade to live-only when no daemon is present). */
|
|
609
|
+
async requestDelivery(op, args, timeoutMs = 5000) {
|
|
610
|
+
if (!this.nc)
|
|
611
|
+
throw new Error(this.notLiveMsg());
|
|
612
|
+
const reqSubject = controlServiceSubject(this.space, CONTROL_DELIVERY, this.card.id); // ctl.delivery.<id>
|
|
613
|
+
// Reply rides the sender's OWN subtree so the daemon's serveControl boundReply guard accepts it
|
|
614
|
+
// (`${reqSubject}.reply.…`). The sender-bound guard is the COMPLETE confused-deputy closure. The
|
|
615
|
+
// random suffix is genuine defense-in-depth (NOT cosmetic): `noMux` subscribes this SPECIFIC named
|
|
616
|
+
// reply subject (not a standing `.reply.>` wildcard), so a predictable suffix would let a peer target
|
|
617
|
+
// an in-flight reply subscription — randomUUID brings it to parity with the nuid-protected `_INBOX`
|
|
618
|
+
// model. Keep both; don't regress to a counter. (Confirmed by the review panel's fact-check.)
|
|
619
|
+
const reply = `${reqSubject}.reply.${randomUUID()}`;
|
|
620
|
+
const body = { op, args, from: this.ref() };
|
|
621
|
+
const m = await this.nc.request(reqSubject, JSON.stringify(body), { timeout: timeoutMs, noMux: true, reply });
|
|
622
|
+
return m.json();
|
|
623
|
+
}
|
|
506
624
|
// ---- presence ------------------------------------------------------------
|
|
507
625
|
getRoster() {
|
|
508
626
|
return [...this.roster.values()].sort((a, b) => a.card.name.localeCompare(b.card.name));
|
|
@@ -515,6 +633,30 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
515
633
|
this.status = status;
|
|
516
634
|
await this.publishPresence();
|
|
517
635
|
}
|
|
636
|
+
/** Publish the agent's global attention mode into presence (advisory observability). Mirror only —
|
|
637
|
+
* delivery decisions stay in the connector's authoritative state. */
|
|
638
|
+
async setAttention(attention) {
|
|
639
|
+
this.attentionMode = attention;
|
|
640
|
+
await this.publishPresence();
|
|
641
|
+
}
|
|
642
|
+
/** Publish the agent's per-channel attention overrides into presence (advisory). An empty map drops
|
|
643
|
+
* the field. Mirror only — never read back into delivery. */
|
|
644
|
+
async setChannelModes(modes) {
|
|
645
|
+
this.channelModes = Object.keys(modes).length ? modes : undefined;
|
|
646
|
+
await this.publishPresence();
|
|
647
|
+
}
|
|
648
|
+
/** Overlay the host's live model onto the card's display-only `meta.model` and republish presence.
|
|
649
|
+
* For connectors that learn the actual model only *after* launch (e.g. Claude Code's `SessionStart`
|
|
650
|
+
* hook payload) rather than from an operator pin. Display-only discovery metadata; a no-op when the
|
|
651
|
+
* value is empty or already current (no redundant publish). The mutated card is read live by every
|
|
652
|
+
* later publish, so even a pre-connect call surfaces on the first presence write. */
|
|
653
|
+
async setCardModel(model) {
|
|
654
|
+
const m = model.trim();
|
|
655
|
+
if (!m || this.card.meta?.model === m)
|
|
656
|
+
return;
|
|
657
|
+
this.card.meta = { ...(this.card.meta ?? {}), model: m };
|
|
658
|
+
await this.publishPresence();
|
|
659
|
+
}
|
|
518
660
|
// ---- channel discovery ---------------------------------------------------
|
|
519
661
|
/** This channel's registry config from the live local cache (undefined if unset). */
|
|
520
662
|
getChannelConfig(channel) {
|
|
@@ -525,84 +667,118 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
525
667
|
channelReplay(channel) {
|
|
526
668
|
return effectiveReplay(this.channelConfigs.get(channel), this.channelDefaults);
|
|
527
669
|
}
|
|
670
|
+
/** Effective delivery class for a channel (per-channel override ?? space default ?? "durable"),
|
|
671
|
+
* from the live watch cache — drives the non-gating delivery-health surface (only durable-class
|
|
672
|
+
* channels have a Plane-3 backstop to report on). */
|
|
673
|
+
channelDeliveryClass(channel) {
|
|
674
|
+
return effectiveDeliveryClass(this.channelConfigs.get(channel), this.channelDefaults);
|
|
675
|
+
}
|
|
528
676
|
// ---- dynamic subscription (join / leave mid-session) ---------------------
|
|
529
677
|
/** The channels this endpoint is currently subscribed to (live — reflects join/leave). */
|
|
530
678
|
joinedChannels() {
|
|
531
679
|
return [...this.channels];
|
|
532
680
|
}
|
|
533
681
|
/**
|
|
534
|
-
* Join a channel mid-session:
|
|
535
|
-
*
|
|
536
|
-
*
|
|
537
|
-
*
|
|
538
|
-
* the
|
|
682
|
+
* Join a channel mid-session: open a native core subscription (manager-free live read, broker-
|
|
683
|
+
* confirmed against `sub.allow`), capture the stream frontier as the join watermark, backfill its
|
|
684
|
+
* history if replay is on, and — for a `durable`-class channel when a delivery daemon is present —
|
|
685
|
+
* request a Plane-3 durable backstop (via `ctl.delivery`). Idempotent: re-joining is a no-op (no
|
|
686
|
+
* re-backfill). Returns the backfill count + whether the durable backstop is active (+ a `reason`
|
|
687
|
+
* when a durable channel couldn't get one).
|
|
539
688
|
*/
|
|
540
689
|
async joinChannel(channel) {
|
|
541
690
|
if (!this.jsm)
|
|
542
691
|
throw new Error(this.notLiveMsg());
|
|
543
692
|
if (this.channels.includes(channel))
|
|
544
|
-
return { joined: false, backfilled: 0 };
|
|
545
|
-
// Arm the watermark BEFORE
|
|
546
|
-
//
|
|
547
|
-
// and filter BEFORE backfill (gap-safe: backfill-first leaves a window in neither stream).
|
|
693
|
+
return { joined: false, backfilled: 0, durable: this.plane3Channels.has(channel) };
|
|
694
|
+
// Arm the watermark BEFORE going live: the backfill reads ≤ frontier and the core-sub only ever
|
|
695
|
+
// delivers post-subscribe live messages (> frontier), so the two never overlap.
|
|
548
696
|
const armed = await this.armJoin([channel]);
|
|
697
|
+
// Live read (SPEC v0.3): open the native core subscription — MANAGER-FREE, broker-enforced by
|
|
698
|
+
// sub.allow. This is what lets an agent join a channel's live feed on its own. The sub.allow
|
|
699
|
+
// refusal is async — broker-confirm before committing local join state; the subscribe handler
|
|
700
|
+
// ALSO drops a channel on ANY refusal (incl. a late one), so this is not a timing gamble (#13).
|
|
701
|
+
this.subscribeChat(channel);
|
|
549
702
|
try {
|
|
550
|
-
await this.
|
|
703
|
+
await this.confirmChatSub();
|
|
551
704
|
}
|
|
552
705
|
catch (e) {
|
|
553
|
-
|
|
554
|
-
|
|
706
|
+
// The confirm boundary (flush) failed — the connection drained/closed mid-join, so we have NO
|
|
707
|
+
// confirmation the subscribe was accepted. Fail closed: undo the half-open join rather than
|
|
708
|
+
// returning as if it were confirmed (a reconnect re-confirms from this.channels, which we never
|
|
709
|
+
// pushed to). unsubscribeChat clears chatSubs + confirmingChatSubs.
|
|
710
|
+
this.unsubscribeChat(channel);
|
|
711
|
+
this.joinSeq.delete(channel);
|
|
712
|
+
throw new Error(`cannot join "${channel}": live subscription could not be confirmed (${e.message})`);
|
|
713
|
+
}
|
|
714
|
+
this.confirmingChatSubs.delete(chatSubject(this.space, "*", channel));
|
|
715
|
+
if (this.chatSubDenied.has(channel)) {
|
|
716
|
+
this.unsubscribeChat(channel);
|
|
717
|
+
this.joinSeq.delete(channel);
|
|
718
|
+
throw new Error(`cannot join "${channel}": not within this agent's read ACL (allowSubscribe)`);
|
|
555
719
|
}
|
|
556
720
|
this.channels.push(channel);
|
|
721
|
+
// Durable backstop. The live core-sub above already delivers (manager-free). For a `durable`-class
|
|
722
|
+
// channel, request a Plane-3 per-member backstop from the server-side delivery daemon (durableJoin via ctl.delivery) so a post reaches a
|
|
723
|
+
// busy/offline turn — the core-sub stays as the live wake-hint, dedup-coalesced with the Plane-3
|
|
724
|
+
// copy by id-dedup. No manager (open dev / manager-less) ⇒ joined LIVE only, surfaced via `reason`
|
|
725
|
+
// (never silent). A `live`-class channel takes no backstop (joined live is the contract).
|
|
726
|
+
let durable = false;
|
|
727
|
+
let reason;
|
|
728
|
+
if (effectiveDeliveryClass(this.channelConfigs.get(channel), this.channelDefaults) === "durable") {
|
|
729
|
+
try {
|
|
730
|
+
const r = await this.durableJoinChannel(channel);
|
|
731
|
+
if (r.durable) {
|
|
732
|
+
this.plane3Channels.set(channel, r.generation ?? 0);
|
|
733
|
+
durable = true;
|
|
734
|
+
}
|
|
735
|
+
else {
|
|
736
|
+
reason = r.reason ?? "durable backstop unavailable";
|
|
737
|
+
}
|
|
738
|
+
}
|
|
739
|
+
catch (e) {
|
|
740
|
+
// No privileged writer (no delivery daemon) or the write was rejected — joined live, backstop
|
|
741
|
+
// unavailable. NOT a join failure: the live subscription is up and authorized.
|
|
742
|
+
reason = `durable backstop unavailable (${e.message})`;
|
|
743
|
+
}
|
|
744
|
+
}
|
|
557
745
|
const backfilled = await this.backfillArmed(armed);
|
|
558
|
-
return { joined: true, backfilled };
|
|
559
|
-
}
|
|
560
|
-
/** Leave a channel mid-session
|
|
561
|
-
*
|
|
562
|
-
*
|
|
746
|
+
return { joined: true, backfilled, durable, ...(reason !== undefined ? { reason } : {}) };
|
|
747
|
+
}
|
|
748
|
+
/** Leave a channel mid-session — MANAGER-FREE for the live read: close the core subscription. For a
|
|
749
|
+
* Plane-3 durable channel, the membership is tombstoned FIRST at the leave cursor (SPEC §7: leave is
|
|
750
|
+
* a hard read boundary for the backstop — a pre-leave entry stays deliverable, `seq > leaveCursor` is
|
|
751
|
+
* denied). FAIL-CLOSED: if the tombstone can't be confirmed the call throws and the leave is NOT
|
|
752
|
+
* applied (live sub stays up, local mirror intact) so the caller can retry — never close the live
|
|
753
|
+
* read while the backstop keeps delivering. */
|
|
563
754
|
async leaveChannel(channel) {
|
|
564
755
|
if (!this.jsm)
|
|
565
756
|
throw new Error(this.notLiveMsg());
|
|
566
|
-
|
|
567
|
-
if (i < 0)
|
|
757
|
+
if (!this.channels.includes(channel))
|
|
568
758
|
return { left: false };
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
759
|
+
// Auth + durable-class ⇒ a Plane-3 membership may exist; tombstone it BEFORE touching local state.
|
|
760
|
+
// The join generation comes from the local mirror, but a BOOT membership whose hydration was missed
|
|
761
|
+
// (daemon down at connect) is NOT in the mirror — so re-resolve it from the delivery service on
|
|
762
|
+
// demand. FAIL-CLOSED: fetchMemberships throws on a responder-present error, so a leave whose
|
|
763
|
+
// tombstone can't be confirmed propagates (live sub stays up, mirror intact) for the caller to retry
|
|
764
|
+
// — reporting `left` while the trusted reader keeps transferring to DLV is the fail-open leak. A
|
|
765
|
+
// genuine no-responder (open / no delivery daemon, no Plane-3) means there is no membership to tombstone.
|
|
766
|
+
if (this.creds && effectiveDeliveryClass(this.channelConfigs.get(channel), this.channelDefaults) === "durable") {
|
|
767
|
+
let generation = this.plane3Channels.get(channel);
|
|
768
|
+
if (generation === undefined)
|
|
769
|
+
generation = (await this.fetchMemberships())?.find((m) => m.channel === channel)?.generation;
|
|
770
|
+
if (generation !== undefined) {
|
|
771
|
+
await this.durableLeaveChannel(channel, generation);
|
|
772
|
+
this.plane3Channels.delete(channel);
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
this.unsubscribeChat(channel);
|
|
776
|
+
const i = this.channels.indexOf(channel);
|
|
777
|
+
if (i >= 0)
|
|
778
|
+
this.channels.splice(i, 1);
|
|
574
779
|
this.joinSeq.delete(channel);
|
|
575
780
|
return { left: true };
|
|
576
781
|
}
|
|
577
|
-
/** Move the chat live-tail durable to a new channel set. OPEN mode self-serves the
|
|
578
|
-
* `consumers.update` (the agent owns its durable). AUTH mode is bind-only — the agent has no
|
|
579
|
-
* UPDATE grant — so it sends a mediated control request to the manager, which validates the set
|
|
580
|
-
* ⊆ its `allowSubscribe` before moving the filter. Throws clearly when no privileged responder is
|
|
581
|
-
* present: a manager-less standalone auth session is fixed to its boot subscribe set — a
|
|
582
|
-
* documented limitation, not a silent degrade. */
|
|
583
|
-
async setChatFilter(channels) {
|
|
584
|
-
if (!this.jsm)
|
|
585
|
-
throw new Error(this.notLiveMsg());
|
|
586
|
-
if (!this.creds) {
|
|
587
|
-
await this.jsm.consumers.update(chatStream(this.space), chatDurable(this.card.id), {
|
|
588
|
-
filter_subjects: collapseFilterSubjects(channels.map((ch) => chatSubject(this.space, "*", ch))),
|
|
589
|
-
});
|
|
590
|
-
return;
|
|
591
|
-
}
|
|
592
|
-
let reply;
|
|
593
|
-
try {
|
|
594
|
-
reply = await this.requestControl(CONTROL_SELF_SERVICE, { op: "setChannels", args: { channels } });
|
|
595
|
-
}
|
|
596
|
-
catch (e) {
|
|
597
|
-
const msg = e.message;
|
|
598
|
-
if (/no responders/i.test(msg))
|
|
599
|
-
throw new Error("cannot change channels at runtime: no privileged provisioner (manager) is serving the mesh — " +
|
|
600
|
-
"this session is fixed to its boot subscribe set");
|
|
601
|
-
throw e;
|
|
602
|
-
}
|
|
603
|
-
if (!reply.ok)
|
|
604
|
-
throw new Error(reply.error ?? "channel change rejected");
|
|
605
|
-
}
|
|
606
782
|
/** One coherent channel model for dashboards: every channel that has messages OR a registry
|
|
607
783
|
* entry (configured-but-empty), each tagged with its {@link ChannelConfig}. Works even on
|
|
608
784
|
* observer endpoints (no consumers needed). */
|
|
@@ -636,56 +812,32 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
636
812
|
.sort((a, b) => a.channel.localeCompare(b.channel));
|
|
637
813
|
}
|
|
638
814
|
async channelMembers(channel) {
|
|
639
|
-
const
|
|
640
|
-
|
|
641
|
-
// One peer has one chat consumer, so this is a straight per-peer collection; join/leave
|
|
642
|
-
// just mutates that consumer's filter_subjects, which the next call re-reads live.
|
|
643
|
-
const byTok = new Map();
|
|
644
|
-
for await (const ci of mgr.consumers.list(chatStream(this.space))) {
|
|
645
|
-
const tok = chatDurableToken(ci.config.durable_name ?? ci.name);
|
|
646
|
-
if (tok === null)
|
|
647
|
-
continue;
|
|
648
|
-
// The server may report a single filter as `filter_subject` or `filter_subjects` — both
|
|
649
|
-
// are the same datum; read whichever is present. Filters are already collapsed (the
|
|
650
|
-
// effective subscription), so parse the channel straight out of each.
|
|
651
|
-
const filters = ci.config.filter_subjects ?? (ci.config.filter_subject ? [ci.config.filter_subject] : []);
|
|
652
|
-
const set = byTok.get(tok) ?? new Set();
|
|
653
|
-
for (const f of filters) {
|
|
654
|
-
const p = parseSubject(f);
|
|
655
|
-
if (p?.kind === "chat")
|
|
656
|
-
set.add(p.rest);
|
|
657
|
-
}
|
|
658
|
-
byTok.set(tok, set);
|
|
659
|
-
}
|
|
660
|
-
// Join with presence for liveness. token() is lossy, so match forward: index the roster
|
|
661
|
-
// by token(id). A durable with no roster match is a ghost/foreign id — keep its token,
|
|
662
|
-
// never drop it.
|
|
663
|
-
const byToken = new Map();
|
|
815
|
+
const members = (await listMembers(await this.membersRegistry())).filter((r) => r.leaveCursor === undefined && r.activated === true);
|
|
816
|
+
const byId = new Map();
|
|
664
817
|
for (const p of this.roster.values())
|
|
665
|
-
|
|
666
|
-
const
|
|
667
|
-
const p =
|
|
818
|
+
byId.set(p.card.id, p);
|
|
819
|
+
const memberForId = (id) => {
|
|
820
|
+
const p = byId.get(id);
|
|
668
821
|
return p
|
|
669
822
|
? { id: p.card.id, name: p.card.name, role: p.card.role, live: p.status !== "offline" }
|
|
670
|
-
: { id
|
|
823
|
+
: { id, name: id, live: false };
|
|
671
824
|
};
|
|
672
825
|
const byName = (a, b) => a.name.localeCompare(b.name);
|
|
673
|
-
if (channel !== undefined)
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
return out.sort(byName);
|
|
679
|
-
}
|
|
826
|
+
if (channel !== undefined)
|
|
827
|
+
return members
|
|
828
|
+
.filter((r) => subjectMatches(r.channel, channel))
|
|
829
|
+
.map((r) => memberForId(r.owner))
|
|
830
|
+
.sort(byName);
|
|
680
831
|
const map = new Map();
|
|
681
|
-
for (const
|
|
682
|
-
const
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
if (arr)
|
|
832
|
+
for (const r of members) {
|
|
833
|
+
const arr = map.get(r.channel);
|
|
834
|
+
const m = memberForId(r.owner);
|
|
835
|
+
if (arr) {
|
|
836
|
+
if (!arr.some((x) => x.id === m.id))
|
|
686
837
|
arr.push(m);
|
|
687
|
-
|
|
688
|
-
|
|
838
|
+
}
|
|
839
|
+
else {
|
|
840
|
+
map.set(r.channel, [m]);
|
|
689
841
|
}
|
|
690
842
|
}
|
|
691
843
|
for (const arr of map.values())
|
|
@@ -746,8 +898,14 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
746
898
|
return;
|
|
747
899
|
void (async () => {
|
|
748
900
|
for await (const s of this.nc.status()) {
|
|
749
|
-
if (s.type
|
|
750
|
-
|
|
901
|
+
if (s.type !== "error")
|
|
902
|
+
continue;
|
|
903
|
+
// Suppress the EXPECTED permission violation from a manager-free join we're confirming: an
|
|
904
|
+
// out-of-ACL `nc.subscribe` is refused async on its chat subject, which joinChannel catches
|
|
905
|
+
// and turns into a clean throw — it is not a connection error to surface.
|
|
906
|
+
if (s.error instanceof PermissionViolationError && this.confirmingChatSubs.has(s.error.subject))
|
|
907
|
+
continue;
|
|
908
|
+
this.emit("error", describeStatusError(s.error));
|
|
751
909
|
}
|
|
752
910
|
})().catch((e) => {
|
|
753
911
|
if (!this.stopped)
|
|
@@ -776,29 +934,10 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
776
934
|
throw new Error("endpoint not started");
|
|
777
935
|
await createSpaceStreams(this.jsm, this.space);
|
|
778
936
|
}
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
* never does (mirrors {@link provisionDmInbox}). Idempotent. The caller must be permissive on CHAT.
|
|
784
|
-
*/
|
|
785
|
-
async provisionChatDurable(targetId, subscribe) {
|
|
786
|
-
const jsm = await this.manager();
|
|
787
|
-
await jsm.consumers.add(chatStream(this.space), chatDurableConfig(this.space, targetId, subscribe));
|
|
788
|
-
}
|
|
789
|
-
/**
|
|
790
|
-
* Privileged: move an agent's bind-only chat durable to a new channel set — the write half of the
|
|
791
|
-
* mediated join/leave. The manager calls this AFTER validating the set ⊆ the agent's
|
|
792
|
-
* `allowSubscribe`; the agent itself has no UPDATE grant, so this trusted path is the only way its
|
|
793
|
-
* live filter moves. The filter is rebuilt from channel names here (not from agent-supplied
|
|
794
|
-
* subjects) so a caller can't smuggle a hand-built filter.
|
|
795
|
-
*/
|
|
796
|
-
async setChatFilterFor(targetId, channels) {
|
|
797
|
-
const jsm = await this.manager();
|
|
798
|
-
await jsm.consumers.update(chatStream(this.space), chatDurable(targetId), {
|
|
799
|
-
filter_subjects: collapseFilterSubjects(channels.map((ch) => chatSubject(this.space, "*", ch))),
|
|
800
|
-
});
|
|
801
|
-
}
|
|
937
|
+
// (v3) The old `provisionMembership` — manager/provisioner-written boot membership at spawn — is GONE.
|
|
938
|
+
// Boot durable membership is now the AGENT self-joining its durable boot channels via the daemon's
|
|
939
|
+
// `ctl.delivery` op at connect ({@link armBootDurableMemberships}), reconciled on outage. The
|
|
940
|
+
// primitive it wrapped, {@link durableJoinFor}, is now driven by the daemon's `ctl.delivery` handler.
|
|
802
941
|
/**
|
|
803
942
|
* Privileged: pre-create an agent's DM inbox durable (auth mode), so the agent can BIND
|
|
804
943
|
* it without holding CONSUMER.CREATE on DM_<space>. The creator sets the filter to
|
|
@@ -810,6 +949,17 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
810
949
|
const jsm = await this.manager();
|
|
811
950
|
await jsm.consumers.add(dmStream(this.space), dmDurableConfig(this.space, targetId));
|
|
812
951
|
}
|
|
952
|
+
/**
|
|
953
|
+
* Privileged: pre-create an agent's bind-only Plane-3 DELIVER durable (`dlv_<id>`, filtered to
|
|
954
|
+
* `dlv.<id>`), so the agent can BIND its per-member durable handoff without holding CONSUMER.CREATE
|
|
955
|
+
* on the DLV stream. Same bind-only model as {@link provisionDmInbox}: the creator sets the filter,
|
|
956
|
+
* the agent never does. The trusted reader transfers re-authorized copies onto `dlv.<id>`; the agent
|
|
957
|
+
* acks them via native JetStream (SPEC §8). Idempotent. The caller must be permissive on DLV.
|
|
958
|
+
*/
|
|
959
|
+
async provisionDlvInbox(targetId) {
|
|
960
|
+
const jsm = await this.manager();
|
|
961
|
+
await jsm.consumers.add(dlvStream(this.space), dlvDurableConfig(this.space, targetId));
|
|
962
|
+
}
|
|
813
963
|
/**
|
|
814
964
|
* Privileged: pre-create a role's shared TASK work-queue durable (auth mode), so agents
|
|
815
965
|
* of that role can BIND it without holding CONSUMER.CREATE on TASK_<space>. The creator
|
|
@@ -820,6 +970,746 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
820
970
|
const jsm = await this.manager();
|
|
821
971
|
await jsm.consumers.add(taskStream(this.space), taskDurableConfig(this.space, role));
|
|
822
972
|
}
|
|
973
|
+
// ---- Plane-3: durable backstop (SPEC §8) — privileged, hosted by the server-side DELIVERY DAEMON ----
|
|
974
|
+
//
|
|
975
|
+
// Two daemon loops + two privileged membership ops (served to agents on `ctl.delivery`). The FAN-OUT
|
|
976
|
+
// writer (routing, not auth) reads every chat message and copies it into each eligible owner's MIXED
|
|
977
|
+
// inbox (`dinbox.<owner>`); the TRUSTED READER (the auth gate) re-authorizes each entry against the
|
|
978
|
+
// CURRENT ACL + membership interval and TRANSFERS the authorized copy to the owner's per-member
|
|
979
|
+
// DELIVER store (`dlv.<owner>`), which the agent binds + acks via native JetStream. The agent holds no
|
|
980
|
+
// read on the mixed store. (v3: this all moved off the manager — the manager is lifecycle-only; it
|
|
981
|
+
// records the read-ACL at mint via commitAcl.) See `.internal/research/stage4-impl-design.md`.
|
|
982
|
+
/** Lazily open the privileged members registry KV (delivery daemon / open-mode self). */
|
|
983
|
+
async membersRegistry() {
|
|
984
|
+
if (!this.nc)
|
|
985
|
+
throw new Error("endpoint not started");
|
|
986
|
+
this.membersKv ??= await openMembersRegistry(this.nc, this.space);
|
|
987
|
+
return this.membersKv;
|
|
988
|
+
}
|
|
989
|
+
/** Lazily open the durable read-ACL registry KV. Privileged write (the manager records an agent's
|
|
990
|
+
* ACL at mint); the delivery daemon reads it fresh per durable entry to re-authorize. */
|
|
991
|
+
async aclRegistry() {
|
|
992
|
+
if (!this.nc)
|
|
993
|
+
throw new Error("endpoint not started");
|
|
994
|
+
this.aclKv ??= await openAclRegistry(this.nc, this.space);
|
|
995
|
+
return this.aclKv;
|
|
996
|
+
}
|
|
997
|
+
/** Privileged ({@link DurableProvisioner}): record an agent's read ACL in the durable registry at
|
|
998
|
+
* provision/mint time — the same act as baking it into the JWT, persisted so the server-side
|
|
999
|
+
* delivery daemon can re-authorize the agent's durable entries and validate its runtime
|
|
1000
|
+
* durable-joins without holding any in-memory ledger. Written ATOMICALLY ({@link writeAclRecord}),
|
|
1001
|
+
* so a present record is always complete (`[]` = known no-read, never a half-write). */
|
|
1002
|
+
async commitAcl(targetId, allowSubscribe) {
|
|
1003
|
+
await writeAclRecord(await this.aclRegistry(), targetId, allowSubscribe);
|
|
1004
|
+
}
|
|
1005
|
+
/** The server-side delivery daemon's fresh-per-entry ACL read: an owner's CURRENT read ACL
|
|
1006
|
+
* (`allowSubscribe`) from the durable registry, or `undefined` if no record (an unknown owner — the
|
|
1007
|
+
* reader DEFERS, never drops). A present `[]` (known no-read) returns `[]` (the reader DROPS). */
|
|
1008
|
+
async aclForOwner(owner) {
|
|
1009
|
+
return (await readAcl(await this.aclRegistry(), owner))?.record.allowSubscribe;
|
|
1010
|
+
}
|
|
1011
|
+
/** Lazily open the delivery lease/readiness KV (pre-created at `cotal up`; bind, never create). */
|
|
1012
|
+
async deliveryRegistry() {
|
|
1013
|
+
if (!this.nc)
|
|
1014
|
+
throw new Error("endpoint not started");
|
|
1015
|
+
this.deliveryKv ??= await openDeliveryRegistry(this.nc, this.space);
|
|
1016
|
+
return this.deliveryKv;
|
|
1017
|
+
}
|
|
1018
|
+
encodeLease(ready) {
|
|
1019
|
+
return new TextEncoder().encode(JSON.stringify({ holder: this.card.id, since: Date.now(), ready }));
|
|
1020
|
+
}
|
|
1021
|
+
/** Acquire the single-flight delivery lease for a shard via an ATOMIC CAS create, marked NOT-ready.
|
|
1022
|
+
* THROWS if a live lease exists — a loud refusal-to-bind (the daemon exits), never a retry, so two
|
|
1023
|
+
* daemons can't split a durable's delivery. A crashed holder's lease auto-expires (bucket TTL),
|
|
1024
|
+
* freeing a re-acquire. Acquired BEFORE binding (single-flight gate); {@link markDeliveryLeaseReady}
|
|
1025
|
+
* flips it ready AFTER the loops + `ctl.delivery` are bound. Returns the lease revision. */
|
|
1026
|
+
async acquireDeliveryLease(shardIndex) {
|
|
1027
|
+
return (await this.deliveryRegistry()).create(leaseKey(shardIndex), this.encodeLease(false));
|
|
1028
|
+
}
|
|
1029
|
+
/** Flip the held lease to READY (CAS `kv.update`) AFTER `startPlane3` has bound the loops + the
|
|
1030
|
+
* `ctl.delivery` responder — so "lease ready" proves the responder is up, not just that the slot was
|
|
1031
|
+
* claimed. Returns the new revision. */
|
|
1032
|
+
async markDeliveryLeaseReady(shardIndex, revision) {
|
|
1033
|
+
return (await this.deliveryRegistry()).update(leaseKey(shardIndex), this.encodeLease(true), revision);
|
|
1034
|
+
}
|
|
1035
|
+
/** Renew the held lease (CAS `kv.update` against `revision`, keeping `ready:true`) to refresh it before
|
|
1036
|
+
* the bucket TTL expires it. Returns the new revision. Throws if the revision moved (lost the lease —
|
|
1037
|
+
* the daemon should exit). */
|
|
1038
|
+
async renewDeliveryLease(shardIndex, revision) {
|
|
1039
|
+
return (await this.deliveryRegistry()).update(leaseKey(shardIndex), this.encodeLease(true), revision);
|
|
1040
|
+
}
|
|
1041
|
+
/** Release the held lease on clean shutdown so a replacement daemon re-acquires immediately (best
|
|
1042
|
+
* effort — a crash just lets the bucket TTL expire it). */
|
|
1043
|
+
async releaseDeliveryLease(shardIndex) {
|
|
1044
|
+
try {
|
|
1045
|
+
await (await this.deliveryRegistry()).delete(leaseKey(shardIndex));
|
|
1046
|
+
}
|
|
1047
|
+
catch { /* already gone */ }
|
|
1048
|
+
}
|
|
1049
|
+
/** Read a shard's delivery lease (the daemon-availability signal), or `undefined` if none is live.
|
|
1050
|
+
* READ-ONLY surface — drives Component 6's `cotal_channels` delivery-health field (an agent reads it
|
|
1051
|
+
* under its own cred, which holds lease-bucket read but no write). */
|
|
1052
|
+
async readDeliveryLease(shardIndex) {
|
|
1053
|
+
const e = await (await this.deliveryRegistry()).get(leaseKey(shardIndex));
|
|
1054
|
+
if (!e || e.operation === "DEL" || e.operation === "PURGE")
|
|
1055
|
+
return undefined;
|
|
1056
|
+
try {
|
|
1057
|
+
return e.json();
|
|
1058
|
+
}
|
|
1059
|
+
catch {
|
|
1060
|
+
return undefined;
|
|
1061
|
+
}
|
|
1062
|
+
}
|
|
1063
|
+
/** Privileged: one owner's NON-TOMBSTONED durable memberships as `{channel, generation, activated}` —
|
|
1064
|
+
* the server-side delivery daemon serves this to a connecting agent (the `listMemberships` op on
|
|
1065
|
+
* `ctl.delivery`). The agent seeds its leave mirror from the ACTIVATED ones (the confirmed backstops),
|
|
1066
|
+
* but the non-activated ones are returned too so `leaveChannel` can discover + close a record that
|
|
1067
|
+
* still routes under the pure-interval predicate (a crash-stuck pending activation) — without reading
|
|
1068
|
+
* the privileged KV itself. */
|
|
1069
|
+
async ownerMemberships(owner) {
|
|
1070
|
+
const recs = await listMembers(await this.membersRegistry(), { owner });
|
|
1071
|
+
return recs
|
|
1072
|
+
.filter((r) => r.leaveCursor === undefined)
|
|
1073
|
+
.map((r) => ({ channel: r.channel, generation: r.generation, activated: r.activated === true }));
|
|
1074
|
+
}
|
|
1075
|
+
/** Effective delivery class read AUTHORITATIVELY from the registry KV (not the watch cache) — so a
|
|
1076
|
+
* `live`→`durable` flip is seen by fan-out without a cache-propagation gap (red-team MED-3). */
|
|
1077
|
+
async deliveryClassFresh(channel) {
|
|
1078
|
+
if (!this.channelKv)
|
|
1079
|
+
return effectiveDeliveryClass(undefined, undefined);
|
|
1080
|
+
const [cfg, defaults] = await Promise.all([
|
|
1081
|
+
isConcreteChannel(channel) ? readChannelConfig(this.channelKv, channel) : Promise.resolve(undefined),
|
|
1082
|
+
readChannelDefaults(this.channelKv),
|
|
1083
|
+
]);
|
|
1084
|
+
return effectiveDeliveryClass(cfg, defaults);
|
|
1085
|
+
}
|
|
1086
|
+
/** Collision-safe `@mention` → owner-id resolution: a name that resolves to exactly one present
|
|
1087
|
+
* peer wins; 0 or >1 matches drop (never fan a directed durable copy to an unrelated same-named
|
|
1088
|
+
* bystander — red-team LOW; SPEC §4 unique instance id). */
|
|
1089
|
+
resolveOwnerByName(name) {
|
|
1090
|
+
const matches = [...this.roster.values()].filter((p) => p.card.name.toLowerCase() === name.toLowerCase());
|
|
1091
|
+
return matches.length === 1 ? matches[0].card.id : undefined;
|
|
1092
|
+
}
|
|
1093
|
+
/** Publish one fan-out entry into an owner's mixed inbox, idempotent via `Nats-Msg-Id`
|
|
1094
|
+
* (`<msgId>:<owner>:<generation>`) so a catch-up copy and a racing fan-out copy collapse. */
|
|
1095
|
+
async publishDinbox(owner, entry) {
|
|
1096
|
+
if (!this.js)
|
|
1097
|
+
return;
|
|
1098
|
+
await this.js.publish(dinboxSubject(this.space, owner), JSON.stringify(entry), {
|
|
1099
|
+
msgID: `${entry.msg.id}:${owner}:${entry.generation}`,
|
|
1100
|
+
});
|
|
1101
|
+
}
|
|
1102
|
+
/** The fan-out consumer's delivered stream-seq — the activation-fence upper bound (red-team
|
|
1103
|
+
* BLOCKER-1: the shared fan-out cursor advances independently of the stream frontier). */
|
|
1104
|
+
async fanoutDeliveredSeq() {
|
|
1105
|
+
const info = await this.consumerInfo(chatStream(this.space), FANOUT_DURABLE);
|
|
1106
|
+
return info?.delivered?.stream_seq ?? 0;
|
|
1107
|
+
}
|
|
1108
|
+
/**
|
|
1109
|
+
* Privileged durable-JOIN write (v3: the delivery daemon calls this from its `ctl.delivery` handler
|
|
1110
|
+
* after validating channel ⊆ the caller's read ACL): capture `joinCursor`, commit a `durable-active`
|
|
1111
|
+
* record (CAS + generation bump), then ACTIVATION CATCH-UP idempotently copies `(joinCursor, fence]`
|
|
1112
|
+
* into the owner inbox where `fence = max(frontier, fanoutDelivered)` — fan-out owns `seq > fence`.
|
|
1113
|
+
* Idempotent against a timeout-retry (an already-activated membership no-ops). Returns `{durable:false}`
|
|
1114
|
+
* (honest degrade) only if the catch-up window was evicted.
|
|
1115
|
+
*
|
|
1116
|
+
* Runs on the daemon (which hosts the fan-out/reader loops + the members KV), so catch-up + the
|
|
1117
|
+
* activation fence read are in-process — no cross-process cursor read.
|
|
1118
|
+
*/
|
|
1119
|
+
async durableJoinFor(owner, channel) {
|
|
1120
|
+
if (!this.js)
|
|
1121
|
+
throw new Error("endpoint not started");
|
|
1122
|
+
await this.manager(); // ensure jsm — a non-consuming provisioner inits it lazily; catch-up + fence need it
|
|
1123
|
+
const kv = await this.membersRegistry();
|
|
1124
|
+
const existing = await readMember(kv, channel, owner);
|
|
1125
|
+
const open = existing?.record.state === "durable-active" && existing.record.leaveCursor === undefined;
|
|
1126
|
+
if (open && existing.record.activated)
|
|
1127
|
+
return { durable: true, generation: existing.record.generation }; // fully activated — idempotent
|
|
1128
|
+
// Either a NEW join (no record / a tombstone to supersede) → fresh joinCursor + bumped generation,
|
|
1129
|
+
// OR a retry of an INCOMPLETE activation (durable-active but not yet activated, from an earlier
|
|
1130
|
+
// eviction/crash) → re-run catch-up over the SAME join window, no bump. The record is committed
|
|
1131
|
+
// `activated:false` first and routes IN-INTERVAL immediately (fan-out + reader deliver via the
|
|
1132
|
+
// pure-interval durableEligible) so no live message published during catch-up is lost. `activated`
|
|
1133
|
+
// gates only the REPORT — durableJoin returns true / channelMembers lists the owner only after the
|
|
1134
|
+
// catch-up confirms. A join that never completes catch-up still routes live (harmless: the agent is
|
|
1135
|
+
// live-subscribed and DLV is id-deduped) but honestly reports durable:false and stays hidden.
|
|
1136
|
+
const joinCursor = open ? existing.record.joinCursor : await this.chatFrontier();
|
|
1137
|
+
const generation = open ? existing.record.generation : (existing?.record.generation ?? 0) + 1;
|
|
1138
|
+
const base = {
|
|
1139
|
+
channel, owner, state: "durable-active", joinCursor, generation,
|
|
1140
|
+
activated: false, writerIdentity: this.card.id, updatedAt: Date.now(),
|
|
1141
|
+
};
|
|
1142
|
+
if (!open)
|
|
1143
|
+
await commitMember(kv, base);
|
|
1144
|
+
const fence = Math.max(await this.chatFrontier(), await this.fanoutDeliveredSeq());
|
|
1145
|
+
const cu = await this.catchupCopy(owner, channel, joinCursor, fence, generation);
|
|
1146
|
+
if (cu.evicted) {
|
|
1147
|
+
// Catch-up window irreparably evicted (the oldest in-window message aged out) — this join can never
|
|
1148
|
+
// be a complete backstop. TOMBSTONE the just-committed record at `fence` so it does NOT route:
|
|
1149
|
+
// pure-interval durableEligible would otherwise keep delivering to a record the agent was told is
|
|
1150
|
+
// durable:false AND can't discover to leave (critic BLOCKER-1). Pass `generation` as the expected
|
|
1151
|
+
// generation (ux stale-write guard) so this cleanup can't tombstone a concurrent NEWER rejoin — if
|
|
1152
|
+
// one won, StaleMembershipWrite is the correct no-op (the rejoin is the live record). Then degrade
|
|
1153
|
+
// honestly — a retry is a fresh join (no longer `open`, so a current joinCursor is captured).
|
|
1154
|
+
try {
|
|
1155
|
+
await tombstoneMember(kv, channel, owner, fence, this.card.id, generation);
|
|
1156
|
+
}
|
|
1157
|
+
catch (e) {
|
|
1158
|
+
if (!(e instanceof StaleMembershipWrite))
|
|
1159
|
+
throw e;
|
|
1160
|
+
}
|
|
1161
|
+
return { durable: false, reason: "activation catch-up window partially evicted by retention", generation };
|
|
1162
|
+
}
|
|
1163
|
+
// Flip → reported durable, ATOMICALLY: refuse if a concurrent SAME-generation leave (tombstone) or a
|
|
1164
|
+
// rejoin superseded this pending join while catch-up ran. A blind same-gen commit would clobber the
|
|
1165
|
+
// tombstone (clear leaveCursor) and resurrect the membership, reopening §7 (review-general-2 BLOCKER).
|
|
1166
|
+
const activated = await activateMember(kv, channel, owner, generation, joinCursor);
|
|
1167
|
+
if (!activated)
|
|
1168
|
+
return { durable: false, reason: "activation superseded by a concurrent leave or rejoin", generation };
|
|
1169
|
+
return { durable: true, generation };
|
|
1170
|
+
}
|
|
1171
|
+
/** Privileged durable-LEAVE write: tombstone the membership at `leaveCursor = frontier` so the
|
|
1172
|
+
* backstop denies `seq > leaveCursor` while a pre-leave entry stays deliverable (SPEC §7 interval). */
|
|
1173
|
+
async durableLeaveFor(owner, channel, expectedGeneration) {
|
|
1174
|
+
if (!this.plane3)
|
|
1175
|
+
return; // not a Plane-3 host — no membership to tombstone
|
|
1176
|
+
const kv = await this.membersRegistry();
|
|
1177
|
+
// expectedGeneration (captured by the agent at durableJoin) refuses a stale leave from tombstoning
|
|
1178
|
+
// a newer rejoin (StaleMembershipWrite) — a durable-disable primitive otherwise.
|
|
1179
|
+
await tombstoneMember(kv, channel, owner, await this.chatFrontier(), this.card.id, expectedGeneration);
|
|
1180
|
+
}
|
|
1181
|
+
/** Idempotently copy the eligible chat messages in `(fromSeqExcl, toSeqIncl]` for `channel` into the
|
|
1182
|
+
* owner inbox, via a DEDICATED per-(owner,join) ephemeral consumer (NOT the agent-scoped
|
|
1183
|
+
* `chathist_<id>`/`histLock` — red-team HIGH-8). `evicted` ⇒ the oldest eligible seq aged out under
|
|
1184
|
+
* `discard=Old` (the start seq could not be served), a durable shortfall the caller surfaces. */
|
|
1185
|
+
async catchupCopy(owner, channel, fromSeqExcl, toSeqIncl, generation) {
|
|
1186
|
+
if (!this.js || !this.jsm || toSeqIncl <= fromSeqExcl)
|
|
1187
|
+
return { copied: 0, evicted: false };
|
|
1188
|
+
const subject = chatSubject(this.space, "*", channel);
|
|
1189
|
+
// Eviction = a message in `(joinCursor, …]` on THIS channel's subject aged out under discard=Old.
|
|
1190
|
+
// Judged PER-SUBJECT (reuse channelDropped: oldest-retained-for-subject vs the watermark, only at
|
|
1191
|
+
// the per-subject cap), NOT against the stream-global joinCursor+1 — other channels' traffic
|
|
1192
|
+
// inflates the global seq, so a naive "first delivered seq > joinCursor+1" false-positives on any
|
|
1193
|
+
// busy multi-channel space (impl-review HIGH-2). A true eviction → durableJoin reports durable:false.
|
|
1194
|
+
const evicted = await this.channelDropped(subject, fromSeqExcl);
|
|
1195
|
+
const name = `cu_${token(owner)}_${generation}`;
|
|
1196
|
+
try {
|
|
1197
|
+
await this.jsm.consumers.delete(chatStream(this.space), name);
|
|
1198
|
+
}
|
|
1199
|
+
catch { /* none */ }
|
|
1200
|
+
await this.jsm.consumers.add(chatStream(this.space), {
|
|
1201
|
+
name, filter_subject: subject, ack_policy: AckPolicy.None, mem_storage: true,
|
|
1202
|
+
inactive_threshold: nanos(30_000), deliver_policy: DeliverPolicy.StartSequence, opt_start_seq: fromSeqExcl + 1,
|
|
1203
|
+
});
|
|
1204
|
+
let copied = 0;
|
|
1205
|
+
try {
|
|
1206
|
+
const consumer = await this.js.consumers.get(chatStream(this.space), name);
|
|
1207
|
+
let pending = (await consumer.info()).num_pending;
|
|
1208
|
+
while (pending > 0) {
|
|
1209
|
+
const want = Math.min(pending, 256);
|
|
1210
|
+
const iter = await consumer.fetch({ max_messages: want, expires: 5_000 });
|
|
1211
|
+
let got = 0;
|
|
1212
|
+
for await (const m of iter) {
|
|
1213
|
+
got++;
|
|
1214
|
+
if (m.seq > toSeqIncl)
|
|
1215
|
+
return { copied, evicted };
|
|
1216
|
+
let msg;
|
|
1217
|
+
try {
|
|
1218
|
+
msg = m.json();
|
|
1219
|
+
}
|
|
1220
|
+
catch {
|
|
1221
|
+
continue;
|
|
1222
|
+
}
|
|
1223
|
+
const parsed = parseSubject(m.subject);
|
|
1224
|
+
if (!parsed || msg.from?.id !== parsed.sender || msg.from.id === owner)
|
|
1225
|
+
continue;
|
|
1226
|
+
await this.publishDinbox(owner, { msg, channel, seq: m.seq, reason: "durable-channel", generation });
|
|
1227
|
+
copied++;
|
|
1228
|
+
}
|
|
1229
|
+
if (got < want)
|
|
1230
|
+
break;
|
|
1231
|
+
pending -= got;
|
|
1232
|
+
}
|
|
1233
|
+
}
|
|
1234
|
+
finally {
|
|
1235
|
+
try {
|
|
1236
|
+
await this.jsm.consumers.delete(chatStream(this.space), name);
|
|
1237
|
+
}
|
|
1238
|
+
catch { /* gone */ }
|
|
1239
|
+
}
|
|
1240
|
+
return { copied, evicted };
|
|
1241
|
+
}
|
|
1242
|
+
/** Start the Plane-3 fan-out writer + trusted reader on THIS (privileged, server-side delivery-daemon)
|
|
1243
|
+
* endpoint, AND serve the `ctl.delivery` control service (runtime durable join/leave/list). `aclFor`
|
|
1244
|
+
* maps an owner id to its current read ACL for the reader's re-authorization — read FRESH per entry
|
|
1245
|
+
* from the durable ACL registry (async). Call once after connect; idempotent durable creation lets it
|
|
1246
|
+
* resume on a daemon restart. Both the JS loops AND the `ctl.delivery` subscription are (re)bound by
|
|
1247
|
+
* {@link armPlane3} on EVERY (re)connect — a reconnect drains the old connection, so re-binding both
|
|
1248
|
+
* is required, not optional (the responder would otherwise be lost on a broker blip). */
|
|
1249
|
+
async startPlane3(aclFor) {
|
|
1250
|
+
if (!this.js)
|
|
1251
|
+
throw new Error("endpoint not started");
|
|
1252
|
+
this.plane3 = { aclFor };
|
|
1253
|
+
await this.armPlane3();
|
|
1254
|
+
}
|
|
1255
|
+
/** Serve one runtime durable-membership control request (the server-side delivery daemon). The caller
|
|
1256
|
+
* id is the authenticated subject sender ({@link serveControl} fail-closes on a mismatch). Validation
|
|
1257
|
+
* is against the durable ACL registry — the SAME KV the reader re-auths against (single source of
|
|
1258
|
+
* truth, no in-memory ledger to drift). */
|
|
1259
|
+
async handleDeliveryControl(req) {
|
|
1260
|
+
const caller = req.from.id;
|
|
1261
|
+
const args = req.args ?? {};
|
|
1262
|
+
if (req.op === "durableJoin")
|
|
1263
|
+
return this.deliveryJoin(caller, args);
|
|
1264
|
+
if (req.op === "durableLeave")
|
|
1265
|
+
return this.deliveryLeave(caller, args);
|
|
1266
|
+
if (req.op === "listMemberships")
|
|
1267
|
+
return { ok: true, data: { memberships: await this.ownerMemberships(caller) } };
|
|
1268
|
+
return { ok: false, error: `op "${req.op}" not supported on the delivery control service` };
|
|
1269
|
+
}
|
|
1270
|
+
/** Validate the channel ARG shape only — non-blank, valid, concrete (NO ACL check, that is op-specific).
|
|
1271
|
+
* Returns the channel on success or a ControlReply error to short-circuit. */
|
|
1272
|
+
checkDurableChannelArg(args, op) {
|
|
1273
|
+
const channel = typeof args.channel === "string" ? args.channel.trim() : "";
|
|
1274
|
+
if (!channel)
|
|
1275
|
+
return { ok: false, error: `${op}: channel must be a non-blank string` };
|
|
1276
|
+
try {
|
|
1277
|
+
assertValidChannel(channel);
|
|
1278
|
+
}
|
|
1279
|
+
catch (e) {
|
|
1280
|
+
return { ok: false, error: e.message };
|
|
1281
|
+
}
|
|
1282
|
+
if (!isConcreteChannel(channel))
|
|
1283
|
+
return { ok: false, error: `${op}: "${channel}" must be a concrete channel (durable membership is per-concrete-channel, not wildcard)` };
|
|
1284
|
+
return channel;
|
|
1285
|
+
}
|
|
1286
|
+
/** JOIN requires the channel be within the caller's CURRENT read ACL (you can't durable-subscribe a
|
|
1287
|
+
* channel you may not read). */
|
|
1288
|
+
async deliveryJoin(caller, args) {
|
|
1289
|
+
const channel = this.checkDurableChannelArg(args, "durableJoin");
|
|
1290
|
+
if (typeof channel !== "string")
|
|
1291
|
+
return channel; // a ControlReply error
|
|
1292
|
+
const acl = await readAcl(await this.aclRegistry(), caller);
|
|
1293
|
+
if (acl === undefined)
|
|
1294
|
+
return { ok: false, error: `durableJoin: no read ACL on record for ${caller} (not provisioned for durable delivery)` };
|
|
1295
|
+
if (!channelInAllow(acl.record.allowSubscribe, channel))
|
|
1296
|
+
return { ok: false, error: `channel "${channel}" is not within your read ACL [${acl.record.allowSubscribe.join(", ")}]` };
|
|
1297
|
+
try {
|
|
1298
|
+
return { ok: true, data: await this.durableJoinFor(caller, channel) };
|
|
1299
|
+
}
|
|
1300
|
+
catch (e) {
|
|
1301
|
+
return { ok: false, error: e.message };
|
|
1302
|
+
}
|
|
1303
|
+
}
|
|
1304
|
+
/** LEAVE must NOT require current-ACL coverage. Leave fires precisely when the ACL was narrowed/revoked
|
|
1305
|
+
* (a refused live sub → {@link closeRefusedMembership}); gating the tombstone on the current ACL would
|
|
1306
|
+
* loop forever and leave the SPEC §7 boundary open (the membership could resume if the ACL is later
|
|
1307
|
+
* restored). The guards are: authenticated caller (serveControl), concrete channel, a finite generation
|
|
1308
|
+
* (the join epoch — without it a stale/replayed leave could tombstone a newer rejoin), and an EXISTING
|
|
1309
|
+
* own membership; `durableLeaveFor` → `tombstoneMember` then enforces the generation match. */
|
|
1310
|
+
async deliveryLeave(caller, args) {
|
|
1311
|
+
const channel = this.checkDurableChannelArg(args, "durableLeave");
|
|
1312
|
+
if (typeof channel !== "string")
|
|
1313
|
+
return channel; // a ControlReply error
|
|
1314
|
+
if (typeof args.generation !== "number" || !Number.isFinite(args.generation))
|
|
1315
|
+
return { ok: false, error: "durableLeave: a finite generation is required (fail-closed stale-leave guard)" };
|
|
1316
|
+
const existing = await readMember(await this.membersRegistry(), channel, caller);
|
|
1317
|
+
if (!existing)
|
|
1318
|
+
return { ok: true, data: { channel, alreadyLeft: true } }; // nothing to tombstone — idempotent
|
|
1319
|
+
try {
|
|
1320
|
+
await this.durableLeaveFor(caller, channel, args.generation);
|
|
1321
|
+
}
|
|
1322
|
+
catch (e) {
|
|
1323
|
+
return { ok: false, error: e.message };
|
|
1324
|
+
}
|
|
1325
|
+
return { ok: true, data: { channel } };
|
|
1326
|
+
}
|
|
1327
|
+
/** (Re)bind the Plane-3 fan-out writer + trusted reader. Idempotent — the durables resume from their
|
|
1328
|
+
* cursor. Called by {@link startPlane3} once AND by {@link connectAndBind} on every (re)connect, so
|
|
1329
|
+
* the delivery daemon's reconnect RE-ARMS the backstop + the ctl.delivery responder. Without this, a broker blip would silently kill
|
|
1330
|
+
* the loops while `durableJoinFor` kept reporting `durable:true` (the impl-review's BLOCKER-1). No-op
|
|
1331
|
+
* unless this endpoint hosts Plane-3 (`this.plane3` set). */
|
|
1332
|
+
async armPlane3() {
|
|
1333
|
+
if (!this.plane3 || !this.js)
|
|
1334
|
+
return;
|
|
1335
|
+
await this.manager(); // the manager runs consume:false, so this.jsm is lazy — ensure it
|
|
1336
|
+
this.armDeliveryControl();
|
|
1337
|
+
await this.runFanout();
|
|
1338
|
+
await this.runReader();
|
|
1339
|
+
}
|
|
1340
|
+
/** (Re)register the `ctl.delivery` control responder on the CURRENT connection. A reconnect drains the
|
|
1341
|
+
* old connection (the old sub is dead and `clearConnectionScoped` leaves caller-owned subs alone), so
|
|
1342
|
+
* this MUST run on every arm — otherwise durable join/leave/list silently lose their responder after a
|
|
1343
|
+
* broker blip. The stale sub is dropped (unsubscribed + removed from `this.subs`) before re-creating.
|
|
1344
|
+
* `boundReply` is essential here: the daemon holds a wildcard reply-publish grant, so the serve path
|
|
1345
|
+
* must reject any reply target outside the authenticated sender's own subtree (confused-deputy fix). */
|
|
1346
|
+
armDeliveryControl() {
|
|
1347
|
+
if (this.deliveryServeSub) {
|
|
1348
|
+
try {
|
|
1349
|
+
this.deliveryServeSub.unsubscribe();
|
|
1350
|
+
}
|
|
1351
|
+
catch { /* dead with the old connection */ }
|
|
1352
|
+
const i = this.subs.indexOf(this.deliveryServeSub);
|
|
1353
|
+
if (i >= 0)
|
|
1354
|
+
this.subs.splice(i, 1);
|
|
1355
|
+
}
|
|
1356
|
+
this.deliveryServeSub = this.serveControl(CONTROL_DELIVERY, (req) => this.handleDeliveryControl(req), { boundReply: true });
|
|
1357
|
+
}
|
|
1358
|
+
/** Fan-out loop: bind the privileged `fanout` durable on CHAT and route each message (routing only —
|
|
1359
|
+
* the trusted reader is the auth gate). */
|
|
1360
|
+
async runFanout() {
|
|
1361
|
+
if (!this.js || !this.jsm)
|
|
1362
|
+
return;
|
|
1363
|
+
try {
|
|
1364
|
+
await this.jsm.consumers.add(chatStream(this.space), fanoutDurableConfig(this.space, { ackWaitMs: this.ackWaitMs }));
|
|
1365
|
+
}
|
|
1366
|
+
catch { /* exists */ }
|
|
1367
|
+
const consumer = await this.js.consumers.get(chatStream(this.space), FANOUT_DURABLE);
|
|
1368
|
+
const msgs = await consumer.consume();
|
|
1369
|
+
this.streamMsgs.push(msgs);
|
|
1370
|
+
void (async () => {
|
|
1371
|
+
for await (const m of msgs) {
|
|
1372
|
+
try {
|
|
1373
|
+
await this.fanOutMessage(m);
|
|
1374
|
+
}
|
|
1375
|
+
catch (e) {
|
|
1376
|
+
if (!this.stopped)
|
|
1377
|
+
this.emit("error", e);
|
|
1378
|
+
try {
|
|
1379
|
+
m.nak();
|
|
1380
|
+
}
|
|
1381
|
+
catch { /* draining */ }
|
|
1382
|
+
}
|
|
1383
|
+
}
|
|
1384
|
+
})().catch((e) => { if (!this.stopped)
|
|
1385
|
+
this.emit("error", e); });
|
|
1386
|
+
}
|
|
1387
|
+
/** Route ONE chat message to eligible owners' mixed inboxes. `durable` channel → its `durable-active`
|
|
1388
|
+
* members within interval; `live` channel → `@mention` targets authorized to read it (ACL only).
|
|
1389
|
+
* Members KV is scanned FRESH per message (no cache — red-team BLOCKER-1 catch-up correctness). */
|
|
1390
|
+
async fanOutMessage(m) {
|
|
1391
|
+
const parsed = parseSubject(m.subject);
|
|
1392
|
+
if (!parsed || parsed.kind !== "chat") {
|
|
1393
|
+
m.ack();
|
|
1394
|
+
return;
|
|
1395
|
+
}
|
|
1396
|
+
const channel = parsed.rest;
|
|
1397
|
+
let msg;
|
|
1398
|
+
try {
|
|
1399
|
+
msg = m.json();
|
|
1400
|
+
}
|
|
1401
|
+
catch {
|
|
1402
|
+
m.ack();
|
|
1403
|
+
return;
|
|
1404
|
+
}
|
|
1405
|
+
if (!msg.from || msg.from.id !== parsed.sender) {
|
|
1406
|
+
m.ack();
|
|
1407
|
+
return;
|
|
1408
|
+
} // authenticity
|
|
1409
|
+
const seq = m.seq;
|
|
1410
|
+
if ((await this.deliveryClassFresh(channel)) === "durable") {
|
|
1411
|
+
for (const rec of await listMembers(await this.membersRegistry(), { channel })) {
|
|
1412
|
+
if (rec.owner === msg.from.id)
|
|
1413
|
+
continue; // never backstop the sender's own post
|
|
1414
|
+
if (!durableEligible(rec, seq))
|
|
1415
|
+
continue; // routing fast-filter (reader re-checks)
|
|
1416
|
+
await this.publishDinbox(rec.owner, { msg, channel, seq, reason: "durable-channel", generation: rec.generation });
|
|
1417
|
+
}
|
|
1418
|
+
}
|
|
1419
|
+
else {
|
|
1420
|
+
for (const name of msg.mentions ?? []) {
|
|
1421
|
+
const owner = this.resolveOwnerByName(name);
|
|
1422
|
+
if (!owner || owner === msg.from.id)
|
|
1423
|
+
continue;
|
|
1424
|
+
const acl = await this.plane3?.aclFor(owner);
|
|
1425
|
+
if (!acl || !channelInAllow(acl, channel))
|
|
1426
|
+
continue; // @mention can't bypass the read ACL
|
|
1427
|
+
await this.publishDinbox(owner, { msg, channel, seq, reason: "live-mention", generation: 0 });
|
|
1428
|
+
}
|
|
1429
|
+
}
|
|
1430
|
+
m.ack();
|
|
1431
|
+
}
|
|
1432
|
+
/** Trusted-reader loop: bind the single privileged `reader` durable over `dinbox.>` and re-authorize
|
|
1433
|
+
* + transfer each entry. */
|
|
1434
|
+
async runReader() {
|
|
1435
|
+
if (!this.js || !this.jsm)
|
|
1436
|
+
return;
|
|
1437
|
+
try {
|
|
1438
|
+
await this.jsm.consumers.add(inboxStream(this.space), inboxReaderConfig(this.space, { ackWaitMs: this.ackWaitMs }));
|
|
1439
|
+
}
|
|
1440
|
+
catch { /* exists */ }
|
|
1441
|
+
const consumer = await this.js.consumers.get(inboxStream(this.space), INBOX_READER_DURABLE);
|
|
1442
|
+
const msgs = await consumer.consume();
|
|
1443
|
+
this.streamMsgs.push(msgs);
|
|
1444
|
+
void (async () => {
|
|
1445
|
+
for await (const m of msgs) {
|
|
1446
|
+
try {
|
|
1447
|
+
await this.readerHandle(m);
|
|
1448
|
+
}
|
|
1449
|
+
catch (e) {
|
|
1450
|
+
if (!this.stopped)
|
|
1451
|
+
this.emit("error", e);
|
|
1452
|
+
try {
|
|
1453
|
+
m.nak();
|
|
1454
|
+
}
|
|
1455
|
+
catch { /* draining */ }
|
|
1456
|
+
}
|
|
1457
|
+
}
|
|
1458
|
+
})().catch((e) => { if (!this.stopped)
|
|
1459
|
+
this.emit("error", e); });
|
|
1460
|
+
}
|
|
1461
|
+
/** Re-authorize ONE mixed-inbox entry and transfer it to the owner's DELIVER store. Deny (drop) on a
|
|
1462
|
+
* revoked/narrowed ACL or out-of-interval seq; on transfer success, ack the mixed entry (durability
|
|
1463
|
+
* has moved to DLV — an §8 equivalent per-member at-least-once mechanism). The agent acks DLV. */
|
|
1464
|
+
async readerHandle(m) {
|
|
1465
|
+
const owner = parseDinboxOwner(m.subject);
|
|
1466
|
+
if (!owner) {
|
|
1467
|
+
m.ack();
|
|
1468
|
+
return;
|
|
1469
|
+
} // unparseable subject — not a real entry
|
|
1470
|
+
let entry;
|
|
1471
|
+
try {
|
|
1472
|
+
entry = m.json();
|
|
1473
|
+
}
|
|
1474
|
+
catch {
|
|
1475
|
+
m.ack();
|
|
1476
|
+
return;
|
|
1477
|
+
} // undecodable — drop
|
|
1478
|
+
const redeliveries = m.info?.deliveryCount ?? 1; // JsMsg delivery attempts (1 on first delivery)
|
|
1479
|
+
const acl = await this.plane3?.aclFor(owner);
|
|
1480
|
+
if (acl === undefined) {
|
|
1481
|
+
// UNKNOWN owner — the manager has not (re)hydrated this owner's ACL yet (e.g. right after a
|
|
1482
|
+
// manager PROCESS restart). This is NOT a revocation: DEFER (redeliver), never drop — an ack here
|
|
1483
|
+
// would lose at-least-once on restart (impl-review BLOCKER-2). A delayed nak + a redelivery
|
|
1484
|
+
// ceiling stops one perma-unknown owner from head-of-lining the shared reader.
|
|
1485
|
+
// (Follow-up: the manager does not yet rehydrate its managed set across a process restart — until
|
|
1486
|
+
// it does, a long-unknown owner's entries term after the ceiling; tracked, not a silent ack-drop.)
|
|
1487
|
+
if (redeliveries >= READER_MAX_REDELIVERIES) {
|
|
1488
|
+
m.term();
|
|
1489
|
+
this.emit("error", new Error(`plane-3 reader: gave up on entry for unknown owner ${owner} after ${redeliveries} redeliveries`));
|
|
1490
|
+
return;
|
|
1491
|
+
}
|
|
1492
|
+
m.nak(2000);
|
|
1493
|
+
return;
|
|
1494
|
+
}
|
|
1495
|
+
// KNOWN owner whose CURRENT ACL no longer covers the channel — a revocation/narrowing. Drop: the
|
|
1496
|
+
// entry is no longer authorized (SPEC §7 current-ACL gate before surfacing).
|
|
1497
|
+
if (!channelInAllow(acl, entry.channel)) {
|
|
1498
|
+
m.ack();
|
|
1499
|
+
return;
|
|
1500
|
+
}
|
|
1501
|
+
if (entry.reason === "durable-channel") {
|
|
1502
|
+
const rec = await readMember(await this.membersRegistry(), entry.channel, owner);
|
|
1503
|
+
// INTERVAL re-auth (not a current-member boolean): a pre-leave entry (seq ≤ leaveCursor) stays
|
|
1504
|
+
// deliverable; seq > leaveCursor (or after a rejoin's newer joinCursor) is the hard cut.
|
|
1505
|
+
if (!rec || !durableEligible(rec.record, entry.seq)) {
|
|
1506
|
+
m.ack();
|
|
1507
|
+
return;
|
|
1508
|
+
}
|
|
1509
|
+
}
|
|
1510
|
+
try {
|
|
1511
|
+
await this.js.publish(dlvSubject(this.space, owner), JSON.stringify(entry.msg), {
|
|
1512
|
+
msgID: `${entry.msg.id}:${owner}:${entry.generation}`,
|
|
1513
|
+
});
|
|
1514
|
+
}
|
|
1515
|
+
catch {
|
|
1516
|
+
// Transfer failed — keep the entry pending (redeliver), bounded by the same ceiling so a poison
|
|
1517
|
+
// entry can't head-of-line the shared reader forever.
|
|
1518
|
+
if (redeliveries >= READER_MAX_REDELIVERIES) {
|
|
1519
|
+
m.term();
|
|
1520
|
+
this.emit("error", new Error(`plane-3 reader: gave up transferring ${entry.msg.id} for ${owner} after ${redeliveries} redeliveries`));
|
|
1521
|
+
return;
|
|
1522
|
+
}
|
|
1523
|
+
m.nak(2000);
|
|
1524
|
+
return;
|
|
1525
|
+
}
|
|
1526
|
+
m.ack();
|
|
1527
|
+
}
|
|
1528
|
+
/** Agent-side: bind + pump our pre-created Plane-3 DELIVER durable (`dlv_<id>`). Every message here is
|
|
1529
|
+
* delivery-daemon-written (DLV is delivery-write-only, broker-enforced) and is a CHANNEL message by contract
|
|
1530
|
+
* (the backstop never carries DMs), so `kind=channel` is path-derived (SPEC §4) and the body is
|
|
1531
|
+
* trusted (no spoof-guard). `durable:true` — real JetStream ack, coalesced with the core-sub live
|
|
1532
|
+
* copy by `MeshAgent.ingest`. No-op when the durable isn't present (open mode / not provisioned). */
|
|
1533
|
+
async pumpDlv() {
|
|
1534
|
+
if (!this.js)
|
|
1535
|
+
return;
|
|
1536
|
+
let consumer;
|
|
1537
|
+
try {
|
|
1538
|
+
consumer = await this.js.consumers.get(dlvStream(this.space), dlvDurable(this.card.id));
|
|
1539
|
+
}
|
|
1540
|
+
catch {
|
|
1541
|
+
return;
|
|
1542
|
+
} // no DLV durable — Plane-3 not active for us
|
|
1543
|
+
const msgs = await consumer.consume();
|
|
1544
|
+
this.streamMsgs.push(msgs);
|
|
1545
|
+
void (async () => {
|
|
1546
|
+
for await (const m of msgs) {
|
|
1547
|
+
let msg;
|
|
1548
|
+
try {
|
|
1549
|
+
msg = m.json();
|
|
1550
|
+
}
|
|
1551
|
+
catch (e) {
|
|
1552
|
+
this.emit("error", e);
|
|
1553
|
+
try {
|
|
1554
|
+
m.term();
|
|
1555
|
+
}
|
|
1556
|
+
catch { /* draining */ }
|
|
1557
|
+
continue;
|
|
1558
|
+
}
|
|
1559
|
+
if (msg.from?.id === this.card.id) {
|
|
1560
|
+
m.ack();
|
|
1561
|
+
continue;
|
|
1562
|
+
} // own echo (defensive)
|
|
1563
|
+
const delivery = { ack: () => m.ack(), nak: () => m.nak(), durable: true };
|
|
1564
|
+
this.emit("message", msg, delivery, { historical: false, kind: "channel" });
|
|
1565
|
+
}
|
|
1566
|
+
})().catch((e) => { if (!this.stopped)
|
|
1567
|
+
this.emit("error", e); });
|
|
1568
|
+
}
|
|
1569
|
+
/** Agent-side: request a Plane-3 durable backstop for a channel via the server-side delivery daemon (ctl.delivery). Throws
|
|
1570
|
+
* when no privileged writer is present (open / no delivery daemon). 30s timeout — activation catch-up may
|
|
1571
|
+
* run before the reply (the window is small, but a busy channel can take more than the 5s default). */
|
|
1572
|
+
async durableJoinChannel(channel) {
|
|
1573
|
+
const reply = await this.requestDelivery("durableJoin", { channel }, 30_000);
|
|
1574
|
+
if (!reply.ok)
|
|
1575
|
+
throw new Error(reply.error ?? "durable join rejected");
|
|
1576
|
+
return reply.data ?? { durable: false };
|
|
1577
|
+
}
|
|
1578
|
+
/** Agent-side: release a Plane-3 durable backstop (tombstone membership at the leave cursor). Passes
|
|
1579
|
+
* the join generation so a stale leave can't tombstone a newer rejoin (the delivery daemon validates it). */
|
|
1580
|
+
async durableLeaveChannel(channel, generation) {
|
|
1581
|
+
const reply = await this.requestDelivery("durableLeave", { channel, generation });
|
|
1582
|
+
if (!reply.ok)
|
|
1583
|
+
throw new Error(reply.error ?? "durable leave rejected");
|
|
1584
|
+
}
|
|
1585
|
+
/** Fail-closed async cleanup for a channel forced out by a LATE sub.allow refusal (the broker revoked
|
|
1586
|
+
* the live read). The sync sub callback can't await, so this RETRIES the Plane-3 tombstone with capped
|
|
1587
|
+
* backoff UNTIL IT SUCCEEDS (or the endpoint stops) — the §7 boundary always closes once the manager
|
|
1588
|
+
* is reachable, never a silent give-up. While pending, the channel is tracked in
|
|
1589
|
+
* {@link pendingDurableLeave} and surfaced via {@link pendingDurableLeaves} (the connector shows it in
|
|
1590
|
+
* `cotal_channels` as `durable-unclosed`, never ordinary absence). The generation is kept the whole
|
|
1591
|
+
* time. Authoritative closure of a revoked membership is also handled by revocation (rotate creds + tear down). */
|
|
1592
|
+
async closeRefusedMembership(channel, generation) {
|
|
1593
|
+
this.pendingDurableLeave.set(channel, generation);
|
|
1594
|
+
for (let attempt = 0;; attempt++) {
|
|
1595
|
+
if (this.stopped)
|
|
1596
|
+
return;
|
|
1597
|
+
try {
|
|
1598
|
+
await this.durableLeaveChannel(channel, generation);
|
|
1599
|
+
this.plane3Channels.delete(channel);
|
|
1600
|
+
this.pendingDurableLeave.delete(channel);
|
|
1601
|
+
return;
|
|
1602
|
+
}
|
|
1603
|
+
catch (e) {
|
|
1604
|
+
if (attempt === 0)
|
|
1605
|
+
this.emit("error", new Error(`channel "${channel}": Plane-3 durable membership (generation ${generation}) not yet tombstoned after a refused live sub — retrying; §7 boundary may be open until it succeeds (${e.message})`));
|
|
1606
|
+
await new Promise((r) => setTimeout(r, Math.min(30_000, 1000 * 2 ** attempt)));
|
|
1607
|
+
}
|
|
1608
|
+
}
|
|
1609
|
+
}
|
|
1610
|
+
/** Channels with a Plane-3 durable membership whose §7 tombstone is still pending after a refused live
|
|
1611
|
+
* sub (see {@link closeRefusedMembership}) — surfaced by the connector as a `durable-unclosed` state so
|
|
1612
|
+
* it is never presented as ordinary "not subscribed". */
|
|
1613
|
+
pendingDurableLeaves() {
|
|
1614
|
+
return [...this.pendingDurableLeave.keys()];
|
|
1615
|
+
}
|
|
1616
|
+
/** A control request that found NO responder — open / manager-less (no privileged control plane),
|
|
1617
|
+
* distinct from a responder that errored. nats.js surfaces it as NoRespondersError, or a RequestError
|
|
1618
|
+
* whose `isNoResponders()` is true. */
|
|
1619
|
+
isNoResponders(e) {
|
|
1620
|
+
return e instanceof NoRespondersError || (e instanceof RequestError && e.isNoResponders());
|
|
1621
|
+
}
|
|
1622
|
+
/** Agent-side: this session's CURRENT durable memberships (channel + join generation) from the
|
|
1623
|
+
* manager — the agent holds no read on the privileged members KV. `undefined` ⇒ NO control responder
|
|
1624
|
+
* (open / no delivery daemon, so there is no Plane-3 and no memberships). THROWS on a responder-present RPC
|
|
1625
|
+
* failure, so a caller can FAIL-CLOSED rather than mistaking a transient error for "no membership". */
|
|
1626
|
+
async fetchMemberships() {
|
|
1627
|
+
let reply;
|
|
1628
|
+
try {
|
|
1629
|
+
reply = await this.requestDelivery("listMemberships", {}, 5_000);
|
|
1630
|
+
}
|
|
1631
|
+
catch (e) {
|
|
1632
|
+
if (this.isNoResponders(e))
|
|
1633
|
+
return undefined; // no delivery daemon — open / daemon-less, no Plane-3
|
|
1634
|
+
throw e; // responder present but errored — surface it (leaveChannel fails closed)
|
|
1635
|
+
}
|
|
1636
|
+
if (!reply.ok)
|
|
1637
|
+
throw new Error(reply.error ?? "listMemberships failed");
|
|
1638
|
+
return reply.data?.memberships ?? [];
|
|
1639
|
+
}
|
|
1640
|
+
/** Agent-side, first connect (auth): SELF-JOIN this session's durable boot channels via the
|
|
1641
|
+
* server-side delivery daemon — replacing the old manager-written boot membership. Each concrete
|
|
1642
|
+
* `durable`-class boot channel gets a `durableJoin` whose returned generation seeds the leave mirror
|
|
1643
|
+
* + durable-state surface; an already-active membership (a relaunch) is idempotent (no re-catch-up).
|
|
1644
|
+
* If the daemon is down/absent at first connect (or reports a transient `durable:false`), the channel
|
|
1645
|
+
* is handed to {@link reconcileBootJoin} for capped-backoff retry — so the backstop is RESTORED once
|
|
1646
|
+
* the daemon recovers, not left silently live-only. Until a membership exists the channel renders
|
|
1647
|
+
* degraded in `cotal_channels` ({@link hasDurableMembership}). */
|
|
1648
|
+
async armBootDurableMemberships() {
|
|
1649
|
+
for (const channel of this.channels) {
|
|
1650
|
+
if (!isConcreteChannel(channel) || this.plane3Channels.has(channel))
|
|
1651
|
+
continue;
|
|
1652
|
+
let cls;
|
|
1653
|
+
try {
|
|
1654
|
+
cls = await this.deliveryClassFresh(channel);
|
|
1655
|
+
}
|
|
1656
|
+
catch {
|
|
1657
|
+
continue;
|
|
1658
|
+
}
|
|
1659
|
+
if (cls !== "durable")
|
|
1660
|
+
continue;
|
|
1661
|
+
try {
|
|
1662
|
+
const r = await this.durableJoinChannel(channel);
|
|
1663
|
+
if (r.durable)
|
|
1664
|
+
this.plane3Channels.set(channel, r.generation ?? 0);
|
|
1665
|
+
else
|
|
1666
|
+
void this.reconcileBootJoin(channel); // present but not yet durable — reconcile to recovery
|
|
1667
|
+
}
|
|
1668
|
+
catch (e) {
|
|
1669
|
+
if (!this.isNoResponders(e))
|
|
1670
|
+
this.emit("error", e); // no daemon ⇒ retry until it recovers
|
|
1671
|
+
void this.reconcileBootJoin(channel);
|
|
1672
|
+
}
|
|
1673
|
+
}
|
|
1674
|
+
}
|
|
1675
|
+
/** Retry a boot durable self-join with capped backoff until a membership EXISTS (success → seed
|
|
1676
|
+
* `plane3Channels`) or the channel is left / the endpoint stops. Mirrors {@link closeRefusedMembership}:
|
|
1677
|
+
* a one-shot first-connect attempt that swallowed a daemon outage would leave the boot channel live-only
|
|
1678
|
+
* forever after the daemon recovers (and the lease-based health could then read "active" with no owner
|
|
1679
|
+
* membership). This loop is the reconcile that closes that gap. Idempotent — a channel already pending
|
|
1680
|
+
* is not double-driven; survives reconnect (it re-issues `durableJoinChannel` on the current connection). */
|
|
1681
|
+
async reconcileBootJoin(channel) {
|
|
1682
|
+
if (this.pendingBootJoins.has(channel))
|
|
1683
|
+
return; // already reconciling
|
|
1684
|
+
this.pendingBootJoins.add(channel);
|
|
1685
|
+
for (let attempt = 0;; attempt++) {
|
|
1686
|
+
await new Promise((r) => setTimeout(r, Math.min(30_000, 1000 * 2 ** attempt)));
|
|
1687
|
+
if (this.stopped || !this.channels.includes(channel) || this.plane3Channels.has(channel)) {
|
|
1688
|
+
this.pendingBootJoins.delete(channel);
|
|
1689
|
+
return; // stopped, left, or another path established it
|
|
1690
|
+
}
|
|
1691
|
+
try {
|
|
1692
|
+
const r = await this.durableJoinChannel(channel);
|
|
1693
|
+
if (r.durable) {
|
|
1694
|
+
this.plane3Channels.set(channel, r.generation ?? 0);
|
|
1695
|
+
this.pendingBootJoins.delete(channel);
|
|
1696
|
+
return;
|
|
1697
|
+
}
|
|
1698
|
+
// present but durable:false (e.g. catch-up window evicted) — keep retrying; the channel stays
|
|
1699
|
+
// honestly degraded meanwhile, never silently "active".
|
|
1700
|
+
}
|
|
1701
|
+
catch (e) {
|
|
1702
|
+
if (attempt === 0 && !this.isNoResponders(e))
|
|
1703
|
+
this.emit("error", new Error(`channel "${channel}": boot durable self-join not yet established — retrying until the delivery daemon is reachable (${e.message})`));
|
|
1704
|
+
}
|
|
1705
|
+
}
|
|
1706
|
+
}
|
|
1707
|
+
/** True if this session holds an established Plane-3 durable membership for `channel` (in `plane3Channels`).
|
|
1708
|
+
* Drives the membership-aware delivery-health surface: a joined durable channel that is NOT yet a member
|
|
1709
|
+
* (boot self-join pending / daemon down) must render degraded, never "active" off a live lease alone. */
|
|
1710
|
+
hasDurableMembership(channel) {
|
|
1711
|
+
return this.plane3Channels.has(channel);
|
|
1712
|
+
}
|
|
823
1713
|
/** Lazily obtain a JetStream manager — so a non-consuming endpoint (e.g. the supervisor,
|
|
824
1714
|
* consume:false) can still pre-create others' durables. */
|
|
825
1715
|
async manager() {
|
|
@@ -843,64 +1733,37 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
843
1733
|
}));
|
|
844
1734
|
}
|
|
845
1735
|
await this.pump(dmStream(this.space), dmDurable(id));
|
|
846
|
-
//
|
|
847
|
-
//
|
|
848
|
-
//
|
|
1736
|
+
// Plane-3 (SPEC §8): bind + pump our per-member DELIVER durable (`dlv_<id>`) — the re-authorized
|
|
1737
|
+
// durable-backstop channel copies the trusted reader transfers to us. No-op when it isn't present
|
|
1738
|
+
// (open mode / un-provisioned). Auth-only feature; the pump self-guards on the durable's existence.
|
|
1739
|
+
await this.pumpDlv();
|
|
1740
|
+
// Multicast: open a native CORE subscription for each channel (live, manager-free, broker-enforced
|
|
1741
|
+
// by sub.allow) — boot + runtime joins use the SAME path; there is no per-instance chat durable.
|
|
1742
|
+
// The durable backstop (a busy/offline turn) is Plane-3 (auth: membership established by the agent's
|
|
1743
|
+
// self-join, the delivery daemon's fan-out writer + trusted reader deliver via the `dlv_<id>` pump
|
|
1744
|
+
// above; open dev mode is live-only — the durable plane needs the daemon's trusted reader, the
|
|
1745
|
+
// security boundary). Per-
|
|
1746
|
+
// channel history is the explicit replay-gated backfill, on FIRST connect only; a reconnect reopens
|
|
1747
|
+
// the subs without re-backfilling (the durable backstop redelivers any missed window via dlv).
|
|
849
1748
|
if (this.channels.length) {
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
//
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
const
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
throw new Error(`chat durable ${durable} not pre-created — a launcher must call provisionChatDurable ` +
|
|
861
|
-
`(auth mode binds the durable, it never self-creates)`);
|
|
862
|
-
await this.jsm.consumers.add(chatStream(this.space), chatDurableConfig(this.space, id, this.channels, {
|
|
863
|
-
ackWaitMs: this.ackWaitMs,
|
|
864
|
-
inactiveThresholdMs: this.inactiveThresholdMs,
|
|
865
|
-
}));
|
|
866
|
-
}
|
|
867
|
-
// First bind to this durable (open self-create, or an auth pre-create never consumed) ⇒
|
|
868
|
-
// backfill the full subscribe set. A later reconnect (the consumed cursor has advanced)
|
|
869
|
-
// backfills only channels the config GAINED — un-acked live messages auto-redeliver, so a full
|
|
870
|
-
// re-backfill would double up. With pre-create, `info` always exists under auth, so the
|
|
871
|
-
// consumed cursor — not the durable's existence — is what tells first-bind from reconnect.
|
|
872
|
-
//
|
|
873
|
-
// Caveat (best-effort, by design): `consumer_seq > 0` proves the durable has delivered at
|
|
874
|
-
// least once, NOT that the initial backfill completed. A crash between the first delivery and
|
|
875
|
-
// backfillArmed() makes the next bind take the reconnect path and skip the full pre-bind
|
|
876
|
-
// backfill. This is unchanged from the prior self-create path (which keyed on durable
|
|
877
|
-
// existence and had the same gap — and was actually weaker: a crash before any delivery left
|
|
878
|
-
// the durable existing, so it never re-backfilled; consumer_seq still 0 here re-backfills).
|
|
879
|
-
// Reliable FORWARD delivery is the durable's job (un-acked redelivery); pre-bind history is
|
|
880
|
-
// opportunistic. A backfill-completion marker would make it reliable — a deferred follow-up.
|
|
881
|
-
const consumed = (info?.delivered?.consumer_seq ?? 0) > 0;
|
|
882
|
-
if (!consumed) {
|
|
883
|
-
// Arm the tail-drop watermarks BEFORE pump starts, so the tail can never deliver a
|
|
884
|
-
// just-bound channel's message un-watermarked (which would double-emit: live + backfill).
|
|
885
|
-
const armed = await this.armJoin(this.channels);
|
|
886
|
-
await this.pump(chatStream(this.space), durable);
|
|
1749
|
+
// Arm the per-channel join watermarks BEFORE opening the subs: the backfill reads <= frontier and
|
|
1750
|
+
// the core-sub delivers > frontier, so they never overlap (first connect). On reconnect we reopen
|
|
1751
|
+
// without arming/backfilling.
|
|
1752
|
+
const armed = this.firstConnect ? await this.armJoin(this.channels) : undefined;
|
|
1753
|
+
for (const ch of this.channels)
|
|
1754
|
+
this.subscribeChat(ch);
|
|
1755
|
+
await this.confirmChatSub();
|
|
1756
|
+
for (const ch of this.channels)
|
|
1757
|
+
this.confirmingChatSubs.delete(chatSubject(this.space, "*", ch));
|
|
1758
|
+
if (armed)
|
|
887
1759
|
await this.backfillArmed(armed);
|
|
888
|
-
}
|
|
889
|
-
else {
|
|
890
|
-
// Reconnect: resume the tail, then backfill any channels the config GAINED since.
|
|
891
|
-
await this.pump(chatStream(this.space), durable);
|
|
892
|
-
const haveFilters = info.config.filter_subjects ?? (info.config.filter_subject ? [info.config.filter_subject] : []);
|
|
893
|
-
const gained = this.channels.filter((c) => !haveFilters.some((f) => subjectMatches(f, chatSubject(this.space, "*", c))));
|
|
894
|
-
const armed = gained.length ? await this.armJoin(gained) : undefined;
|
|
895
|
-
// Reconcile the durable's filter to the CURRENT config — OPEN MODE ONLY. Auth mode is
|
|
896
|
-
// bind-only (no UPDATE grant): the durable's filter is authoritative, moved solely by the
|
|
897
|
-
// mediated join/leave control op, so the agent never self-reconciles it.
|
|
898
|
-
if (!this.creds && !sameSet(haveFilters, want))
|
|
899
|
-
await this.jsm.consumers.update(chatStream(this.space), durable, { filter_subjects: want });
|
|
900
|
-
if (armed)
|
|
901
|
-
await this.backfillArmed(armed);
|
|
902
|
-
}
|
|
903
1760
|
}
|
|
1761
|
+
// First connect, auth mode: self-join BOOT durable channels via the server-side delivery daemon
|
|
1762
|
+
// (it owns membership now — there is no manager-written boot membership). Seeds plane3Channels so a
|
|
1763
|
+
// later leave can tombstone the §7 boundary; idempotent on relaunch. Open mode has no Plane-3.
|
|
1764
|
+
if (this.firstConnect && this.creds && this.channels.length)
|
|
1765
|
+
await this.armBootDurableMemberships();
|
|
1766
|
+
this.firstConnect = false;
|
|
904
1767
|
// Anycast: a shared work-queue consumer for our role — one instance grabs each task.
|
|
905
1768
|
// Open mode self-creates; auth mode BINDS the provisioner-pre-created svc_<role>
|
|
906
1769
|
// durable (agents are denied CONSUMER.CREATE on TASK_<space>, since the create-time
|
|
@@ -955,8 +1818,14 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
955
1818
|
m.ack();
|
|
956
1819
|
continue;
|
|
957
1820
|
}
|
|
1821
|
+
// No pre-commit dedup here: the durable is the at-least-once path, so it must NEVER ack a copy
|
|
1822
|
+
// just because an id was "seen" — that would drop an unhandled message (the security/critic
|
|
1823
|
+
// HIGH). Steady state is single-path (coverage-partition: the core-sub drops durable-covered
|
|
1824
|
+
// channels). The only overlap is the brief live-first transition window, and a duplicate there
|
|
1825
|
+
// is coalesced downstream by the receiver's commit-aware id-dedup (MeshAgent.ingest keeps ONE
|
|
1826
|
+
// entry and takes THIS durable ack handle) — so the durable copy is acked only once handled.
|
|
958
1827
|
}
|
|
959
|
-
const delivery = { ack: () => m.ack(), nak: () => m.nak() };
|
|
1828
|
+
const delivery = { ack: () => m.ack(), nak: () => m.nak(), durable: true };
|
|
960
1829
|
this.emit("message", msg, delivery, {
|
|
961
1830
|
historical: false,
|
|
962
1831
|
kind: kindFromParsed(parsed.kind),
|
|
@@ -967,6 +1836,98 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
967
1836
|
this.emit("error", e);
|
|
968
1837
|
});
|
|
969
1838
|
}
|
|
1839
|
+
/** Open a native core subscription to a channel's live feed (the manager-free live read path,
|
|
1840
|
+
* broker-enforced by `sub.allow`). At-most-once — no replay, no ack; it is the live delivery for
|
|
1841
|
+
* every channel (boot + runtime). For a `durable` channel it is also the low-latency wake-hint
|
|
1842
|
+
* alongside the Plane-3 durable copy, coalesced by the receiver's id-dedup. Drops our own echo +
|
|
1843
|
+
* spoofed senders. */
|
|
1844
|
+
subscribeChat(channel) {
|
|
1845
|
+
if (!this.nc || this.chatSubs.has(channel))
|
|
1846
|
+
return;
|
|
1847
|
+
this.chatSubDenied.delete(channel);
|
|
1848
|
+
const subject = chatSubject(this.space, "*", channel);
|
|
1849
|
+
this.confirmingChatSubs.add(subject);
|
|
1850
|
+
const sub = this.nc.subscribe(subject, {
|
|
1851
|
+
callback: (err, m) => {
|
|
1852
|
+
if (err) {
|
|
1853
|
+
// async sub.allow refusal (or sub error): the live feed for this channel is dead — never a
|
|
1854
|
+
// leak (the broker refused it). Drop the channel from local joined state even if it was
|
|
1855
|
+
// already treated as joined — a LATE refusal beyond the confirm window: conformance #13
|
|
1856
|
+
// "drop on late refusal". (During the join's own confirm the channel isn't pushed yet, so
|
|
1857
|
+
// this fires nothing then; joinChannel reads `chatSubDenied` and throws cleanly.)
|
|
1858
|
+
this.chatSubDenied.add(channel);
|
|
1859
|
+
this.chatSubs.delete(channel);
|
|
1860
|
+
// NOTE: do NOT remove `subject` from confirmingChatSubs here — that set gates watchStatus's
|
|
1861
|
+
// suppression of this expected violation, and is cleared by joinChannel after confirm (or by
|
|
1862
|
+
// unsubscribeChat). Removing it in the callback races the watcher and leaks a spurious error.
|
|
1863
|
+
const i = this.channels.indexOf(channel);
|
|
1864
|
+
if (i >= 0) {
|
|
1865
|
+
this.channels.splice(i, 1);
|
|
1866
|
+
this.joinSeq.delete(channel);
|
|
1867
|
+
// A late sub.allow refusal forces this agent out of the channel (the broker revoked its live
|
|
1868
|
+
// read). If it held a Plane-3 durable membership, the §7 boundary must close too. This sub
|
|
1869
|
+
// callback can't await, so a fail-closed async helper RETRIES the tombstone (backoff) UNTIL it
|
|
1870
|
+
// succeeds, clearing the mirror only then; while pending it is surfaced via cotal_channels —
|
|
1871
|
+
// never a silent drop, never lost retry state.
|
|
1872
|
+
const gen = this.plane3Channels.get(channel);
|
|
1873
|
+
if (gen !== undefined)
|
|
1874
|
+
void this.closeRefusedMembership(channel, gen);
|
|
1875
|
+
this.emit("error", new Error(`left channel "${channel}": its live subscription was refused by the broker`));
|
|
1876
|
+
}
|
|
1877
|
+
return;
|
|
1878
|
+
}
|
|
1879
|
+
const parsed = parseSubject(m.subject);
|
|
1880
|
+
if (!parsed || parsed.kind !== "chat")
|
|
1881
|
+
return;
|
|
1882
|
+
let msg;
|
|
1883
|
+
try {
|
|
1884
|
+
msg = m.json();
|
|
1885
|
+
}
|
|
1886
|
+
catch (e) {
|
|
1887
|
+
this.emit("error", e);
|
|
1888
|
+
return;
|
|
1889
|
+
}
|
|
1890
|
+
if (!msg.from || msg.from.id !== parsed.sender)
|
|
1891
|
+
return; // spoof/malformed — drop (at-most-once)
|
|
1892
|
+
if (msg.from.id === this.card.id)
|
|
1893
|
+
return; // our own echo
|
|
1894
|
+
const delivery = { ack: () => { }, nak: () => { }, durable: false }; // live = at-most-once, not acked
|
|
1895
|
+
this.emit("message", msg, delivery, {
|
|
1896
|
+
historical: false,
|
|
1897
|
+
kind: kindFromParsed(parsed.kind),
|
|
1898
|
+
});
|
|
1899
|
+
},
|
|
1900
|
+
});
|
|
1901
|
+
this.chatSubs.set(channel, sub);
|
|
1902
|
+
}
|
|
1903
|
+
/** Close a channel's core subscription (manager-free leave). */
|
|
1904
|
+
unsubscribeChat(channel) {
|
|
1905
|
+
this.confirmingChatSubs.delete(chatSubject(this.space, "*", channel));
|
|
1906
|
+
const sub = this.chatSubs.get(channel);
|
|
1907
|
+
if (sub) {
|
|
1908
|
+
try {
|
|
1909
|
+
sub.unsubscribe();
|
|
1910
|
+
}
|
|
1911
|
+
catch {
|
|
1912
|
+
/* closing with the connection */
|
|
1913
|
+
}
|
|
1914
|
+
this.chatSubs.delete(channel);
|
|
1915
|
+
}
|
|
1916
|
+
this.chatSubDenied.delete(channel);
|
|
1917
|
+
}
|
|
1918
|
+
/** Confirm a just-opened core subscription was accepted by the broker. A `sub.allow` violation is
|
|
1919
|
+
* async in NATS, so flush (round-trips the SUB) then settle briefly to let the refusal land — a
|
|
1920
|
+
* denied subscribe must not read as a successful join (SPEC conformance #13). */
|
|
1921
|
+
async confirmChatSub() {
|
|
1922
|
+
if (!this.nc)
|
|
1923
|
+
throw new Error("connection not established");
|
|
1924
|
+
// flush() is the deterministic boundary: the broker's -ERR for an out-of-ACL SUB arrives BEFORE the
|
|
1925
|
+
// PONG, so once flush resolves the subscribe callback has already recorded any denial. A flush
|
|
1926
|
+
// FAILURE means the connection drained/closed mid-join — we have no confirmation, so let it throw
|
|
1927
|
+
// (joinChannel fails closed) instead of swallowing it and continuing as if confirmed.
|
|
1928
|
+
await this.nc.flush();
|
|
1929
|
+
await new Promise((r) => setTimeout(r, 50));
|
|
1930
|
+
}
|
|
970
1931
|
/** The highest join watermark among the joined subscriptions that cover `concreteChannel`
|
|
971
1932
|
* (a wildcard sub like `team.>` covers `team.backend`), or undefined if none — the tail
|
|
972
1933
|
* drops a chat message with `seq <= ` this. */
|
|
@@ -997,8 +1958,8 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
997
1958
|
return (await this.jsm.streams.info(chatStream(this.space))).state.last_seq;
|
|
998
1959
|
}
|
|
999
1960
|
/** Phase 1 of a join — arm each channel's tail-drop watermark at the current frontier. MUST run
|
|
1000
|
-
* BEFORE the
|
|
1001
|
-
*
|
|
1961
|
+
* BEFORE opening the core subscription so the live tail can never carry a just-joined message
|
|
1962
|
+
* un-watermarked — which would double-emit it (live + backfill).
|
|
1002
1963
|
* Returns the per-channel frontiers for {@link backfillArmed}. */
|
|
1003
1964
|
async armJoin(channels) {
|
|
1004
1965
|
const frontiers = new Map();
|
|
@@ -1123,7 +2084,7 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1123
2084
|
this.emit("error", e);
|
|
1124
2085
|
return 0;
|
|
1125
2086
|
}
|
|
1126
|
-
const noop = { ack: () => { }, nak: () => { } };
|
|
2087
|
+
const noop = { ack: () => { }, nak: () => { }, durable: false };
|
|
1127
2088
|
let n = 0;
|
|
1128
2089
|
for (const sm of msgs) {
|
|
1129
2090
|
let msg;
|
|
@@ -1238,9 +2199,15 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1238
2199
|
card: this.card,
|
|
1239
2200
|
status: this.status,
|
|
1240
2201
|
activity: this.activity,
|
|
2202
|
+
attention: this.attentionMode,
|
|
2203
|
+
channelModes: this.channelModes,
|
|
1241
2204
|
ts: Date.now(),
|
|
1242
2205
|
};
|
|
1243
|
-
|
|
2206
|
+
// Wire contract (SPEC §6): an OFFLINE record must not carry the advisory attention fields. Scrub at
|
|
2207
|
+
// the publisher — this covers stop(), setStatus("offline"), and any future offline publish site, so
|
|
2208
|
+
// the raw KV record is compliant, not only the observer-side roster materialization.
|
|
2209
|
+
const record = this.status === "offline" ? this.toOffline(p) : p;
|
|
2210
|
+
await this.kv.put(this.card.id, JSON.stringify(record));
|
|
1244
2211
|
}
|
|
1245
2212
|
async startPresenceWatch() {
|
|
1246
2213
|
if (!this.kv)
|
|
@@ -1305,7 +2272,9 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1305
2272
|
applyPresence(id, raw) {
|
|
1306
2273
|
const prev = this.roster.get(id);
|
|
1307
2274
|
const stale = Date.now() - raw.ts > this.ttlMs;
|
|
1308
|
-
|
|
2275
|
+
// Any offline materialization (a stale snapshot OR a graceful-leave record) drops the advisory
|
|
2276
|
+
// attention fields — an offline peer must not carry a stale `[focus]`/`locally muted` hint.
|
|
2277
|
+
const p = stale || raw.status === "offline" ? this.toOffline(raw) : raw;
|
|
1309
2278
|
// First time we hear about an already-offline peer (stale snapshot): record quietly.
|
|
1310
2279
|
if (!prev && p.status === "offline") {
|
|
1311
2280
|
this.roster.set(id, p);
|
|
@@ -1318,7 +2287,9 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1318
2287
|
prev.status !== "offline" &&
|
|
1319
2288
|
p.status !== "offline" &&
|
|
1320
2289
|
prev.status === p.status &&
|
|
1321
|
-
prev.activity === p.activity
|
|
2290
|
+
prev.activity === p.activity &&
|
|
2291
|
+
prev.attention === p.attention &&
|
|
2292
|
+
sameChannelModes(prev.channelModes, p.channelModes)) {
|
|
1322
2293
|
this.roster.set(id, p);
|
|
1323
2294
|
return;
|
|
1324
2295
|
}
|
|
@@ -1331,12 +2302,18 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1331
2302
|
this.emit("presence", { type, presence: p });
|
|
1332
2303
|
this.emit("roster", this.getRoster());
|
|
1333
2304
|
}
|
|
2305
|
+
/** Materialize an OFFLINE presence record: drop the advisory attention fields. An offline peer must
|
|
2306
|
+
* not show a stale `[focus]` or "locally muted #x" hint — SPEC: attention removed on offline sweep,
|
|
2307
|
+
* channel modes reset on restart. card/activity/ts are kept. */
|
|
2308
|
+
toOffline(p) {
|
|
2309
|
+
return { ...p, status: "offline", attention: undefined, channelModes: undefined };
|
|
2310
|
+
}
|
|
1334
2311
|
/** Mark a known peer offline (on KV delete/purge), keeping it in the roster. */
|
|
1335
2312
|
markOffline(id) {
|
|
1336
2313
|
const prev = this.roster.get(id);
|
|
1337
2314
|
if (!prev || prev.status === "offline")
|
|
1338
2315
|
return;
|
|
1339
|
-
const offline =
|
|
2316
|
+
const offline = this.toOffline(prev);
|
|
1340
2317
|
this.roster.set(id, offline);
|
|
1341
2318
|
this.emit("presence", { type: "offline", presence: offline });
|
|
1342
2319
|
this.emit("roster", this.getRoster());
|
|
@@ -1344,10 +2321,11 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1344
2321
|
sweep() {
|
|
1345
2322
|
const now = Date.now();
|
|
1346
2323
|
let changed = false;
|
|
1347
|
-
for (const [, p] of this.roster) {
|
|
2324
|
+
for (const [id, p] of this.roster) {
|
|
1348
2325
|
if (p.status !== "offline" && now - p.ts > this.ttlMs) {
|
|
1349
|
-
|
|
1350
|
-
this.
|
|
2326
|
+
const offline = this.toOffline(p);
|
|
2327
|
+
this.roster.set(id, offline);
|
|
2328
|
+
this.emit("presence", { type: "offline", presence: offline });
|
|
1351
2329
|
changed = true;
|
|
1352
2330
|
}
|
|
1353
2331
|
}
|
|
@@ -1355,13 +2333,6 @@ export class CotalEndpoint extends EventEmitter {
|
|
|
1355
2333
|
this.emit("roster", this.getRoster());
|
|
1356
2334
|
}
|
|
1357
2335
|
}
|
|
1358
|
-
/** The id token of a chat-stream durable, or null if it isn't one — the inverse of
|
|
1359
|
-
* `chatDurable` (`chat_<token(id)>`). token() is lossy, so this returns the token, not the
|
|
1360
|
-
* original id; callers match it forward against `token(card.id)`. */
|
|
1361
|
-
function chatDurableToken(durable) {
|
|
1362
|
-
const prefix = "chat_";
|
|
1363
|
-
return durable.startsWith(prefix) ? durable.slice(prefix.length) : null;
|
|
1364
|
-
}
|
|
1365
2336
|
/** Map an authenticated parsed-subject kind to the message class surfaced to "message" listeners.
|
|
1366
2337
|
* Throws on `ctl` (control-plane is request/reply, never a "message") — per repo convention, no
|
|
1367
2338
|
* silent default: an unexpected delivering kind is a bug, not something to swallow. */
|
|
@@ -1377,12 +2348,14 @@ function kindFromParsed(kind) {
|
|
|
1377
2348
|
throw new Error(`cannot derive a message kind from subject kind "${kind}"`);
|
|
1378
2349
|
}
|
|
1379
2350
|
}
|
|
1380
|
-
/**
|
|
1381
|
-
|
|
1382
|
-
|
|
2351
|
+
/** Shallow-equal two per-channel-mode maps (presence dedup): a change must re-emit, so an attention
|
|
2352
|
+
* toggle isn't swallowed as a quiet heartbeat. Absent and empty compare equal. */
|
|
2353
|
+
function sameChannelModes(a, b) {
|
|
2354
|
+
const ak = a ? Object.keys(a) : [];
|
|
2355
|
+
const bk = b ? Object.keys(b) : [];
|
|
2356
|
+
if (ak.length !== bk.length)
|
|
1383
2357
|
return false;
|
|
1384
|
-
|
|
1385
|
-
return b.every((x) => s.has(x));
|
|
2358
|
+
return ak.every((k) => a[k] === b?.[k]);
|
|
1386
2359
|
}
|
|
1387
2360
|
function authOpts(a) {
|
|
1388
2361
|
const tls = a.tls ? {} : undefined;
|