@cotal-ai/core 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/endpoint.js CHANGED
@@ -2,16 +2,21 @@ import { EventEmitter } from "node:events";
2
2
  import { randomUUID } from "node:crypto";
3
3
  import { connect, credsAuthenticator, nanos, AuthorizationError, PermissionViolationError, UserAuthenticationExpiredError, } from "@nats-io/transport-node";
4
4
  import { idFromCreds } from "./identity.js";
5
- import { createSpaceStreams, dmDurableConfig, taskDurableConfig, MAX_MSGS_PER_SUBJECT } from "./streams.js";
5
+ import { assertValidName } from "./resolve.js";
6
+ import { createSpaceStreams, chatDurableConfig, dmDurableConfig, taskDurableConfig, MAX_MSGS_PER_SUBJECT } from "./streams.js";
6
7
  import { jetstream, jetstreamManager, AckPolicy, DeliverPolicy, } from "@nats-io/jetstream";
7
8
  import { Kvm } from "@nats-io/kv";
8
9
  import { openChannelRegistry, effectiveReplay, effectiveReplayWindowMs, readChannelConfig, readChannelDefaults, } from "./channels.js";
9
- import { anycastSubject, CHANNEL_DEFAULTS_KEY, chatStream, chatDurable, chatSubject, collapseFilterSubjects, controlServiceSubject, dmStream, dmDurable, isConcreteChannel, normalizeMentions, parseSubject, presenceBucket, spacePrefix, spaceWildcard, subjectMatches, taskStream, taskDurable, token, unicastSubject, } from "./subjects.js";
10
+ import { anycastSubject, CHANNEL_DEFAULTS_KEY, chatStream, chatDurable, chatHistDurable, chatSubject, collapseFilterSubjects, controlServiceSubject, CONTROL_SELF_SERVICE, dmStream, dmDurable, isConcreteChannel, normalizeMentions, parseSubject, presenceBucket, spacePrefix, spaceWildcard, subjectMatches, taskStream, taskDurable, token, unicastSubject, } from "./subjects.js";
10
11
  export const DEFAULT_SERVER = "nats://127.0.0.1:4222";
11
12
  /** Space joined when none is given on the CLI (the `cotal-<space>` cmux tab, etc.). */
12
13
  export const DEFAULT_SPACE = "main";
13
14
  /**
14
- * Events: "message" (CotalMessage), "presence" (PresenceEvent), "roster" (Presence[]), "error" (Error).
15
+ * Events: "message" (CotalMessage), "presence" (PresenceEvent), "roster" (Presence[]), "error" (Error),
16
+ * "connection" ({ connected: boolean }) — true on every successful (re)bind (initial start, manual
17
+ * reconnect, AND background self-heal), false the moment the connection drops (rebuild null window /
18
+ * terminal close). Lets an in-process agent track connectedness off the endpoint's own (re)binds
19
+ * instead of an imperative flag the self-heal path can't reach.
15
20
  *
16
21
  * Callers MUST attach an "error" listener before `start()`: async faults (incl. NATS
17
22
  * permission denials, surfaced via `watchStatus`) are emitted as "error", and Node throws
@@ -48,6 +53,9 @@ export class CotalEndpoint extends EventEmitter {
48
53
  * a lagging joiner + dedups the backfill overlap). Keyed by the subscription pattern (may be
49
54
  * wildcard), so the drop matches every concrete channel the pattern subsumes. */
50
55
  joinSeq = new Map();
56
+ /** Serializes history reads ({@link collectHistory}): they share the fixed per-instance
57
+ * `chathist_<id>` consumer, so overlapping reads would delete/recreate it under one another. */
58
+ histLock = Promise.resolve();
51
59
  subs = [];
52
60
  streamMsgs = [];
53
61
  heartbeatTimer;
@@ -56,9 +64,27 @@ export class CotalEndpoint extends EventEmitter {
56
64
  status = "idle";
57
65
  activity;
58
66
  stopped = false;
67
+ /** In-flight rebuild (drain+rebind) — serializes manual reconnect, the supervisor's
68
+ * closed(), and reestablishLoop so only ONE rebuild runs at a time (a second trigger
69
+ * coalesces onto the shared promise, never starts a parallel connectAndBind). */
70
+ rebuildPromise;
71
+ /** True only during the null window of a rebuild (this.nc unset) — user-facing ops then
72
+ * throw a "reconnecting" message instead of the misleading "endpoint not started". */
73
+ reconnecting = false;
74
+ /** One reestablishLoop at a time; concurrent triggers coalesce via rebuild(). */
75
+ reestablishing = false;
76
+ /** Interruptible backoff for reestablishLoop — reconnect()/stop() resolves this to retry
77
+ * now instead of awaiting the full retryMs. */
78
+ backoffResolve;
79
+ backoffTimer;
80
+ retryMs = 3000;
59
81
  constructor(opts) {
60
82
  super();
61
83
  this.space = opts.space;
84
+ // A display name is the client-side handle a peer is addressed by; reject the reserved `/`
85
+ // (the future owner/name separator) and surrounding whitespace at the one identity choke
86
+ // point every join/spawn path flows through.
87
+ assertValidName(opts.card.name);
62
88
  // Identity precedence: an explicit card.id, else the creds' identity, else a random
63
89
  // uuid. When both an id and creds are given they MUST name the same nkey — otherwise
64
90
  // the subject sender token wouldn't match the authenticated user and every publish
@@ -87,6 +113,19 @@ export class CotalEndpoint extends EventEmitter {
87
113
  return { id: this.card.id, name: this.card.name, role: this.card.role };
88
114
  }
89
115
  async start() {
116
+ await this.connectAndBind();
117
+ // nats.js auto-reconnects transient drops; when it exhausts its attempts and the
118
+ // connection closes for good, rebuild from scratch so an in-process agent (e.g. the
119
+ // OpenCode plugin) recovers without a host respawn. Armed only after a successful first
120
+ // connect — a first-connect failure throws to the caller's connect-retry loop instead.
121
+ this.superviseConnection();
122
+ }
123
+ /** Open the connection and bind everything that hangs off it: status watch, presence
124
+ * watch + heartbeat, channel registry, and the durable consumers. Re-runnable — a
125
+ * reconnect calls it again after {@link clearConnectionScoped}; every binding is
126
+ * idempotent (durables bind by name, JetStream dedups by msgID, KV opens are idempotent). */
127
+ async connectAndBind() {
128
+ this.clearConnectionScoped();
90
129
  this.nc = await connect({
91
130
  servers: this.servers,
92
131
  name: `cotal:${this.card.name}`,
@@ -134,11 +173,184 @@ export class CotalEndpoint extends EventEmitter {
134
173
  await this.ensureStreams();
135
174
  await this.startConsumers();
136
175
  }
176
+ // Bound and live — covers initial start, manual reconnect, AND background self-heal (every
177
+ // path lands here). The single signal an in-process agent's connected flag tracks.
178
+ this.emit("connection", { connected: true });
179
+ }
180
+ /** Tear down everything {@link connectAndBind} (re)creates, so a rebind can't leak a
181
+ * second heartbeat, double-pump a consumer, or keep stale roster ghosts. Caller-owned
182
+ * subs (tap/serve) are left alone — they aren't rebuilt here. */
183
+ clearConnectionScoped() {
184
+ if (this.heartbeatTimer) {
185
+ clearInterval(this.heartbeatTimer);
186
+ this.heartbeatTimer = undefined;
187
+ }
188
+ if (this.sweepTimer) {
189
+ clearInterval(this.sweepTimer);
190
+ this.sweepTimer = undefined;
191
+ }
192
+ for (const msgs of this.streamMsgs) {
193
+ try {
194
+ msgs.stop();
195
+ }
196
+ catch {
197
+ /* already closed with the connection */
198
+ }
199
+ }
200
+ this.streamMsgs.length = 0;
201
+ this.roster.clear();
202
+ this.joinSeq.clear();
203
+ this.channelConfigs.clear();
204
+ this.channelDefaults = {};
205
+ }
206
+ /** If stop() ran during a rebuild's `await connectAndBind`, the just-bound connection +
207
+ * heartbeat + supervisor would be left live on a stopped endpoint. Tear that fresh
208
+ * connection back down and report it. Reads `this.nc` in its own scope (a bare `this.nc`
209
+ * in doRebuild narrows to `never` via TS inlining connectAndBind's assignment). Returns
210
+ * true iff it tore something down (caller bails out of the rebuild). */
211
+ async tearDownIfStopped() {
212
+ if (!this.stopped)
213
+ return false;
214
+ const nc = this.nc;
215
+ this.clearConnectionScoped();
216
+ try {
217
+ await nc?.drain();
218
+ }
219
+ catch {
220
+ /* already closing */
221
+ }
222
+ this.nc = undefined;
223
+ return true;
224
+ }
225
+ /** Watch for a terminal close (nats.js has exhausted its own reconnect) and rebuild.
226
+ * Our own stop()/drain also resolves closed(), so the `stopped` guard keeps a clean
227
+ * shutdown from re-establishing. The identity guard (`this.nc !== nc`) no-ops a STALE
228
+ * supervisor — one whose connection reconnect()/rebuild already replaced — so only a
229
+ * close of the CURRENT connection triggers a rebuild. The rebuild itself is serialized
230
+ * with the manual path via {@link rebuild}. */
231
+ superviseConnection() {
232
+ const nc = this.nc;
233
+ if (!nc)
234
+ return;
235
+ void nc.closed().then((err) => {
236
+ if (this.stopped)
237
+ return;
238
+ if (this.nc !== nc)
239
+ return; // epoch-stale — a rebuild already swapped this connection
240
+ this.emit("connection", { connected: false }); // dropped — report it before the rebuild kicks in
241
+ this.emit("error", new Error(`mesh connection closed${err ? `: ${err.message}` : ""} — re-establishing`));
242
+ void this.reestablishLoop();
243
+ });
244
+ }
245
+ /** Single serialized rebuild: drain the old connection and rebind via {@link connectAndBind},
246
+ * guarded so concurrent triggers (manual {@link reconnect}, the supervisor's closed(), the
247
+ * retry loop) coalesce onto ONE in-flight rebuild instead of racing two connectAndBinds and
248
+ * leaking a connection. Returns the shared promise; a second caller gets the in-flight one. */
249
+ rebuild() {
250
+ if (this.rebuildPromise)
251
+ return this.rebuildPromise;
252
+ const p = this.doRebuild().finally(() => {
253
+ if (this.rebuildPromise === p)
254
+ this.rebuildPromise = undefined;
255
+ });
256
+ this.rebuildPromise = p;
257
+ return p;
258
+ }
259
+ /** The transition: stop the connection-scoped timers FIRST (so nothing live touches
260
+ * this.nc during the null window), drop the connection refs, drain the old nc, then
261
+ * rebind + re-arm the supervisor on the fresh connection. clearConnectionScoped is
262
+ * idempotent, so connectAndBind's own call here is a noop. */
263
+ async doRebuild() {
264
+ const oldNc = this.nc;
265
+ this.reconnecting = true;
266
+ try {
267
+ this.clearConnectionScoped();
268
+ this.nc = undefined;
269
+ this.js = undefined;
270
+ this.jsm = undefined;
271
+ this.kv = undefined;
272
+ this.channelKv = undefined;
273
+ this.emit("connection", { connected: false }); // null window opened — not live until the rebind below
274
+ try {
275
+ await oldNc?.drain();
276
+ }
277
+ catch {
278
+ /* already closing */
279
+ }
280
+ await this.connectAndBind();
281
+ // stop() may have run during the await — don't leave a live connection + heartbeat +
282
+ // supervisor on a stopped endpoint. (Reads this.nc in its own scope — a bare `this.nc`
283
+ // here in doRebuild narrows to `never` via TS inlining connectAndBind's assignment.)
284
+ if (await this.tearDownIfStopped())
285
+ return;
286
+ this.superviseConnection(); // re-arm on the fresh nc
287
+ }
288
+ finally {
289
+ this.reconnecting = false;
290
+ }
291
+ }
292
+ /** Rebuild with backoff until it sticks or we're stopped. Interruptible: a manual
293
+ * {@link reconnect} kicks the backoff so the next attempt runs immediately instead of
294
+ * awaiting the full retryMs. One loop at a time ({@link reestablishing}); concurrent
295
+ * triggers coalesce via {@link rebuild}. */
296
+ async reestablishLoop() {
297
+ if (this.reestablishing)
298
+ return;
299
+ this.reestablishing = true;
300
+ try {
301
+ while (!this.stopped) {
302
+ try {
303
+ await this.rebuild();
304
+ return; // success — re-armed; the supervisor re-triggers on the next terminal close
305
+ }
306
+ catch (e) {
307
+ if (!this.stopped)
308
+ this.emit("error", e);
309
+ await new Promise((resolve) => {
310
+ this.backoffResolve = resolve;
311
+ this.backoffTimer = setTimeout(resolve, this.retryMs);
312
+ });
313
+ }
314
+ }
315
+ }
316
+ finally {
317
+ this.reestablishing = false;
318
+ }
319
+ }
320
+ /** Cut an in-flight reestablish backoff short so the next attempt runs immediately, and
321
+ * clear its timer so it can't fire later on a stopped/restarted loop. */
322
+ kickBackoff() {
323
+ this.backoffResolve?.();
324
+ if (this.backoffTimer) {
325
+ clearTimeout(this.backoffTimer);
326
+ this.backoffTimer = undefined;
327
+ }
328
+ }
329
+ /** Manual reconnect: tear down the current connection and rebuild, WITHOUT the permanent
330
+ * stop (stopped/stopping stay false). Serialized with the self-heal supervisor via
331
+ * {@link rebuild}, and interruptible — if a backoff is in flight, kick it so the attempt
332
+ * is now, not in retryMs. Throws if stopped. On failure, leaves {@link reestablishLoop}
333
+ * running in the background so the endpoint never stays dead, and rethrows so the caller
334
+ * can report it. */
335
+ async reconnect() {
336
+ if (this.stopped)
337
+ throw new Error("endpoint stopped — cannot reconnect");
338
+ this.kickBackoff();
339
+ try {
340
+ await this.rebuild();
341
+ }
342
+ catch (e) {
343
+ void this.reestablishLoop(); // background retry until success or stop
344
+ throw e;
345
+ }
137
346
  }
138
347
  async stop() {
139
348
  if (this.stopped)
140
349
  return;
141
350
  this.stopped = true;
351
+ // Wake a reestablishLoop sitting in backoff so it sees `stopped` and exits instead of
352
+ // sleeping out retryMs; also clears the timer so it can't fire later.
353
+ this.kickBackoff();
142
354
  if (this.heartbeatTimer)
143
355
  clearInterval(this.heartbeatTimer);
144
356
  if (this.sweepTimer)
@@ -286,7 +498,7 @@ export class CotalEndpoint extends EventEmitter {
286
498
  /** Send a control request to a service and await its reply (client side). */
287
499
  async requestControl(service, req, timeoutMs = 5000) {
288
500
  if (!this.nc)
289
- throw new Error("endpoint not started");
501
+ throw new Error(this.notLiveMsg());
290
502
  const body = { ...req, from: req.from ?? this.ref() };
291
503
  const m = await this.nc.request(controlServiceSubject(this.space, service, this.card.id), JSON.stringify(body), { timeout: timeoutMs });
292
504
  return m.json();
@@ -327,17 +539,20 @@ export class CotalEndpoint extends EventEmitter {
327
539
  */
328
540
  async joinChannel(channel) {
329
541
  if (!this.jsm)
330
- throw new Error("endpoint not started");
542
+ throw new Error(this.notLiveMsg());
331
543
  if (this.channels.includes(channel))
332
544
  return { joined: false, backfilled: 0 };
333
- const next = collapseFilterSubjects([...this.channels, channel].map((ch) => chatSubject(this.space, "*", ch)));
334
545
  // Arm the watermark BEFORE the filter flip (single-delivery: a tail message on the new
335
546
  // channel is then either ≤ frontier → backfill-only or > frontier → tail-only, never both),
336
547
  // and filter BEFORE backfill (gap-safe: backfill-first leaves a window in neither stream).
337
548
  const armed = await this.armJoin([channel]);
338
- await this.jsm.consumers.update(chatStream(this.space), chatDurable(this.card.id), {
339
- filter_subjects: next,
340
- });
549
+ try {
550
+ await this.setChatFilter([...this.channels, channel]);
551
+ }
552
+ catch (e) {
553
+ this.joinSeq.delete(channel); // the flip was rejected (e.g. outside allowSubscribe) — undo the arm
554
+ throw e;
555
+ }
341
556
  this.channels.push(channel);
342
557
  const backfilled = await this.backfillArmed(armed);
343
558
  return { joined: true, backfilled };
@@ -347,26 +562,53 @@ export class CotalEndpoint extends EventEmitter {
347
562
  * leaving). Returns whether anything changed. */
348
563
  async leaveChannel(channel) {
349
564
  if (!this.jsm)
350
- throw new Error("endpoint not started");
565
+ throw new Error(this.notLiveMsg());
351
566
  const i = this.channels.indexOf(channel);
352
567
  if (i < 0)
353
568
  return { left: false };
354
569
  if (this.channels.length === 1)
355
570
  throw new Error(`cannot leave "${channel}" — it is your only channel (an empty filter would subscribe to all)`);
356
571
  const remaining = this.channels.filter((c) => c !== channel);
357
- await this.jsm.consumers.update(chatStream(this.space), chatDurable(this.card.id), {
358
- filter_subjects: collapseFilterSubjects(remaining.map((ch) => chatSubject(this.space, "*", ch))),
359
- });
572
+ await this.setChatFilter(remaining);
360
573
  this.channels.splice(i, 1);
361
574
  this.joinSeq.delete(channel);
362
575
  return { left: true };
363
576
  }
577
+ /** Move the chat live-tail durable to a new channel set. OPEN mode self-serves the
578
+ * `consumers.update` (the agent owns its durable). AUTH mode is bind-only — the agent has no
579
+ * UPDATE grant — so it sends a mediated control request to the manager, which validates the set
580
+ * ⊆ its `allowSubscribe` before moving the filter. Throws clearly when no privileged responder is
581
+ * present: a manager-less standalone auth session is fixed to its boot subscribe set — a
582
+ * documented limitation, not a silent degrade. */
583
+ async setChatFilter(channels) {
584
+ if (!this.jsm)
585
+ throw new Error(this.notLiveMsg());
586
+ if (!this.creds) {
587
+ await this.jsm.consumers.update(chatStream(this.space), chatDurable(this.card.id), {
588
+ filter_subjects: collapseFilterSubjects(channels.map((ch) => chatSubject(this.space, "*", ch))),
589
+ });
590
+ return;
591
+ }
592
+ let reply;
593
+ try {
594
+ reply = await this.requestControl(CONTROL_SELF_SERVICE, { op: "setChannels", args: { channels } });
595
+ }
596
+ catch (e) {
597
+ const msg = e.message;
598
+ if (/no responders/i.test(msg))
599
+ throw new Error("cannot change channels at runtime: no privileged provisioner (manager) is serving the mesh — " +
600
+ "this session is fixed to its boot subscribe set");
601
+ throw e;
602
+ }
603
+ if (!reply.ok)
604
+ throw new Error(reply.error ?? "channel change rejected");
605
+ }
364
606
  /** One coherent channel model for dashboards: every channel that has messages OR a registry
365
607
  * entry (configured-but-empty), each tagged with its {@link ChannelConfig}. Works even on
366
608
  * observer endpoints (no consumers needed). */
367
609
  async listChannels() {
368
610
  if (!this.nc)
369
- throw new Error("endpoint not started");
611
+ throw new Error(this.notLiveMsg());
370
612
  const mgr = await jetstreamManager(this.nc);
371
613
  // Subjects carry the sender (chat.<sender>.<channel>), so collapse across senders: sum
372
614
  // each channel's counts regardless of who published.
@@ -512,9 +754,18 @@ export class CotalEndpoint extends EventEmitter {
512
754
  this.emit("error", e);
513
755
  });
514
756
  }
757
+ /** The error message for a guard that finds the endpoint unbound: "reconnecting" during a
758
+ * rebuild's null window OR an inter-retry backoff (so a concurrent op reports the real
759
+ * reason, not "not started" — `reestablishing` spans the whole retry loop incl. backoff),
760
+ * else "endpoint not started" (genuine pre-start). */
761
+ notLiveMsg() {
762
+ return this.reconnecting || this.reestablishing
763
+ ? "reconnecting — try again shortly"
764
+ : "endpoint not started";
765
+ }
515
766
  async publishMsg(subject, msg) {
516
767
  if (!this.js)
517
- throw new Error("endpoint not started");
768
+ throw new Error(this.notLiveMsg());
518
769
  // msgID = message id → free server-side dedup across JetStream redelivery.
519
770
  await this.js.publish(subject, JSON.stringify(msg), { msgID: msg.id });
520
771
  }
@@ -525,6 +776,29 @@ export class CotalEndpoint extends EventEmitter {
525
776
  throw new Error("endpoint not started");
526
777
  await createSpaceStreams(this.jsm, this.space);
527
778
  }
779
+ /**
780
+ * Privileged: pre-create an agent's bind-only chat live-tail durable (auth mode), filtered to its
781
+ * `subscribe` set, so the agent can BIND it without holding CONSUMER.CREATE/UPDATE on CHAT — its
782
+ * live read can't be self-widened past `allowSubscribe`. The creator sets the filter; the agent
783
+ * never does (mirrors {@link provisionDmInbox}). Idempotent. The caller must be permissive on CHAT.
784
+ */
785
+ async provisionChatDurable(targetId, subscribe) {
786
+ const jsm = await this.manager();
787
+ await jsm.consumers.add(chatStream(this.space), chatDurableConfig(this.space, targetId, subscribe));
788
+ }
789
+ /**
790
+ * Privileged: move an agent's bind-only chat durable to a new channel set — the write half of the
791
+ * mediated join/leave. The manager calls this AFTER validating the set ⊆ the agent's
792
+ * `allowSubscribe`; the agent itself has no UPDATE grant, so this trusted path is the only way its
793
+ * live filter moves. The filter is rebuilt from channel names here (not from agent-supplied
794
+ * subjects) so a caller can't smuggle a hand-built filter.
795
+ */
796
+ async setChatFilterFor(targetId, channels) {
797
+ const jsm = await this.manager();
798
+ await jsm.consumers.update(chatStream(this.space), chatDurable(targetId), {
799
+ filter_subjects: collapseFilterSubjects(channels.map((ch) => chatSubject(this.space, "*", ch))),
800
+ });
801
+ }
528
802
  /**
529
803
  * Privileged: pre-create an agent's DM inbox durable (auth mode), so the agent can BIND
530
804
  * it without holding CONSUMER.CREATE on DM_<space>. The creator sets the filter to
@@ -559,8 +833,6 @@ export class CotalEndpoint extends EventEmitter {
559
833
  if (!this.jsm)
560
834
  throw new Error("endpoint not started");
561
835
  const id = this.card.id;
562
- const ack_wait = nanos(this.ackWaitMs);
563
- const inactive_threshold = nanos(this.inactiveThresholdMs);
564
836
  // Unicast: this instance's private DM inbox. Open mode self-creates; auth mode BINDS a
565
837
  // durable the provisioner pre-created (agents are denied CONSUMER.CREATE on DM_<space>,
566
838
  // since the create-time filter_subject is the attack surface — see provisionDmInbox).
@@ -577,37 +849,53 @@ export class CotalEndpoint extends EventEmitter {
577
849
  if (this.channels.length) {
578
850
  const durable = chatDurable(id);
579
851
  const want = collapseFilterSubjects(this.channels.map((ch) => chatSubject(this.space, "*", ch)));
852
+ // Auth mode: the chat live-tail durable is pre-created BIND-ONLY by the provisioner (the agent
853
+ // is denied CONSUMER.CREATE/UPDATE on CHAT — its filter is the read boundary). Open mode: the
854
+ // agent owns it and self-creates. Either way it is a DeliverPolicy.New tail; per-channel
855
+ // history is the explicit backfill below (the only shape that honors per-channel replay
856
+ // policy given deliver_policy is consumer-wide).
580
857
  const info = await this.consumerInfo(chatStream(this.space), durable);
581
858
  if (!info) {
582
- // Fresh durable: a New tail (history is the explicit backfill below — the only shape
583
- // that honors per-channel policy given deliver_policy is consumer-wide).
584
- await this.jsm.consumers.add(chatStream(this.space), {
585
- durable_name: durable,
586
- filter_subjects: want,
587
- ack_policy: AckPolicy.Explicit,
588
- ack_wait,
589
- deliver_policy: DeliverPolicy.New,
590
- inactive_threshold,
591
- });
859
+ if (this.creds)
860
+ throw new Error(`chat durable ${durable} not pre-created a launcher must call provisionChatDurable ` +
861
+ `(auth mode binds the durable, it never self-creates)`);
862
+ await this.jsm.consumers.add(chatStream(this.space), chatDurableConfig(this.space, id, this.channels, {
863
+ ackWaitMs: this.ackWaitMs,
864
+ inactiveThresholdMs: this.inactiveThresholdMs,
865
+ }));
866
+ }
867
+ // First bind to this durable (open self-create, or an auth pre-create never consumed) ⇒
868
+ // backfill the full subscribe set. A later reconnect (the consumed cursor has advanced)
869
+ // backfills only channels the config GAINED — un-acked live messages auto-redeliver, so a full
870
+ // re-backfill would double up. With pre-create, `info` always exists under auth, so the
871
+ // consumed cursor — not the durable's existence — is what tells first-bind from reconnect.
872
+ //
873
+ // Caveat (best-effort, by design): `consumer_seq > 0` proves the durable has delivered at
874
+ // least once, NOT that the initial backfill completed. A crash between the first delivery and
875
+ // backfillArmed() makes the next bind take the reconnect path and skip the full pre-bind
876
+ // backfill. This is unchanged from the prior self-create path (which keyed on durable
877
+ // existence and had the same gap — and was actually weaker: a crash before any delivery left
878
+ // the durable existing, so it never re-backfilled; consumer_seq still 0 here re-backfills).
879
+ // Reliable FORWARD delivery is the durable's job (un-acked redelivery); pre-bind history is
880
+ // opportunistic. A backfill-completion marker would make it reliable — a deferred follow-up.
881
+ const consumed = (info?.delivered?.consumer_seq ?? 0) > 0;
882
+ if (!consumed) {
592
883
  // Arm the tail-drop watermarks BEFORE pump starts, so the tail can never deliver a
593
- // just-created channel's message un-watermarked (which would double-emit: live + backfill).
884
+ // just-bound channel's message un-watermarked (which would double-emit: live + backfill).
594
885
  const armed = await this.armJoin(this.channels);
595
886
  await this.pump(chatStream(this.space), durable);
596
887
  await this.backfillArmed(armed);
597
888
  }
598
889
  else {
599
- // Rebind: reconcile the durable's filter to the CURRENT config (a config that changed
600
- // between restarts is honored). Channels the config GAINED are backfilled like a fresh
601
- // join; channels it LOST are dropped from the filter. An unchanged config = pure resume,
602
- // empty diff, no re-replay.
890
+ // Reconnect: resume the tail, then backfill any channels the config GAINED since.
603
891
  await this.pump(chatStream(this.space), durable);
604
892
  const haveFilters = info.config.filter_subjects ?? (info.config.filter_subject ? [info.config.filter_subject] : []);
605
- // Channels the config gained = those not already covered by the durable's filters (a
606
- // wildcard already covers its sub-channels). Backfill only those.
607
893
  const gained = this.channels.filter((c) => !haveFilters.some((f) => subjectMatches(f, chatSubject(this.space, "*", c))));
608
- // Arm watermarks for the gained channels BEFORE the filter reconcile flips them on.
609
894
  const armed = gained.length ? await this.armJoin(gained) : undefined;
610
- if (!sameSet(haveFilters, want))
895
+ // Reconcile the durable's filter to the CURRENT config — OPEN MODE ONLY. Auth mode is
896
+ // bind-only (no UPDATE grant): the durable's filter is authoritative, moved solely by the
897
+ // mediated join/leave control op, so the agent never self-reconciles it.
898
+ if (!this.creds && !sameSet(haveFilters, want))
611
899
  await this.jsm.consumers.update(chatStream(this.space), durable, { filter_subjects: want });
612
900
  if (armed)
613
901
  await this.backfillArmed(armed);
@@ -738,73 +1026,122 @@ export class CotalEndpoint extends EventEmitter {
738
1026
  async joinPolicyFresh(channel) {
739
1027
  if (!this.channelKv)
740
1028
  return { replay: effectiveReplay(undefined, undefined) };
1029
+ // A wildcard subscription (`review.>`) has no single registry entry — and `>`/`*` are illegal
1030
+ // KV keys, so a per-channel get would throw. Read only the space defaults for it; concrete
1031
+ // channels still get their per-channel override.
741
1032
  const [cfg, defaults] = await Promise.all([
742
- readChannelConfig(this.channelKv, channel),
1033
+ isConcreteChannel(channel) ? readChannelConfig(this.channelKv, channel) : Promise.resolve(undefined),
743
1034
  readChannelDefaults(this.channelKv),
744
1035
  ]);
745
1036
  return { replay: effectiveReplay(cfg, defaults), windowMs: effectiveReplayWindowMs(cfg, defaults) };
746
1037
  }
747
- /** Read a channel's retained history up to `upToSeq` via JetStream **Direct Get** (a read
748
- * verb no consumer create, so it rides a read-only grant) and emit each message as a
749
- * `historical` "message" event. `sinceMs` bounds how far back via a native Direct-Get
750
- * `start_time` (now window); unset ⇒ the full retained window. New messages (`seq > upToSeq`)
751
- * are skipped the live tail owns them. Pages the batch API; the ack handle is a no-op. */
752
- async backfillChannel(channel, upToSeq, sinceMs) {
753
- if (!this.jsm)
1038
+ /**
1039
+ * Read retained chat history on ONE channel subject through a name-scoped, single-filter
1040
+ * EPHEMERAL pull consumer the broker-contained replacement for the removed Direct Get. The
1041
+ * create rides `$JS.API.CONSUMER.CREATE.<CHAT>.<chathist_id>.<subject>`, whose trailing filter
1042
+ * token nats-server pins to the request body (JSConsumerCreateFilterSubjectMismatchErr, code
1043
+ * 10131) — so an agent can only ever replay a channel its `allowSubscribe` grants. Single filter
1044
+ * only (plural isn't ACL-constrainable); `AckPolicy.None` + `mem_storage` so it leaves no durable
1045
+ * state, and it is deleted right after. Returns raw messages in stream order from `start`,
1046
+ * stopping once past `untilSeq` (exclusive of it) or after `limit`. The per-instance name means
1047
+ * calls must be serial — every reader here awaits to completion, so they are.
1048
+ */
1049
+ async collectHistory(subject, start, opts = {}) {
1050
+ // Serialize on the per-instance lock: the fixed `chathist_<id>` name means two concurrent reads
1051
+ // (recall + join-backfill + drop-marker can race in-process) would delete/recreate the consumer
1052
+ // under each other and cross-feed results. The chain makes the "serial callers" assumption true.
1053
+ const run = this.histLock.then(() => this.collectHistoryInner(subject, start, opts));
1054
+ this.histLock = run.catch(() => { }); // keep the chain alive on error
1055
+ return run;
1056
+ }
1057
+ async collectHistoryInner(subject, start, opts = {}) {
1058
+ if (!this.jsm || !this.js)
754
1059
  throw new Error("endpoint not started");
755
- const subject = chatSubject(this.space, "*", channel);
756
- const collected = [];
757
- // First page starts by time when a window is set (native), else from seq 1; after that we
758
- // always page by sequence.
759
- const startTime = sinceMs === undefined ? undefined : new Date(Date.now() - sinceMs);
760
- let startSeq = 1;
761
- let first = true;
762
- pages: for (;;) {
763
- let last = 0;
764
- let got = 0;
765
- try {
766
- const query = first && startTime !== undefined
767
- ? { start_time: startTime, next_by_subj: subject, batch: 256 }
768
- : { seq: startSeq, next_by_subj: subject, batch: 256 };
769
- first = false;
770
- const iter = await this.jsm.direct.getBatch(chatStream(this.space), query);
771
- for await (const sm of iter) {
1060
+ const stream = chatStream(this.space);
1061
+ const name = chatHistDurable(this.card.id);
1062
+ const out = [];
1063
+ // Clear any consumer leaked by a crashed prior read before re-creating it with THIS read's
1064
+ // single filter (the read ACL is enforced at create see the doc above).
1065
+ try {
1066
+ await this.jsm.consumers.delete(stream, name);
1067
+ }
1068
+ catch { /* none — fine */ }
1069
+ await this.jsm.consumers.add(stream, {
1070
+ name,
1071
+ filter_subject: subject,
1072
+ ack_policy: AckPolicy.None,
1073
+ mem_storage: true,
1074
+ inactive_threshold: nanos(30_000),
1075
+ ...("time" in start
1076
+ ? { deliver_policy: DeliverPolicy.StartTime, opt_start_time: start.time.toISOString() }
1077
+ : { deliver_policy: DeliverPolicy.StartSequence, opt_start_seq: start.seq }),
1078
+ });
1079
+ try {
1080
+ const consumer = await this.js.consumers.get(stream, name);
1081
+ let pending = (await consumer.info()).num_pending;
1082
+ while (pending > 0) {
1083
+ const want = Math.min(pending, 256);
1084
+ const iter = await consumer.fetch({ max_messages: want, expires: 5_000 });
1085
+ let got = 0;
1086
+ for await (const m of iter) {
772
1087
  got++;
773
- if (sm.seq > upToSeq)
774
- break pages; // crossed the frontier — the tail owns the rest
775
- last = sm.seq;
776
- let msg;
777
- try {
778
- msg = sm.json();
779
- }
780
- catch {
781
- continue; // skip undecodable
782
- }
783
- // Same authenticity guard as the tail; skip our own echoes in history.
784
- const parsed = parseSubject(sm.subject);
785
- if (!parsed || msg.from?.id !== parsed.sender || msg.from.id === this.card.id)
1088
+ if (opts.untilSeq !== undefined && m.seq > opts.untilSeq)
1089
+ return out; // crossed the frontier
1090
+ // Belt-and-suspenders over the lock: only keep messages on the requested channel subject
1091
+ // (the consumer's filter already bounds this; guards against any stale-consumer edge).
1092
+ if (!subjectMatches(subject, m.subject))
786
1093
  continue;
787
- collected.push({ msg, seq: sm.seq });
1094
+ out.push(m);
1095
+ if (opts.limit !== undefined && out.length >= opts.limit)
1096
+ return out;
788
1097
  }
1098
+ if (got < want)
1099
+ break; // drained early
1100
+ pending -= got;
789
1101
  }
790
- catch (e) {
791
- // Batch Direct Get raises a 404 ("message not found") when no message matches from
792
- // `start` — the normal "no more history" signal (empty channel or last page), not a
793
- // fault. Anything else is real.
794
- if (e.code === 404)
795
- break;
796
- this.emit("error", e);
797
- break;
1102
+ }
1103
+ finally {
1104
+ try {
1105
+ await this.jsm.consumers.delete(stream, name);
798
1106
  }
799
- if (got === 0 || last === 0)
800
- break; // drained
801
- startSeq = last + 1;
1107
+ catch { /* already gone */ }
1108
+ }
1109
+ return out;
1110
+ }
1111
+ /** Read a channel's retained history up to `upToSeq` (the join frontier) and emit each message
1112
+ * as a `historical` "message" event. `sinceMs` bounds how far back via a native consumer
1113
+ * `start_time` (now − window); unset ⇒ the full retained window. New messages (`seq > upToSeq`)
1114
+ * are skipped — the live tail owns them. Reads through the contained {@link collectHistory}. */
1115
+ async backfillChannel(channel, upToSeq, sinceMs) {
1116
+ const subject = chatSubject(this.space, "*", channel);
1117
+ const start = sinceMs === undefined ? { seq: 1 } : { time: new Date(Date.now() - sinceMs) };
1118
+ let msgs;
1119
+ try {
1120
+ msgs = await this.collectHistory(subject, start, { untilSeq: upToSeq });
1121
+ }
1122
+ catch (e) {
1123
+ this.emit("error", e);
1124
+ return 0;
802
1125
  }
803
1126
  const noop = { ack: () => { }, nak: () => { } };
804
- for (const { msg } of collected)
805
- // Backfill only ever pages the chat stream, so the authenticated class is always "channel".
1127
+ let n = 0;
1128
+ for (const sm of msgs) {
1129
+ let msg;
1130
+ try {
1131
+ msg = sm.json();
1132
+ }
1133
+ catch {
1134
+ continue; // skip undecodable
1135
+ }
1136
+ // Same authenticity guard as the tail; skip our own echoes in history.
1137
+ const parsed = parseSubject(sm.subject);
1138
+ if (!parsed || msg.from?.id !== parsed.sender || msg.from.id === this.card.id)
1139
+ continue;
1140
+ // Backfill only ever reads the chat stream, so the authenticated class is always "channel".
806
1141
  this.emit("message", msg, noop, { historical: true, kind: "channel" });
807
- return collected.length;
1142
+ n++;
1143
+ }
1144
+ return n;
808
1145
  }
809
1146
  /**
810
1147
  * Replay-gated pull of a channel's retained ambient from `sinceSeq` (exclusive) forward — the
@@ -815,55 +1152,40 @@ export class CotalEndpoint extends EventEmitter {
815
1152
  *
816
1153
  * Honors the **same** per-channel replay gate as join-backfill ({@link joinPolicyFresh}): a
817
1154
  * `replay=off` channel returns nothing, so `focus` can't become a history bypass for a channel
818
- * that denies replay to everyone else (chat is `allow_direct` with no broker-level ACL, so this
819
- * app gate is the entire boundary).
1155
+ * that denies replay to everyone else (the read ACL bounds *which* channels recall can touch; this
1156
+ * app gate bounds *whether* a permitted channel replays).
820
1157
  */
821
1158
  async recallChannel(channel, sinceSeq) {
822
1159
  if (!this.jsm)
823
- throw new Error("endpoint not started");
1160
+ throw new Error(this.notLiveMsg());
824
1161
  if (!isConcreteChannel(channel))
825
1162
  return { messages: [], dropped: false };
826
1163
  const policy = await this.joinPolicyFresh(channel);
827
1164
  if (!policy.replay)
828
1165
  return { messages: [], dropped: false };
829
1166
  const subject = chatSubject(this.space, "*", channel);
1167
+ let raw;
1168
+ try {
1169
+ raw = await this.collectHistory(subject, { seq: sinceSeq + 1 });
1170
+ }
1171
+ catch (e) {
1172
+ this.emit("error", e);
1173
+ raw = [];
1174
+ }
830
1175
  const collected = [];
831
- let startSeq = sinceSeq + 1;
832
- pages: for (;;) {
833
- let last = 0;
834
- let got = 0;
1176
+ for (const sm of raw) {
1177
+ let msg;
835
1178
  try {
836
- const iter = await this.jsm.direct.getBatch(chatStream(this.space), {
837
- seq: startSeq,
838
- next_by_subj: subject,
839
- batch: 256,
840
- });
841
- for await (const sm of iter) {
842
- got++;
843
- last = sm.seq;
844
- let msg;
845
- try {
846
- msg = sm.json();
847
- }
848
- catch {
849
- continue; // skip undecodable
850
- }
851
- // Same authenticity guard as the tail/backfill; skip our own echoes.
852
- const parsed = parseSubject(sm.subject);
853
- if (!parsed || msg.from?.id !== parsed.sender || msg.from.id === this.card.id)
854
- continue;
855
- collected.push(msg);
856
- }
1179
+ msg = sm.json();
857
1180
  }
858
- catch (e) {
859
- if (e.code === 404)
860
- break; // no more history (empty or last page)
861
- this.emit("error", e);
862
- break;
1181
+ catch {
1182
+ continue; // skip undecodable
863
1183
  }
864
- if (got === 0 || last === 0)
865
- break;
866
- startSeq = last + 1;
1184
+ // Same authenticity guard as the tail/backfill; skip our own echoes.
1185
+ const parsed = parseSubject(sm.subject);
1186
+ if (!parsed || msg.from?.id !== parsed.sender || msg.from.id === this.card.id)
1187
+ continue;
1188
+ collected.push(msg);
867
1189
  }
868
1190
  const dropped = await this.channelDropped(subject, sinceSeq);
869
1191
  return { messages: collected, dropped };
@@ -895,24 +1217,18 @@ export class CotalEndpoint extends EventEmitter {
895
1217
  return oldest !== undefined && oldest > sinceSeq + 1;
896
1218
  }
897
1219
  /** Sequence of the earliest message still retained on a channel subject (any sender), or
898
- * undefined if nothing is retained. One 1-message Direct Get — used for the recall drop marker. */
1220
+ * undefined if nothing is retained. One message through the contained {@link collectHistory}
1221
+ * used for the recall drop marker. */
899
1222
  async channelOldestSeq(subject) {
900
1223
  if (!this.jsm)
901
1224
  return undefined;
902
1225
  try {
903
- const iter = await this.jsm.direct.getBatch(chatStream(this.space), {
904
- seq: 1,
905
- next_by_subj: subject,
906
- batch: 1,
907
- });
908
- for await (const sm of iter)
909
- return sm.seq;
910
- return undefined;
1226
+ const [first] = await this.collectHistory(subject, { seq: 1 }, { limit: 1 });
1227
+ return first?.seq;
911
1228
  }
912
1229
  catch (e) {
913
- if (e.code !== 404)
914
- this.emit("error", e);
915
- return undefined; // 404 = nothing retained on this subject (normal)
1230
+ this.emit("error", e);
1231
+ return undefined;
916
1232
  }
917
1233
  }
918
1234
  async publishPresence() {
@@ -1088,6 +1404,19 @@ function describeStatusError(err) {
1088
1404
  }
1089
1405
  return err;
1090
1406
  }
1407
+ /** True when a failure is a NATS *permission denial* — the subject is forbidden to this
1408
+ * endpoint's creds — rather than a missing responder or a timeout. The two need opposite
1409
+ * fixes (grant the capability vs. start/await the service), so callers (e.g. a control
1410
+ * request that can't reach the manager) must tell them apart instead of defaulting to
1411
+ * "service down". Unwraps a wrapped `cause` and falls back to the server's error text, since
1412
+ * a denied publish can surface either as the typed error or inside a request rejection. */
1413
+ export function isPermissionDenied(e) {
1414
+ if (e instanceof PermissionViolationError)
1415
+ return true;
1416
+ if (e?.cause instanceof PermissionViolationError)
1417
+ return true;
1418
+ return /permissions?\s+violation/i.test(String(e?.message ?? ""));
1419
+ }
1091
1420
  /** Whether a NATS server is *running* at `servers`. True on a successful connect AND on an
1092
1421
  * auth rejection — an auth error means a server is there, just refusing these creds (so the
1093
1422
  * caller should surface the real auth failure, not a misleading "server down", and `up`