@rivalis/fleet 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/module.js ADDED
@@ -0,0 +1,3582 @@
1
+ var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
2
+ get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
3
+ }) : x)(function(x) {
4
+ if (typeof require !== "undefined") return require.apply(this, arguments);
5
+ throw Error('Dynamic require of "' + x + '" is not supported');
6
+ });
7
+
8
+ // src/orchestrator/Orchestrator.ts
9
+ import { Broadcast } from "@toolcase/base";
10
+
11
+ // src/util/logger.ts
12
+ var NOOP_LOGGER = {
13
+ error() {
14
+ },
15
+ warning() {
16
+ },
17
+ info() {
18
+ },
19
+ debug() {
20
+ },
21
+ verbose() {
22
+ },
23
+ log() {
24
+ }
25
+ };
26
+
27
+ // src/orchestrator/Config.ts
28
+ var DEFAULT_HOST = "0.0.0.0";
29
+ var DEFAULT_HEARTBEAT_MS = 5e3;
30
+ var DEFAULT_COMMAND_TIMEOUT_MS = 1e4;
31
+ var MIN_KEY_LENGTH = 16;
32
+ var WEAK_KEY_LENGTH = 32;
33
+ function normalizeKeys(value) {
34
+ if (value === void 0) {
35
+ return [];
36
+ }
37
+ const list = Array.isArray(value) ? value : [value];
38
+ const seen = /* @__PURE__ */ new Set();
39
+ for (const key of list) {
40
+ if (typeof key === "string" && key.length > 0) {
41
+ seen.add(key);
42
+ }
43
+ }
44
+ return [...seen];
45
+ }
46
+ function resolveConfig(options) {
47
+ if (typeof options !== "object" || options === null) {
48
+ throw new Error("orchestrator config error: options must be an object");
49
+ }
50
+ if (typeof options.port !== "number" || !Number.isInteger(options.port) || options.port < 0 || options.port > 65535) {
51
+ throw new Error(`orchestrator config error: port must be an integer in [0, 65535], got ${String(options.port)}`);
52
+ }
53
+ const agentKeys = normalizeKeys(options.agentKey);
54
+ if (agentKeys.length === 0) {
55
+ throw new Error("orchestrator config error: at least one agentKey is required");
56
+ }
57
+ const api = options.api ?? true;
58
+ const adminKeys = normalizeKeys(options.adminKey);
59
+ if (api && adminKeys.length === 0) {
60
+ throw new Error("orchestrator config error: adminKey is required when api is enabled");
61
+ }
62
+ const heartbeatMs = options.heartbeatMs ?? DEFAULT_HEARTBEAT_MS;
63
+ if (typeof heartbeatMs !== "number" || heartbeatMs <= 0) {
64
+ throw new Error("orchestrator config error: heartbeatMs must be a positive number");
65
+ }
66
+ const commandTimeoutMs = options.commandTimeoutMs ?? DEFAULT_COMMAND_TIMEOUT_MS;
67
+ if (typeof commandTimeoutMs !== "number" || commandTimeoutMs <= 0) {
68
+ throw new Error("orchestrator config error: commandTimeoutMs must be a positive number");
69
+ }
70
+ let cors2 = false;
71
+ if (options.cors !== void 0 && options.cors !== false) {
72
+ if (!Array.isArray(options.cors.origins)) {
73
+ throw new Error("orchestrator config error: cors.origins must be an array of strings");
74
+ }
75
+ cors2 = { origins: [...options.cors.origins] };
76
+ }
77
+ return {
78
+ host: options.host ?? DEFAULT_HOST,
79
+ port: options.port,
80
+ agentKeys,
81
+ adminKeys,
82
+ api,
83
+ heartbeatMs,
84
+ commandTimeoutMs,
85
+ cors: cors2,
86
+ sseQueryAuth: options.sseQueryAuth ?? false,
87
+ trustProxy: options.trustProxy ?? false
88
+ };
89
+ }
90
+ function enforceSecurityPolicy(config, context = {}) {
91
+ const env2 = context.env;
92
+ const isProduction = env2 === "production";
93
+ const logger = context.logger ?? NOOP_LOGGER;
94
+ const adminSet = new Set(config.adminKeys);
95
+ const intersects = config.agentKeys.some((key) => adminSet.has(key));
96
+ if (intersects) {
97
+ const message = "orchestrator security: agentKey and adminKey lists intersect \u2014 one key serving both audiences re-opens the legacy single-token hole (\xA713)";
98
+ if (isProduction) {
99
+ throw new Error(`${message}; refusing to start when NODE_ENV=production`);
100
+ }
101
+ logger.warning(message);
102
+ }
103
+ if (!isProduction) {
104
+ return;
105
+ }
106
+ const allKeys = [...config.agentKeys, ...config.adminKeys];
107
+ for (const key of allKeys) {
108
+ if (key.length < MIN_KEY_LENGTH) {
109
+ throw new Error(
110
+ `orchestrator security: a configured key is shorter than ${MIN_KEY_LENGTH} characters \u2014 refusing to start when NODE_ENV=production (\xA713)`
111
+ );
112
+ }
113
+ }
114
+ for (const key of allKeys) {
115
+ if (key.length < WEAK_KEY_LENGTH) {
116
+ logger.warning(
117
+ `orchestrator security: a configured key is shorter than ${WEAK_KEY_LENGTH} characters \u2014 weak; prefer 32+ (\xA713)`
118
+ );
119
+ }
120
+ }
121
+ }
122
+
123
+ // src/orchestrator/FleetState.ts
124
+ import { randomBytes } from "crypto";
125
+
126
+ // src/util/canonical.ts
127
+ import { createHash } from "crypto";
128
+ function canonicalize(value) {
129
+ return encode(value);
130
+ }
131
+ function encode(value) {
132
+ if (value === null) {
133
+ return "null";
134
+ }
135
+ const type = typeof value;
136
+ if (type === "string") {
137
+ return JSON.stringify(value);
138
+ }
139
+ if (type === "number") {
140
+ return Number.isFinite(value) ? String(value) : "null";
141
+ }
142
+ if (type === "boolean") {
143
+ return value ? "true" : "false";
144
+ }
145
+ if (type === "bigint") {
146
+ return value.toString();
147
+ }
148
+ if (Array.isArray(value)) {
149
+ const items = value.map((item) => encodeArrayItem(item));
150
+ return "[" + items.join(",") + "]";
151
+ }
152
+ if (type === "object") {
153
+ const obj = value;
154
+ const keys = Object.keys(obj).sort();
155
+ const parts = [];
156
+ for (const key of keys) {
157
+ const child = obj[key];
158
+ if (isSkippable(child)) {
159
+ continue;
160
+ }
161
+ parts.push(JSON.stringify(key) + ":" + encode(child));
162
+ }
163
+ return "{" + parts.join(",") + "}";
164
+ }
165
+ return "null";
166
+ }
167
+ function encodeArrayItem(item) {
168
+ return isSkippable(item) ? "null" : encode(item);
169
+ }
170
+ function isSkippable(value) {
171
+ const type = typeof value;
172
+ return value === void 0 || type === "function" || type === "symbol";
173
+ }
174
+ function hash64(value) {
175
+ const digest = createHash("sha256").update(canonicalize(value)).digest();
176
+ return digest.subarray(0, 8).toString("hex");
177
+ }
178
+
179
+ // src/domain/roomId.ts
180
+ var ROOM_ID_PATTERN = /^[A-Za-z0-9_-]{1,64}$/;
181
+ var NAMESPACE_SEPARATOR = "~";
182
+ var ROOM_ID_CHAR = /[A-Za-z0-9_-]/;
183
+ function isValidRoomId(id) {
184
+ return ROOM_ID_PATTERN.test(id);
185
+ }
186
+ function encodeRoomId(id) {
187
+ if (isValidRoomId(id)) {
188
+ return id;
189
+ }
190
+ let out = "";
191
+ for (const byte of Buffer.from(id, "utf8")) {
192
+ const ch = String.fromCharCode(byte);
193
+ out += ROOM_ID_CHAR.test(ch) ? ch : "%" + byte.toString(16).toUpperCase().padStart(2, "0");
194
+ }
195
+ return out;
196
+ }
197
+ function namespaceRoomId(processUid, encodedRoomId) {
198
+ return processUid + NAMESPACE_SEPARATOR + encodedRoomId;
199
+ }
200
+
201
+ // src/domain/roomCreate.ts
202
+ var roomCreateSchema = {
203
+ type: { type: "string", required: true, min: 1 },
204
+ roomId: { type: "string", pattern: ROOM_ID_PATTERN.source },
205
+ placement: { type: "object" }
206
+ };
207
+
208
+ // src/domain/errors.ts
209
+ import { EndpointError } from "@toolcase/node";
210
+ var CODE_TO_STATUS = {
211
+ VALIDATION: 400,
212
+ UNAUTHORIZED: 401,
213
+ INSTANCE_NOT_FOUND: 404,
214
+ ROOM_NOT_FOUND: 404,
215
+ NO_CANDIDATE: 409,
216
+ ROOM_EXISTS: 409,
217
+ INSTANCE_DRAINING: 409,
218
+ PAYLOAD_TOO_LARGE: 413,
219
+ INSTANCE_BUSY: 429,
220
+ AUTH_THROTTLED: 429,
221
+ SSE_LIMIT: 429,
222
+ COMMAND_FAILED: 502,
223
+ INSTANCE_DISCONNECTED: 502,
224
+ COMMAND_TIMEOUT: 504
225
+ };
226
+ var FleetError = class extends EndpointError {
227
+ constructor(code, message) {
228
+ super(CODE_TO_STATUS[code], code, message);
229
+ this.name = "FleetError";
230
+ }
231
+ };
232
+
233
+ // src/orchestrator/FleetState.ts
234
+ var FleetState = class {
235
+ logger;
236
+ random;
237
+ /** Read model, keyed by connection-scoped `instanceId` (a reconnect is a new key). */
238
+ records = /* @__PURE__ */ new Map();
239
+ /** Live capacity reservations: token → instanceId. */
240
+ reservations = /* @__PURE__ */ new Map();
241
+ /** Reserved room slots per instance, derived from `reservations` for O(1) headroom checks. */
242
+ reservedByInstance = /* @__PURE__ */ new Map();
243
+ reservationSeq = 0;
244
+ /** Room ids reserved by in-flight creates (§11) — held until ack/timeout/rejection. */
245
+ reservedRoomIds = /* @__PURE__ */ new Set();
246
+ /**
247
+ * Room ids whose create has settled (acked OK or timed out) but whose room has
248
+ * not yet appeared in an applied snapshot from the owning instance (task 003).
249
+ * The id reservation is held *past* the command settle: releasing it on ack/timeout
250
+ * would free the id for up to one `heartbeatMs` before the room reconciles into the
251
+ * read model — the window in which the §10 retry-after-504 (or an immediate
252
+ * re-create) re-reserves the id and double-creates on a *different* instance, the
253
+ * exact cross-instance duplicate §11 exists to prevent. Keyed `roomId → owning
254
+ * instanceId`; cleared when the owning instance's next snapshot/poll reconciles
255
+ * (the read model takes over) or it is evicted. Held entries count toward both
256
+ * id-uniqueness ({@link isRoomIdTaken}) and `maxRooms` headroom ({@link hasHeadroom}).
257
+ */
258
+ pendingRoomIds = /* @__PURE__ */ new Map();
259
+ /** Pending-visibility room count per instance, for O(1) `maxRooms` headroom (task 003). */
260
+ pendingByInstance = /* @__PURE__ */ new Map();
261
+ /** Monotonic join counter — assigns each instance its tie-break order (§11). */
262
+ joinCounter = 0;
263
+ /**
264
+ * Instances the orchestrator has marked stale (wedged: connected but silent
265
+ * past 2×heartbeat — §7). Liveness bookkeeping, not snapshot-derived semantic
266
+ * state: it is **excluded from `stateHash`** (like `lastSyncAt`) but **excludes
267
+ * the instance from auto-placement**, so a wedged-yet-least-loaded node cannot
268
+ * keep winning placement until it is evicted at 3×heartbeat.
269
+ */
270
+ staleInstances = /* @__PURE__ */ new Set();
271
+ /**
272
+ * Agent-acked-but-not-yet-snapshotted status, kept for PLACEMENT candidacy only
273
+ * (task 004). On a `drain`/`undrain` ack the agent has already flipped its
274
+ * agent-owned status (§7), but the read-model `status` only catches up at the
275
+ * instance's next poll reply — up to one `heartbeatMs` later. Until then
276
+ * `place()` would still see the stale value and keep selecting a just-drained
277
+ * node (or keep excluding a just-undrained one). Like {@link staleInstances},
278
+ * this is a placement-only override: it is **excluded from `stateHash`** and the
279
+ * read model, and it **never writes the agent-owned `status`** (§7 status
280
+ * ownership stays intact) — it only shifts what {@link place} treats as the
281
+ * instance's effective status. Keyed `instanceId → effective status`; cleared the
282
+ * moment a snapshot/poll reconciles the matching status into the read model (the
283
+ * override has done its job) or the instance is removed.
284
+ */
285
+ pendingStatus = /* @__PURE__ */ new Map();
286
+ /**
287
+ * Memoized id-resolution pass ({@link resolve}) and {@link computeStateHash}
288
+ * result. The resolution is O(rooms) — flatten every room, group by base id,
289
+ * sort the collision buckets, build the public-id index, clone every instance —
290
+ * and was previously rebuilt on **every** read-model query (`stats`/`instances`/
291
+ * `rooms`/`getRoom`/…); one `GET /v1/stats` alone resolves ≥2×. Both are now
292
+ * computed lazily and held until the next SEMANTIC mutation.
293
+ *
294
+ * Invalidated by exactly the two mutations that change semantic state:
295
+ * {@link applySnapshot} (when it actually applies) and {@link removeInstance}.
296
+ * {@link touch} (advances `lastSyncAt`) and {@link setStale} are non-semantic —
297
+ * both are excluded from `stateHash` (§6) and from the resolution — so neither
298
+ * invalidates; `touch` instead keeps the cached `InstanceInfo.lastSyncAt` in step
299
+ * in place (see below). `null` ⇒ dirty, rebuild on next read.
300
+ *
301
+ * Read-only contract: the cached `InstanceInfo` / `RoomInfo` objects are now
302
+ * SHARED across callers and across queries (a query no longer clones afresh).
303
+ * They must be treated as immutable by consumers; the only sanctioned in-place
304
+ * write is `touch`'s `lastSyncAt` refresh, which is liveness bookkeeping outside
305
+ * both the resolution and the hash. The `instances`/`rooms`/`findRooms` getters
306
+ * still hand back a fresh array container so a caller's `sort()`/`push()` cannot
307
+ * corrupt the memo — only the element objects are shared.
308
+ */
309
+ resolvedView = null;
310
+ cachedStateHash = null;
311
+ constructor(options = {}) {
312
+ this.logger = options.logger ?? NOOP_LOGGER;
313
+ this.random = options.random ?? Math.random;
314
+ }
315
+ // -----------------------------------------------------------------------
316
+ // Read model mutation (driven by the fleet room — task 009)
317
+ // -----------------------------------------------------------------------
318
+ /**
319
+ * Apply a validated full `fleet/state` snapshot to the read model. Returns `true`
320
+ * when applied, `false` when dropped as an out-of-order/duplicate frame.
321
+ *
322
+ * `seq` is per-connection monotonic (§7); a frame whose `seq` does not
323
+ * strictly exceed the last applied one is **dropped, never applied** — this
324
+ * turns a hypothetical agent-side send-queue bug into a lost frame instead of
325
+ * read-model corruption (§7, §14). Field validation (§13) happens upstream;
326
+ * this method trusts the payload's shape.
327
+ */
328
+ applySnapshot(instanceId, payload, lastSyncAt) {
329
+ const existing = this.records.get(instanceId);
330
+ if (existing !== void 0 && payload.seq <= existing.lastSeq) {
331
+ this.logger.warning(
332
+ `fleet: dropped out-of-order snapshot from instance=${instanceId} (seq=${payload.seq} <= last=${existing.lastSeq})`
333
+ );
334
+ return false;
335
+ }
336
+ const info = buildInstanceInfo(instanceId, payload, lastSyncAt);
337
+ const joinSeq = existing?.joinSeq ?? ++this.joinCounter;
338
+ this.records.set(instanceId, { info, lastSeq: payload.seq, lastHash: payload.hash, joinSeq });
339
+ this.invalidate();
340
+ this.clearPendingVisibility(instanceId);
341
+ this.reconcilePendingStatus(instanceId);
342
+ return true;
343
+ }
344
+ /**
345
+ * Bump an instance's `lastSyncAt` without touching semantic state (used on
346
+ * a hash-only `fleet/state` reply). Deliberately does **not** affect `stateHash` — liveness
347
+ * bookkeeping is excluded so a quiet fleet still produces ETag 304s (§6, §10).
348
+ */
349
+ touch(instanceId, lastSyncAt) {
350
+ const record = this.records.get(instanceId);
351
+ if (record === void 0) {
352
+ return;
353
+ }
354
+ record.info.lastSyncAt = lastSyncAt;
355
+ this.clearPendingVisibility(instanceId);
356
+ this.reconcilePendingStatus(instanceId);
357
+ const cached = this.resolvedView?.byId.get(instanceId);
358
+ if (cached !== void 0) {
359
+ cached.lastSyncAt = lastSyncAt;
360
+ }
361
+ }
362
+ /** Remove an instance from the read model (socket close or eviction, §7). */
363
+ removeInstance(instanceId) {
364
+ const record = this.records.get(instanceId);
365
+ if (record === void 0) {
366
+ return null;
367
+ }
368
+ this.records.delete(instanceId);
369
+ this.staleInstances.delete(instanceId);
370
+ this.pendingStatus.delete(instanceId);
371
+ this.clearPendingVisibility(instanceId);
372
+ this.invalidate();
373
+ return record.info;
374
+ }
375
+ /**
376
+ * Mark/unmark an instance stale (orchestrator liveness — §7). A stale instance
377
+ * stays in the read model and the `stateHash` (so dashboards keep seeing it
378
+ * until eviction) but is dropped from auto-placement candidacy. Cleared
379
+ * automatically on {@link removeInstance}.
380
+ */
381
+ setStale(instanceId, stale) {
382
+ if (stale) {
383
+ this.staleInstances.add(instanceId);
384
+ } else {
385
+ this.staleInstances.delete(instanceId);
386
+ }
387
+ }
388
+ /**
389
+ * Record an agent-acked-but-not-yet-snapshotted status for PLACEMENT only
390
+ * (task 004) — called on a `drain`/`undrain` ack, where the agent has already
391
+ * flipped its status (§7) but the read model lags by up to one poll. `place()`
392
+ * reads this through {@link effectiveStatus} so candidacy converges at ack time
393
+ * (`drain` excludes the node, `undrain` re-includes it) instead of one poll
394
+ * interval later. Never writes the agent-owned read-model `status` and is absent
395
+ * from `stateHash`, so §7 status ownership and the §10 ETag are untouched. The
396
+ * override clears itself once a snapshot reconciles the matching status (see
397
+ * {@link reconcilePendingStatus}). No-op on an unknown instance — there is nothing
398
+ * to place onto, and a later join starts clean.
399
+ */
400
+ setPendingStatus(instanceId, status) {
401
+ if (!this.records.has(instanceId)) {
402
+ return;
403
+ }
404
+ this.pendingStatus.set(instanceId, status);
405
+ }
406
+ /** Hash of the last applied snapshot for an instance (sent as the poll `knownHash` for dedup, §7). */
407
+ lastHashOf(instanceId) {
408
+ return this.records.get(instanceId)?.lastHash ?? null;
409
+ }
410
+ // -----------------------------------------------------------------------
411
+ // Read model queries (§9)
412
+ // -----------------------------------------------------------------------
413
+ get instances() {
414
+ return [...this.resolve().instances];
415
+ }
416
+ get rooms() {
417
+ const rooms = [];
418
+ for (const instance of this.resolve().instances) {
419
+ rooms.push(...instance.rooms);
420
+ }
421
+ return rooms;
422
+ }
423
+ get stats() {
424
+ const instances = this.resolve().instances;
425
+ let connections = 0;
426
+ let rooms = 0;
427
+ const roomTypes = /* @__PURE__ */ new Set();
428
+ for (const instance of instances) {
429
+ connections += instance.connections;
430
+ rooms += instance.rooms.length;
431
+ for (const type of instance.roomTypes) {
432
+ roomTypes.add(type);
433
+ }
434
+ }
435
+ if (this.cachedStateHash === null) {
436
+ this.cachedStateHash = this.computeStateHash(instances);
437
+ this.logger.debug("fleet: computed semantic state hash");
438
+ }
439
+ return {
440
+ instances: instances.length,
441
+ rooms,
442
+ connections,
443
+ roomTypes: [...roomTypes].sort(),
444
+ stateHash: this.cachedStateHash
445
+ };
446
+ }
447
+ getInstance(id) {
448
+ return this.resolve().byId.get(id) ?? null;
449
+ }
450
+ /**
451
+ * Resolve an instance by its stable `processUid` (§6 pinning) to the **most
452
+ * recent connection** — the record with the highest `joinSeq` (task 011). During
453
+ * a reconnect overlap two records share a `processUid` (the live new connection
454
+ * plus the old wedged one not yet evicted, up to 3 poll intervals); `processUid`
455
+ * is the documented *stable* handle across reconnects, so it must resolve to the
456
+ * live connection. First-match (map insertion order) would pick the OLDEST — the
457
+ * dead connection in exactly the scenario `processUid` pinning exists for.
458
+ */
459
+ getInstanceByProcessUid(processUid) {
460
+ const record = this.latestRecordByProcessUid(processUid);
461
+ return record === null ? null : this.resolve().byId.get(record.info.id) ?? null;
462
+ }
463
+ /** Look up a room by its PUBLIC id (canonical, namespaced, or percent-encoded — §11). */
464
+ getRoom(roomId) {
465
+ return this.resolve().byPublicId.get(roomId)?.room ?? null;
466
+ }
467
+ /**
468
+ * Map a public room id (possibly namespaced or percent-encoded) back to its
469
+ * owning instance and the RAW id the agent knows it by — what a `fleet/cmd`
470
+ * `destroy` must carry, since the agent never sees the public id (§11). Returns
471
+ * null when no room has that public id.
472
+ */
473
+ resolveRoom(roomId) {
474
+ const locator = this.resolve().byPublicId.get(roomId);
475
+ if (locator === void 0) {
476
+ return null;
477
+ }
478
+ return { instanceId: locator.instanceId, rawRoomId: locator.rawRoomId };
479
+ }
480
+ /** Rooms cluster-wide, filtered by type / owning instance / owning-instance labels (§9). */
481
+ findRooms(filter = {}) {
482
+ const result = [];
483
+ for (const instance of this.resolve().instances) {
484
+ if (filter.instanceId !== void 0 && instance.id !== filter.instanceId) {
485
+ continue;
486
+ }
487
+ if (filter.labels !== void 0 && !matchesLabels(instance.labels, filter.labels)) {
488
+ continue;
489
+ }
490
+ for (const room of instance.rooms) {
491
+ if (filter.type !== void 0 && room.type !== filter.type) {
492
+ continue;
493
+ }
494
+ result.push(room);
495
+ }
496
+ }
497
+ return result;
498
+ }
499
+ // -----------------------------------------------------------------------
500
+ // Placement (§9)
501
+ // -----------------------------------------------------------------------
502
+ /**
503
+ * Select an instance for a new room and reserve a capacity slot on it,
504
+ * atomically (§9). Throws a coded {@link FleetError} on validation /
505
+ * no-candidate / draining-pin. The reservation must be released by the
506
+ * caller on ack, timeout, or rejection.
507
+ */
508
+ place(request) {
509
+ if (request.instanceId !== void 0 && request.processUid !== void 0) {
510
+ throw new FleetError("VALIDATION", "specify at most one of placement.instanceId or placement.processUid");
511
+ }
512
+ if (request.instanceId !== void 0 || request.processUid !== void 0) {
513
+ const instance2 = request.instanceId !== void 0 ? this.rawInstanceById(request.instanceId) : this.rawInstanceByProcessUid(request.processUid);
514
+ if (instance2 === null) {
515
+ const which = request.instanceId !== void 0 ? `instanceId=${request.instanceId}` : `processUid=${request.processUid}`;
516
+ throw new FleetError("INSTANCE_NOT_FOUND", `no instance matches ${which}`);
517
+ }
518
+ if (this.effectiveStatus(instance2) === "draining" && request.force !== true) {
519
+ throw new FleetError(
520
+ "INSTANCE_DRAINING",
521
+ `instance ${instance2.id} is draining; pin requires force: true`
522
+ );
523
+ }
524
+ if (this.staleInstances.has(instance2.id) && request.force !== true) {
525
+ throw new FleetError(
526
+ "INSTANCE_DISCONNECTED",
527
+ `instance ${instance2.id} is stale (missed poll replies); pin requires force: true`
528
+ );
529
+ }
530
+ if (!instance2.autoCreate) {
531
+ throw new FleetError("NO_CANDIDATE", `instance ${instance2.id} has autoCreate disabled`);
532
+ }
533
+ return { instance: instance2, reservation: this.reserve(instance2.id) };
534
+ }
535
+ const candidates = this.rawInstances().filter(
536
+ (instance2) => this.effectiveStatus(instance2) === "active" && !this.staleInstances.has(instance2.id) && instance2.autoCreate === true && instance2.roomTypes.includes(request.type) && (request.labels === void 0 || matchesLabels(instance2.labels, request.labels)) && this.hasHeadroom(instance2)
537
+ );
538
+ if (candidates.length === 0) {
539
+ throw new FleetError("NO_CANDIDATE", `no active instance can host room type "${request.type}"`);
540
+ }
541
+ const instance = this.pick(candidates, request.strategy ?? "least-loaded");
542
+ return { instance, reservation: this.reserve(instance.id) };
543
+ }
544
+ /** Release a capacity reservation (on ack, timeout, or rejection — §9). Idempotent. */
545
+ release(reservation) {
546
+ if (!this.reservations.delete(reservation.id)) {
547
+ return;
548
+ }
549
+ const count = this.reservedByInstance.get(reservation.instanceId) ?? 0;
550
+ if (count <= 1) {
551
+ this.reservedByInstance.delete(reservation.instanceId);
552
+ } else {
553
+ this.reservedByInstance.set(reservation.instanceId, count - 1);
554
+ }
555
+ }
556
+ /** Reserved (in-flight) room slots currently held against an instance. */
557
+ reservedRooms(instanceId) {
558
+ return this.reservedByInstance.get(instanceId) ?? 0;
559
+ }
560
+ // -----------------------------------------------------------------------
561
+ // Room-id uniqueness & reservation (§11)
562
+ // -----------------------------------------------------------------------
563
+ /**
564
+ * Validate, uniqueness-check, and reserve a room id for an in-flight create
565
+ * (§11). When `roomId` is omitted a collision-free `r_<id>` within the charset
566
+ * is generated. Throws {@link FleetError} `VALIDATION` (explicit id outside the
567
+ * charset) or `ROOM_EXISTS` (id already in the fleet or already reserved). The
568
+ * reservation closes the race window: two concurrent creates with the same
569
+ * explicit id cannot both pass — exactly one reserves, the rest fail fast. The
570
+ * caller must `releaseRoomId` on ack, timeout, or rejection.
571
+ */
572
+ reserveRoomId(roomId) {
573
+ if (roomId === void 0) {
574
+ const generated = this.generateFreeRoomId();
575
+ this.reservedRoomIds.add(generated);
576
+ return { roomId: generated };
577
+ }
578
+ if (!isValidRoomId(roomId)) {
579
+ throw new FleetError("VALIDATION", `roomId "${roomId}" must match ${ROOM_ID_PATTERN.source}`);
580
+ }
581
+ if (this.isRoomIdTaken(roomId)) {
582
+ throw new FleetError("ROOM_EXISTS", `room id "${roomId}" already exists or is reserved`);
583
+ }
584
+ this.reservedRoomIds.add(roomId);
585
+ return { roomId };
586
+ }
587
+ /** Release a room-id reservation (on ack, timeout, or rejection — §11). Idempotent. */
588
+ releaseRoomId(reservation) {
589
+ this.reservedRoomIds.delete(reservation.roomId);
590
+ }
591
+ /**
592
+ * Transition a create's reservations from *in-flight* to *pending-visibility*
593
+ * (task 003) — called by the command engine when a create **acks OK or times
594
+ * out**, instead of releasing. The room id stays reserved and one `maxRooms` slot
595
+ * stays counted until the owning instance's next snapshot/poll reconciles the room
596
+ * into the read model (or it is evicted). This closes the §11 window where a
597
+ * `504`-then-retry (§10) or an ack-then-immediate re-create would re-reserve the id
598
+ * after the command settled but before the room was visible, and double-create it on
599
+ * another instance. The original capacity reservation token is released and both
600
+ * holds collapse into one pending-visibility entry (still one id, one room slot).
601
+ */
602
+ holdUntilVisible(roomIdReservation, reservation) {
603
+ this.release(reservation);
604
+ this.reservedRoomIds.delete(roomIdReservation.roomId);
605
+ if (this.pendingRoomIds.has(roomIdReservation.roomId)) {
606
+ return;
607
+ }
608
+ this.pendingRoomIds.set(roomIdReservation.roomId, reservation.instanceId);
609
+ this.pendingByInstance.set(
610
+ reservation.instanceId,
611
+ (this.pendingByInstance.get(reservation.instanceId) ?? 0) + 1
612
+ );
613
+ }
614
+ /** Acked-but-not-yet-visible room slots held against an instance (task 003). */
615
+ pendingRooms(instanceId) {
616
+ return this.pendingByInstance.get(instanceId) ?? 0;
617
+ }
618
+ // -----------------------------------------------------------------------
619
+ // Internals
620
+ // -----------------------------------------------------------------------
621
+ /**
622
+ * A public id is taken when an in-flight reservation holds it, a settled-but-not-
623
+ * yet-visible create holds it (task 003), or a live room already uses it.
624
+ */
625
+ isRoomIdTaken(roomId) {
626
+ return this.reservedRoomIds.has(roomId) || this.pendingRoomIds.has(roomId) || this.getRoom(roomId) !== null;
627
+ }
628
+ /**
629
+ * Clear every pending-visibility hold for an instance (task 003) — called when the
630
+ * instance's snapshot/poll reconciles its room set (the read model now holds
631
+ * whatever rooms truly exist) or when it is evicted (its rooms vanish). Either way
632
+ * the in-flight hold has done its job: a present room is taken via the read model,
633
+ * an absent one is genuinely free. Idempotent.
634
+ */
635
+ clearPendingVisibility(instanceId) {
636
+ if (this.pendingByInstance.get(instanceId) === void 0) {
637
+ return;
638
+ }
639
+ for (const [roomId, owner] of [...this.pendingRoomIds]) {
640
+ if (owner === instanceId) {
641
+ this.pendingRoomIds.delete(roomId);
642
+ }
643
+ }
644
+ this.pendingByInstance.delete(instanceId);
645
+ }
646
+ /**
647
+ * The status `place()` should treat the instance as having (task 004): the
648
+ * pending placement override when one is held, else the snapshot-derived
649
+ * read-model `status`. The override exists only between a `drain`/`undrain` ack
650
+ * and the snapshot that confirms it.
651
+ */
652
+ effectiveStatus(instance) {
653
+ return this.pendingStatus.get(instance.id) ?? instance.status;
654
+ }
655
+ /**
656
+ * Drop the placement override once the read model has caught up (task 004) — i.e.
657
+ * the last-applied snapshot's status now equals the pending value. Called on every
658
+ * snapshot apply and hash-only poll reply. Idempotent; no-op when no override is held.
659
+ */
660
+ reconcilePendingStatus(instanceId) {
661
+ const pending = this.pendingStatus.get(instanceId);
662
+ if (pending !== void 0 && this.records.get(instanceId)?.info.status === pending) {
663
+ this.pendingStatus.delete(instanceId);
664
+ }
665
+ }
666
+ /** Generate a `r_<id>` not currently reserved or in use; near-certain on the first try. */
667
+ generateFreeRoomId() {
668
+ for (let attempt = 0; attempt < 1e3; attempt++) {
669
+ const candidate = generateRoomId();
670
+ if (!this.isRoomIdTaken(candidate)) {
671
+ return candidate;
672
+ }
673
+ }
674
+ throw new FleetError("ROOM_EXISTS", "could not generate a unique room id after 1000 attempts");
675
+ }
676
+ /** Raw read-model rows (raw room ids) — the placement candidate source. */
677
+ rawInstances() {
678
+ return [...this.records.values()].map((record) => record.info);
679
+ }
680
+ rawInstanceById(id) {
681
+ return this.records.get(id)?.info ?? null;
682
+ }
683
+ rawInstanceByProcessUid(processUid) {
684
+ return this.latestRecordByProcessUid(processUid)?.info ?? null;
685
+ }
686
+ /**
687
+ * The record for `processUid` with the highest `joinSeq` — the most recent
688
+ * connection (task 011). Shared by {@link getInstanceByProcessUid} (read API)
689
+ * and the pinned {@link place} path so both resolve a reconnect-overlapped
690
+ * `processUid` to the live connection, never the wedged old one.
691
+ */
692
+ latestRecordByProcessUid(processUid) {
693
+ let latest = null;
694
+ for (const record of this.records.values()) {
695
+ if (record.info.processUid === processUid && (latest === null || record.joinSeq > latest.joinSeq)) {
696
+ latest = record;
697
+ }
698
+ }
699
+ return latest;
700
+ }
701
+ /**
702
+ * Resolve raw agent-reported room ids into the fleet-unique PUBLIC id space
703
+ * (§11). Pure function of the current read model — derivable from snapshots
704
+ * alone (§3), so it survives an orchestrator restart. Rules:
705
+ * - Local ids outside the charset are percent-encoded ({@link encodeRoomId}).
706
+ * - When several rooms map to the same base id, exactly one keeps it: a
707
+ * `fleet` room beats a `local` one, then the earliest joiner wins, then the
708
+ * lower instance id (deterministic — never map-iteration order). The losers
709
+ * surface namespaced as `<processUid>~<base>`, flagged `local` per their own
710
+ * origin. Two `fleet` rooms colliding can only happen across a restart, so
711
+ * that case is logged naming both instances (post-restart tie-break, §11).
712
+ */
713
+ resolve() {
714
+ if (this.resolvedView !== null) {
715
+ return this.resolvedView;
716
+ }
717
+ const entries = [];
718
+ for (const record of this.records.values()) {
719
+ for (const room of record.info.rooms) {
720
+ entries.push({
721
+ instanceId: record.info.id,
722
+ processUid: record.info.processUid,
723
+ joinSeq: record.joinSeq,
724
+ rawId: room.id,
725
+ base: encodeRoomId(room.id),
726
+ origin: room.local ? "local" : "fleet",
727
+ room,
728
+ publicId: ""
729
+ });
730
+ }
731
+ }
732
+ const groups = /* @__PURE__ */ new Map();
733
+ for (const entry of entries) {
734
+ const bucket = groups.get(entry.base);
735
+ if (bucket === void 0) {
736
+ groups.set(entry.base, [entry]);
737
+ } else {
738
+ bucket.push(entry);
739
+ }
740
+ }
741
+ for (const [base, bucket] of groups) {
742
+ if (bucket.length === 1) {
743
+ bucket[0].publicId = base;
744
+ continue;
745
+ }
746
+ const ordered = [...bucket].sort(compareForCanonical);
747
+ const keeper = ordered[0];
748
+ keeper.publicId = base;
749
+ for (const entry of ordered) {
750
+ if (entry !== keeper) {
751
+ entry.publicId = namespaceRoomId(entry.processUid, base);
752
+ }
753
+ }
754
+ const fleetDuplicates = ordered.filter((entry) => entry.origin === "fleet");
755
+ if (fleetDuplicates.length > 1) {
756
+ for (const loser of fleetDuplicates) {
757
+ if (loser !== keeper) {
758
+ this.logger.warning(
759
+ `fleet: duplicate room id "${base}" reported by instance ${keeper.instanceId} (joined earliest, keeps the canonical id) and instance ${loser.instanceId} (surfaced as "${loser.publicId}") \u2014 \xA711 post-restart tie-break, no room hidden or destroyed`
760
+ );
761
+ }
762
+ }
763
+ }
764
+ }
765
+ const roomsByInstance = /* @__PURE__ */ new Map();
766
+ const byPublicId = /* @__PURE__ */ new Map();
767
+ for (const entry of entries) {
768
+ const room = { ...entry.room, id: entry.publicId };
769
+ const list = roomsByInstance.get(entry.instanceId);
770
+ if (list === void 0) {
771
+ roomsByInstance.set(entry.instanceId, [room]);
772
+ } else {
773
+ list.push(room);
774
+ }
775
+ byPublicId.set(entry.publicId, { room, instanceId: entry.instanceId, rawRoomId: entry.rawId });
776
+ }
777
+ const instances = [];
778
+ const byId = /* @__PURE__ */ new Map();
779
+ for (const record of this.records.values()) {
780
+ const instance = { ...record.info, rooms: roomsByInstance.get(record.info.id) ?? [] };
781
+ instances.push(instance);
782
+ byId.set(instance.id, instance);
783
+ }
784
+ this.logger.debug("fleet: rebuilt id-resolution view");
785
+ this.resolvedView = { instances, byId, byPublicId };
786
+ return this.resolvedView;
787
+ }
788
+ /**
789
+ * Drop the memoized resolution + state hash. Called by the two SEMANTIC
790
+ * mutations only ({@link applySnapshot}, {@link removeInstance}); the next read
791
+ * rebuilds. Non-semantic mutations ({@link touch}, {@link setStale}) never call
792
+ * this — see {@link resolvedView}.
793
+ */
794
+ invalidate() {
795
+ this.resolvedView = null;
796
+ this.cachedStateHash = null;
797
+ }
798
+ reserve(instanceId) {
799
+ const id = `res_${++this.reservationSeq}`;
800
+ this.reservations.set(id, instanceId);
801
+ this.reservedByInstance.set(instanceId, (this.reservedByInstance.get(instanceId) ?? 0) + 1);
802
+ return { id, instanceId };
803
+ }
804
+ /**
805
+ * Headroom against capacity, counting in-flight reservations as rooms (§9).
806
+ * A pending create occupies a room slot but contributes no connections (the
807
+ * room is empty until clients join), so reservations gate `maxRooms` only;
808
+ * `maxConnections` is gated by the real connection count.
809
+ */
810
+ hasHeadroom(instance) {
811
+ const capacity = instance.capacity;
812
+ if (capacity.maxRooms !== null) {
813
+ const projected = instance.rooms.length + this.reservedRooms(instance.id) + this.pendingRooms(instance.id);
814
+ if (projected >= capacity.maxRooms) {
815
+ return false;
816
+ }
817
+ }
818
+ if (capacity.maxConnections !== null && instance.connections >= capacity.maxConnections) {
819
+ return false;
820
+ }
821
+ return true;
822
+ }
823
+ /**
824
+ * Pick among filtered candidates (§9 steps 2–3). `least-loaded`/`most-loaded`
825
+ * score by `connections / maxConnections` only when *every* candidate
826
+ * declares `maxConnections`; if any leaves it undeclared, all are scored by
827
+ * raw `connections` (a normalized 0.93 and a raw 1500 are not comparable).
828
+ * Ties are broken randomly; `random` ignores load entirely.
829
+ */
830
+ pick(candidates, strategy) {
831
+ if (strategy === "random") {
832
+ return this.choose(candidates);
833
+ }
834
+ const allDeclare = candidates.every((instance) => instance.capacity.maxConnections !== null);
835
+ const scoreOf = (instance) => allDeclare ? instance.connections / instance.capacity.maxConnections : instance.connections;
836
+ let best = scoreOf(candidates[0]);
837
+ for (const instance of candidates) {
838
+ const score = scoreOf(instance);
839
+ best = strategy === "most-loaded" ? Math.max(best, score) : Math.min(best, score);
840
+ }
841
+ const tied = candidates.filter((instance) => scoreOf(instance) === best);
842
+ return this.choose(tied);
843
+ }
844
+ /** Uniform random choice from a non-empty list (placement tie-break / `random` strategy). */
845
+ choose(list) {
846
+ const index = Math.floor(this.random() * list.length);
847
+ return list[Math.min(index, list.length - 1)];
848
+ }
849
+ /**
850
+ * Hash of SEMANTIC fleet state only (§6): instances, rooms, counts, statuses,
851
+ * capacities, versions — explicitly EXCLUDING `lastSyncAt` and all liveness
852
+ * bookkeeping, so the §10 ETag does not churn on every heartbeat. Order-
853
+ * independent: instances are sorted by id before encoding.
854
+ */
855
+ computeStateHash(instances) {
856
+ const projection = instances.map((instance) => ({
857
+ id: instance.id,
858
+ name: instance.name,
859
+ processUid: instance.processUid,
860
+ endpointUrl: instance.endpointUrl,
861
+ labels: instance.labels,
862
+ roomTypes: instance.roomTypes,
863
+ connections: instance.connections,
864
+ capacity: instance.capacity,
865
+ autoCreate: instance.autoCreate,
866
+ status: instance.status,
867
+ agentVersion: instance.agentVersion,
868
+ protocolVersion: instance.protocolVersion,
869
+ rooms: instance.rooms.map((room) => ({
870
+ id: room.id,
871
+ type: room.type,
872
+ connections: room.connections,
873
+ instanceId: room.instanceId,
874
+ endpointUrl: room.endpointUrl,
875
+ local: room.local
876
+ }))
877
+ })).sort((a, b) => a.id < b.id ? -1 : a.id > b.id ? 1 : 0);
878
+ return hash64(projection);
879
+ }
880
+ };
881
+ function buildInstanceInfo(instanceId, payload, lastSyncAt) {
882
+ const rooms = payload.rooms.map((room) => ({
883
+ id: room.id,
884
+ type: room.type,
885
+ connections: room.connections,
886
+ instanceId,
887
+ // Denormalized from the owning instance so room lookups carry the URL (§6).
888
+ endpointUrl: payload.endpointUrl,
889
+ // Provenance is the agent's call, never inferred here (§6).
890
+ local: room.origin === "local"
891
+ }));
892
+ let connections = 0;
893
+ for (const room of rooms) {
894
+ connections += room.connections;
895
+ }
896
+ return {
897
+ id: instanceId,
898
+ name: payload.name,
899
+ processUid: payload.processUid,
900
+ endpointUrl: payload.endpointUrl,
901
+ labels: payload.labels,
902
+ roomTypes: payload.roomTypes,
903
+ rooms,
904
+ connections,
905
+ capacity: payload.capacity,
906
+ autoCreate: payload.autoCreate,
907
+ status: payload.status,
908
+ lastSyncAt,
909
+ agentVersion: payload.agentVersion,
910
+ protocolVersion: payload.protocolVersion
911
+ };
912
+ }
913
+ function matchesLabels(instanceLabels, required) {
914
+ for (const key of Object.keys(required)) {
915
+ if (instanceLabels[key] !== required[key]) {
916
+ return false;
917
+ }
918
+ }
919
+ return true;
920
+ }
921
+ function compareForCanonical(a, b) {
922
+ const rankA = a.origin === "fleet" ? 0 : 1;
923
+ const rankB = b.origin === "fleet" ? 0 : 1;
924
+ if (rankA !== rankB) {
925
+ return rankA - rankB;
926
+ }
927
+ if (a.joinSeq !== b.joinSeq) {
928
+ return a.joinSeq - b.joinSeq;
929
+ }
930
+ return a.instanceId < b.instanceId ? -1 : a.instanceId > b.instanceId ? 1 : 0;
931
+ }
932
+ var ROOM_ID_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-";
933
+ var ROOM_ID_RANDOM_LENGTH = 21;
934
+ function generateRoomId() {
935
+ const bytes = randomBytes(ROOM_ID_RANDOM_LENGTH);
936
+ let id = "r_";
937
+ for (let i = 0; i < ROOM_ID_RANDOM_LENGTH; i++) {
938
+ id += ROOM_ID_ALPHABET[bytes[i] & 63];
939
+ }
940
+ return id;
941
+ }
942
+
943
+ // src/orchestrator/AgentAuthenticator.ts
944
+ import { createHash as createHash2, timingSafeEqual } from "crypto";
945
+ function matchKey(presented, keys) {
946
+ if (typeof presented !== "string" || presented.length === 0 || keys.length === 0) {
947
+ return null;
948
+ }
949
+ const presentedDigest = createHash2("sha256").update(presented).digest();
950
+ let matched = null;
951
+ for (const key of keys) {
952
+ const candidate = createHash2("sha256").update(key).digest();
953
+ if (timingSafeEqual(presentedDigest, candidate)) {
954
+ matched = key;
955
+ }
956
+ }
957
+ return matched;
958
+ }
959
+ var AgentAuthenticator = class {
960
+ constructor(agentKeys) {
961
+ this.agentKeys = agentKeys;
962
+ }
963
+ agentKeys;
964
+ /** True when `ticket` is one of the configured agent keys (constant-time, §13). */
965
+ matches(ticket) {
966
+ return matchKey(ticket, this.agentKeys) !== null;
967
+ }
968
+ };
969
+
970
+ // src/wire/topics.ts
971
+ var PROTOCOL_VERSION = 3;
972
+ var WS_SUBPROTOCOL = "rivalis-fleet.v1";
973
+ var MAX_INFLIGHT_COMMANDS = 32;
974
+ var Topics = {
975
+ /** orch → agent: assigns id + heartbeat (poll cadence) on join; followed by the first poll. */
976
+ hello: "fleet/hello",
977
+ /** orch → agent: state poll. Carries `knownHash` (dedup) + the last recorded `status` (echo). */
978
+ poll: "fleet/poll",
979
+ /** agent → orch: poll reply. Full snapshot when the hash differs from `knownHash`, hash-only otherwise. */
980
+ state: "fleet/state",
981
+ /** orch → agent: command push. */
982
+ cmd: "fleet/cmd",
983
+ /** agent → orch: command result. */
984
+ ack: "fleet/ack"
985
+ };
986
+
987
+ // src/wire/snapshotSchema.ts
988
+ var MAX_ENDPOINT_URL_LENGTH = 512;
989
+ var MAX_NAME_LENGTH = 64;
990
+ var MAX_LABELS = 32;
991
+ var MAX_LABEL_KEY_LENGTH = 64;
992
+ var MAX_LABEL_VALUE_LENGTH = 64;
993
+ var MAX_ROOM_TYPES = 256;
994
+ var MAX_ROOMS = 5e4;
995
+ var MAX_ROOM_ID_LENGTH = 256;
996
+ var MAX_ROOM_TYPE_LENGTH = 64;
997
+ var MAX_ROOM_CONNECTIONS = 1e6;
998
+ var ALLOWED_ENDPOINT_SCHEMES = /* @__PURE__ */ new Set(["ws:", "wss:", "http:", "https:"]);
999
+ var syncPayloadSchema = {
1000
+ endpointUrl: { type: "string", required: true, max: MAX_ENDPOINT_URL_LENGTH },
1001
+ name: { type: "string", required: true, max: MAX_NAME_LENGTH },
1002
+ labels: { type: "object", required: true },
1003
+ roomTypes: { type: "array", required: true, max: MAX_ROOM_TYPES, items: { type: "string", max: MAX_ROOM_TYPE_LENGTH } },
1004
+ rooms: { type: "array", required: true, max: MAX_ROOMS, items: { type: "object" } }
1005
+ };
1006
+ function checkRule(field, value, rule) {
1007
+ if (value === void 0 || value === null) {
1008
+ return rule.required === true ? `${field} is required` : null;
1009
+ }
1010
+ switch (rule.type) {
1011
+ case "string":
1012
+ if (typeof value !== "string") {
1013
+ return `${field} must be a string`;
1014
+ }
1015
+ if (rule.max !== void 0 && value.length > rule.max) {
1016
+ return `${field} exceeds ${rule.max} characters`;
1017
+ }
1018
+ if (rule.min !== void 0 && value.length < rule.min) {
1019
+ return `${field} must be at least ${rule.min} characters`;
1020
+ }
1021
+ if (rule.pattern !== void 0 && !new RegExp(rule.pattern).test(value)) {
1022
+ return `${field} has an invalid format`;
1023
+ }
1024
+ return null;
1025
+ case "number":
1026
+ case "integer":
1027
+ if (typeof value !== "number" || rule.type === "integer" && !Number.isInteger(value)) {
1028
+ return `${field} must be a number`;
1029
+ }
1030
+ if (rule.max !== void 0 && value > rule.max) {
1031
+ return `${field} exceeds ${rule.max}`;
1032
+ }
1033
+ if (rule.min !== void 0 && value < rule.min) {
1034
+ return `${field} is below ${rule.min}`;
1035
+ }
1036
+ return null;
1037
+ case "boolean":
1038
+ return typeof value === "boolean" ? null : `${field} must be a boolean`;
1039
+ case "object":
1040
+ return typeof value === "object" && !Array.isArray(value) ? null : `${field} must be an object`;
1041
+ case "array":
1042
+ if (!Array.isArray(value)) {
1043
+ return `${field} must be an array`;
1044
+ }
1045
+ if (rule.max !== void 0 && value.length > rule.max) {
1046
+ return `${field} exceeds ${rule.max} entries`;
1047
+ }
1048
+ if (rule.min !== void 0 && value.length < rule.min) {
1049
+ return `${field} must have at least ${rule.min} entries`;
1050
+ }
1051
+ if (rule.items !== void 0) {
1052
+ for (const entry of value) {
1053
+ const reason = checkRule(`${field} entry`, entry, rule.items);
1054
+ if (reason !== null) {
1055
+ return reason;
1056
+ }
1057
+ }
1058
+ }
1059
+ return null;
1060
+ default:
1061
+ return null;
1062
+ }
1063
+ }
1064
+ function checkSchema(schema, data) {
1065
+ for (const key of Object.keys(schema)) {
1066
+ const rule = schema[key];
1067
+ if (rule === void 0) {
1068
+ continue;
1069
+ }
1070
+ const reason = checkRule(key, data[key], rule);
1071
+ if (reason !== null) {
1072
+ return reason;
1073
+ }
1074
+ }
1075
+ return null;
1076
+ }
1077
+ function validateSnapshot(payload) {
1078
+ const data = payload;
1079
+ const reason = checkSchema(syncPayloadSchema, data);
1080
+ if (reason !== null) {
1081
+ return reason;
1082
+ }
1083
+ let parsed;
1084
+ try {
1085
+ parsed = new URL(payload.endpointUrl);
1086
+ } catch {
1087
+ return "endpointUrl is not a valid URL";
1088
+ }
1089
+ if (!ALLOWED_ENDPOINT_SCHEMES.has(parsed.protocol)) {
1090
+ return "endpointUrl scheme is not allowed";
1091
+ }
1092
+ const labels = payload.labels;
1093
+ const labelKeys = Object.keys(labels);
1094
+ if (labelKeys.length > MAX_LABELS) {
1095
+ return `labels exceeds ${MAX_LABELS} entries`;
1096
+ }
1097
+ for (const key of labelKeys) {
1098
+ if (key.length > MAX_LABEL_KEY_LENGTH) {
1099
+ return `a label key exceeds ${MAX_LABEL_KEY_LENGTH} characters`;
1100
+ }
1101
+ const value = labels[key];
1102
+ if (typeof value !== "string" || value.length > MAX_LABEL_VALUE_LENGTH) {
1103
+ return `a label value is not a string of at most ${MAX_LABEL_VALUE_LENGTH} characters`;
1104
+ }
1105
+ }
1106
+ const rooms = payload.rooms;
1107
+ for (const entry of rooms) {
1108
+ const id = entry.id;
1109
+ if (typeof id !== "string" || id.length > MAX_ROOM_ID_LENGTH) {
1110
+ return `a room id is not a string of at most ${MAX_ROOM_ID_LENGTH} characters`;
1111
+ }
1112
+ const type = entry.type;
1113
+ if (typeof type !== "string" || type.length > MAX_ROOM_TYPE_LENGTH) {
1114
+ return `a room type is not a string of at most ${MAX_ROOM_TYPE_LENGTH} characters`;
1115
+ }
1116
+ const connections = entry.connections;
1117
+ if (typeof connections !== "number" || connections > MAX_ROOM_CONNECTIONS) {
1118
+ return `a room connections value exceeds ${MAX_ROOM_CONNECTIONS}`;
1119
+ }
1120
+ }
1121
+ return null;
1122
+ }
1123
+
1124
+ // src/wire/serializer.ts
1125
+ import { createRequire } from "module";
1126
+ var WIRE_MAJOR = PROTOCOL_VERSION;
1127
+ var WIRE_MINOR = 0;
1128
+ var HEADER_BYTES = 2;
1129
+ var WireVersionError = class extends Error {
1130
+ /** The major byte read off the incompatible frame (123 for a legacy JSON `{...}` frame). */
1131
+ theirVersion;
1132
+ /** This build's protocol major. */
1133
+ ourVersion;
1134
+ constructor(theirVersion) {
1135
+ super(
1136
+ `fleet wire protocol version mismatch: peer speaks major v${theirVersion}, this build speaks v${PROTOCOL_VERSION} \u2014 agents and orchestrator must run the same @rivalis/fleet major (\xA77). A v1 (JSON) peer against a v${PROTOCOL_VERSION} peer is exactly this case; upgrade both halves in lockstep.`
1137
+ );
1138
+ this.name = "WireVersionError";
1139
+ this.theirVersion = theirVersion;
1140
+ this.ourVersion = PROTOCOL_VERSION;
1141
+ }
1142
+ };
1143
+ var Type = {
1144
+ Label: "Label",
1145
+ SyncRoom: "SyncRoom",
1146
+ Capacity: "Capacity",
1147
+ AckRoom: "AckRoom",
1148
+ Hello: "Hello",
1149
+ Poll: "Poll",
1150
+ State: "State",
1151
+ Cmd: "Cmd",
1152
+ Ack: "Ack"
1153
+ };
1154
+ var TOPIC_TYPE = {
1155
+ [Topics.hello]: Type.Hello,
1156
+ [Topics.poll]: Type.Poll,
1157
+ [Topics.state]: Type.State,
1158
+ [Topics.cmd]: Type.Cmd,
1159
+ [Topics.ack]: Type.Ack
1160
+ };
1161
+ var serializer = null;
1162
+ function getSerializer() {
1163
+ if (serializer !== null) {
1164
+ return serializer;
1165
+ }
1166
+ const metaUrl = import.meta.url;
1167
+ const req = metaUrl ? createRequire(metaUrl) : __require;
1168
+ const mod = req("@toolcase/serializer");
1169
+ const Serializer = mod.Serializer ?? mod.default;
1170
+ const F = Serializer.FieldType;
1171
+ const s = new Serializer("fleet");
1172
+ s.define(Type.Label, [
1173
+ { key: "key", type: F.STRING, rule: "optional" },
1174
+ { key: "value", type: F.STRING, rule: "optional" }
1175
+ ]);
1176
+ s.define(Type.SyncRoom, [
1177
+ { key: "id", type: F.STRING, rule: "optional" },
1178
+ { key: "type", type: F.STRING, rule: "optional" },
1179
+ { key: "connections", type: F.UINT32, rule: "optional" },
1180
+ { key: "origin", type: F.STRING, rule: "optional" }
1181
+ ]);
1182
+ s.define(Type.Capacity, [
1183
+ // null = unlimited (§6). Absent on the wire ⇒ null; an explicit 0 ⇒ 0.
1184
+ { key: "maxConnections", type: F.INT32, rule: "optional", default: null },
1185
+ { key: "maxRooms", type: F.INT32, rule: "optional", default: null }
1186
+ ]);
1187
+ s.define(Type.AckRoom, [
1188
+ { key: "id", type: F.STRING, rule: "optional" },
1189
+ { key: "type", type: F.STRING, rule: "optional" }
1190
+ ]);
1191
+ s.define(Type.Hello, [
1192
+ { key: "instanceId", type: F.STRING, rule: "optional" },
1193
+ { key: "protocolVersion", type: F.UINT32, rule: "optional" },
1194
+ { key: "heartbeatMs", type: F.UINT32, rule: "optional" }
1195
+ ]);
1196
+ s.define(Type.Poll, [
1197
+ { key: "reqId", type: F.STRING, rule: "optional" },
1198
+ // Absent ⇒ null (no prior state / forced full, subsumes the old fleet/resync).
1199
+ { key: "knownHash", type: F.STRING, rule: "optional" },
1200
+ { key: "status", type: F.STRING, rule: "optional" }
1201
+ ]);
1202
+ s.define(Type.State, [
1203
+ { key: "reqId", type: F.STRING, rule: "optional" },
1204
+ // full=false is a hash-only liveness reply: the snapshot fields below are
1205
+ // omitted on the wire (preserving the old sync/ping dedup, orch-initiated).
1206
+ { key: "full", type: F.BOOL, rule: "optional" },
1207
+ { key: "seq", type: F.UINT32, rule: "optional" },
1208
+ { key: "hash", type: F.STRING, rule: "optional" },
1209
+ { key: "name", type: F.STRING, rule: "optional" },
1210
+ { key: "processUid", type: F.STRING, rule: "optional" },
1211
+ { key: "agentVersion", type: F.STRING, rule: "optional" },
1212
+ { key: "protocolVersion", type: F.UINT32, rule: "optional" },
1213
+ { key: "endpointUrl", type: F.STRING, rule: "optional" },
1214
+ { key: "labels", type: Type.Label, rule: "repeated" },
1215
+ { key: "capacity", type: Type.Capacity, rule: "optional" },
1216
+ { key: "autoCreate", type: F.BOOL, rule: "optional" },
1217
+ { key: "roomTypes", type: F.STRING, rule: "repeated" },
1218
+ { key: "rooms", type: Type.SyncRoom, rule: "repeated" },
1219
+ { key: "status", type: F.STRING, rule: "optional" }
1220
+ ]);
1221
+ s.define(Type.Cmd, [
1222
+ { key: "cmdId", type: F.STRING, rule: "optional" },
1223
+ { key: "op", type: F.STRING, rule: "optional" },
1224
+ { key: "roomId", type: F.STRING, rule: "optional" },
1225
+ { key: "roomType", type: F.STRING, rule: "optional" }
1226
+ ]);
1227
+ s.define(Type.Ack, [
1228
+ { key: "cmdId", type: F.STRING, rule: "optional" },
1229
+ { key: "ok", type: F.BOOL, rule: "optional" },
1230
+ { key: "error", type: F.STRING, rule: "optional" },
1231
+ { key: "alreadyGone", type: F.BOOL, rule: "optional" },
1232
+ { key: "room", type: Type.AckRoom, rule: "optional" },
1233
+ // APPEND-ONLY (task 003): the room-already-exists signal must stay LAST so
1234
+ // existing tags are unmoved (see the append-only tag rule in the file header).
1235
+ { key: "exists", type: F.BOOL, rule: "optional" }
1236
+ ]);
1237
+ serializer = s;
1238
+ return s;
1239
+ }
1240
+ function present(obj, key) {
1241
+ return obj !== null && obj !== void 0 && Object.prototype.hasOwnProperty.call(obj, key);
1242
+ }
1243
+ function labelsToList(labels) {
1244
+ return Object.entries(labels ?? {}).map(([key, value]) => ({ key, value }));
1245
+ }
1246
+ function labelsFromList(list) {
1247
+ const labels = {};
1248
+ for (const entry of list ?? []) {
1249
+ labels[entry.key ?? ""] = entry.value ?? "";
1250
+ }
1251
+ return labels;
1252
+ }
1253
+ function capacityToMessage(capacity) {
1254
+ return {
1255
+ maxConnections: capacity?.maxConnections ?? null,
1256
+ maxRooms: capacity?.maxRooms ?? null
1257
+ };
1258
+ }
1259
+ function capacityFromMessage(capacity) {
1260
+ return {
1261
+ // Absent ⇒ null (unlimited, §6); an explicit 0 is preserved as 0.
1262
+ maxConnections: present(capacity, "maxConnections") ? capacity.maxConnections : null,
1263
+ maxRooms: present(capacity, "maxRooms") ? capacity.maxRooms : null
1264
+ };
1265
+ }
1266
+ function stateToMessage(p) {
1267
+ if (!p.full) {
1268
+ return { reqId: p.reqId, full: false, seq: p.seq, hash: p.hash };
1269
+ }
1270
+ return {
1271
+ reqId: p.reqId,
1272
+ full: true,
1273
+ seq: p.seq,
1274
+ hash: p.hash,
1275
+ name: p.name,
1276
+ processUid: p.processUid,
1277
+ agentVersion: p.agentVersion,
1278
+ protocolVersion: p.protocolVersion,
1279
+ endpointUrl: p.endpointUrl,
1280
+ labels: labelsToList(p.labels),
1281
+ capacity: capacityToMessage(p.capacity),
1282
+ autoCreate: p.autoCreate,
1283
+ roomTypes: p.roomTypes ?? [],
1284
+ rooms: (p.rooms ?? []).map((r) => ({
1285
+ id: r.id,
1286
+ type: r.type,
1287
+ connections: r.connections,
1288
+ origin: r.origin
1289
+ })),
1290
+ status: p.status
1291
+ };
1292
+ }
1293
+ function stateFromMessage(m) {
1294
+ return {
1295
+ reqId: m.reqId ?? "",
1296
+ full: m.full ?? false,
1297
+ seq: m.seq ?? 0,
1298
+ hash: m.hash ?? "",
1299
+ name: m.name ?? "",
1300
+ processUid: m.processUid ?? "",
1301
+ agentVersion: m.agentVersion ?? "",
1302
+ protocolVersion: m.protocolVersion ?? 0,
1303
+ endpointUrl: m.endpointUrl ?? "",
1304
+ labels: labelsFromList(m.labels),
1305
+ capacity: capacityFromMessage(m.capacity),
1306
+ autoCreate: m.autoCreate ?? false,
1307
+ roomTypes: Array.isArray(m.roomTypes) ? m.roomTypes : [],
1308
+ rooms: (Array.isArray(m.rooms) ? m.rooms : []).map((r) => ({
1309
+ id: r.id ?? "",
1310
+ type: r.type ?? "",
1311
+ connections: r.connections ?? 0,
1312
+ origin: r.origin ?? "local"
1313
+ })),
1314
+ status: m.status ?? "active"
1315
+ };
1316
+ }
1317
+ function toMessage(topic, payload) {
1318
+ switch (topic) {
1319
+ case Topics.state:
1320
+ return stateToMessage(payload);
1321
+ case Topics.poll: {
1322
+ const p = payload;
1323
+ const msg = { reqId: p.reqId, status: p.status };
1324
+ if (p.knownHash !== null && p.knownHash !== void 0) {
1325
+ msg.knownHash = p.knownHash;
1326
+ }
1327
+ return msg;
1328
+ }
1329
+ default:
1330
+ return payload;
1331
+ }
1332
+ }
1333
+ function fromMessage(topic, m) {
1334
+ switch (topic) {
1335
+ case Topics.hello:
1336
+ return {
1337
+ instanceId: m.instanceId ?? "",
1338
+ protocolVersion: m.protocolVersion ?? 0,
1339
+ heartbeatMs: m.heartbeatMs ?? 0
1340
+ };
1341
+ case Topics.poll:
1342
+ return {
1343
+ reqId: m.reqId ?? "",
1344
+ // Absent knownHash ⇒ null (no prior state / forced full).
1345
+ knownHash: present(m, "knownHash") ? m.knownHash : null,
1346
+ status: m.status ?? "active"
1347
+ };
1348
+ case Topics.state:
1349
+ return stateFromMessage(m);
1350
+ case Topics.cmd: {
1351
+ const cmd = { cmdId: m.cmdId ?? "", op: m.op };
1352
+ if (present(m, "roomId")) {
1353
+ cmd.roomId = m.roomId;
1354
+ }
1355
+ if (present(m, "roomType")) {
1356
+ cmd.roomType = m.roomType;
1357
+ }
1358
+ return cmd;
1359
+ }
1360
+ case Topics.ack: {
1361
+ const ack = { cmdId: m.cmdId ?? "", ok: m.ok ?? false };
1362
+ if (present(m, "error")) {
1363
+ ack.error = m.error;
1364
+ }
1365
+ if (present(m, "alreadyGone")) {
1366
+ ack.alreadyGone = m.alreadyGone;
1367
+ }
1368
+ if (present(m, "exists")) {
1369
+ ack.exists = m.exists;
1370
+ }
1371
+ if (present(m, "room")) {
1372
+ ack.room = { id: m.room.id ?? "", type: m.room.type ?? "" };
1373
+ }
1374
+ return ack;
1375
+ }
1376
+ default:
1377
+ return m;
1378
+ }
1379
+ }
1380
+ function encodeFrame(topic, payload) {
1381
+ const type = TOPIC_TYPE[topic];
1382
+ if (type === void 0) {
1383
+ throw new Error(`fleet wire: no message type for topic=${topic}`);
1384
+ }
1385
+ const body = getSerializer().encode(type, toMessage(topic, payload));
1386
+ const frame = new Uint8Array(HEADER_BYTES + body.length);
1387
+ frame[0] = WIRE_MAJOR;
1388
+ frame[1] = WIRE_MINOR;
1389
+ frame.set(body, HEADER_BYTES);
1390
+ return frame;
1391
+ }
1392
+ function decodeFrame(topic, frame) {
1393
+ const type = TOPIC_TYPE[topic];
1394
+ if (type === void 0) {
1395
+ throw new Error(`fleet wire: no message type for topic=${topic}`);
1396
+ }
1397
+ if (frame === null || frame === void 0 || frame.length < HEADER_BYTES) {
1398
+ throw new Error("fleet wire: truncated frame (shorter than the 2-byte version header)");
1399
+ }
1400
+ const major = frame[0];
1401
+ if (major !== WIRE_MAJOR) {
1402
+ throw new WireVersionError(major);
1403
+ }
1404
+ const body = frame.subarray(HEADER_BYTES);
1405
+ const decoded = getSerializer().decode(type, body);
1406
+ return fromMessage(topic, decoded);
1407
+ }
1408
+
1409
+ // src/util/errors.ts
1410
+ function describe(error) {
1411
+ return error instanceof Error ? error.message : String(error);
1412
+ }
1413
+
1414
+ // src/orchestrator/CommandEngine.ts
1415
+ var CommandEngine = class {
1416
+ constructor(scheduler, reservations, commandTimeoutMs) {
1417
+ this.scheduler = scheduler;
1418
+ this.reservations = reservations;
1419
+ this.commandTimeoutMs = commandTimeoutMs;
1420
+ }
1421
+ scheduler;
1422
+ reservations;
1423
+ commandTimeoutMs;
1424
+ /** Pending commands keyed by instance id, then by `cmdId`. */
1425
+ pending = /* @__PURE__ */ new Map();
1426
+ cmdSeq = 0;
1427
+ /** Monotonic command id (`cmd_N`) — connection-agnostic, unique per orchestrator. */
1428
+ nextCmdId() {
1429
+ return `cmd_${++this.cmdSeq}`;
1430
+ }
1431
+ /** How many commands are currently in flight for an instance. */
1432
+ inFlight(instanceId) {
1433
+ return this.pending.get(instanceId)?.size ?? 0;
1434
+ }
1435
+ /**
1436
+ * Push a `fleet/cmd` and return a promise that resolves on its `fleet/ack`
1437
+ * (rejects on `COMMAND_FAILED`), or rejects on timeout (`COMMAND_TIMEOUT`) /
1438
+ * disconnect (`INSTANCE_DISCONNECTED`). Caps in-flight commands per instance at
1439
+ * {@link MAX_INFLIGHT_COMMANDS} → `INSTANCE_BUSY` rather than queueing unbounded
1440
+ * promises behind a slow agent (§7). Reservations (create only) ride on the
1441
+ * pending entry and are released on every settle path.
1442
+ */
1443
+ send(link, cmd, reservation = null, roomIdReservation = null) {
1444
+ const map = this.mapFor(link.instanceId);
1445
+ if (map.size >= MAX_INFLIGHT_COMMANDS) {
1446
+ if (reservation !== null) {
1447
+ this.reservations.release(reservation);
1448
+ }
1449
+ if (roomIdReservation !== null) {
1450
+ this.reservations.releaseRoomId(roomIdReservation);
1451
+ }
1452
+ return Promise.reject(new FleetError(
1453
+ "INSTANCE_BUSY",
1454
+ `instance ${link.instanceId} has ${map.size} commands in flight (max ${MAX_INFLIGHT_COMMANDS})`
1455
+ ));
1456
+ }
1457
+ return new Promise((resolve, reject) => {
1458
+ const timer = this.scheduler.setTimeout(() => {
1459
+ this.settle(link.instanceId, cmd.cmdId, (pending) => {
1460
+ this.holdOrRelease(pending);
1461
+ pending.reject(new FleetError(
1462
+ "COMMAND_TIMEOUT",
1463
+ `command ${cmd.cmdId} (${cmd.op}) timed out after ${this.commandTimeoutMs}ms`
1464
+ ));
1465
+ });
1466
+ }, this.commandTimeoutMs);
1467
+ map.set(cmd.cmdId, { resolve, reject, timer, reservation, roomIdReservation });
1468
+ try {
1469
+ link.send(Topics.cmd, cmd);
1470
+ } catch (error) {
1471
+ this.settle(link.instanceId, cmd.cmdId, (pending) => {
1472
+ this.releaseReservations(pending);
1473
+ pending.reject(new FleetError(
1474
+ "INSTANCE_DISCONNECTED",
1475
+ `failed to send command ${cmd.cmdId} (${cmd.op}) to instance ${link.instanceId}: ${describe(error)}`
1476
+ ));
1477
+ });
1478
+ }
1479
+ });
1480
+ }
1481
+ /**
1482
+ * Resolve/reject the originating promise for an inbound `fleet/ack`. Returns
1483
+ * `false` when no such pending exists (a late ack after a timeout, or an unknown
1484
+ * cmd) so the caller can log-and-drop — never a double-resolve (§14).
1485
+ */
1486
+ ack(instanceId, ack) {
1487
+ return this.settle(instanceId, ack.cmdId, (pending) => {
1488
+ if (ack.ok) {
1489
+ this.holdOrRelease(pending);
1490
+ pending.resolve(ack);
1491
+ } else {
1492
+ this.releaseReservations(pending);
1493
+ pending.reject(ack.exists === true ? new FleetError("ROOM_EXISTS", ack.error ?? "room id already exists") : new FleetError("COMMAND_FAILED", ack.error ?? "agent reported command failure"));
1494
+ }
1495
+ });
1496
+ }
1497
+ /**
1498
+ * Reject every in-flight command for a disconnected/evicted instance immediately
1499
+ * with `INSTANCE_DISCONNECTED` — callers never wait out `commandTimeoutMs` for an
1500
+ * instance the orchestrator already knows is gone (§7).
1501
+ */
1502
+ rejectAll(instanceId, reason) {
1503
+ const map = this.pending.get(instanceId);
1504
+ if (map === void 0) {
1505
+ return;
1506
+ }
1507
+ for (const cmdId of [...map.keys()]) {
1508
+ this.settle(instanceId, cmdId, (pending) => {
1509
+ this.releaseReservations(pending);
1510
+ pending.reject(new FleetError("INSTANCE_DISCONNECTED", `instance ${instanceId} disconnected (${reason})`));
1511
+ });
1512
+ }
1513
+ this.pending.delete(instanceId);
1514
+ }
1515
+ /**
1516
+ * Settle exactly one pending command: delete it and clear its timer, then run
1517
+ * `action` (which disposes the reservations — release or {@link holdOrRelease} —
1518
+ * and resolves/rejects). Returns `false` when no such pending exists (already
1519
+ * settled) — the single guard against double-resolve from a timeout-then-late-ack
1520
+ * or disconnect-then-ack race (§14). Reservation disposition moved into the per-path
1521
+ * `action` callbacks (task 003): ack-OK / timeout hold until visible, every other
1522
+ * path releases.
1523
+ */
1524
+ settle(instanceId, cmdId, action) {
1525
+ const map = this.pending.get(instanceId);
1526
+ const pending = map?.get(cmdId);
1527
+ if (map === void 0 || pending === void 0) {
1528
+ return false;
1529
+ }
1530
+ map.delete(cmdId);
1531
+ this.scheduler.clearTimeout(pending.timer);
1532
+ action(pending);
1533
+ return true;
1534
+ }
1535
+ /**
1536
+ * Hold a create's reservations until its room is visible (task 003) — used on
1537
+ * ack-OK and timeout. A create carries BOTH a capacity and a room-id reservation;
1538
+ * any other command (destroy/drain/undrain) carries neither, so this degrades to a
1539
+ * release of whatever (if anything) is present.
1540
+ */
1541
+ holdOrRelease(pending) {
1542
+ if (pending.reservation !== null && pending.roomIdReservation !== null) {
1543
+ this.reservations.holdUntilVisible(pending.roomIdReservation, pending.reservation);
1544
+ } else {
1545
+ this.releaseReservations(pending);
1546
+ }
1547
+ }
1548
+ /** Release a settled command's reservations immediately (failure / disconnect / busy). */
1549
+ releaseReservations(pending) {
1550
+ if (pending.reservation !== null) {
1551
+ this.reservations.release(pending.reservation);
1552
+ }
1553
+ if (pending.roomIdReservation !== null) {
1554
+ this.reservations.releaseRoomId(pending.roomIdReservation);
1555
+ }
1556
+ }
1557
+ mapFor(instanceId) {
1558
+ let map = this.pending.get(instanceId);
1559
+ if (map === void 0) {
1560
+ map = /* @__PURE__ */ new Map();
1561
+ this.pending.set(instanceId, map);
1562
+ }
1563
+ return map;
1564
+ }
1565
+ };
1566
+
1567
+ // src/orchestrator/Poller.ts
1568
+ var FORCE_FULL_EVERY_POLLS = 12;
1569
+ var Poller = class {
1570
+ constructor(scheduler, intervalMs, callbacks) {
1571
+ this.scheduler = scheduler;
1572
+ this.intervalMs = intervalMs;
1573
+ this.callbacks = callbacks;
1574
+ }
1575
+ scheduler;
1576
+ intervalMs;
1577
+ callbacks;
1578
+ entries = /* @__PURE__ */ new Map();
1579
+ reqSeq = 0;
1580
+ /** True while the instance is being polled (started and not yet forgotten). */
1581
+ has(instanceId) {
1582
+ return this.entries.has(instanceId);
1583
+ }
1584
+ /** Begin polling an instance: send the first poll now, then one every `intervalMs`. */
1585
+ start(instanceId) {
1586
+ this.entries.set(instanceId, { timer: null, outstandingReqId: null, missed: 0, pollCount: 0 });
1587
+ this.poll(instanceId);
1588
+ this.schedule(instanceId);
1589
+ }
1590
+ /**
1591
+ * Consume the outstanding poll's reply (§7 enforcement). Returns `true` when
1592
+ * `reqId` matches the in-flight poll (resets the missed counter); `false` when it
1593
+ * matches no outstanding poll — an unsolicited / duplicate / post-settle
1594
+ * `fleet/state`, which the caller turns into a kick.
1595
+ */
1596
+ reply(instanceId, reqId) {
1597
+ const entry = this.entries.get(instanceId);
1598
+ if (entry === void 0 || entry.outstandingReqId === null || entry.outstandingReqId !== reqId) {
1599
+ return false;
1600
+ }
1601
+ entry.outstandingReqId = null;
1602
+ entry.missed = 0;
1603
+ return true;
1604
+ }
1605
+ /** Stop polling an instance and cancel its timer (teardown). Idempotent. */
1606
+ forget(instanceId) {
1607
+ const entry = this.entries.get(instanceId);
1608
+ if (entry !== void 0) {
1609
+ this.scheduler.clearTimeout(entry.timer);
1610
+ this.entries.delete(instanceId);
1611
+ }
1612
+ }
1613
+ schedule(instanceId) {
1614
+ const entry = this.entries.get(instanceId);
1615
+ if (entry === void 0) {
1616
+ return;
1617
+ }
1618
+ entry.timer = this.scheduler.setTimeout(() => this.tick(instanceId), this.intervalMs);
1619
+ }
1620
+ tick(instanceId) {
1621
+ const entry = this.entries.get(instanceId);
1622
+ if (entry === void 0) {
1623
+ return;
1624
+ }
1625
+ if (entry.outstandingReqId !== null) {
1626
+ entry.missed += 1;
1627
+ if (entry.missed === 2) {
1628
+ this.callbacks.onStale(instanceId);
1629
+ }
1630
+ if (entry.missed >= 3) {
1631
+ this.callbacks.onEvict(instanceId);
1632
+ return;
1633
+ }
1634
+ this.schedule(instanceId);
1635
+ return;
1636
+ }
1637
+ this.poll(instanceId);
1638
+ this.schedule(instanceId);
1639
+ }
1640
+ poll(instanceId) {
1641
+ const entry = this.entries.get(instanceId);
1642
+ if (entry === void 0) {
1643
+ return;
1644
+ }
1645
+ const reqId = `poll_${++this.reqSeq}`;
1646
+ const forceFull = entry.pollCount % FORCE_FULL_EVERY_POLLS === 0;
1647
+ entry.pollCount += 1;
1648
+ entry.outstandingReqId = reqId;
1649
+ this.callbacks.sendPoll(instanceId, reqId, forceFull);
1650
+ }
1651
+ };
1652
+
1653
+ // src/orchestrator/EventReconciler.ts
1654
+ var EventReconciler = class {
1655
+ constructor(state, emit) {
1656
+ this.state = state;
1657
+ this.emit = emit;
1658
+ }
1659
+ state;
1660
+ emit;
1661
+ knownInstanceIds = /* @__PURE__ */ new Set();
1662
+ knownRooms = /* @__PURE__ */ new Map();
1663
+ lastStatsHash = "";
1664
+ /**
1665
+ * Diff the read model and emit the derived events: `instance:join` for a new
1666
+ * instance, `room:create`/`room:destroy` for room churn, and `sync` whenever the
1667
+ * semantic `stateHash` changes. `instance:leave` is emitted by
1668
+ * {@link instanceRemoved}, not here.
1669
+ */
1670
+ reconcile() {
1671
+ const instances = this.state.instances;
1672
+ const currentInstanceIds = /* @__PURE__ */ new Set();
1673
+ const currentRoomIds = /* @__PURE__ */ new Set();
1674
+ for (const instance of instances) {
1675
+ currentInstanceIds.add(instance.id);
1676
+ if (!this.knownInstanceIds.has(instance.id)) {
1677
+ this.knownInstanceIds.add(instance.id);
1678
+ this.emit("instance:join", instance);
1679
+ }
1680
+ for (const room of instance.rooms) {
1681
+ currentRoomIds.add(room.id);
1682
+ if (!this.knownRooms.has(room.id)) {
1683
+ this.knownRooms.set(room.id, room);
1684
+ this.emit("room:create", room);
1685
+ }
1686
+ }
1687
+ }
1688
+ for (const [roomId, room] of [...this.knownRooms]) {
1689
+ if (!currentRoomIds.has(roomId)) {
1690
+ this.knownRooms.delete(roomId);
1691
+ this.emit("room:destroy", room);
1692
+ }
1693
+ }
1694
+ for (const id of [...this.knownInstanceIds]) {
1695
+ if (!currentInstanceIds.has(id)) {
1696
+ this.knownInstanceIds.delete(id);
1697
+ }
1698
+ }
1699
+ const stats = this.state.stats;
1700
+ if (stats.stateHash !== this.lastStatsHash) {
1701
+ this.lastStatsHash = stats.stateHash;
1702
+ this.emit("sync", stats);
1703
+ }
1704
+ }
1705
+ /**
1706
+ * An instance was removed from the read model (socket close or eviction): forget
1707
+ * it and emit `instance:leave`. The caller follows with a {@link reconcile} so the
1708
+ * vanished instance's rooms surface as `room:destroy` and the `sync` fires.
1709
+ */
1710
+ instanceRemoved(removed) {
1711
+ this.knownInstanceIds.delete(removed.id);
1712
+ this.emit("instance:leave", removed);
1713
+ }
1714
+ };
1715
+
1716
+ // src/orchestrator/FleetControl.ts
1717
+ var FleetControl = class {
1718
+ constructor(state, commands, getLink) {
1719
+ this.state = state;
1720
+ this.commands = commands;
1721
+ this.getLink = getLink;
1722
+ }
1723
+ state;
1724
+ commands;
1725
+ getLink;
1726
+ /** Place a new room and push an acknowledged `create` command (§9 command flow). */
1727
+ async createRoom(request) {
1728
+ const roomIdReservation = this.state.reserveRoomId(request.roomId);
1729
+ let placement;
1730
+ try {
1731
+ placement = this.state.place({ type: request.type, ...request.placement ?? {} });
1732
+ } catch (error) {
1733
+ this.state.releaseRoomId(roomIdReservation);
1734
+ throw error;
1735
+ }
1736
+ const link = this.getLink(placement.instance.id);
1737
+ if (link === void 0) {
1738
+ this.state.release(placement.reservation);
1739
+ this.state.releaseRoomId(roomIdReservation);
1740
+ throw new FleetError("INSTANCE_DISCONNECTED", `instance ${placement.instance.id} is no longer connected`);
1741
+ }
1742
+ const cmd = {
1743
+ cmdId: this.commands.nextCmdId(),
1744
+ op: "create",
1745
+ roomId: roomIdReservation.roomId,
1746
+ roomType: request.type
1747
+ };
1748
+ await this.commands.send(link, cmd, placement.reservation, roomIdReservation);
1749
+ return {
1750
+ id: roomIdReservation.roomId,
1751
+ type: request.type,
1752
+ connections: 0,
1753
+ instanceId: placement.instance.id,
1754
+ endpointUrl: placement.instance.endpointUrl,
1755
+ local: false
1756
+ };
1757
+ }
1758
+ /** Destroy a room by its fleet-unique public id; the orchestrator resolves the owner (§9, §10). */
1759
+ async destroyRoom(roomId) {
1760
+ const located = this.state.resolveRoom(roomId);
1761
+ if (located === null) {
1762
+ throw new FleetError("ROOM_NOT_FOUND", `room ${roomId} not found`);
1763
+ }
1764
+ const link = this.getLink(located.instanceId);
1765
+ if (link === void 0) {
1766
+ throw new FleetError("INSTANCE_DISCONNECTED", `instance ${located.instanceId} is no longer connected`);
1767
+ }
1768
+ await this.commands.send(link, { cmdId: this.commands.nextCmdId(), op: "destroy", roomId: located.rawRoomId });
1769
+ }
1770
+ /** Ask an instance to drain via `fleet/cmd {op:'drain'}` — the agent owns status (§7). */
1771
+ drainInstance(instanceId) {
1772
+ return this.sendStatusCommand(instanceId, "drain");
1773
+ }
1774
+ /** Reverse of {@link drainInstance}. */
1775
+ undrainInstance(instanceId) {
1776
+ return this.sendStatusCommand(instanceId, "undrain");
1777
+ }
1778
+ async sendStatusCommand(instanceId, op) {
1779
+ if (this.state.getInstance(instanceId) === null) {
1780
+ throw new FleetError("INSTANCE_NOT_FOUND", `instance ${instanceId} not found`);
1781
+ }
1782
+ const link = this.getLink(instanceId);
1783
+ if (link === void 0) {
1784
+ throw new FleetError("INSTANCE_DISCONNECTED", `instance ${instanceId} is no longer connected`);
1785
+ }
1786
+ await this.commands.send(link, { cmdId: this.commands.nextCmdId(), op });
1787
+ this.state.setPendingStatus(instanceId, op === "drain" ? "draining" : "active");
1788
+ }
1789
+ };
1790
+
1791
+ // src/orchestrator/transport.ts
1792
+ import { createServer } from "http";
1793
+ import { WSTransport } from "@rivalis/node";
1794
+
1795
+ // src/orchestrator/FleetRoom.ts
1796
+ var AGENT_TOPICS = [Topics.state, Topics.ack];
1797
+ function createFleetRoomClass(core, controller) {
1798
+ const Base = core.Room;
1799
+ class FleetRoom extends Base {
1800
+ // Strict request/reply (task 011): an unbound topic is an unsolicited frame
1801
+ // → kick. Supersedes the pre-011 'drop' forward-compat stance (§7).
1802
+ unknownTopicPolicy = "kick";
1803
+ onCreate() {
1804
+ const room = this;
1805
+ for (const topic of AGENT_TOPICS) {
1806
+ room.bind(topic, (actor, payload) => {
1807
+ controller.handleAgentMessage(actor.id, topic, payload);
1808
+ });
1809
+ }
1810
+ }
1811
+ onJoin(actor) {
1812
+ controller.handleAgentJoin(this.linkFor(actor));
1813
+ }
1814
+ onLeave(actor) {
1815
+ controller.handleAgentLeave(actor.id);
1816
+ }
1817
+ /** Wrap an actor as an {@link AgentLink}; `send`/`kick` are core `Room` methods. */
1818
+ linkFor(actor) {
1819
+ const room = this;
1820
+ return {
1821
+ instanceId: actor.id,
1822
+ send: (topic, payload) => {
1823
+ room.send(actor, topic, encodeFrame(topic, payload));
1824
+ },
1825
+ close: () => {
1826
+ room.kick(actor);
1827
+ }
1828
+ };
1829
+ }
1830
+ }
1831
+ return FleetRoom;
1832
+ }
1833
+
1834
+ // src/orchestrator/transport.ts
1835
+ var FLEET_ROOM_TYPE = "@rivalis/fleet";
1836
+ var FLEET_ROOM_ID = "fleet";
1837
+ var MAX_SNAPSHOT_BYTES = 4 * 1024 * 1024;
1838
+ var HEADERS_TIMEOUT_MS = 1e4;
1839
+ var REQUEST_TIMEOUT_MS = 3e4;
1840
+ function selectSubprotocol(protocols) {
1841
+ if (protocols.has(WS_SUBPROTOCOL)) {
1842
+ return WS_SUBPROTOCOL;
1843
+ }
1844
+ for (const protocol of protocols) {
1845
+ return protocol;
1846
+ }
1847
+ return false;
1848
+ }
1849
+ function controlPlaneRateLimiterOptions() {
1850
+ const maxOutstanding = MAX_INFLIGHT_COMMANDS + 1;
1851
+ return {
1852
+ capacity: 2 * maxOutstanding,
1853
+ refillPerSecond: maxOutstanding
1854
+ };
1855
+ }
1856
+ function createSharedHttpServer(handler) {
1857
+ const server = createServer((req, res) => handler(req, res));
1858
+ server.headersTimeout = HEADERS_TIMEOUT_MS;
1859
+ server.requestTimeout = REQUEST_TIMEOUT_MS;
1860
+ return server;
1861
+ }
1862
+ function attachControlPlane(core, httpServer, deps) {
1863
+ class FleetAuth extends core.AuthMiddleware {
1864
+ async authenticate(ticket) {
1865
+ return deps.authenticator.matches(ticket) ? { data: null, roomId: FLEET_ROOM_ID } : null;
1866
+ }
1867
+ }
1868
+ const transport = new WSTransport(
1869
+ { server: httpServer },
1870
+ null,
1871
+ { ticketSource: "protocol", maxPayload: MAX_SNAPSHOT_BYTES }
1872
+ );
1873
+ const wss = transport.ws;
1874
+ if (wss?.options !== void 0) {
1875
+ wss.options.handleProtocols = (protocols) => {
1876
+ const selected = selectSubprotocol(protocols);
1877
+ if (selected !== false && selected !== WS_SUBPROTOCOL) {
1878
+ deps.logger.warning(
1879
+ `fleet: WS 101 fell back to echoing a client-offered subprotocol that is not the '${WS_SUBPROTOCOL}' sentinel \u2014 this round-trips the connection ticket (agent key) into the response headers (\xA713). Upgrade the agent client to offer the sentinel. (value not logged)`
1880
+ );
1881
+ }
1882
+ return selected;
1883
+ };
1884
+ } else {
1885
+ deps.logger.warning("fleet: could not install WS subprotocol selector \u2014 101 may echo the ticket (\xA713)");
1886
+ }
1887
+ const rivalis = new core.Rivalis({
1888
+ transports: [transport],
1889
+ authMiddleware: new FleetAuth(),
1890
+ rateLimiter: new core.TokenBucketRateLimiter(controlPlaneRateLimiterOptions())
1891
+ });
1892
+ const FleetRoomClass = createFleetRoomClass(core, deps.controller);
1893
+ rivalis.rooms.define(FLEET_ROOM_TYPE, FleetRoomClass);
1894
+ rivalis.rooms.create(FLEET_ROOM_TYPE, FLEET_ROOM_ID);
1895
+ return rivalis;
1896
+ }
1897
+
1898
+ // src/env.ts
1899
+ function env(key, defaultValue = null, type = "string") {
1900
+ if (typeof process === "undefined") {
1901
+ throw new Error("env works only with NodeJS");
1902
+ }
1903
+ const value = process.env[key];
1904
+ if (type === "number") {
1905
+ if (value === void 0) {
1906
+ return defaultValue;
1907
+ }
1908
+ const numberValue = parseInt(value, 10);
1909
+ return numberValue.toString() === value ? numberValue : defaultValue;
1910
+ }
1911
+ if (type === "boolean") {
1912
+ const boolValue = `${value}`.toLowerCase();
1913
+ if (boolValue === "true") {
1914
+ return true;
1915
+ }
1916
+ if (boolValue === "false") {
1917
+ return false;
1918
+ }
1919
+ return defaultValue;
1920
+ }
1921
+ return value !== void 0 ? value : defaultValue;
1922
+ }
1923
+ function nodeEnv() {
1924
+ return env("NODE_ENV");
1925
+ }
1926
+
1927
+ // src/routers/index.ts
1928
+ import Fastify from "fastify";
1929
+ import cors from "@fastify/cors";
1930
+ import { Router } from "@toolcase/node";
1931
+
1932
+ // src/routers/shared.ts
1933
+ import { createHash as createHash3 } from "crypto";
1934
+ import { HTTP } from "@toolcase/base";
1935
+ import { errorMeta } from "@toolcase/node";
1936
+ var MAX_BODY_BYTES = 64 * 1024;
1937
+ var SSE_PING_MS = 15e3;
1938
+ var AUTH_FAILURE_LIMIT = 10;
1939
+ var AUTH_FAILURE_WINDOW_MS = 6e4;
1940
+ var MAX_SSE_STREAMS = 100;
1941
+ var MAX_THROTTLE_BUCKETS = 4096;
1942
+ function createContext(deps) {
1943
+ const now = deps.now ?? Date.now;
1944
+ return {
1945
+ deps,
1946
+ throttle: new AuthThrottle(AUTH_FAILURE_LIMIT, AUTH_FAILURE_WINDOW_MS, now),
1947
+ streams: /* @__PURE__ */ new Set(),
1948
+ pingMs: deps.ssePingMs ?? SSE_PING_MS,
1949
+ maxStreams: deps.maxSseStreams ?? MAX_SSE_STREAMS,
1950
+ authInfo: /* @__PURE__ */ new WeakMap()
1951
+ };
1952
+ }
1953
+ function restOk(reply, data, status = HTTP.Status.OK) {
1954
+ reply.code(status);
1955
+ return new HTTP.RESTResponse(status, data);
1956
+ }
1957
+ function restError(reply, status, cause) {
1958
+ reply.code(status);
1959
+ return new HTTP.RESTError(status, cause).toJSON();
1960
+ }
1961
+ function installErrorHandlers(fastify, getLogger) {
1962
+ fastify.setNotFoundHandler((req, reply) => restError(reply, HTTP.Status.NOT_FOUND, "NOT_FOUND"));
1963
+ fastify.setErrorHandler((error, req, reply) => {
1964
+ const meta = errorMeta(error);
1965
+ if (meta !== null) {
1966
+ return restError(reply, meta.status, meta.code ?? "INTERNAL");
1967
+ }
1968
+ const status = error.statusCode;
1969
+ if (status === HTTP.Status.PAYLOAD_TOO_LARGE) {
1970
+ return restError(reply, HTTP.Status.PAYLOAD_TOO_LARGE, "PAYLOAD_TOO_LARGE");
1971
+ }
1972
+ if (typeof status === "number" && status >= 400 && status < 500) {
1973
+ return restError(reply, status, "VALIDATION");
1974
+ }
1975
+ getLogger().error(`unhandled error on ${req.method} ${pathOf(req)}: ${describe(error)}`);
1976
+ return restError(reply, HTTP.Status.INTERNAL_SERVER_ERROR, "INTERNAL");
1977
+ });
1978
+ }
1979
+ function sendConditional(req, reply, deps, data) {
1980
+ const etag = weakEtag(deps.fleet.stats.stateHash);
1981
+ reply.header("etag", etag);
1982
+ if (ifNoneMatchMatches(req, etag)) {
1983
+ reply.code(HTTP.Status.NOT_MODIFIED);
1984
+ return null;
1985
+ }
1986
+ return restOk(reply, data);
1987
+ }
1988
+ function weakEtag(stateHash) {
1989
+ return `W/"${stateHash}"`;
1990
+ }
1991
+ function ifNoneMatchMatches(req, etag) {
1992
+ const header = req.headers["if-none-match"];
1993
+ if (typeof header !== "string") {
1994
+ return false;
1995
+ }
1996
+ if (header.trim() === "*") {
1997
+ return true;
1998
+ }
1999
+ return header.split(",").some((candidate) => candidate.trim() === etag);
2000
+ }
2001
+ function bearerToken(req) {
2002
+ const header = req.headers["authorization"];
2003
+ if (typeof header !== "string") {
2004
+ return null;
2005
+ }
2006
+ const match = /^Bearer\s+(.+)$/i.exec(header.trim());
2007
+ return match === null ? null : match[1];
2008
+ }
2009
+ function fingerprint(key) {
2010
+ return "key#" + createHash3("sha256").update(key).digest("hex").slice(0, 8);
2011
+ }
2012
+ function remoteIp(req) {
2013
+ return req.ip ?? "unknown";
2014
+ }
2015
+ function pathOf(req) {
2016
+ const url = req.url;
2017
+ const q = url.indexOf("?");
2018
+ return q === -1 ? url : url.slice(0, q);
2019
+ }
2020
+ function isEventsPath(req) {
2021
+ return req.method === "GET" && pathOf(req) === "/v1/events";
2022
+ }
2023
+ function isMutatingRoute(req) {
2024
+ const path = pathOf(req);
2025
+ if (req.method === "POST" && path === "/v1/rooms") {
2026
+ return true;
2027
+ }
2028
+ if (req.method === "DELETE" && /^\/v1\/rooms\/.+$/.test(path)) {
2029
+ return true;
2030
+ }
2031
+ if (req.method === "POST" && /^\/v1\/instances\/[^/]+\/(drain|undrain)$/.test(path)) {
2032
+ return true;
2033
+ }
2034
+ return false;
2035
+ }
2036
+ async function authHook(ctx, req) {
2037
+ const ip = remoteIp(req);
2038
+ const path = pathOf(req);
2039
+ if (ctx.throttle.blocked(ip)) {
2040
+ ctx.deps.getLogger().warning(`auth throttled ip=${ip} route=${req.method} ${path}`);
2041
+ throw new FleetError("AUTH_THROTTLED", "too many failed authentication attempts");
2042
+ }
2043
+ let matched = matchKey(bearerToken(req), ctx.deps.config.adminKeys);
2044
+ if (matched === null && isEventsPath(req) && ctx.deps.config.sseQueryAuth) {
2045
+ const queryKey = req.query?.["key"];
2046
+ if (typeof queryKey === "string") {
2047
+ matched = matchKey(queryKey, ctx.deps.config.adminKeys);
2048
+ }
2049
+ }
2050
+ if (matched === null) {
2051
+ ctx.throttle.recordFailure(ip);
2052
+ ctx.deps.getLogger().warning(`auth failure ip=${ip} route=${req.method} ${path}`);
2053
+ throw new FleetError("UNAUTHORIZED", "missing or invalid admin key");
2054
+ }
2055
+ ctx.authInfo.set(req, { fingerprint: fingerprint(matched), ip });
2056
+ }
2057
+ async function auditHook(ctx, req, reply) {
2058
+ if (!isMutatingRoute(req)) {
2059
+ return;
2060
+ }
2061
+ const info = ctx.authInfo.get(req);
2062
+ ctx.deps.getLogger().info(
2063
+ `audit route=${req.method} ${pathOf(req)} key=${info?.fingerprint ?? "unknown"} ip=${info?.ip ?? remoteIp(req)} outcome=${reply.statusCode}`
2064
+ );
2065
+ }
2066
+ function corsHeadersForSse(req, cors2) {
2067
+ if (cors2 === false) {
2068
+ return {};
2069
+ }
2070
+ const origin = req.headers["origin"];
2071
+ if (typeof origin !== "string") {
2072
+ return {};
2073
+ }
2074
+ if (cors2.origins.includes("*")) {
2075
+ return { "access-control-allow-origin": "*" };
2076
+ }
2077
+ if (cors2.origins.includes(origin)) {
2078
+ return { "access-control-allow-origin": origin, vary: "Origin" };
2079
+ }
2080
+ return {};
2081
+ }
2082
+ var AuthThrottle = class {
2083
+ constructor(limit, windowMs, now, maxBuckets = MAX_THROTTLE_BUCKETS) {
2084
+ this.limit = limit;
2085
+ this.windowMs = windowMs;
2086
+ this.now = now;
2087
+ this.maxBuckets = maxBuckets;
2088
+ }
2089
+ limit;
2090
+ windowMs;
2091
+ now;
2092
+ maxBuckets;
2093
+ buckets = /* @__PURE__ */ new Map();
2094
+ /** Wall-clock of the last opportunistic sweep; gates pruning to once per window. */
2095
+ lastPruneAt = -Infinity;
2096
+ /** True when the IP is over its failed-auth budget (no tokens left). */
2097
+ blocked(ip) {
2098
+ return this.refill(ip).tokens < 1;
2099
+ }
2100
+ /** Charge one token for a failed attempt (floored at zero). */
2101
+ recordFailure(ip) {
2102
+ const bucket = this.refill(ip);
2103
+ bucket.tokens = Math.max(0, bucket.tokens - 1);
2104
+ }
2105
+ /** Current bucket count — a test seam for the §13 memory-bound assertions. */
2106
+ get size() {
2107
+ return this.buckets.size;
2108
+ }
2109
+ refill(ip) {
2110
+ const now = this.now();
2111
+ this.prune(now);
2112
+ let bucket = this.buckets.get(ip);
2113
+ if (bucket === void 0) {
2114
+ bucket = { tokens: this.limit, last: now };
2115
+ this.buckets.set(ip, bucket);
2116
+ this.evictIfOver();
2117
+ return bucket;
2118
+ }
2119
+ const elapsed = now - bucket.last;
2120
+ if (elapsed > 0) {
2121
+ bucket.tokens = Math.min(this.limit, bucket.tokens + elapsed / this.windowMs * this.limit);
2122
+ bucket.last = now;
2123
+ }
2124
+ return bucket;
2125
+ }
2126
+ /**
2127
+ * Opportunistic sweep (≤ once per window): delete every bucket that has fully
2128
+ * refilled and not been touched within the last window. Such a bucket holds no
2129
+ * information — a fresh IP starts full — so removing it cannot un-throttle anyone.
2130
+ * Computing the *refilled* token count (not the stored one) also reclaims buckets
2131
+ * stuck below full only because the IP never returned after a single failure.
2132
+ */
2133
+ prune(now) {
2134
+ if (now - this.lastPruneAt < this.windowMs) {
2135
+ return;
2136
+ }
2137
+ this.lastPruneAt = now;
2138
+ for (const [ip, bucket] of this.buckets) {
2139
+ const elapsed = now - bucket.last;
2140
+ if (elapsed <= this.windowMs) {
2141
+ continue;
2142
+ }
2143
+ const refilled = Math.min(this.limit, bucket.tokens + elapsed / this.windowMs * this.limit);
2144
+ if (refilled >= this.limit) {
2145
+ this.buckets.delete(ip);
2146
+ }
2147
+ }
2148
+ }
2149
+ /** Hard cap: when over {@link maxBuckets}, evict the oldest-touched bucket. */
2150
+ evictIfOver() {
2151
+ if (this.buckets.size <= this.maxBuckets) {
2152
+ return;
2153
+ }
2154
+ let oldestIp = null;
2155
+ let oldest = Infinity;
2156
+ for (const [ip, bucket] of this.buckets) {
2157
+ if (bucket.last < oldest) {
2158
+ oldest = bucket.last;
2159
+ oldestIp = ip;
2160
+ }
2161
+ }
2162
+ if (oldestIp !== null) {
2163
+ this.buckets.delete(oldestIp);
2164
+ }
2165
+ }
2166
+ };
2167
+
2168
+ // src/routers/HealthRouter.ts
2169
+ import { RouteHandler } from "@toolcase/node";
2170
+ import { HTTP as HTTP2 } from "@toolcase/base";
2171
+ var HealthRouter = class extends RouteHandler {
2172
+ constructor(ctx) {
2173
+ super();
2174
+ this.ctx = ctx;
2175
+ }
2176
+ ctx;
2177
+ register(fastify) {
2178
+ fastify.get("/healthz", async (_req, reply) => restOk(reply));
2179
+ fastify.get("/readyz", async (_req, reply) => {
2180
+ if (this.ctx.deps.isReady()) {
2181
+ return restOk(reply);
2182
+ }
2183
+ return restError(reply, HTTP2.Status.SERVICE_UNAVAILABLE, "NOT_READY");
2184
+ });
2185
+ }
2186
+ };
2187
+
2188
+ // src/routers/StatsRouter.ts
2189
+ import { RouteHandler as RouteHandler2 } from "@toolcase/node";
2190
+ var StatsRouter = class extends RouteHandler2 {
2191
+ constructor(ctx) {
2192
+ super();
2193
+ this.ctx = ctx;
2194
+ }
2195
+ ctx;
2196
+ register(fastify) {
2197
+ fastify.get("/stats", async (req, reply) => sendConditional(req, reply, this.ctx.deps, this.ctx.deps.fleet.stats));
2198
+ }
2199
+ };
2200
+
2201
+ // src/routers/InstancesRouter.ts
2202
+ import { RouteHandler as RouteHandler3 } from "@toolcase/node";
2203
+ var InstancesRouter = class extends RouteHandler3 {
2204
+ constructor(ctx) {
2205
+ super();
2206
+ this.ctx = ctx;
2207
+ }
2208
+ ctx;
2209
+ register(fastify) {
2210
+ const deps = this.ctx.deps;
2211
+ fastify.get("/instances", async (req, reply) => sendConditional(req, reply, deps, deps.fleet.instances));
2212
+ fastify.get("/instances/:id", async (req, reply) => {
2213
+ const id = paramId(req);
2214
+ const instance = deps.fleet.getInstance(id);
2215
+ if (instance === null) {
2216
+ throw new FleetError("INSTANCE_NOT_FOUND", `instance ${id} not found`);
2217
+ }
2218
+ return restOk(reply, instance);
2219
+ });
2220
+ fastify.get("/instances/:id/rooms", async (req, reply) => {
2221
+ const id = paramId(req);
2222
+ if (deps.fleet.getInstance(id) === null) {
2223
+ throw new FleetError("INSTANCE_NOT_FOUND", `instance ${id} not found`);
2224
+ }
2225
+ return restOk(reply, deps.fleet.findRooms({ instanceId: id }));
2226
+ });
2227
+ fastify.post("/instances/:id/drain", async (req, reply) => {
2228
+ await deps.fleet.drainInstance(paramId(req));
2229
+ return restOk(reply);
2230
+ });
2231
+ fastify.post("/instances/:id/undrain", async (req, reply) => {
2232
+ await deps.fleet.undrainInstance(paramId(req));
2233
+ return restOk(reply);
2234
+ });
2235
+ }
2236
+ };
2237
+ function paramId(req) {
2238
+ return req.params.id;
2239
+ }
2240
+
2241
+ // src/routers/RoomsRouter.ts
2242
+ import { RouteHandler as RouteHandler4, deriveJsonSchema } from "@toolcase/node";
2243
+ import { HTTP as HTTP3 } from "@toolcase/base";
2244
+ var roomCreateBodySchema = deriveJsonSchema(roomCreateSchema, "create");
2245
+ var RoomsRouter = class extends RouteHandler4 {
2246
+ constructor(ctx) {
2247
+ super();
2248
+ this.ctx = ctx;
2249
+ }
2250
+ ctx;
2251
+ register(fastify) {
2252
+ const deps = this.ctx.deps;
2253
+ fastify.get("/rooms", async (req, reply) => sendConditional(req, reply, deps, deps.fleet.findRooms(roomFilter(req))));
2254
+ fastify.post("/rooms", { schema: { body: roomCreateBodySchema } }, async (req, reply) => {
2255
+ const created = await deps.fleet.createRoom(
2256
+ req.body
2257
+ );
2258
+ return restOk(reply, created, HTTP3.Status.CREATED);
2259
+ });
2260
+ fastify.get("/rooms/:roomId", async (req, reply) => {
2261
+ const roomId = publicRoomId(req);
2262
+ const room = deps.fleet.getRoom(roomId);
2263
+ if (room === null) {
2264
+ throw new FleetError("ROOM_NOT_FOUND", `room ${roomId} not found`);
2265
+ }
2266
+ return restOk(reply, room);
2267
+ });
2268
+ fastify.delete("/rooms/:roomId", async (req, reply) => {
2269
+ await deps.fleet.destroyRoom(publicRoomId(req));
2270
+ return restOk(reply);
2271
+ });
2272
+ }
2273
+ };
2274
+ function roomFilter(req) {
2275
+ const query = req.query ?? {};
2276
+ const filter = {};
2277
+ if (typeof query.type === "string") {
2278
+ filter.type = query.type;
2279
+ }
2280
+ if (typeof query.instanceId === "string") {
2281
+ filter.instanceId = query.instanceId;
2282
+ }
2283
+ const raw = query.label;
2284
+ const labelParams = Array.isArray(raw) ? raw : raw !== void 0 ? [raw] : [];
2285
+ if (labelParams.length > 0) {
2286
+ const labels = {};
2287
+ for (const entry of labelParams) {
2288
+ if (typeof entry !== "string") {
2289
+ continue;
2290
+ }
2291
+ const idx = entry.indexOf(":");
2292
+ if (idx > 0) {
2293
+ labels[entry.slice(0, idx)] = entry.slice(idx + 1);
2294
+ }
2295
+ }
2296
+ filter.labels = labels;
2297
+ }
2298
+ return filter;
2299
+ }
2300
+ function publicRoomId(req) {
2301
+ const path = pathnameOf(req);
2302
+ const segments = path.split("/");
2303
+ return segments[segments.length - 1] ?? "";
2304
+ }
2305
+ function pathnameOf(req) {
2306
+ const url = req.url;
2307
+ const q = url.indexOf("?");
2308
+ return q === -1 ? url : url.slice(0, q);
2309
+ }
2310
+
2311
+ // src/routers/EventsRouter.ts
2312
+ import { RouteHandler as RouteHandler5 } from "@toolcase/node";
2313
+ var EventsRouter = class extends RouteHandler5 {
2314
+ constructor(ctx) {
2315
+ super();
2316
+ this.ctx = ctx;
2317
+ }
2318
+ ctx;
2319
+ register(fastify) {
2320
+ fastify.get("/events", async (req, reply) => this.stream(req, reply));
2321
+ }
2322
+ stream(req, reply) {
2323
+ const ctx = this.ctx;
2324
+ if (ctx.streams.size >= ctx.maxStreams) {
2325
+ ctx.deps.getLogger().warning(
2326
+ `sse stream cap reached (${ctx.maxStreams}) \u2014 rejecting new stream from ip=${remoteIp(req)}`
2327
+ );
2328
+ throw new FleetError("SSE_LIMIT", `concurrent SSE stream cap reached (${ctx.maxStreams})`);
2329
+ }
2330
+ reply.hijack();
2331
+ const raw = reply.raw;
2332
+ raw.writeHead(200, {
2333
+ ...corsHeadersForSse(req, ctx.deps.config.cors),
2334
+ "content-type": "text/event-stream; charset=utf-8",
2335
+ "cache-control": "no-cache, no-transform",
2336
+ connection: "keep-alive",
2337
+ // No Last-Event-ID replay (§10): a reconnecting consumer re-GETs stats+instances.
2338
+ "x-accel-buffering": "no"
2339
+ });
2340
+ const write = (chunk) => {
2341
+ if (raw.writableEnded || raw.destroyed) {
2342
+ return;
2343
+ }
2344
+ try {
2345
+ raw.write(chunk);
2346
+ } catch {
2347
+ }
2348
+ };
2349
+ write(": connected\n\n");
2350
+ const unsubscribe = ctx.deps.subscribe((event) => {
2351
+ write(`event: ${event.type}
2352
+ data: ${JSON.stringify(event.data ?? null)}
2353
+
2354
+ `);
2355
+ });
2356
+ const ping = setInterval(() => write(": ping\n\n"), ctx.pingMs);
2357
+ ping.unref?.();
2358
+ const stream = {
2359
+ end: () => {
2360
+ if (!raw.writableEnded) {
2361
+ raw.end();
2362
+ }
2363
+ },
2364
+ cleanup: () => {
2365
+ clearInterval(ping);
2366
+ unsubscribe();
2367
+ ctx.streams.delete(stream);
2368
+ }
2369
+ };
2370
+ ctx.streams.add(stream);
2371
+ req.raw.on("close", () => stream.cleanup());
2372
+ }
2373
+ };
2374
+
2375
+ // src/routers/index.ts
2376
+ function createHttpApi(deps, options = {}) {
2377
+ const ctx = createContext(deps);
2378
+ const base = { logger: false, bodyLimit: MAX_BODY_BYTES, trustProxy: deps.config.trustProxy };
2379
+ const fastify = options.serverFactory !== void 0 ? Fastify({ ...base, serverFactory: options.serverFactory }) : Fastify({ ...base });
2380
+ installErrorHandlers(fastify, deps.getLogger);
2381
+ if (deps.config.cors !== false) {
2382
+ const origins = deps.config.cors.origins;
2383
+ void fastify.register(cors, { origin: origins.includes("*") ? "*" : origins });
2384
+ }
2385
+ new HealthRouter(ctx).register(fastify);
2386
+ if (deps.config.api) {
2387
+ void fastify.register(async (v1) => {
2388
+ v1.addHook("onRequest", (req) => authHook(ctx, req));
2389
+ v1.addHook("onResponse", (req, reply) => auditHook(ctx, req, reply));
2390
+ new Router().add(new StatsRouter(ctx)).add(new InstancesRouter(ctx)).add(new RoomsRouter(ctx)).add(new EventsRouter(ctx)).register(v1);
2391
+ }, { prefix: "/v1" });
2392
+ }
2393
+ const drainStreams = () => {
2394
+ for (const stream of [...ctx.streams]) {
2395
+ stream.cleanup();
2396
+ stream.end();
2397
+ }
2398
+ };
2399
+ return {
2400
+ fastify,
2401
+ ready: async () => {
2402
+ await fastify.ready();
2403
+ },
2404
+ listen: async (opts) => {
2405
+ await fastify.listen(opts);
2406
+ },
2407
+ shutdown: drainStreams,
2408
+ close: async () => {
2409
+ drainStreams();
2410
+ await fastify.close();
2411
+ }
2412
+ };
2413
+ }
2414
+
2415
+ // src/util/loadCore.ts
2416
+ import { createRequire as createRequire2 } from "module";
2417
+ function loadCore() {
2418
+ const metaUrl = import.meta.url;
2419
+ const req = metaUrl ? createRequire2(metaUrl) : __require;
2420
+ return req("@rivalis/core");
2421
+ }
2422
+
2423
+ // src/util/scheduler.ts
2424
+ var defaultScheduler = {
2425
+ setTimeout: (fn, ms) => {
2426
+ const t = setTimeout(fn, ms);
2427
+ t.unref?.();
2428
+ return t;
2429
+ },
2430
+ clearTimeout: (h) => clearTimeout(h),
2431
+ setInterval: (fn, ms) => {
2432
+ const t = setInterval(fn, ms);
2433
+ t.unref?.();
2434
+ return t;
2435
+ },
2436
+ clearInterval: (h) => clearInterval(h)
2437
+ };
2438
+
2439
+ // src/orchestrator/Orchestrator.ts
2440
+ var Orchestrator = class extends Broadcast {
2441
+ fleet;
2442
+ config;
2443
+ state;
2444
+ now;
2445
+ logger;
2446
+ /** `fleet:http` logger; NOOP until `listen()` loads core's logging factory. */
2447
+ httpLogger;
2448
+ /** Fastify-based REST /v1 surface over the same `node:http` server (§10, task 006). */
2449
+ httpApi;
2450
+ // Injected collaborators (§15) — each a separately unit-tested concern.
2451
+ auth;
2452
+ commands;
2453
+ poller;
2454
+ reconciler;
2455
+ control;
2456
+ /** Live agent links keyed by connection-scoped instance id. */
2457
+ links = /* @__PURE__ */ new Map();
2458
+ rivalis = null;
2459
+ httpServer = null;
2460
+ listening = false;
2461
+ transportAttached = false;
2462
+ constructor(options, internals = {}) {
2463
+ super();
2464
+ this.config = resolveConfig(options);
2465
+ const scheduler = internals.scheduler ?? defaultScheduler;
2466
+ this.now = internals.now ?? Date.now;
2467
+ this.logger = internals.logger ?? NOOP_LOGGER;
2468
+ this.httpLogger = this.logger;
2469
+ const resolvedNodeEnv = internals.env ?? nodeEnv();
2470
+ const securityContext = { logger: this.logger };
2471
+ if (resolvedNodeEnv != null) {
2472
+ securityContext.env = resolvedNodeEnv;
2473
+ }
2474
+ enforceSecurityPolicy(this.config, securityContext);
2475
+ this.state = new FleetState({ logger: this.logger });
2476
+ this.auth = new AgentAuthenticator(this.config.agentKeys);
2477
+ this.commands = new CommandEngine(scheduler, this.state, this.config.commandTimeoutMs);
2478
+ this.poller = new Poller(scheduler, this.config.heartbeatMs, {
2479
+ sendPoll: (id, reqId, forceFull) => this.sendPoll(id, reqId, forceFull),
2480
+ onStale: (id) => this.onStale(id),
2481
+ onEvict: (id) => this.onEvict(id)
2482
+ });
2483
+ this.reconciler = new EventReconciler(this.state, (event, data) => this.emitEvent(event, data));
2484
+ this.control = new FleetControl(this.state, this.commands, (id) => this.links.get(id));
2485
+ const self = this;
2486
+ this.fleet = {
2487
+ get stats() {
2488
+ return self.state.stats;
2489
+ },
2490
+ get instances() {
2491
+ return self.state.instances;
2492
+ },
2493
+ get rooms() {
2494
+ return self.state.rooms;
2495
+ },
2496
+ getInstance: (id) => self.state.getInstance(id),
2497
+ getRoom: (id) => self.state.getRoom(id),
2498
+ findRooms: (filter) => self.state.findRooms(filter ?? {}),
2499
+ createRoom: (request) => self.control.createRoom(request),
2500
+ destroyRoom: (roomId) => self.control.destroyRoom(roomId),
2501
+ drainInstance: (instanceId) => self.control.drainInstance(instanceId),
2502
+ undrainInstance: (instanceId) => self.control.undrainInstance(instanceId)
2503
+ };
2504
+ this.httpApi = createHttpApi(
2505
+ {
2506
+ config: this.config,
2507
+ fleet: this.fleet,
2508
+ isReady: () => this.ready,
2509
+ subscribe: (listener) => this.subscribeFleetEvents(listener),
2510
+ getLogger: () => this.httpLogger,
2511
+ now: this.now
2512
+ },
2513
+ {
2514
+ serverFactory: (handler) => {
2515
+ const server = createSharedHttpServer(handler);
2516
+ this.httpServer = server;
2517
+ return server;
2518
+ }
2519
+ }
2520
+ );
2521
+ }
2522
+ /**
2523
+ * Bridge every {@link FleetEventType} broadcast (§9) into one SSE listener as a
2524
+ * {@link FleetEvent} `{ type, data }`; returns an unsubscribe (called on stream close, §10).
2525
+ */
2526
+ subscribeFleetEvents(listener) {
2527
+ const types = [
2528
+ "instance:join",
2529
+ "instance:leave",
2530
+ "instance:stale",
2531
+ "room:create",
2532
+ "room:destroy",
2533
+ "sync"
2534
+ ];
2535
+ const handlers = types.map((type) => {
2536
+ const handler = (data) => listener({ type, data });
2537
+ this.on(type, handler);
2538
+ return { type, handler };
2539
+ });
2540
+ return () => {
2541
+ for (const { type, handler } of handlers) {
2542
+ this.off(type, handler);
2543
+ }
2544
+ };
2545
+ }
2546
+ /** True once HTTP is listening and the WS transport is attached (drives `/readyz`, task 010). */
2547
+ get ready() {
2548
+ return this.listening && this.transportAttached;
2549
+ }
2550
+ // ---- Lifecycle ----
2551
+ /** Start the HTTP/WS server, attach the internal Rivalis room, begin accepting agents (§9). */
2552
+ async listen() {
2553
+ if (this.listening) {
2554
+ return;
2555
+ }
2556
+ const core = loadCore();
2557
+ const httpServer = this.httpServer;
2558
+ if (httpServer === null) {
2559
+ throw new Error("orchestrator: http server was not created by the REST layer");
2560
+ }
2561
+ const rivalis = attachControlPlane(core, httpServer, { authenticator: this.auth, controller: this, logger: this.logger });
2562
+ this.rivalis = rivalis;
2563
+ this.logger = rivalis.logging.getLogger("fleet");
2564
+ this.httpLogger = rivalis.logging.getLogger("fleet:http");
2565
+ this.transportAttached = true;
2566
+ await this.httpApi.listen({ host: this.config.host, port: this.config.port });
2567
+ this.listening = true;
2568
+ this.logger.info(
2569
+ `orchestrator listening host=(${this.config.host}) port=(${this.config.port}) api=(${this.config.api ? "/v1" : "off"}) heartbeat=(${this.config.heartbeatMs}ms)`
2570
+ );
2571
+ }
2572
+ /** Gracefully stop: reject in-flight commands, destroy rooms, dispose transport, close HTTP (§9). */
2573
+ async shutdown() {
2574
+ this.httpApi.shutdown();
2575
+ for (const instanceId of [...this.links.keys()]) {
2576
+ this.teardownInstance(instanceId, "orchestrator shutdown");
2577
+ }
2578
+ if (this.rivalis !== null) {
2579
+ try {
2580
+ await this.rivalis.shutdown();
2581
+ } catch (error) {
2582
+ this.logger.warning(`rivalis shutdown error: ${describe(error)}`);
2583
+ }
2584
+ this.rivalis = null;
2585
+ }
2586
+ await this.httpApi.close();
2587
+ this.httpServer = null;
2588
+ this.transportAttached = false;
2589
+ this.listening = false;
2590
+ }
2591
+ // ---- FleetController — driven by the FleetRoom (agent transport, §7) ----
2592
+ /** @internal Agent joined: assign id, send `fleet/hello`, start polling (§7, task 011). */
2593
+ handleAgentJoin(link) {
2594
+ this.guard(`agent join instance=${link.instanceId}`, () => {
2595
+ this.links.set(link.instanceId, link);
2596
+ link.send(Topics.hello, {
2597
+ instanceId: link.instanceId,
2598
+ protocolVersion: PROTOCOL_VERSION,
2599
+ heartbeatMs: this.config.heartbeatMs
2600
+ });
2601
+ this.poller.start(link.instanceId);
2602
+ this.logger.info(`agent joined instance=${link.instanceId}`);
2603
+ });
2604
+ }
2605
+ /** @internal Agent socket closed: evict instantly, rejecting any in-flight commands (§7). */
2606
+ handleAgentLeave(instanceId) {
2607
+ this.guard(`agent leave instance=${instanceId}`, () => {
2608
+ this.teardownInstance(instanceId, "socket close");
2609
+ });
2610
+ }
2611
+ /**
2612
+ * @internal Inbound agent frame (task 011). Every agent frame must be a reply to
2613
+ * an outstanding orchestrator request — `fleet/state` to a `fleet/poll`,
2614
+ * `fleet/ack` to a `fleet/cmd`. A well-formed frame whose correlation id matches
2615
+ * no outstanding request (spontaneous, duplicate, or post-settle) is an
2616
+ * unsolicited frame → kick. A malformed / version-incompatible frame is logged
2617
+ * and dropped (the lockstep-mismatch path is evicted by missed polls, §7/§8).
2618
+ */
2619
+ handleAgentMessage(instanceId, topic, payload) {
2620
+ this.guard(`agent message instance=${instanceId} topic=${topic}`, () => {
2621
+ if (!this.links.has(instanceId)) {
2622
+ return;
2623
+ }
2624
+ switch (topic) {
2625
+ case Topics.state: {
2626
+ const decoded = this.decode(instanceId, Topics.state, payload);
2627
+ if (decoded !== null) {
2628
+ this.handleState(instanceId, decoded);
2629
+ }
2630
+ return;
2631
+ }
2632
+ case Topics.ack: {
2633
+ const decoded = this.decode(instanceId, Topics.ack, payload);
2634
+ if (decoded !== null) {
2635
+ this.handleAck(instanceId, decoded);
2636
+ }
2637
+ return;
2638
+ }
2639
+ default:
2640
+ this.kick(instanceId, `unexpected topic on the control plane`);
2641
+ }
2642
+ });
2643
+ }
2644
+ // ---- Poll dispatch + reply ingestion (§7, task 011) ----
2645
+ /** Build and send a `fleet/poll`: knownHash drives dedup, status echoes for drain confirmation. */
2646
+ sendPoll(instanceId, reqId, forceFull) {
2647
+ this.guard(`poll instance=${instanceId}`, () => {
2648
+ const link = this.links.get(instanceId);
2649
+ if (link === void 0) {
2650
+ return;
2651
+ }
2652
+ const knownHash = forceFull ? null : this.state.lastHashOf(instanceId);
2653
+ const status = this.state.getInstance(instanceId)?.status ?? "active";
2654
+ link.send(Topics.poll, { reqId, knownHash, status });
2655
+ });
2656
+ }
2657
+ /**
2658
+ * Ingest a `fleet/state` poll reply (task 011). The reply must match the
2659
+ * outstanding poll's `reqId` (consumed via the poller); an unmatched reply is
2660
+ * unsolicited → kick. A full reply is bounds-checked (§13) and applied; a
2661
+ * hash-only reply just refreshes liveness (the snapshot is unchanged).
2662
+ */
2663
+ handleState(instanceId, state) {
2664
+ if (!this.poller.reply(instanceId, state.reqId)) {
2665
+ this.kick(instanceId, "unsolicited or duplicate fleet/state (no matching outstanding poll)");
2666
+ return;
2667
+ }
2668
+ this.state.setStale(instanceId, false);
2669
+ if (!state.full) {
2670
+ this.state.touch(instanceId, this.now());
2671
+ return;
2672
+ }
2673
+ const reason = validateSnapshot(state);
2674
+ if (reason !== null) {
2675
+ this.logger.warning(`rejected snapshot from instance=${instanceId}: ${reason} (\xA713)`);
2676
+ return;
2677
+ }
2678
+ if (this.state.applySnapshot(instanceId, state, this.now())) {
2679
+ this.reconciler.reconcile();
2680
+ }
2681
+ }
2682
+ handleAck(instanceId, ack) {
2683
+ if (!this.commands.ack(instanceId, ack)) {
2684
+ this.kick(instanceId, "ack for unknown or already-settled command");
2685
+ }
2686
+ }
2687
+ /**
2688
+ * Kick an agent that broke the request/reply contract (task 011): tear it down
2689
+ * (rejecting in-flight commands, removing it from the read model) and close the
2690
+ * socket so it reconnects fresh. The log line names the cause and the instance —
2691
+ * never the offending payload's contents (§13).
2692
+ */
2693
+ kick(instanceId, reason) {
2694
+ const link = this.links.get(instanceId);
2695
+ this.logger.warning(`kicking instance=${instanceId}: ${reason} (request/reply enforcement, \xA77)`);
2696
+ this.teardownInstance(instanceId, "protocol violation");
2697
+ link?.close();
2698
+ }
2699
+ // ---- Liveness callbacks (read-model + events); timers owned by the Poller ----
2700
+ onStale(instanceId) {
2701
+ this.guard(`stale instance=${instanceId}`, () => {
2702
+ this.state.setStale(instanceId, true);
2703
+ this.logger.warning(`instance=${instanceId} stale (2 missed poll replies) \u2014 excluded from placement`);
2704
+ const info = this.state.getInstance(instanceId);
2705
+ if (info !== null) {
2706
+ this.emitEvent("instance:stale", info);
2707
+ }
2708
+ });
2709
+ }
2710
+ onEvict(instanceId) {
2711
+ this.guard(`evict instance=${instanceId}`, () => {
2712
+ const link = this.links.get(instanceId);
2713
+ this.logger.warning(`evicting wedged instance=${instanceId} (3 missed poll replies)`);
2714
+ this.teardownInstance(instanceId, "liveness eviction");
2715
+ link?.close();
2716
+ });
2717
+ }
2718
+ /**
2719
+ * Remove an instance from every table, reject its in-flight commands immediately
2720
+ * with `INSTANCE_DISCONNECTED` (§7), and reconcile (its rooms → `room:destroy`, `sync`).
2721
+ */
2722
+ teardownInstance(instanceId, reason) {
2723
+ if (!this.links.has(instanceId) && !this.poller.has(instanceId)) {
2724
+ return;
2725
+ }
2726
+ this.links.delete(instanceId);
2727
+ this.poller.forget(instanceId);
2728
+ this.commands.rejectAll(instanceId, reason);
2729
+ const removed = this.state.removeInstance(instanceId);
2730
+ if (removed !== null) {
2731
+ this.reconciler.instanceRemoved(removed);
2732
+ }
2733
+ this.reconciler.reconcile();
2734
+ }
2735
+ // ---- Internals ----
2736
+ /**
2737
+ * Decode a binary agent frame for `topic` (§7). Returns `null` on any failure —
2738
+ * never throws into the host (§8): a protocol-incompatible frame (e.g. a legacy
2739
+ * JSON agent against this v2 orchestrator) or a malformed/truncated one is logged
2740
+ * and dropped, and the read model keeps its last good state.
2741
+ */
2742
+ decode(instanceId, topic, payload) {
2743
+ const bytes = typeof payload === "string" ? Buffer.from(payload, "utf-8") : payload;
2744
+ try {
2745
+ return decodeFrame(topic, bytes);
2746
+ } catch (error) {
2747
+ if (error instanceof WireVersionError) {
2748
+ this.logger.warning(
2749
+ `dropped protocol-incompatible frame from instance=${instanceId} topic=${topic} (peer major=${error.theirVersion}, orchestrator=${PROTOCOL_VERSION}) \u2014 agents and orchestrator must run the same @rivalis/fleet major (\xA77)`
2750
+ );
2751
+ } else {
2752
+ this.logger.warning(`failed to decode agent frame topic=${topic} from instance=${instanceId}: ${describe(error)}`);
2753
+ }
2754
+ return null;
2755
+ }
2756
+ }
2757
+ emitEvent(event, data) {
2758
+ try {
2759
+ this.emit(event, data);
2760
+ } catch (error) {
2761
+ this.logger.error(`listener for ${event} threw: ${describe(error)}`);
2762
+ }
2763
+ }
2764
+ /**
2765
+ * Run a timer- / transport- / core-dispatch-driven callback, swallowing and
2766
+ * logging any throw so it never escapes into a raw `setTimeout` (an
2767
+ * `uncaughtException` that would crash the whole control plane) or back into
2768
+ * core's room dispatch (§14 failure modes). Mirrors the agent's host-safety
2769
+ * `guard` (§8): the orchestrator is the single point of coordination, so one
2770
+ * unhandled throw on a poll tick, a snapshot application, or a liveness deadline
2771
+ * must degrade to a logged failure on one instance, never an orchestrator-wide
2772
+ * outage. Never rethrows.
2773
+ */
2774
+ guard(label, fn) {
2775
+ try {
2776
+ fn();
2777
+ } catch (error) {
2778
+ this.logger.error(`orchestrator ${label} handler error: ${describe(error)}`);
2779
+ }
2780
+ }
2781
+ };
2782
+
2783
+ // src/agent/FleetAgent.ts
2784
+ import { Broadcast as Broadcast2 } from "@toolcase/base";
2785
+
2786
+ // src/agent/Snapshot.ts
2787
+ import { randomBytes as randomBytes2 } from "crypto";
2788
+
2789
+ // src/util/packageVersion.ts
2790
+ import { createRequire as createRequire3 } from "module";
2791
+ function packageVersion() {
2792
+ try {
2793
+ const metaUrl = import.meta.url;
2794
+ const req = metaUrl ? createRequire3(metaUrl) : __require;
2795
+ const pkg = req("../package.json");
2796
+ return pkg.version ?? "0.0.0";
2797
+ } catch {
2798
+ return "0.0.0";
2799
+ }
2800
+ }
2801
+
2802
+ // src/agent/Snapshot.ts
2803
+ var MAX_SNAPSHOT_BYTES2 = 4 * 1024 * 1024;
2804
+ var WARN_RATIO = 0.5;
2805
+ var ERROR_RATIO = 0.9;
2806
+ var MIN_CORE_VERSION = "6.1.0";
2807
+ function generateProcessUid() {
2808
+ return "p_" + randomBytes2(12).toString("hex");
2809
+ }
2810
+ var Snapshot = class {
2811
+ /** Stable per-process id (§6) — constant across reconnects. */
2812
+ processUid;
2813
+ rivalis;
2814
+ logger;
2815
+ name;
2816
+ endpointUrl;
2817
+ labels;
2818
+ capacity;
2819
+ autoCreate;
2820
+ agentVersion;
2821
+ protocolVersion;
2822
+ /** Room ids created in response to `fleet/cmd` → stamped `origin: 'fleet'`. */
2823
+ fleetOrigins = /* @__PURE__ */ new Set();
2824
+ /** Agent owns `status` (§7); flipped via `setStatus`. */
2825
+ statusValue;
2826
+ /** Per-connection monotonic frame counter — defensive hardening only (§7). */
2827
+ seq = 0;
2828
+ constructor(rivalis, options, logger) {
2829
+ this.assertCoreSupport(rivalis);
2830
+ this.rivalis = rivalis;
2831
+ this.logger = logger ?? rivalis.logging?.getLogger?.("fleet:agent") ?? NOOP_LOGGER;
2832
+ this.name = options.name;
2833
+ this.endpointUrl = options.endpointUrl;
2834
+ this.labels = options.labels ?? {};
2835
+ this.capacity = {
2836
+ maxConnections: options.capacity?.maxConnections ?? null,
2837
+ maxRooms: options.capacity?.maxRooms ?? null
2838
+ };
2839
+ this.autoCreate = options.autoCreate ?? true;
2840
+ this.agentVersion = options.agentVersion ?? packageVersion();
2841
+ this.protocolVersion = options.protocolVersion ?? PROTOCOL_VERSION;
2842
+ this.processUid = options.processUid ?? generateProcessUid();
2843
+ this.statusValue = options.status ?? "active";
2844
+ }
2845
+ get status() {
2846
+ return this.statusValue;
2847
+ }
2848
+ /** Flip the agent-owned status (§7). The next snapshot carries the new value. */
2849
+ setStatus(status) {
2850
+ this.statusValue = status;
2851
+ }
2852
+ /** Stamp a room as fleet-created (`origin: 'fleet'`). Called on a `fleet/cmd` create. */
2853
+ markFleetOrigin(roomId) {
2854
+ this.fleetOrigins.add(roomId);
2855
+ }
2856
+ /** Drop provenance for a destroyed room so a future id reuse is not mis-stamped. */
2857
+ forgetRoom(roomId) {
2858
+ this.fleetOrigins.delete(roomId);
2859
+ }
2860
+ /**
2861
+ * New connection (reconnect): reset the `seq` counter. The reconnect assigns a
2862
+ * fresh `instanceId`, so the orchestrator holds no prior hash and its first poll
2863
+ * carries `knownHash: null` → the next reply is always a full snapshot (§7).
2864
+ */
2865
+ resetConnection() {
2866
+ this.seq = 0;
2867
+ }
2868
+ /**
2869
+ * Rebuild the full semantic snapshot from live core state and hash it. Pure:
2870
+ * no `seq`, no size guard, no dedup-state mutation — used for hash inspection
2871
+ * and as the basis for {@link pollReply}.
2872
+ */
2873
+ rebuild() {
2874
+ const content = this.buildContent();
2875
+ return { content, hash: hash64(content) };
2876
+ }
2877
+ /**
2878
+ * Build a `fleet/state` reply to an orchestrator `fleet/poll` (§7, task 011).
2879
+ * The orchestrator drives the dedup: a FULL snapshot when the rebuilt hash
2880
+ * differs from the poll's `knownHash` (or `knownHash` is null — no prior state /
2881
+ * forced full), a hash-only reply otherwise. Always advances `seq`.
2882
+ */
2883
+ pollReply(reqId, knownHash) {
2884
+ const { content, hash } = this.rebuild();
2885
+ const seq = this.nextSeq();
2886
+ if (knownHash !== null && hash === knownHash) {
2887
+ const payload2 = { reqId, full: false, seq, hash, ...content };
2888
+ return { kind: "state", full: false, hash, encodedBytes: 0, payload: payload2 };
2889
+ }
2890
+ const payload = { reqId, full: true, seq, hash, ...content };
2891
+ const encodedBytes = encodeFrame(Topics.state, payload).length;
2892
+ this.checkSize(encodedBytes, content.rooms.length);
2893
+ return { kind: "state", full: true, hash, encodedBytes, payload };
2894
+ }
2895
+ nextSeq() {
2896
+ this.seq += 1;
2897
+ return this.seq;
2898
+ }
2899
+ buildContent() {
2900
+ const manager = this.rivalis.rooms;
2901
+ const roomTypes = [...manager.definitions()].sort();
2902
+ const rooms = [];
2903
+ for (const id of manager.keys()) {
2904
+ const room = manager.get(id);
2905
+ if (room === null) {
2906
+ continue;
2907
+ }
2908
+ if (typeof room.type !== "string") {
2909
+ throw new Error(this.coreSupportError(`room id=(${id}) has no string \`type\``));
2910
+ }
2911
+ rooms.push({
2912
+ id,
2913
+ type: room.type,
2914
+ connections: room.actorCount,
2915
+ origin: this.fleetOrigins.has(id) ? "fleet" : "local"
2916
+ });
2917
+ }
2918
+ rooms.sort((a, b) => a.id < b.id ? -1 : a.id > b.id ? 1 : 0);
2919
+ return {
2920
+ name: this.name,
2921
+ processUid: this.processUid,
2922
+ agentVersion: this.agentVersion,
2923
+ protocolVersion: this.protocolVersion,
2924
+ endpointUrl: this.endpointUrl,
2925
+ labels: this.labels,
2926
+ capacity: this.capacity,
2927
+ autoCreate: this.autoCreate,
2928
+ roomTypes,
2929
+ rooms,
2930
+ status: this.statusValue
2931
+ };
2932
+ }
2933
+ checkSize(bytes, roomCount) {
2934
+ const pct = Math.round(bytes / MAX_SNAPSHOT_BYTES2 * 100);
2935
+ if (bytes >= MAX_SNAPSHOT_BYTES2 * ERROR_RATIO) {
2936
+ this.logger.error(
2937
+ `fleet snapshot at ${pct}% of the 4 MiB transport frame limit (${bytes} bytes, ${roomCount} rooms). An oversized snapshot is terminated by the transport, which causes a permanent reconnect loop. Remediation: host fewer rooms per instance, raise the orchestrator's WSTransport.maxPayload, or split the fleet across more instances (chunked sync is roadmap \xA716).`
2938
+ );
2939
+ } else if (bytes >= MAX_SNAPSHOT_BYTES2 * WARN_RATIO) {
2940
+ this.logger.warning(
2941
+ `fleet snapshot at ${pct}% of the 4 MiB transport frame limit (${bytes} bytes, ${roomCount} rooms) \u2014 approaching the size guard.`
2942
+ );
2943
+ }
2944
+ }
2945
+ /**
2946
+ * Feature-detect the §4 core additions and throw an actionable, version-naming
2947
+ * error when they are absent — a clean failure at startup instead of
2948
+ * `undefined` types in snapshots at runtime. `Room.type` can only be checked
2949
+ * against rooms that already exist; with zero rooms the `definitions()` gate
2950
+ * is the primary guard (and `buildContent` re-checks each room defensively).
2951
+ */
2952
+ assertCoreSupport(rivalis) {
2953
+ const manager = rivalis?.rooms;
2954
+ if (manager === void 0 || manager === null) {
2955
+ throw new Error(this.coreSupportError("rivalis.rooms is not available"));
2956
+ }
2957
+ if (typeof manager.definitions !== "function") {
2958
+ throw new Error(this.coreSupportError("rivalis.rooms.definitions() is not available"));
2959
+ }
2960
+ if (typeof manager.keys !== "function" || typeof manager.get !== "function") {
2961
+ throw new Error(this.coreSupportError("rivalis.rooms.keys()/get() are not available"));
2962
+ }
2963
+ for (const id of manager.keys()) {
2964
+ const room = manager.get(id);
2965
+ if (room !== null && typeof room.type !== "string") {
2966
+ throw new Error(this.coreSupportError(`Room.type is not available (room id=(${id}) has no string \`type\`)`));
2967
+ }
2968
+ }
2969
+ }
2970
+ coreSupportError(detail) {
2971
+ return `@rivalis/fleet requires @rivalis/core >= ${MIN_CORE_VERSION}: ${detail}. Upgrade @rivalis/core to >= ${MIN_CORE_VERSION} (the \xA74 additions: Room.type, RoomManager.definitions()).`;
2972
+ }
2973
+ };
2974
+
2975
+ // src/agent/FleetAgent.ts
2976
+ import { WSClient } from "@rivalis/node";
2977
+ var DEFAULT_BACKOFF_BASE_MS = 500;
2978
+ var DEFAULT_BACKOFF_CAP_MS = 3e4;
2979
+ var DEFAULT_AWAIT_EMPTY_POLL_MS = 200;
2980
+ function defaultCreateClient(url) {
2981
+ return new WSClient(url, {
2982
+ ticketSource: "protocol",
2983
+ subprotocols: [WS_SUBPROTOCOL]
2984
+ });
2985
+ }
2986
+ var FleetAgent = class extends Broadcast2 {
2987
+ rivalis;
2988
+ logger;
2989
+ snapshot;
2990
+ url;
2991
+ key;
2992
+ autoCreate;
2993
+ maxRooms;
2994
+ connectTimeoutMs;
2995
+ client;
2996
+ scheduler;
2997
+ random;
2998
+ backoffBaseMs;
2999
+ backoffCapMs;
3000
+ awaitEmptyPollMs;
3001
+ installSignalHandlers;
3002
+ lifecycle = "closed";
3003
+ instanceId = null;
3004
+ /** Set once `connect()`/reconnects should stop (intentional `disconnect()` or fatal error). */
3005
+ closed = false;
3006
+ /** Distinguishes an operator-driven close from a transport drop that should reconnect. */
3007
+ intentionalClose = false;
3008
+ reconnectTimer = null;
3009
+ connectDeadline = null;
3010
+ reconnectAttempt = 0;
3011
+ connectResolve = null;
3012
+ connectReject = null;
3013
+ /**
3014
+ * Pending `drain()` / `undrain()` promises (task 011): each waits for a
3015
+ * `fleet/poll` echoing its target status — the orchestrator's acknowledged
3016
+ * confirmation that it recorded the agent-owned status flip. No unsolicited frame.
3017
+ */
3018
+ pendingStatus = [];
3019
+ uninstallSignals = null;
3020
+ /**
3021
+ * Whether the room/transport listeners are currently attached (task 008). The
3022
+ * subscription lifecycle tracks the connection lifecycle: attached on construct
3023
+ * and on every `connect()`, detached on the terminal paths (`disconnect()`,
3024
+ * `failConnect()`) so a discarded/replaced agent stops reacting to room events
3025
+ * and the host can drop it (otherwise `RoomManager`'s broadcast retains it).
3026
+ */
3027
+ listenersAttached = false;
3028
+ /**
3029
+ * Drop provenance when a room is destroyed so a future id reuse is not mis-stamped
3030
+ * (§7). Room create/destroy/define no longer trigger a push — changes surface at
3031
+ * the next orchestrator poll (task 011).
3032
+ */
3033
+ onRoomDestroy = (roomId) => {
3034
+ this.snapshot.forgetRoom(roomId);
3035
+ };
3036
+ constructor(rivalis, options, internals = {}) {
3037
+ super();
3038
+ this.rivalis = rivalis;
3039
+ this.logger = rivalis.logging?.getLogger?.("fleet:agent") ?? NOOP_LOGGER;
3040
+ const snapshotOptions = {
3041
+ name: options.name,
3042
+ endpointUrl: options.endpointUrl,
3043
+ agentVersion: packageVersion()
3044
+ };
3045
+ if (options.labels !== void 0) {
3046
+ snapshotOptions.labels = options.labels;
3047
+ }
3048
+ if (options.capacity !== void 0) {
3049
+ snapshotOptions.capacity = options.capacity;
3050
+ }
3051
+ if (options.autoCreate !== void 0) {
3052
+ snapshotOptions.autoCreate = options.autoCreate;
3053
+ }
3054
+ this.snapshot = new Snapshot(rivalis, snapshotOptions, this.logger);
3055
+ this.url = options.url;
3056
+ this.key = options.key;
3057
+ this.autoCreate = options.autoCreate ?? true;
3058
+ this.maxRooms = options.capacity?.maxRooms ?? null;
3059
+ this.connectTimeoutMs = options.connectTimeoutMs;
3060
+ this.scheduler = internals.scheduler ?? defaultScheduler;
3061
+ this.random = internals.random ?? Math.random;
3062
+ this.backoffBaseMs = internals.backoff?.baseMs ?? DEFAULT_BACKOFF_BASE_MS;
3063
+ this.backoffCapMs = internals.backoff?.capMs ?? DEFAULT_BACKOFF_CAP_MS;
3064
+ this.awaitEmptyPollMs = internals.awaitEmptyPollMs ?? DEFAULT_AWAIT_EMPTY_POLL_MS;
3065
+ this.installSignalHandlers = internals.installSignalHandlers;
3066
+ this.client = (internals.createClient ?? defaultCreateClient)(this.url);
3067
+ this.attachListeners();
3068
+ }
3069
+ /** Lifecycle status (§8): `'connecting' | 'connected' | 'draining' | 'closed'`. */
3070
+ get status() {
3071
+ return this.lifecycle;
3072
+ }
3073
+ /** Stable per-process id (§6), constant across reconnects. */
3074
+ get processUid() {
3075
+ return this.snapshot.processUid;
3076
+ }
3077
+ /**
3078
+ * Connect to the orchestrator; resolves on the first `fleet/hello`. Default:
3079
+ * retries forever (backoff per §7) — the promise stays pending while the
3080
+ * orchestrator is unreachable. With `connectTimeoutMs` set, rejects after the
3081
+ * deadline and transitions to `'closed'` with no background retry loop (§8).
3082
+ */
3083
+ connect() {
3084
+ if (this.lifecycle === "connected" || this.lifecycle === "draining") {
3085
+ return Promise.resolve();
3086
+ }
3087
+ if (this.connectResolve !== null) {
3088
+ return new Promise((resolve, reject) => {
3089
+ const prevResolve = this.connectResolve;
3090
+ const prevReject = this.connectReject;
3091
+ this.connectResolve = () => {
3092
+ prevResolve();
3093
+ resolve();
3094
+ };
3095
+ this.connectReject = (e) => {
3096
+ prevReject(e);
3097
+ reject(e);
3098
+ };
3099
+ });
3100
+ }
3101
+ this.closed = false;
3102
+ this.intentionalClose = false;
3103
+ this.lifecycle = "connecting";
3104
+ this.reconnectAttempt = 0;
3105
+ this.attachListeners();
3106
+ return new Promise((resolve, reject) => {
3107
+ this.connectResolve = resolve;
3108
+ this.connectReject = reject;
3109
+ if (this.connectTimeoutMs !== void 0) {
3110
+ this.connectDeadline = this.scheduler.setTimeout(
3111
+ () => this.failConnect(new Error("fleet:agent connect timeout exceeded")),
3112
+ this.connectTimeoutMs
3113
+ );
3114
+ }
3115
+ this.openConnection();
3116
+ });
3117
+ }
3118
+ /**
3119
+ * Mark this instance draining (§7, task 011): flips the agent-owned status
3120
+ * immediately (so the next `fleet/state` reply carries it) and resolves only when
3121
+ * a subsequent `fleet/poll` echoes `status: 'draining'` — the orchestrator's
3122
+ * acknowledged confirmation that it recorded the flip. No unsolicited frame.
3123
+ */
3124
+ drain() {
3125
+ return this.requestStatus("draining");
3126
+ }
3127
+ /** Reverse of `drain()` — restore the instance to `active`; resolves on the poll echo (§7). */
3128
+ undrain() {
3129
+ return this.requestStatus("active");
3130
+ }
3131
+ /** Resolve once every local room is empty (zero connections), or reject on `timeoutMs` (§8). */
3132
+ awaitEmpty({ timeoutMs } = {}) {
3133
+ const empty = () => {
3134
+ for (const id of this.rivalis.rooms.keys()) {
3135
+ const room = this.rivalis.rooms.get(id);
3136
+ if (room !== null && room.actorCount > 0) {
3137
+ return false;
3138
+ }
3139
+ }
3140
+ return true;
3141
+ };
3142
+ if (empty()) {
3143
+ return Promise.resolve();
3144
+ }
3145
+ return new Promise((resolve, reject) => {
3146
+ let poll = null;
3147
+ let deadline = null;
3148
+ const cleanup = () => {
3149
+ if (poll !== null) {
3150
+ this.scheduler.clearInterval(poll);
3151
+ poll = null;
3152
+ }
3153
+ if (deadline !== null) {
3154
+ this.scheduler.clearTimeout(deadline);
3155
+ deadline = null;
3156
+ }
3157
+ };
3158
+ poll = this.scheduler.setInterval(() => {
3159
+ if (empty()) {
3160
+ cleanup();
3161
+ resolve();
3162
+ }
3163
+ }, this.awaitEmptyPollMs);
3164
+ if (timeoutMs !== void 0) {
3165
+ deadline = this.scheduler.setTimeout(() => {
3166
+ cleanup();
3167
+ reject(new Error("fleet:agent awaitEmpty timeout exceeded"));
3168
+ }, timeoutMs);
3169
+ }
3170
+ });
3171
+ }
3172
+ /** Detach cleanly: stop all timers, close the transport, no further reconnects (§8). */
3173
+ async disconnect() {
3174
+ this.intentionalClose = true;
3175
+ this.closed = true;
3176
+ this.clearAllTimers();
3177
+ this.rejectPendingStatus(new Error("fleet:agent disconnected"));
3178
+ try {
3179
+ this.client.disconnect();
3180
+ } catch (error) {
3181
+ this.logger.warning(`fleet:agent transport disconnect error: ${describe(error)}`);
3182
+ }
3183
+ this.detachListeners();
3184
+ this.lifecycle = "closed";
3185
+ if (this.connectReject !== null) {
3186
+ const reject = this.connectReject;
3187
+ this.connectResolve = null;
3188
+ this.connectReject = null;
3189
+ reject(new Error("fleet:agent disconnected before connect resolved"));
3190
+ }
3191
+ this.emit("disconnect", Buffer.from("closed"));
3192
+ }
3193
+ /**
3194
+ * Wire `SIGTERM`/`SIGINT` to the graceful sequence (§8):
3195
+ * drain → awaitEmpty → disconnect → `rivalis.shutdown()`.
3196
+ */
3197
+ enableGracefulShutdown({ emptyTimeoutMs = 6e4 } = {}) {
3198
+ if (this.uninstallSignals !== null) {
3199
+ this.uninstallSignals();
3200
+ this.uninstallSignals = null;
3201
+ }
3202
+ const handler = () => {
3203
+ void this.gracefulShutdown(emptyTimeoutMs);
3204
+ };
3205
+ if (this.installSignalHandlers !== void 0) {
3206
+ this.uninstallSignals = this.installSignalHandlers(handler);
3207
+ return;
3208
+ }
3209
+ process.once("SIGTERM", handler);
3210
+ process.once("SIGINT", handler);
3211
+ this.uninstallSignals = () => {
3212
+ process.removeListener("SIGTERM", handler);
3213
+ process.removeListener("SIGINT", handler);
3214
+ };
3215
+ }
3216
+ async gracefulShutdown(emptyTimeoutMs) {
3217
+ try {
3218
+ await this.drain();
3219
+ } catch (error) {
3220
+ this.logger.warning(`fleet:agent graceful drain failed: ${describe(error)}`);
3221
+ }
3222
+ try {
3223
+ await this.awaitEmpty({ timeoutMs: emptyTimeoutMs });
3224
+ } catch (error) {
3225
+ this.logger.warning(`fleet:agent graceful awaitEmpty: ${describe(error)}`);
3226
+ }
3227
+ try {
3228
+ await this.disconnect();
3229
+ } catch (error) {
3230
+ this.logger.warning(`fleet:agent graceful disconnect failed: ${describe(error)}`);
3231
+ }
3232
+ try {
3233
+ await this.rivalis.shutdown();
3234
+ } catch (error) {
3235
+ this.logger.warning(`fleet:agent graceful rivalis.shutdown failed: ${describe(error)}`);
3236
+ }
3237
+ }
3238
+ // -----------------------------------------------------------------------
3239
+ // Transport wiring
3240
+ // -----------------------------------------------------------------------
3241
+ /**
3242
+ * Attach the room-provenance and transport listeners (task 008). Idempotent —
3243
+ * re-`connect()` after a `disconnect()` calls this again but it no-ops while
3244
+ * already attached, so listeners are never doubled.
3245
+ */
3246
+ attachListeners() {
3247
+ if (this.listenersAttached) {
3248
+ return;
3249
+ }
3250
+ this.wireClient();
3251
+ this.subscribeRooms();
3252
+ this.listenersAttached = true;
3253
+ }
3254
+ /**
3255
+ * Detach every listener on the terminal paths (task 008): the rooms broadcast
3256
+ * stops retaining this agent (no more `forgetRoom` on room destroy) and the
3257
+ * transport handlers are removed. Without this a discarded agent leaks — the
3258
+ * `RoomManager` broadcast keeps it alive for the host process's lifetime.
3259
+ */
3260
+ detachListeners() {
3261
+ if (!this.listenersAttached) {
3262
+ return;
3263
+ }
3264
+ this.unsubscribeRooms();
3265
+ try {
3266
+ this.client.removeAllListeners();
3267
+ } catch (error) {
3268
+ this.logger.warning(`fleet:agent transport removeAllListeners error: ${describe(error)}`);
3269
+ }
3270
+ this.listenersAttached = false;
3271
+ }
3272
+ wireClient() {
3273
+ this.client.on("client:connect", () => this.guard("transport open", () => this.onTransportOpen()));
3274
+ this.client.on("client:disconnect", (reason) => this.guard("transport close", () => this.onTransportClose(reason)));
3275
+ this.client.on("client:error", (error) => this.guard("transport error", () => this.onTransportError(error)));
3276
+ this.client.on(Topics.hello, (payload) => this.guard("hello", () => this.onHello(payload)));
3277
+ this.client.on(Topics.poll, (payload) => this.guard("poll", () => this.onPoll(payload)));
3278
+ this.client.on(Topics.cmd, (payload) => this.guard("cmd", () => this.onCmd(payload)));
3279
+ }
3280
+ subscribeRooms() {
3281
+ this.rivalis.rooms.on("destroy", this.onRoomDestroy);
3282
+ }
3283
+ unsubscribeRooms() {
3284
+ this.rivalis.rooms.off("destroy", this.onRoomDestroy);
3285
+ }
3286
+ openConnection() {
3287
+ if (this.closed) {
3288
+ return;
3289
+ }
3290
+ try {
3291
+ this.client.connect(this.key);
3292
+ } catch (error) {
3293
+ this.logger.warning(`fleet:agent connect attempt threw: ${describe(error)}`);
3294
+ this.scheduleReconnect();
3295
+ }
3296
+ }
3297
+ onTransportOpen() {
3298
+ this.logger.debug?.("fleet:agent transport open \u2014 awaiting fleet/hello");
3299
+ }
3300
+ onTransportClose(reason) {
3301
+ if (this.closed || this.intentionalClose) {
3302
+ return;
3303
+ }
3304
+ this.rejectPendingStatus(new Error("fleet:agent connection lost"));
3305
+ this.lifecycle = "connecting";
3306
+ this.emit("disconnect", reason);
3307
+ this.scheduleReconnect();
3308
+ }
3309
+ onTransportError(error) {
3310
+ this.logger.warning(`fleet:agent transport error: ${describe(error)}`);
3311
+ this.emit("error", error);
3312
+ }
3313
+ scheduleReconnect() {
3314
+ if (this.closed || this.intentionalClose || this.reconnectTimer !== null) {
3315
+ return;
3316
+ }
3317
+ const delay = this.backoffDelay();
3318
+ this.reconnectAttempt += 1;
3319
+ this.reconnectTimer = this.scheduler.setTimeout(() => {
3320
+ this.reconnectTimer = null;
3321
+ this.openConnection();
3322
+ }, delay);
3323
+ }
3324
+ /** Full-jitter exponential backoff: random in `[0, min(cap, base·2^attempt)]` (§7). */
3325
+ backoffDelay() {
3326
+ const ceiling = Math.min(this.backoffCapMs, this.backoffBaseMs * Math.pow(2, this.reconnectAttempt));
3327
+ return Math.floor(this.random() * ceiling);
3328
+ }
3329
+ // -----------------------------------------------------------------------
3330
+ // Protocol handlers (orch → agent)
3331
+ // -----------------------------------------------------------------------
3332
+ onHello(raw) {
3333
+ let hello;
3334
+ try {
3335
+ hello = decodeFrame(Topics.hello, toBytes(raw));
3336
+ } catch (error) {
3337
+ if (error instanceof WireVersionError) {
3338
+ this.logger.error(error.message);
3339
+ this.emit("error", error);
3340
+ this.failConnect(error);
3341
+ return;
3342
+ }
3343
+ this.logger.warning(`fleet:agent failed to decode fleet/hello: ${describe(error)}`);
3344
+ return;
3345
+ }
3346
+ if (hello.protocolVersion !== PROTOCOL_VERSION) {
3347
+ const error = new Error(
3348
+ `fleet protocol major mismatch: orchestrator=${hello.protocolVersion}, agent=${PROTOCOL_VERSION} \u2014 upgrade so both speak the same major (\xA77)`
3349
+ );
3350
+ this.logger.error(error.message);
3351
+ this.emit("error", error);
3352
+ this.failConnect(error);
3353
+ return;
3354
+ }
3355
+ this.instanceId = hello.instanceId;
3356
+ this.reconnectAttempt = 0;
3357
+ this.clearReconnect();
3358
+ this.clearConnectDeadline();
3359
+ this.snapshot.resetConnection();
3360
+ this.lifecycle = this.snapshot.status === "draining" ? "draining" : "connected";
3361
+ if (this.connectResolve !== null) {
3362
+ const resolve = this.connectResolve;
3363
+ this.connectResolve = null;
3364
+ this.connectReject = null;
3365
+ resolve();
3366
+ }
3367
+ this.emit("connect", { instanceId: this.instanceId, processUid: this.snapshot.processUid });
3368
+ }
3369
+ /**
3370
+ * Answer an orchestrator `fleet/poll` with a `fleet/state` reply (§7, task 011):
3371
+ * full snapshot when our hash differs from the poll's `knownHash`, hash-only
3372
+ * otherwise. A poll echoing a pending `drain()`/`undrain()` target status also
3373
+ * resolves that promise (the acknowledged confirmation, no unsolicited frame).
3374
+ */
3375
+ onPoll(raw) {
3376
+ const poll = this.decodeInbound(Topics.poll, raw);
3377
+ if (poll === null) {
3378
+ return;
3379
+ }
3380
+ this.resolveStatusOnEcho(poll.status);
3381
+ if (this.client.connected) {
3382
+ this.sendState(this.snapshot.pollReply(poll.reqId, poll.knownHash));
3383
+ }
3384
+ }
3385
+ onCmd(raw) {
3386
+ const cmd = this.decodeInbound(Topics.cmd, raw);
3387
+ if (cmd === null) {
3388
+ return;
3389
+ }
3390
+ this.emit("command", cmd);
3391
+ switch (cmd.op) {
3392
+ case "create":
3393
+ return this.execCreate(cmd);
3394
+ case "destroy":
3395
+ return this.execDestroy(cmd);
3396
+ case "drain":
3397
+ return this.execStatusCmd(cmd, "draining");
3398
+ case "undrain":
3399
+ return this.execStatusCmd(cmd, "active");
3400
+ default:
3401
+ this.sendAck({ cmdId: cmd.cmdId, ok: false, error: `unknown op: ${String(cmd.op)}` });
3402
+ }
3403
+ }
3404
+ execCreate(cmd) {
3405
+ if (!this.autoCreate) {
3406
+ this.sendAck({ cmdId: cmd.cmdId, ok: false, error: "autoCreate is disabled on this instance" });
3407
+ return;
3408
+ }
3409
+ if (typeof cmd.roomType !== "string") {
3410
+ this.sendAck({ cmdId: cmd.cmdId, ok: false, error: "create requires roomType" });
3411
+ return;
3412
+ }
3413
+ if (this.maxRooms !== null && this.rivalis.rooms.count >= this.maxRooms) {
3414
+ this.sendAck({ cmdId: cmd.cmdId, ok: false, error: `capacity exhausted: maxRooms=${this.maxRooms}` });
3415
+ return;
3416
+ }
3417
+ if (typeof cmd.roomId === "string" && this.rivalis.rooms.get(cmd.roomId) !== null) {
3418
+ this.sendAck({ cmdId: cmd.cmdId, ok: false, exists: true, error: "room id already exists" });
3419
+ return;
3420
+ }
3421
+ try {
3422
+ const room = this.rivalis.rooms.create(cmd.roomType, cmd.roomId ?? null);
3423
+ this.snapshot.markFleetOrigin(room.id);
3424
+ this.sendAck({ cmdId: cmd.cmdId, ok: true, room: { id: room.id, type: room.type } });
3425
+ } catch (error) {
3426
+ this.sendAck({ cmdId: cmd.cmdId, ok: false, error: describe(error) });
3427
+ }
3428
+ }
3429
+ execDestroy(cmd) {
3430
+ if (typeof cmd.roomId !== "string") {
3431
+ this.sendAck({ cmdId: cmd.cmdId, ok: false, error: "destroy requires roomId" });
3432
+ return;
3433
+ }
3434
+ if (this.rivalis.rooms.get(cmd.roomId) === null) {
3435
+ this.sendAck({ cmdId: cmd.cmdId, ok: true, alreadyGone: true });
3436
+ return;
3437
+ }
3438
+ try {
3439
+ this.rivalis.rooms.destroy(cmd.roomId);
3440
+ this.snapshot.forgetRoom(cmd.roomId);
3441
+ this.sendAck({ cmdId: cmd.cmdId, ok: true });
3442
+ } catch (error) {
3443
+ this.sendAck({ cmdId: cmd.cmdId, ok: false, error: describe(error) });
3444
+ }
3445
+ }
3446
+ execStatusCmd(cmd, status) {
3447
+ this.snapshot.setStatus(status);
3448
+ this.lifecycle = status === "draining" ? "draining" : "connected";
3449
+ this.sendAck({ cmdId: cmd.cmdId, ok: true });
3450
+ }
3451
+ // -----------------------------------------------------------------------
3452
+ // Outbound (agent → orch) — replies only (task 011)
3453
+ // -----------------------------------------------------------------------
3454
+ requestStatus(target) {
3455
+ this.snapshot.setStatus(target);
3456
+ this.lifecycle = target === "draining" ? "draining" : "connected";
3457
+ return new Promise((resolve, reject) => {
3458
+ this.pendingStatus.push({ target, resolve, reject });
3459
+ });
3460
+ }
3461
+ /** Resolve every pending drain()/undrain() whose target matches the poll-echoed status. */
3462
+ resolveStatusOnEcho(echoed) {
3463
+ if (this.pendingStatus.length === 0) {
3464
+ return;
3465
+ }
3466
+ const remaining = [];
3467
+ for (const pending of this.pendingStatus) {
3468
+ if (pending.target === echoed) {
3469
+ pending.resolve();
3470
+ } else {
3471
+ remaining.push(pending);
3472
+ }
3473
+ }
3474
+ this.pendingStatus = remaining;
3475
+ }
3476
+ sendState(frame) {
3477
+ this.send(Topics.state, frame.payload);
3478
+ }
3479
+ sendAck(ack) {
3480
+ this.send(Topics.ack, ack);
3481
+ }
3482
+ send(topic, payload) {
3483
+ try {
3484
+ this.client.send(topic, encodeFrame(topic, payload));
3485
+ } catch (error) {
3486
+ this.logger.warning(`fleet:agent send failed topic=${topic}: ${describe(error)}`);
3487
+ }
3488
+ }
3489
+ // -----------------------------------------------------------------------
3490
+ // Teardown helpers
3491
+ // -----------------------------------------------------------------------
3492
+ /** Fatal connect failure (timeout or protocol mismatch): reject, close, stop retrying (§8). */
3493
+ failConnect(error) {
3494
+ this.closed = true;
3495
+ this.intentionalClose = true;
3496
+ this.clearAllTimers();
3497
+ this.rejectPendingStatus(error);
3498
+ try {
3499
+ this.client.disconnect();
3500
+ } catch (disconnectError) {
3501
+ this.logger.warning(`fleet:agent disconnect during failConnect: ${describe(disconnectError)}`);
3502
+ }
3503
+ this.detachListeners();
3504
+ this.lifecycle = "closed";
3505
+ if (this.connectReject !== null) {
3506
+ const reject = this.connectReject;
3507
+ this.connectResolve = null;
3508
+ this.connectReject = null;
3509
+ reject(error);
3510
+ }
3511
+ }
3512
+ rejectPendingStatus(error) {
3513
+ const pending = this.pendingStatus;
3514
+ this.pendingStatus = [];
3515
+ for (const entry of pending) {
3516
+ entry.reject(error);
3517
+ }
3518
+ }
3519
+ clearReconnect() {
3520
+ if (this.reconnectTimer !== null) {
3521
+ this.scheduler.clearTimeout(this.reconnectTimer);
3522
+ this.reconnectTimer = null;
3523
+ }
3524
+ }
3525
+ clearConnectDeadline() {
3526
+ if (this.connectDeadline !== null) {
3527
+ this.scheduler.clearTimeout(this.connectDeadline);
3528
+ this.connectDeadline = null;
3529
+ }
3530
+ }
3531
+ clearAllTimers() {
3532
+ this.clearReconnect();
3533
+ this.clearConnectDeadline();
3534
+ if (this.uninstallSignals !== null) {
3535
+ this.uninstallSignals();
3536
+ this.uninstallSignals = null;
3537
+ }
3538
+ }
3539
+ /** Run a transport/timer callback, swallowing+logging any throw (§8 host-safety contract). */
3540
+ guard(label, fn) {
3541
+ try {
3542
+ fn();
3543
+ } catch (error) {
3544
+ this.logger.error(`fleet:agent ${label} handler error: ${describe(error)}`);
3545
+ this.emit("error", error instanceof Error ? error : new Error(describe(error)));
3546
+ }
3547
+ }
3548
+ /**
3549
+ * Decode an inbound binary frame for a non-hello topic (§7, task 005). Logs +
3550
+ * returns `null` on any failure — never throws into the host (§8). A
3551
+ * protocol-incompatible frame is logged as a version mismatch; a
3552
+ * malformed/truncated frame is logged and dropped. (`fleet/hello` handles a
3553
+ * version mismatch itself — a loud connect failure — so it does not use this.)
3554
+ */
3555
+ decodeInbound(topic, raw) {
3556
+ try {
3557
+ return decodeFrame(topic, toBytes(raw));
3558
+ } catch (error) {
3559
+ if (error instanceof WireVersionError) {
3560
+ this.logger.warning(`fleet:agent dropped protocol-incompatible ${topic} frame (peer major=${error.theirVersion}, agent=${PROTOCOL_VERSION})`);
3561
+ } else {
3562
+ this.logger.warning(`fleet:agent failed to decode ${topic}: ${describe(error)}`);
3563
+ }
3564
+ return null;
3565
+ }
3566
+ }
3567
+ };
3568
+ function toBytes(raw) {
3569
+ if (raw instanceof Uint8Array) {
3570
+ return raw;
3571
+ }
3572
+ if (typeof raw === "string") {
3573
+ return Buffer.from(raw, "utf-8");
3574
+ }
3575
+ return new Uint8Array(0);
3576
+ }
3577
+ export {
3578
+ FleetAgent,
3579
+ FleetError,
3580
+ Orchestrator,
3581
+ PROTOCOL_VERSION
3582
+ };