@rivalis/fleet 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/main.d.ts ADDED
@@ -0,0 +1,592 @@
1
+ import { Broadcast } from '@toolcase/base';
2
+ import { Logger } from '@toolcase/logging';
3
+ import { Rivalis, Client } from '@rivalis/core';
4
+ import { EndpointError } from '@toolcase/node';
5
+
6
+ /**
7
+ * Orchestrator configuration normalization and defaults (§9). Consumed by the
8
+ * Orchestrator (§9) and the CLI (§12).
9
+ *
10
+ * Scope note: this resolves option shapes and defaults and normalizes the
11
+ * `string | string[]` key surface for rotation (§13). The §13 *security* checks
12
+ * — key-strength enforcement, agent/admin audience separation, the production
13
+ * refuse-to-start rules — live in {@link enforceSecurityPolicy} below, called by
14
+ * the Orchestrator constructor: structural normalization and security hardening
15
+ * stay separate, but both have their home in this module.
16
+ */
17
+
18
+ /**
19
+ * Public orchestrator option surface (§9). Lives next to its primary consumer
20
+ * (config resolution) rather than in the pure data model — it is consumer
21
+ * configuration, not a read-model type. Re-exported from `main.ts`.
22
+ */
23
+ interface OrchestratorOptions {
24
+ /** Bind address (default 0.0.0.0). */
25
+ host?: string;
26
+ port: number;
27
+ /** Agent auth key(s) — agents connect with any listed key. */
28
+ agentKey: string | string[];
29
+ /** REST admin key(s) — required when `api: true`. */
30
+ adminKey?: string | string[];
31
+ /** Serve REST /v1 (default true). */
32
+ api?: boolean;
33
+ heartbeatMs?: number;
34
+ commandTimeoutMs?: number;
35
+ /** CORS allow-origins, or false (default) for same-origin only. */
36
+ cors?: false | {
37
+ origins: string[];
38
+ };
39
+ /** Allow `?key=` auth on /v1/events for browser EventSource (§10, §13). */
40
+ sseQueryAuth?: boolean;
41
+ /**
42
+ * Trust `X-Forwarded-For` from a front proxy (default `false`, §13). When a
43
+ * TLS-terminating reverse proxy / service mesh sits in front, enable this so the
44
+ * per-IP failed-auth throttle and audit logs key on the real client IP (`req.ip`,
45
+ * resolved from the forwarded header) instead of collapsing every client into the
46
+ * proxy's single socket address. Leave off for direct exposure — a spoofable
47
+ * header must not be trusted from an untrusted network.
48
+ */
49
+ trustProxy?: boolean;
50
+ }
51
+
52
+ /**
53
+ * Internal Rivalis Room (type `@rivalis/fleet`) that hosts connected agents as
54
+ * actors — the orchestrator dogfoods Rivalis for agent transport (§7). Each
55
+ * agent socket is an actor in this single room; the room binds the agent → orch
56
+ * reply topics (`fleet/state`, `fleet/ack`) and forwards every frame, join, and
57
+ * leave to the {@link FleetController} (the Orchestrator).
58
+ *
59
+ * `unknownTopicPolicy = 'kick'` (task 011): under strict orchestrator-driven
60
+ * request/reply, every agent frame must be a reply to an outstanding request — an
61
+ * unbound topic is an unsolicited frame and the agent is kicked. This supersedes
62
+ * the pre-011 `'drop'` forward-compat stance (a major bump now guards
63
+ * compatibility, §7). A frame on a *bound* reply topic still flows to the
64
+ * controller, which checks the correlation id and kicks if it matches no
65
+ * outstanding request.
66
+ *
67
+ * The class is produced by a **factory** rather than declared statically so that
68
+ * `@rivalis/core` is required lazily (the orchestrator loads core inside
69
+ * `listen()`; importing `@rivalis/fleet` must not eagerly drag core's ESM build
70
+ * in — mirrors `FleetAgent`'s lazy `loadCore`). The factory closes over the
71
+ * controller, so the room needs no per-instance wiring after construction.
72
+ */
73
+ /**
74
+ * A connected agent as seen by the control plane — an abstraction over the
75
+ * core `Actor`. The orchestrator never touches `Actor`/`Room` directly; it sends
76
+ * JSON frames and closes wedged sockets through this seam, which also makes the
77
+ * control plane unit-testable without a live WebSocket (§15).
78
+ */
79
+ interface AgentLink {
80
+ /** Connection-scoped instance id — the actor id assigned by core (§6). */
81
+ readonly instanceId: string;
82
+ /** Send a topic frame to this agent; payloads are binary-encoded (§7, task 005). */
83
+ send(topic: string, payload: unknown): void;
84
+ /** Kick the agent's socket (used to evict a wedged-but-connected instance, §7). */
85
+ close(): void;
86
+ }
87
+ /** What the FleetRoom forwards into the Orchestrator. Implemented by the Orchestrator. */
88
+ interface FleetController {
89
+ handleAgentJoin(link: AgentLink): void;
90
+ handleAgentLeave(instanceId: string): void;
91
+ handleAgentMessage(instanceId: string, topic: string, payload: Uint8Array): void;
92
+ }
93
+
94
+ /**
95
+ * Pure fleet data model (§6) — the read-model rows surfaced over the library and
96
+ * REST APIs and the placement-request shape. No I/O, no wire-format concerns:
97
+ * the agent builds `fleet/state` payloads (see `../wire`) which the orchestrator
98
+ * validates into these `InstanceInfo`/`RoomInfo` rows.
99
+ */
100
+ /** Lifecycle status of an instance (§6). The agent owns this value (§7). */
101
+ type InstanceStatus = 'active' | 'draining';
102
+ /** Resolved capacity declaration; `null` means "unlimited" for that dimension. */
103
+ interface Capacity {
104
+ maxConnections: number | null;
105
+ maxRooms: number | null;
106
+ }
107
+ type PlacementStrategy = 'least-loaded' | 'most-loaded' | 'random';
108
+ interface PlacementRequest {
109
+ /** Pin to a connection-scoped instance id (see §9 pinning caveat). */
110
+ instanceId?: string;
111
+ /** Pin to an instance by its stable process id. */
112
+ processUid?: string;
113
+ strategy?: PlacementStrategy;
114
+ /** Only instances matching all listed labels are candidates. */
115
+ labels?: Record<string, string>;
116
+ /** Pinning to a draining instance requires `force: true`. */
117
+ force?: boolean;
118
+ }
119
+ interface InstanceInfo {
120
+ id: string;
121
+ name: string;
122
+ processUid: string;
123
+ endpointUrl: string;
124
+ labels: Record<string, string>;
125
+ roomTypes: string[];
126
+ rooms: RoomInfo[];
127
+ connections: number;
128
+ capacity: Capacity;
129
+ autoCreate: boolean;
130
+ status: InstanceStatus;
131
+ lastSyncAt: number;
132
+ agentVersion: string;
133
+ protocolVersion: number;
134
+ }
135
+ interface RoomInfo {
136
+ id: string;
137
+ type: string;
138
+ connections: number;
139
+ instanceId: string;
140
+ endpointUrl: string;
141
+ local: boolean;
142
+ }
143
+ interface FleetStats {
144
+ instances: number;
145
+ rooms: number;
146
+ connections: number;
147
+ roomTypes: string[];
148
+ stateHash: string;
149
+ }
150
+ /** Stable, machine-readable error codes surfaced over REST (§10) and control APIs. */
151
+ type FleetErrorCode = 'VALIDATION' | 'UNAUTHORIZED' | 'INSTANCE_NOT_FOUND' | 'ROOM_NOT_FOUND' | 'NO_CANDIDATE' | 'ROOM_EXISTS' | 'INSTANCE_DRAINING' | 'PAYLOAD_TOO_LARGE' | 'INSTANCE_BUSY' | 'AUTH_THROTTLED' | 'SSE_LIMIT' | 'COMMAND_FAILED' | 'INSTANCE_DISCONNECTED' | 'COMMAND_TIMEOUT';
152
+ type FleetEventType = 'instance:join' | 'instance:leave' | 'instance:stale' | 'room:create' | 'room:destroy' | 'sync';
153
+ interface FleetEvent {
154
+ type: FleetEventType;
155
+ data?: unknown;
156
+ }
157
+
158
+ /**
159
+ * The fleet's coded error hierarchy (task 004) — `FleetError extends
160
+ * EndpointError`, with the HTTP status carried **on the error class** instead of
161
+ * a side table in the REST router. The node-service blueprint puts domain errors
162
+ * here in `src/domain/errors.ts` and maps them with `errorMeta(e)`, so the router
163
+ * no longer owns a parallel `code → status` table that could drift from the throw
164
+ * sites (spec §10 codes are unchanged; only the mapping *mechanism* moves).
165
+ *
166
+ * ──────────────────────────────────────────────────────────────────────────────
167
+ * `@toolcase/node` adoption (task 006)
168
+ *
169
+ * `EndpointError`, `errorMeta`, and `isLibError` now come from **`@toolcase/node`**
170
+ * (the local port that stood in until task 006 — while `@toolcase/node` was not
171
+ * loadable here — is deleted). The package is listed in `tsup.config.ts`'s
172
+ * `external`, so the one `EndpointError` class identity is shared across every
173
+ * bundle: a {@link FleetError} thrown in the `FleetState`/`Orchestrator` bundle is
174
+ * recognized by the router bundle's `errorMeta` via `instanceof EndpointError`
175
+ * (the *base* is externalized; `FleetError` itself is still bundled per-entry, but
176
+ * the mapping checks the shared base, never `FleetError`). That is exactly the
177
+ * cross-bundle correctness the earlier structural port was working around.
178
+ * ──────────────────────────────────────────────────────────────────────────────
179
+ */
180
+
181
+ /**
182
+ * Coded error surfaced by placement, the command engine, and REST validation
183
+ * (§9/§10). Extends `@toolcase/node`'s {@link EndpointError}, resolving its HTTP
184
+ * `statusCode` from the §10 table at construction — so `errorMeta` maps it (via
185
+ * `instanceof EndpointError`) without the router knowing the table. The public
186
+ * `code` contract (a {@link FleetErrorCode}) is the documented REST envelope `cause`
187
+ * (spec §10) and is unchanged.
188
+ */
189
+ declare class FleetError extends EndpointError {
190
+ readonly code: FleetErrorCode;
191
+ constructor(code: FleetErrorCode, message: string);
192
+ }
193
+
194
+ /**
195
+ * Shared default timer scheduler (task 002). One definition of the `unref`-ing
196
+ * `setTimeout`/`setInterval` wrapper that the orchestrator and the agent each
197
+ * carried. `unref` so a lingering timer never pins the process.
198
+ *
199
+ * Typed as a structural superset — timeouts *and* intervals — so the single
200
+ * value satisfies both the orchestrator's timeouts-only `OrchestratorScheduler`
201
+ * seam and the agent's `AgentScheduler` (which also drives heartbeat/poll
202
+ * intervals). The injectable scheduler seams in each consumer are unchanged;
203
+ * tests still pass their own fakes.
204
+ */
205
+ /**
206
+ * Timeouts-only timer seam shared by the orchestrator and every collaborator it
207
+ * injects ({@link CommandEngine}, {@link Poller}). One definition so the
208
+ * decomposed pieces (and their unit tests) take the same fake clock the
209
+ * Orchestrator does (§15).
210
+ */
211
+ interface TimerScheduler {
212
+ setTimeout(fn: () => void, ms: number): unknown;
213
+ clearTimeout(handle: unknown): void;
214
+ }
215
+
216
+ /**
217
+ * Wire-protocol constants (§7) — protocol versioning, the in-flight command cap,
218
+ * and the topic name table exchanged between agent and orchestrator. Per-topic
219
+ * JSON payload shapes live in `./payloads`.
220
+ */
221
+ /**
222
+ * Protocol MAJOR spoken by agent and orchestrator — a single integer (§7).
223
+ *
224
+ * Bumped 1 → 2 by task 005: the wire format changed from JSON to binary
225
+ * (`@toolcase/serializer`), a breaking change within the major.
226
+ *
227
+ * Bumped 2 → 3 by task 011: the protocol was inverted to strict
228
+ * orchestrator-driven request/reply. Agent push topics (`fleet/sync`,
229
+ * `fleet/ping`, `fleet/resync`, `fleet/status`, `fleet/status-ack`) are gone;
230
+ * the orchestrator now polls (`fleet/poll`) and the agent replies (`fleet/state`).
231
+ * A v2 (push) peer and a v3 (poll) peer cannot interoperate — both halves must be
232
+ * upgraded in lockstep. The 2-byte version header on every frame
233
+ * (`wire/serializer`) is what makes the mismatch fail loudly at `fleet/hello`.
234
+ */
235
+ declare const PROTOCOL_VERSION = 3;
236
+
237
+ /**
238
+ * Embeddable fleet orchestrator (§9) — the dogfooded control plane (§7). After the
239
+ * task-008 decomposition this is the **facade** wiring its collaborators, each a
240
+ * separately unit-tested concern: {@link AgentAuthenticator} (agent-key match),
241
+ * {@link CommandEngine} (pending commands + cap + settle), {@link Poller}
242
+ * (orchestrator-driven polling + missed-reply stale/evict, task 011),
243
+ * {@link EventReconciler} (read-model diffs → events), {@link FleetControl}
244
+ * (create/destroy/drain), and `transport.ts` (control-plane bootstrap). The
245
+ * Orchestrator retains config resolution, the `FleetApi` facade, poll dispatch +
246
+ * per-agent outstanding-request enforcement, and lifecycle (`listen`/`shutdown`).
247
+ *
248
+ * `listen()` is the only method that touches core / the network; constructing an
249
+ * Orchestrator loads no core, so the control plane is exercised directly in unit
250
+ * tests against the {@link AgentLink} seams and an injectable scheduler (§15).
251
+ */
252
+
253
+ /** Minimal timer surface (timeouts only); tests inject a virtual-time fake (§15). */
254
+ type OrchestratorScheduler = TimerScheduler;
255
+ /** Test/advanced seams kept off the public option surface (§9). */
256
+ interface OrchestratorInternals {
257
+ scheduler?: OrchestratorScheduler;
258
+ /** Wall clock for `lastSyncAt`; default `Date.now`. */
259
+ now?: () => number;
260
+ logger?: Logger;
261
+ /** `NODE_ENV` override for the §13 production refuse-to-start rules; default sourced from `src/env.ts`. */
262
+ env?: string;
263
+ }
264
+ /** Read-model + control surface exposed as `orchestrator.fleet` (§9). */
265
+ interface FleetApi {
266
+ readonly stats: FleetStats;
267
+ readonly instances: InstanceInfo[];
268
+ readonly rooms: RoomInfo[];
269
+ getInstance(id: string): InstanceInfo | null;
270
+ getRoom(roomId: string): RoomInfo | null;
271
+ findRooms(filter?: {
272
+ type?: string;
273
+ instanceId?: string;
274
+ labels?: Record<string, string>;
275
+ }): RoomInfo[];
276
+ createRoom(request: {
277
+ type: string;
278
+ roomId?: string;
279
+ placement?: PlacementRequest;
280
+ }): Promise<RoomInfo>;
281
+ destroyRoom(roomId: string): Promise<void>;
282
+ drainInstance(instanceId: string): Promise<void>;
283
+ undrainInstance(instanceId: string): Promise<void>;
284
+ }
285
+ declare class Orchestrator extends Broadcast implements FleetController {
286
+ readonly fleet: FleetApi;
287
+ private readonly config;
288
+ private readonly state;
289
+ private readonly now;
290
+ private logger;
291
+ /** `fleet:http` logger; NOOP until `listen()` loads core's logging factory. */
292
+ private httpLogger;
293
+ /** Fastify-based REST /v1 surface over the same `node:http` server (§10, task 006). */
294
+ private readonly httpApi;
295
+ private readonly auth;
296
+ private readonly commands;
297
+ private readonly poller;
298
+ private readonly reconciler;
299
+ private readonly control;
300
+ /** Live agent links keyed by connection-scoped instance id. */
301
+ private readonly links;
302
+ private rivalis;
303
+ private httpServer;
304
+ private listening;
305
+ private transportAttached;
306
+ constructor(options: OrchestratorOptions, internals?: OrchestratorInternals);
307
+ /**
308
+ * Bridge every {@link FleetEventType} broadcast (§9) into one SSE listener as a
309
+ * {@link FleetEvent} `{ type, data }`; returns an unsubscribe (called on stream close, §10).
310
+ */
311
+ private subscribeFleetEvents;
312
+ /** True once HTTP is listening and the WS transport is attached (drives `/readyz`, task 010). */
313
+ get ready(): boolean;
314
+ /** Start the HTTP/WS server, attach the internal Rivalis room, begin accepting agents (§9). */
315
+ listen(): Promise<void>;
316
+ /** Gracefully stop: reject in-flight commands, destroy rooms, dispose transport, close HTTP (§9). */
317
+ shutdown(): Promise<void>;
318
+ /** @internal Agent joined: assign id, send `fleet/hello`, start polling (§7, task 011). */
319
+ handleAgentJoin(link: AgentLink): void;
320
+ /** @internal Agent socket closed: evict instantly, rejecting any in-flight commands (§7). */
321
+ handleAgentLeave(instanceId: string): void;
322
+ /**
323
+ * @internal Inbound agent frame (task 011). Every agent frame must be a reply to
324
+ * an outstanding orchestrator request — `fleet/state` to a `fleet/poll`,
325
+ * `fleet/ack` to a `fleet/cmd`. A well-formed frame whose correlation id matches
326
+ * no outstanding request (spontaneous, duplicate, or post-settle) is an
327
+ * unsolicited frame → kick. A malformed / version-incompatible frame is logged
328
+ * and dropped (the lockstep-mismatch path is evicted by missed polls, §7/§8).
329
+ */
330
+ handleAgentMessage(instanceId: string, topic: string, payload: Uint8Array | string): void;
331
+ /** Build and send a `fleet/poll`: knownHash drives dedup, status echoes for drain confirmation. */
332
+ private sendPoll;
333
+ /**
334
+ * Ingest a `fleet/state` poll reply (task 011). The reply must match the
335
+ * outstanding poll's `reqId` (consumed via the poller); an unmatched reply is
336
+ * unsolicited → kick. A full reply is bounds-checked (§13) and applied; a
337
+ * hash-only reply just refreshes liveness (the snapshot is unchanged).
338
+ */
339
+ private handleState;
340
+ private handleAck;
341
+ /**
342
+ * Kick an agent that broke the request/reply contract (task 011): tear it down
343
+ * (rejecting in-flight commands, removing it from the read model) and close the
344
+ * socket so it reconnects fresh. The log line names the cause and the instance —
345
+ * never the offending payload's contents (§13).
346
+ */
347
+ private kick;
348
+ private onStale;
349
+ private onEvict;
350
+ /**
351
+ * Remove an instance from every table, reject its in-flight commands immediately
352
+ * with `INSTANCE_DISCONNECTED` (§7), and reconcile (its rooms → `room:destroy`, `sync`).
353
+ */
354
+ private teardownInstance;
355
+ /**
356
+ * Decode a binary agent frame for `topic` (§7). Returns `null` on any failure —
357
+ * never throws into the host (§8): a protocol-incompatible frame (e.g. a legacy
358
+ * JSON agent against this v2 orchestrator) or a malformed/truncated one is logged
359
+ * and dropped, and the read model keeps its last good state.
360
+ */
361
+ private decode;
362
+ private emitEvent;
363
+ /**
364
+ * Run a timer- / transport- / core-dispatch-driven callback, swallowing and
365
+ * logging any throw so it never escapes into a raw `setTimeout` (an
366
+ * `uncaughtException` that would crash the whole control plane) or back into
367
+ * core's room dispatch (§14 failure modes). Mirrors the agent's host-safety
368
+ * `guard` (§8): the orchestrator is the single point of coordination, so one
369
+ * unhandled throw on a poll tick, a snapshot application, or a liveness deadline
370
+ * must degrade to a logged failure on one instance, never an orchestrator-wide
371
+ * outage. Never rethrows.
372
+ */
373
+ private guard;
374
+ }
375
+
376
+ /**
377
+ * Instance-side fleet client (§8). Attaches a `Rivalis` instance to an
378
+ * orchestrator over core's hardened `WSClient` (task 002, `ticketSource:
379
+ * 'protocol'` so the agent key never lands in a URL — §13): it reports the
380
+ * instance's rooms/connections and executes orchestrator-pushed room commands.
381
+ *
382
+ * Strict orchestrator-driven request/reply (task 011): the agent never pushes
383
+ * state spontaneously. The orchestrator polls (`fleet/poll`) on its own cadence
384
+ * and the agent answers with `fleet/state` — a full snapshot when its hash differs
385
+ * from the poll's `knownHash`, a hash-only reply otherwise. `drain()`/`undrain()`
386
+ * flip the agent-owned status locally and resolve when a subsequent poll echoes the
387
+ * target status (an acknowledged confirmation, no unsolicited frame).
388
+ *
389
+ * The load-bearing contract (§8): **never throws into the host process from
390
+ * network failures**. Every transport callback is wrapped, failures are logged
391
+ * via `rivalis.logging.getLogger('fleet:agent')`, and the agent reconnects with
392
+ * exponential backoff (0.5 s → 30 s cap, full jitter — §7). This is what forces
393
+ * the §4 `WSClient` hardening: the unhardened client crashes the host on the
394
+ * first `ECONNREFUSED`.
395
+ */
396
+
397
+ /**
398
+ * Public agent option surface (§8). Lives next to its sole consumer rather than
399
+ * in the pure data model — it is consumer configuration, not a read-model type.
400
+ * Re-exported from `main.ts`. Note: no `heartbeatMs` — the interval is assigned
401
+ * by the orchestrator in `fleet/hello` (single source of truth, §7).
402
+ */
403
+ interface FleetAgentOptions {
404
+ /** Orchestrator WS endpoint. */
405
+ url: string;
406
+ /** Agent key (sent via WS subprotocol, never query string — §13). */
407
+ key: string;
408
+ /** Public URL game clients use to connect to this instance. */
409
+ endpointUrl: string;
410
+ /** Human-readable instance name. */
411
+ name: string;
412
+ labels?: Record<string, string>;
413
+ capacity?: {
414
+ maxConnections?: number | null;
415
+ maxRooms?: number | null;
416
+ };
417
+ /** Allow orchestrator-initiated `rooms.create` (default true). */
418
+ autoCreate?: boolean;
419
+ /** Reject `connect()` after this deadline instead of retrying forever. */
420
+ connectTimeoutMs?: number;
421
+ }
422
+ /** Lifecycle status surfaced by `agent.status` (§8). Distinct from the snapshot's `active`/`draining`. */
423
+ type AgentLifecycleStatus = 'connecting' | 'connected' | 'draining' | 'closed';
424
+ /** Opaque timer handle — `unknown` so an injected fake scheduler can return anything. */
425
+ type TimerHandle = unknown;
426
+ /** Injectable timer surface so tests drive heartbeat/debounce/backoff deterministically. */
427
+ interface AgentScheduler {
428
+ setTimeout(fn: () => void, ms: number): TimerHandle;
429
+ clearTimeout(handle: TimerHandle): void;
430
+ setInterval(fn: () => void, ms: number): TimerHandle;
431
+ clearInterval(handle: TimerHandle): void;
432
+ }
433
+ /**
434
+ * Test/advanced seams kept off the public `FleetAgentOptions` surface (§8 keeps
435
+ * the documented constructor to `(rivalis, options)`). Mirrors the third-param
436
+ * convention the Snapshot builder uses for its logger.
437
+ */
438
+ interface AgentInternals {
439
+ createClient?: (url: string) => Client;
440
+ scheduler?: AgentScheduler;
441
+ backoff?: {
442
+ baseMs?: number;
443
+ capMs?: number;
444
+ };
445
+ random?: () => number;
446
+ awaitEmptyPollMs?: number;
447
+ /** Wire process-signal handlers for `enableGracefulShutdown`; returns an uninstaller. */
448
+ installSignalHandlers?: (handler: () => void) => () => void;
449
+ }
450
+ declare class FleetAgent extends Broadcast {
451
+ private readonly rivalis;
452
+ private readonly logger;
453
+ private readonly snapshot;
454
+ private readonly url;
455
+ private readonly key;
456
+ private readonly autoCreate;
457
+ private readonly maxRooms;
458
+ private readonly connectTimeoutMs;
459
+ private readonly client;
460
+ private readonly scheduler;
461
+ private readonly random;
462
+ private readonly backoffBaseMs;
463
+ private readonly backoffCapMs;
464
+ private readonly awaitEmptyPollMs;
465
+ private readonly installSignalHandlers;
466
+ private lifecycle;
467
+ private instanceId;
468
+ /** Set once `connect()`/reconnects should stop (intentional `disconnect()` or fatal error). */
469
+ private closed;
470
+ /** Distinguishes an operator-driven close from a transport drop that should reconnect. */
471
+ private intentionalClose;
472
+ private reconnectTimer;
473
+ private connectDeadline;
474
+ private reconnectAttempt;
475
+ private connectResolve;
476
+ private connectReject;
477
+ /**
478
+ * Pending `drain()` / `undrain()` promises (task 011): each waits for a
479
+ * `fleet/poll` echoing its target status — the orchestrator's acknowledged
480
+ * confirmation that it recorded the agent-owned status flip. No unsolicited frame.
481
+ */
482
+ private pendingStatus;
483
+ private uninstallSignals;
484
+ /**
485
+ * Whether the room/transport listeners are currently attached (task 008). The
486
+ * subscription lifecycle tracks the connection lifecycle: attached on construct
487
+ * and on every `connect()`, detached on the terminal paths (`disconnect()`,
488
+ * `failConnect()`) so a discarded/replaced agent stops reacting to room events
489
+ * and the host can drop it (otherwise `RoomManager`'s broadcast retains it).
490
+ */
491
+ private listenersAttached;
492
+ /**
493
+ * Drop provenance when a room is destroyed so a future id reuse is not mis-stamped
494
+ * (§7). Room create/destroy/define no longer trigger a push — changes surface at
495
+ * the next orchestrator poll (task 011).
496
+ */
497
+ private readonly onRoomDestroy;
498
+ constructor(rivalis: Rivalis, options: FleetAgentOptions, internals?: AgentInternals);
499
+ /** Lifecycle status (§8): `'connecting' | 'connected' | 'draining' | 'closed'`. */
500
+ get status(): AgentLifecycleStatus;
501
+ /** Stable per-process id (§6), constant across reconnects. */
502
+ get processUid(): string;
503
+ /**
504
+ * Connect to the orchestrator; resolves on the first `fleet/hello`. Default:
505
+ * retries forever (backoff per §7) — the promise stays pending while the
506
+ * orchestrator is unreachable. With `connectTimeoutMs` set, rejects after the
507
+ * deadline and transitions to `'closed'` with no background retry loop (§8).
508
+ */
509
+ connect(): Promise<void>;
510
+ /**
511
+ * Mark this instance draining (§7, task 011): flips the agent-owned status
512
+ * immediately (so the next `fleet/state` reply carries it) and resolves only when
513
+ * a subsequent `fleet/poll` echoes `status: 'draining'` — the orchestrator's
514
+ * acknowledged confirmation that it recorded the flip. No unsolicited frame.
515
+ */
516
+ drain(): Promise<void>;
517
+ /** Reverse of `drain()` — restore the instance to `active`; resolves on the poll echo (§7). */
518
+ undrain(): Promise<void>;
519
+ /** Resolve once every local room is empty (zero connections), or reject on `timeoutMs` (§8). */
520
+ awaitEmpty({ timeoutMs }?: {
521
+ timeoutMs?: number;
522
+ }): Promise<void>;
523
+ /** Detach cleanly: stop all timers, close the transport, no further reconnects (§8). */
524
+ disconnect(): Promise<void>;
525
+ /**
526
+ * Wire `SIGTERM`/`SIGINT` to the graceful sequence (§8):
527
+ * drain → awaitEmpty → disconnect → `rivalis.shutdown()`.
528
+ */
529
+ enableGracefulShutdown({ emptyTimeoutMs }?: {
530
+ emptyTimeoutMs?: number;
531
+ }): void;
532
+ private gracefulShutdown;
533
+ /**
534
+ * Attach the room-provenance and transport listeners (task 008). Idempotent —
535
+ * re-`connect()` after a `disconnect()` calls this again but it no-ops while
536
+ * already attached, so listeners are never doubled.
537
+ */
538
+ private attachListeners;
539
+ /**
540
+ * Detach every listener on the terminal paths (task 008): the rooms broadcast
541
+ * stops retaining this agent (no more `forgetRoom` on room destroy) and the
542
+ * transport handlers are removed. Without this a discarded agent leaks — the
543
+ * `RoomManager` broadcast keeps it alive for the host process's lifetime.
544
+ */
545
+ private detachListeners;
546
+ private wireClient;
547
+ private subscribeRooms;
548
+ private unsubscribeRooms;
549
+ private openConnection;
550
+ private onTransportOpen;
551
+ private onTransportClose;
552
+ private onTransportError;
553
+ private scheduleReconnect;
554
+ /** Full-jitter exponential backoff: random in `[0, min(cap, base·2^attempt)]` (§7). */
555
+ private backoffDelay;
556
+ private onHello;
557
+ /**
558
+ * Answer an orchestrator `fleet/poll` with a `fleet/state` reply (§7, task 011):
559
+ * full snapshot when our hash differs from the poll's `knownHash`, hash-only
560
+ * otherwise. A poll echoing a pending `drain()`/`undrain()` target status also
561
+ * resolves that promise (the acknowledged confirmation, no unsolicited frame).
562
+ */
563
+ private onPoll;
564
+ private onCmd;
565
+ private execCreate;
566
+ private execDestroy;
567
+ private execStatusCmd;
568
+ private requestStatus;
569
+ /** Resolve every pending drain()/undrain() whose target matches the poll-echoed status. */
570
+ private resolveStatusOnEcho;
571
+ private sendState;
572
+ private sendAck;
573
+ private send;
574
+ /** Fatal connect failure (timeout or protocol mismatch): reject, close, stop retrying (§8). */
575
+ private failConnect;
576
+ private rejectPendingStatus;
577
+ private clearReconnect;
578
+ private clearConnectDeadline;
579
+ private clearAllTimers;
580
+ /** Run a transport/timer callback, swallowing+logging any throw (§8 host-safety contract). */
581
+ private guard;
582
+ /**
583
+ * Decode an inbound binary frame for a non-hello topic (§7, task 005). Logs +
584
+ * returns `null` on any failure — never throws into the host (§8). A
585
+ * protocol-incompatible frame is logged as a version mismatch; a
586
+ * malformed/truncated frame is logged and dropped. (`fleet/hello` handles a
587
+ * version mismatch itself — a loud connect failure — so it does not use this.)
588
+ */
589
+ private decodeInbound;
590
+ }
591
+
592
+ export { FleetAgent, type FleetAgentOptions, type FleetApi, FleetError, type FleetErrorCode, type FleetEvent, type FleetEventType, type FleetStats, type InstanceInfo, Orchestrator, type OrchestratorOptions, PROTOCOL_VERSION, type PlacementRequest, type PlacementStrategy, type RoomInfo };