@rivalis/fleet 8.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +417 -0
- package/bin/rivalis-fleet.js +10 -0
- package/lib/AgentAuthenticator.js +56 -0
- package/lib/CommandEngine.js +258 -0
- package/lib/EventReconciler.js +90 -0
- package/lib/FleetAgent.js +1217 -0
- package/lib/FleetControl.js +139 -0
- package/lib/FleetState.js +865 -0
- package/lib/Orchestrator.js +2834 -0
- package/lib/Poller.js +113 -0
- package/lib/Snapshot.js +471 -0
- package/lib/canonical.js +82 -0
- package/lib/cli.js +3076 -0
- package/lib/domain.js +97 -0
- package/lib/env.js +99 -0
- package/lib/main.d.ts +592 -0
- package/lib/main.js +3618 -0
- package/lib/module.js +3582 -0
- package/lib/routers.js +598 -0
- package/lib/wire.js +507 -0
- package/package.json +78 -0
package/lib/main.d.ts
ADDED
|
@@ -0,0 +1,592 @@
|
|
|
1
|
+
import { Broadcast } from '@toolcase/base';
|
|
2
|
+
import { Logger } from '@toolcase/logging';
|
|
3
|
+
import { Rivalis, Client } from '@rivalis/core';
|
|
4
|
+
import { EndpointError } from '@toolcase/node';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Orchestrator configuration normalization and defaults (§9). Consumed by the
|
|
8
|
+
* Orchestrator (§9) and the CLI (§12).
|
|
9
|
+
*
|
|
10
|
+
* Scope note: this resolves option shapes and defaults and normalizes the
|
|
11
|
+
* `string | string[]` key surface for rotation (§13). The §13 *security* checks
|
|
12
|
+
* — key-strength enforcement, agent/admin audience separation, the production
|
|
13
|
+
* refuse-to-start rules — live in {@link enforceSecurityPolicy} below, called by
|
|
14
|
+
* the Orchestrator constructor: structural normalization and security hardening
|
|
15
|
+
* stay separate, but both have their home in this module.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Public orchestrator option surface (§9). Lives next to its primary consumer
|
|
20
|
+
* (config resolution) rather than in the pure data model — it is consumer
|
|
21
|
+
* configuration, not a read-model type. Re-exported from `main.ts`.
|
|
22
|
+
*/
|
|
23
|
+
interface OrchestratorOptions {
|
|
24
|
+
/** Bind address (default 0.0.0.0). */
|
|
25
|
+
host?: string;
|
|
26
|
+
port: number;
|
|
27
|
+
/** Agent auth key(s) — agents connect with any listed key. */
|
|
28
|
+
agentKey: string | string[];
|
|
29
|
+
/** REST admin key(s) — required when `api: true`. */
|
|
30
|
+
adminKey?: string | string[];
|
|
31
|
+
/** Serve REST /v1 (default true). */
|
|
32
|
+
api?: boolean;
|
|
33
|
+
heartbeatMs?: number;
|
|
34
|
+
commandTimeoutMs?: number;
|
|
35
|
+
/** CORS allow-origins, or false (default) for same-origin only. */
|
|
36
|
+
cors?: false | {
|
|
37
|
+
origins: string[];
|
|
38
|
+
};
|
|
39
|
+
/** Allow `?key=` auth on /v1/events for browser EventSource (§10, §13). */
|
|
40
|
+
sseQueryAuth?: boolean;
|
|
41
|
+
/**
|
|
42
|
+
* Trust `X-Forwarded-For` from a front proxy (default `false`, §13). When a
|
|
43
|
+
* TLS-terminating reverse proxy / service mesh sits in front, enable this so the
|
|
44
|
+
* per-IP failed-auth throttle and audit logs key on the real client IP (`req.ip`,
|
|
45
|
+
* resolved from the forwarded header) instead of collapsing every client into the
|
|
46
|
+
* proxy's single socket address. Leave off for direct exposure — a spoofable
|
|
47
|
+
* header must not be trusted from an untrusted network.
|
|
48
|
+
*/
|
|
49
|
+
trustProxy?: boolean;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Internal Rivalis Room (type `@rivalis/fleet`) that hosts connected agents as
|
|
54
|
+
* actors — the orchestrator dogfoods Rivalis for agent transport (§7). Each
|
|
55
|
+
* agent socket is an actor in this single room; the room binds the agent → orch
|
|
56
|
+
* reply topics (`fleet/state`, `fleet/ack`) and forwards every frame, join, and
|
|
57
|
+
* leave to the {@link FleetController} (the Orchestrator).
|
|
58
|
+
*
|
|
59
|
+
* `unknownTopicPolicy = 'kick'` (task 011): under strict orchestrator-driven
|
|
60
|
+
* request/reply, every agent frame must be a reply to an outstanding request — an
|
|
61
|
+
* unbound topic is an unsolicited frame and the agent is kicked. This supersedes
|
|
62
|
+
* the pre-011 `'drop'` forward-compat stance (a major bump now guards
|
|
63
|
+
* compatibility, §7). A frame on a *bound* reply topic still flows to the
|
|
64
|
+
* controller, which checks the correlation id and kicks if it matches no
|
|
65
|
+
* outstanding request.
|
|
66
|
+
*
|
|
67
|
+
* The class is produced by a **factory** rather than declared statically so that
|
|
68
|
+
* `@rivalis/core` is required lazily (the orchestrator loads core inside
|
|
69
|
+
* `listen()`; importing `@rivalis/fleet` must not eagerly drag core's ESM build
|
|
70
|
+
* in — mirrors `FleetAgent`'s lazy `loadCore`). The factory closes over the
|
|
71
|
+
* controller, so the room needs no per-instance wiring after construction.
|
|
72
|
+
*/
|
|
73
|
+
/**
|
|
74
|
+
* A connected agent as seen by the control plane — an abstraction over the
|
|
75
|
+
* core `Actor`. The orchestrator never touches `Actor`/`Room` directly; it sends
|
|
76
|
+
* JSON frames and closes wedged sockets through this seam, which also makes the
|
|
77
|
+
* control plane unit-testable without a live WebSocket (§15).
|
|
78
|
+
*/
|
|
79
|
+
interface AgentLink {
|
|
80
|
+
/** Connection-scoped instance id — the actor id assigned by core (§6). */
|
|
81
|
+
readonly instanceId: string;
|
|
82
|
+
/** Send a topic frame to this agent; payloads are binary-encoded (§7, task 005). */
|
|
83
|
+
send(topic: string, payload: unknown): void;
|
|
84
|
+
/** Kick the agent's socket (used to evict a wedged-but-connected instance, §7). */
|
|
85
|
+
close(): void;
|
|
86
|
+
}
|
|
87
|
+
/** What the FleetRoom forwards into the Orchestrator. Implemented by the Orchestrator. */
|
|
88
|
+
interface FleetController {
|
|
89
|
+
handleAgentJoin(link: AgentLink): void;
|
|
90
|
+
handleAgentLeave(instanceId: string): void;
|
|
91
|
+
handleAgentMessage(instanceId: string, topic: string, payload: Uint8Array): void;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Pure fleet data model (§6) — the read-model rows surfaced over the library and
|
|
96
|
+
* REST APIs and the placement-request shape. No I/O, no wire-format concerns:
|
|
97
|
+
* the agent builds `fleet/state` payloads (see `../wire`) which the orchestrator
|
|
98
|
+
* validates into these `InstanceInfo`/`RoomInfo` rows.
|
|
99
|
+
*/
|
|
100
|
+
/** Lifecycle status of an instance (§6). The agent owns this value (§7). */
|
|
101
|
+
type InstanceStatus = 'active' | 'draining';
|
|
102
|
+
/** Resolved capacity declaration; `null` means "unlimited" for that dimension. */
|
|
103
|
+
interface Capacity {
|
|
104
|
+
maxConnections: number | null;
|
|
105
|
+
maxRooms: number | null;
|
|
106
|
+
}
|
|
107
|
+
type PlacementStrategy = 'least-loaded' | 'most-loaded' | 'random';
|
|
108
|
+
interface PlacementRequest {
|
|
109
|
+
/** Pin to a connection-scoped instance id (see §9 pinning caveat). */
|
|
110
|
+
instanceId?: string;
|
|
111
|
+
/** Pin to an instance by its stable process id. */
|
|
112
|
+
processUid?: string;
|
|
113
|
+
strategy?: PlacementStrategy;
|
|
114
|
+
/** Only instances matching all listed labels are candidates. */
|
|
115
|
+
labels?: Record<string, string>;
|
|
116
|
+
/** Pinning to a draining instance requires `force: true`. */
|
|
117
|
+
force?: boolean;
|
|
118
|
+
}
|
|
119
|
+
interface InstanceInfo {
|
|
120
|
+
id: string;
|
|
121
|
+
name: string;
|
|
122
|
+
processUid: string;
|
|
123
|
+
endpointUrl: string;
|
|
124
|
+
labels: Record<string, string>;
|
|
125
|
+
roomTypes: string[];
|
|
126
|
+
rooms: RoomInfo[];
|
|
127
|
+
connections: number;
|
|
128
|
+
capacity: Capacity;
|
|
129
|
+
autoCreate: boolean;
|
|
130
|
+
status: InstanceStatus;
|
|
131
|
+
lastSyncAt: number;
|
|
132
|
+
agentVersion: string;
|
|
133
|
+
protocolVersion: number;
|
|
134
|
+
}
|
|
135
|
+
interface RoomInfo {
|
|
136
|
+
id: string;
|
|
137
|
+
type: string;
|
|
138
|
+
connections: number;
|
|
139
|
+
instanceId: string;
|
|
140
|
+
endpointUrl: string;
|
|
141
|
+
local: boolean;
|
|
142
|
+
}
|
|
143
|
+
interface FleetStats {
|
|
144
|
+
instances: number;
|
|
145
|
+
rooms: number;
|
|
146
|
+
connections: number;
|
|
147
|
+
roomTypes: string[];
|
|
148
|
+
stateHash: string;
|
|
149
|
+
}
|
|
150
|
+
/** Stable, machine-readable error codes surfaced over REST (§10) and control APIs. */
|
|
151
|
+
type FleetErrorCode = 'VALIDATION' | 'UNAUTHORIZED' | 'INSTANCE_NOT_FOUND' | 'ROOM_NOT_FOUND' | 'NO_CANDIDATE' | 'ROOM_EXISTS' | 'INSTANCE_DRAINING' | 'PAYLOAD_TOO_LARGE' | 'INSTANCE_BUSY' | 'AUTH_THROTTLED' | 'SSE_LIMIT' | 'COMMAND_FAILED' | 'INSTANCE_DISCONNECTED' | 'COMMAND_TIMEOUT';
|
|
152
|
+
type FleetEventType = 'instance:join' | 'instance:leave' | 'instance:stale' | 'room:create' | 'room:destroy' | 'sync';
|
|
153
|
+
interface FleetEvent {
|
|
154
|
+
type: FleetEventType;
|
|
155
|
+
data?: unknown;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* The fleet's coded error hierarchy (task 004) — `FleetError extends
|
|
160
|
+
* EndpointError`, with the HTTP status carried **on the error class** instead of
|
|
161
|
+
* a side table in the REST router. The node-service blueprint puts domain errors
|
|
162
|
+
* here in `src/domain/errors.ts` and maps them with `errorMeta(e)`, so the router
|
|
163
|
+
* no longer owns a parallel `code → status` table that could drift from the throw
|
|
164
|
+
* sites (spec §10 codes are unchanged; only the mapping *mechanism* moves).
|
|
165
|
+
*
|
|
166
|
+
* ──────────────────────────────────────────────────────────────────────────────
|
|
167
|
+
* `@toolcase/node` adoption (task 006)
|
|
168
|
+
*
|
|
169
|
+
* `EndpointError`, `errorMeta`, and `isLibError` now come from **`@toolcase/node`**
|
|
170
|
+
* (the local port that stood in until task 006 — while `@toolcase/node` was not
|
|
171
|
+
* loadable here — is deleted). The package is listed in `tsup.config.ts`'s
|
|
172
|
+
* `external`, so the one `EndpointError` class identity is shared across every
|
|
173
|
+
* bundle: a {@link FleetError} thrown in the `FleetState`/`Orchestrator` bundle is
|
|
174
|
+
* recognized by the router bundle's `errorMeta` via `instanceof EndpointError`
|
|
175
|
+
* (the *base* is externalized; `FleetError` itself is still bundled per-entry, but
|
|
176
|
+
* the mapping checks the shared base, never `FleetError`). That is exactly the
|
|
177
|
+
* cross-bundle correctness the earlier structural port was working around.
|
|
178
|
+
* ──────────────────────────────────────────────────────────────────────────────
|
|
179
|
+
*/
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Coded error surfaced by placement, the command engine, and REST validation
|
|
183
|
+
* (§9/§10). Extends `@toolcase/node`'s {@link EndpointError}, resolving its HTTP
|
|
184
|
+
* `statusCode` from the §10 table at construction — so `errorMeta` maps it (via
|
|
185
|
+
* `instanceof EndpointError`) without the router knowing the table. The public
|
|
186
|
+
* `code` contract (a {@link FleetErrorCode}) is the documented REST envelope `cause`
|
|
187
|
+
* (spec §10) and is unchanged.
|
|
188
|
+
*/
|
|
189
|
+
declare class FleetError extends EndpointError {
|
|
190
|
+
readonly code: FleetErrorCode;
|
|
191
|
+
constructor(code: FleetErrorCode, message: string);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Shared default timer scheduler (task 002). One definition of the `unref`-ing
|
|
196
|
+
* `setTimeout`/`setInterval` wrapper that the orchestrator and the agent each
|
|
197
|
+
* carried. `unref` so a lingering timer never pins the process.
|
|
198
|
+
*
|
|
199
|
+
* Typed as a structural superset — timeouts *and* intervals — so the single
|
|
200
|
+
* value satisfies both the orchestrator's timeouts-only `OrchestratorScheduler`
|
|
201
|
+
* seam and the agent's `AgentScheduler` (which also drives heartbeat/poll
|
|
202
|
+
* intervals). The injectable scheduler seams in each consumer are unchanged;
|
|
203
|
+
* tests still pass their own fakes.
|
|
204
|
+
*/
|
|
205
|
+
/**
|
|
206
|
+
* Timeouts-only timer seam shared by the orchestrator and every collaborator it
|
|
207
|
+
* injects ({@link CommandEngine}, {@link Poller}). One definition so the
|
|
208
|
+
* decomposed pieces (and their unit tests) take the same fake clock the
|
|
209
|
+
* Orchestrator does (§15).
|
|
210
|
+
*/
|
|
211
|
+
interface TimerScheduler {
|
|
212
|
+
setTimeout(fn: () => void, ms: number): unknown;
|
|
213
|
+
clearTimeout(handle: unknown): void;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Wire-protocol constants (§7) — protocol versioning, the in-flight command cap,
|
|
218
|
+
* and the topic name table exchanged between agent and orchestrator. Per-topic
|
|
219
|
+
* JSON payload shapes live in `./payloads`.
|
|
220
|
+
*/
|
|
221
|
+
/**
|
|
222
|
+
* Protocol MAJOR spoken by agent and orchestrator — a single integer (§7).
|
|
223
|
+
*
|
|
224
|
+
* Bumped 1 → 2 by task 005: the wire format changed from JSON to binary
|
|
225
|
+
* (`@toolcase/serializer`), a breaking change within the major.
|
|
226
|
+
*
|
|
227
|
+
* Bumped 2 → 3 by task 011: the protocol was inverted to strict
|
|
228
|
+
* orchestrator-driven request/reply. Agent push topics (`fleet/sync`,
|
|
229
|
+
* `fleet/ping`, `fleet/resync`, `fleet/status`, `fleet/status-ack`) are gone;
|
|
230
|
+
* the orchestrator now polls (`fleet/poll`) and the agent replies (`fleet/state`).
|
|
231
|
+
* A v2 (push) peer and a v3 (poll) peer cannot interoperate — both halves must be
|
|
232
|
+
* upgraded in lockstep. The 2-byte version header on every frame
|
|
233
|
+
* (`wire/serializer`) is what makes the mismatch fail loudly at `fleet/hello`.
|
|
234
|
+
*/
|
|
235
|
+
declare const PROTOCOL_VERSION = 3;
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Embeddable fleet orchestrator (§9) — the dogfooded control plane (§7). After the
|
|
239
|
+
* task-008 decomposition this is the **facade** wiring its collaborators, each a
|
|
240
|
+
* separately unit-tested concern: {@link AgentAuthenticator} (agent-key match),
|
|
241
|
+
* {@link CommandEngine} (pending commands + cap + settle), {@link Poller}
|
|
242
|
+
* (orchestrator-driven polling + missed-reply stale/evict, task 011),
|
|
243
|
+
* {@link EventReconciler} (read-model diffs → events), {@link FleetControl}
|
|
244
|
+
* (create/destroy/drain), and `transport.ts` (control-plane bootstrap). The
|
|
245
|
+
* Orchestrator retains config resolution, the `FleetApi` facade, poll dispatch +
|
|
246
|
+
* per-agent outstanding-request enforcement, and lifecycle (`listen`/`shutdown`).
|
|
247
|
+
*
|
|
248
|
+
* `listen()` is the only method that touches core / the network; constructing an
|
|
249
|
+
* Orchestrator loads no core, so the control plane is exercised directly in unit
|
|
250
|
+
* tests against the {@link AgentLink} seams and an injectable scheduler (§15).
|
|
251
|
+
*/
|
|
252
|
+
|
|
253
|
+
/** Minimal timer surface (timeouts only); tests inject a virtual-time fake (§15). */
|
|
254
|
+
type OrchestratorScheduler = TimerScheduler;
|
|
255
|
+
/** Test/advanced seams kept off the public option surface (§9). */
|
|
256
|
+
interface OrchestratorInternals {
|
|
257
|
+
scheduler?: OrchestratorScheduler;
|
|
258
|
+
/** Wall clock for `lastSyncAt`; default `Date.now`. */
|
|
259
|
+
now?: () => number;
|
|
260
|
+
logger?: Logger;
|
|
261
|
+
/** `NODE_ENV` override for the §13 production refuse-to-start rules; default sourced from `src/env.ts`. */
|
|
262
|
+
env?: string;
|
|
263
|
+
}
|
|
264
|
+
/** Read-model + control surface exposed as `orchestrator.fleet` (§9). */
|
|
265
|
+
interface FleetApi {
|
|
266
|
+
readonly stats: FleetStats;
|
|
267
|
+
readonly instances: InstanceInfo[];
|
|
268
|
+
readonly rooms: RoomInfo[];
|
|
269
|
+
getInstance(id: string): InstanceInfo | null;
|
|
270
|
+
getRoom(roomId: string): RoomInfo | null;
|
|
271
|
+
findRooms(filter?: {
|
|
272
|
+
type?: string;
|
|
273
|
+
instanceId?: string;
|
|
274
|
+
labels?: Record<string, string>;
|
|
275
|
+
}): RoomInfo[];
|
|
276
|
+
createRoom(request: {
|
|
277
|
+
type: string;
|
|
278
|
+
roomId?: string;
|
|
279
|
+
placement?: PlacementRequest;
|
|
280
|
+
}): Promise<RoomInfo>;
|
|
281
|
+
destroyRoom(roomId: string): Promise<void>;
|
|
282
|
+
drainInstance(instanceId: string): Promise<void>;
|
|
283
|
+
undrainInstance(instanceId: string): Promise<void>;
|
|
284
|
+
}
|
|
285
|
+
declare class Orchestrator extends Broadcast implements FleetController {
|
|
286
|
+
readonly fleet: FleetApi;
|
|
287
|
+
private readonly config;
|
|
288
|
+
private readonly state;
|
|
289
|
+
private readonly now;
|
|
290
|
+
private logger;
|
|
291
|
+
/** `fleet:http` logger; NOOP until `listen()` loads core's logging factory. */
|
|
292
|
+
private httpLogger;
|
|
293
|
+
/** Fastify-based REST /v1 surface over the same `node:http` server (§10, task 006). */
|
|
294
|
+
private readonly httpApi;
|
|
295
|
+
private readonly auth;
|
|
296
|
+
private readonly commands;
|
|
297
|
+
private readonly poller;
|
|
298
|
+
private readonly reconciler;
|
|
299
|
+
private readonly control;
|
|
300
|
+
/** Live agent links keyed by connection-scoped instance id. */
|
|
301
|
+
private readonly links;
|
|
302
|
+
private rivalis;
|
|
303
|
+
private httpServer;
|
|
304
|
+
private listening;
|
|
305
|
+
private transportAttached;
|
|
306
|
+
constructor(options: OrchestratorOptions, internals?: OrchestratorInternals);
|
|
307
|
+
/**
|
|
308
|
+
* Bridge every {@link FleetEventType} broadcast (§9) into one SSE listener as a
|
|
309
|
+
* {@link FleetEvent} `{ type, data }`; returns an unsubscribe (called on stream close, §10).
|
|
310
|
+
*/
|
|
311
|
+
private subscribeFleetEvents;
|
|
312
|
+
/** True once HTTP is listening and the WS transport is attached (drives `/readyz`, task 010). */
|
|
313
|
+
get ready(): boolean;
|
|
314
|
+
/** Start the HTTP/WS server, attach the internal Rivalis room, begin accepting agents (§9). */
|
|
315
|
+
listen(): Promise<void>;
|
|
316
|
+
/** Gracefully stop: reject in-flight commands, destroy rooms, dispose transport, close HTTP (§9). */
|
|
317
|
+
shutdown(): Promise<void>;
|
|
318
|
+
/** @internal Agent joined: assign id, send `fleet/hello`, start polling (§7, task 011). */
|
|
319
|
+
handleAgentJoin(link: AgentLink): void;
|
|
320
|
+
/** @internal Agent socket closed: evict instantly, rejecting any in-flight commands (§7). */
|
|
321
|
+
handleAgentLeave(instanceId: string): void;
|
|
322
|
+
/**
|
|
323
|
+
* @internal Inbound agent frame (task 011). Every agent frame must be a reply to
|
|
324
|
+
* an outstanding orchestrator request — `fleet/state` to a `fleet/poll`,
|
|
325
|
+
* `fleet/ack` to a `fleet/cmd`. A well-formed frame whose correlation id matches
|
|
326
|
+
* no outstanding request (spontaneous, duplicate, or post-settle) is an
|
|
327
|
+
* unsolicited frame → kick. A malformed / version-incompatible frame is logged
|
|
328
|
+
* and dropped (the lockstep-mismatch path is evicted by missed polls, §7/§8).
|
|
329
|
+
*/
|
|
330
|
+
handleAgentMessage(instanceId: string, topic: string, payload: Uint8Array | string): void;
|
|
331
|
+
/** Build and send a `fleet/poll`: knownHash drives dedup, status echoes for drain confirmation. */
|
|
332
|
+
private sendPoll;
|
|
333
|
+
/**
|
|
334
|
+
* Ingest a `fleet/state` poll reply (task 011). The reply must match the
|
|
335
|
+
* outstanding poll's `reqId` (consumed via the poller); an unmatched reply is
|
|
336
|
+
* unsolicited → kick. A full reply is bounds-checked (§13) and applied; a
|
|
337
|
+
* hash-only reply just refreshes liveness (the snapshot is unchanged).
|
|
338
|
+
*/
|
|
339
|
+
private handleState;
|
|
340
|
+
private handleAck;
|
|
341
|
+
/**
|
|
342
|
+
* Kick an agent that broke the request/reply contract (task 011): tear it down
|
|
343
|
+
* (rejecting in-flight commands, removing it from the read model) and close the
|
|
344
|
+
* socket so it reconnects fresh. The log line names the cause and the instance —
|
|
345
|
+
* never the offending payload's contents (§13).
|
|
346
|
+
*/
|
|
347
|
+
private kick;
|
|
348
|
+
private onStale;
|
|
349
|
+
private onEvict;
|
|
350
|
+
/**
|
|
351
|
+
* Remove an instance from every table, reject its in-flight commands immediately
|
|
352
|
+
* with `INSTANCE_DISCONNECTED` (§7), and reconcile (its rooms → `room:destroy`, `sync`).
|
|
353
|
+
*/
|
|
354
|
+
private teardownInstance;
|
|
355
|
+
/**
|
|
356
|
+
* Decode a binary agent frame for `topic` (§7). Returns `null` on any failure —
|
|
357
|
+
* never throws into the host (§8): a protocol-incompatible frame (e.g. a legacy
|
|
358
|
+
* JSON agent against this v2 orchestrator) or a malformed/truncated one is logged
|
|
359
|
+
* and dropped, and the read model keeps its last good state.
|
|
360
|
+
*/
|
|
361
|
+
private decode;
|
|
362
|
+
private emitEvent;
|
|
363
|
+
/**
|
|
364
|
+
* Run a timer- / transport- / core-dispatch-driven callback, swallowing and
|
|
365
|
+
* logging any throw so it never escapes into a raw `setTimeout` (an
|
|
366
|
+
* `uncaughtException` that would crash the whole control plane) or back into
|
|
367
|
+
* core's room dispatch (§14 failure modes). Mirrors the agent's host-safety
|
|
368
|
+
* `guard` (§8): the orchestrator is the single point of coordination, so one
|
|
369
|
+
* unhandled throw on a poll tick, a snapshot application, or a liveness deadline
|
|
370
|
+
* must degrade to a logged failure on one instance, never an orchestrator-wide
|
|
371
|
+
* outage. Never rethrows.
|
|
372
|
+
*/
|
|
373
|
+
private guard;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
/**
|
|
377
|
+
* Instance-side fleet client (§8). Attaches a `Rivalis` instance to an
|
|
378
|
+
* orchestrator over core's hardened `WSClient` (task 002, `ticketSource:
|
|
379
|
+
* 'protocol'` so the agent key never lands in a URL — §13): it reports the
|
|
380
|
+
* instance's rooms/connections and executes orchestrator-pushed room commands.
|
|
381
|
+
*
|
|
382
|
+
* Strict orchestrator-driven request/reply (task 011): the agent never pushes
|
|
383
|
+
* state spontaneously. The orchestrator polls (`fleet/poll`) on its own cadence
|
|
384
|
+
* and the agent answers with `fleet/state` — a full snapshot when its hash differs
|
|
385
|
+
* from the poll's `knownHash`, a hash-only reply otherwise. `drain()`/`undrain()`
|
|
386
|
+
* flip the agent-owned status locally and resolve when a subsequent poll echoes the
|
|
387
|
+
* target status (an acknowledged confirmation, no unsolicited frame).
|
|
388
|
+
*
|
|
389
|
+
* The load-bearing contract (§8): **never throws into the host process from
|
|
390
|
+
* network failures**. Every transport callback is wrapped, failures are logged
|
|
391
|
+
* via `rivalis.logging.getLogger('fleet:agent')`, and the agent reconnects with
|
|
392
|
+
* exponential backoff (0.5 s → 30 s cap, full jitter — §7). This is what forces
|
|
393
|
+
* the §4 `WSClient` hardening: the unhardened client crashes the host on the
|
|
394
|
+
* first `ECONNREFUSED`.
|
|
395
|
+
*/
|
|
396
|
+
|
|
397
|
+
/**
|
|
398
|
+
* Public agent option surface (§8). Lives next to its sole consumer rather than
|
|
399
|
+
* in the pure data model — it is consumer configuration, not a read-model type.
|
|
400
|
+
* Re-exported from `main.ts`. Note: no `heartbeatMs` — the interval is assigned
|
|
401
|
+
* by the orchestrator in `fleet/hello` (single source of truth, §7).
|
|
402
|
+
*/
|
|
403
|
+
interface FleetAgentOptions {
|
|
404
|
+
/** Orchestrator WS endpoint. */
|
|
405
|
+
url: string;
|
|
406
|
+
/** Agent key (sent via WS subprotocol, never query string — §13). */
|
|
407
|
+
key: string;
|
|
408
|
+
/** Public URL game clients use to connect to this instance. */
|
|
409
|
+
endpointUrl: string;
|
|
410
|
+
/** Human-readable instance name. */
|
|
411
|
+
name: string;
|
|
412
|
+
labels?: Record<string, string>;
|
|
413
|
+
capacity?: {
|
|
414
|
+
maxConnections?: number | null;
|
|
415
|
+
maxRooms?: number | null;
|
|
416
|
+
};
|
|
417
|
+
/** Allow orchestrator-initiated `rooms.create` (default true). */
|
|
418
|
+
autoCreate?: boolean;
|
|
419
|
+
/** Reject `connect()` after this deadline instead of retrying forever. */
|
|
420
|
+
connectTimeoutMs?: number;
|
|
421
|
+
}
|
|
422
|
+
/** Lifecycle status surfaced by `agent.status` (§8). Distinct from the snapshot's `active`/`draining`. */
|
|
423
|
+
type AgentLifecycleStatus = 'connecting' | 'connected' | 'draining' | 'closed';
|
|
424
|
+
/** Opaque timer handle — `unknown` so an injected fake scheduler can return anything. */
|
|
425
|
+
type TimerHandle = unknown;
|
|
426
|
+
/** Injectable timer surface so tests drive heartbeat/debounce/backoff deterministically. */
|
|
427
|
+
interface AgentScheduler {
|
|
428
|
+
setTimeout(fn: () => void, ms: number): TimerHandle;
|
|
429
|
+
clearTimeout(handle: TimerHandle): void;
|
|
430
|
+
setInterval(fn: () => void, ms: number): TimerHandle;
|
|
431
|
+
clearInterval(handle: TimerHandle): void;
|
|
432
|
+
}
|
|
433
|
+
/**
|
|
434
|
+
* Test/advanced seams kept off the public `FleetAgentOptions` surface (§8 keeps
|
|
435
|
+
* the documented constructor to `(rivalis, options)`). Mirrors the third-param
|
|
436
|
+
* convention the Snapshot builder uses for its logger.
|
|
437
|
+
*/
|
|
438
|
+
interface AgentInternals {
|
|
439
|
+
createClient?: (url: string) => Client;
|
|
440
|
+
scheduler?: AgentScheduler;
|
|
441
|
+
backoff?: {
|
|
442
|
+
baseMs?: number;
|
|
443
|
+
capMs?: number;
|
|
444
|
+
};
|
|
445
|
+
random?: () => number;
|
|
446
|
+
awaitEmptyPollMs?: number;
|
|
447
|
+
/** Wire process-signal handlers for `enableGracefulShutdown`; returns an uninstaller. */
|
|
448
|
+
installSignalHandlers?: (handler: () => void) => () => void;
|
|
449
|
+
}
|
|
450
|
+
declare class FleetAgent extends Broadcast {
|
|
451
|
+
private readonly rivalis;
|
|
452
|
+
private readonly logger;
|
|
453
|
+
private readonly snapshot;
|
|
454
|
+
private readonly url;
|
|
455
|
+
private readonly key;
|
|
456
|
+
private readonly autoCreate;
|
|
457
|
+
private readonly maxRooms;
|
|
458
|
+
private readonly connectTimeoutMs;
|
|
459
|
+
private readonly client;
|
|
460
|
+
private readonly scheduler;
|
|
461
|
+
private readonly random;
|
|
462
|
+
private readonly backoffBaseMs;
|
|
463
|
+
private readonly backoffCapMs;
|
|
464
|
+
private readonly awaitEmptyPollMs;
|
|
465
|
+
private readonly installSignalHandlers;
|
|
466
|
+
private lifecycle;
|
|
467
|
+
private instanceId;
|
|
468
|
+
/** Set once `connect()`/reconnects should stop (intentional `disconnect()` or fatal error). */
|
|
469
|
+
private closed;
|
|
470
|
+
/** Distinguishes an operator-driven close from a transport drop that should reconnect. */
|
|
471
|
+
private intentionalClose;
|
|
472
|
+
private reconnectTimer;
|
|
473
|
+
private connectDeadline;
|
|
474
|
+
private reconnectAttempt;
|
|
475
|
+
private connectResolve;
|
|
476
|
+
private connectReject;
|
|
477
|
+
/**
|
|
478
|
+
* Pending `drain()` / `undrain()` promises (task 011): each waits for a
|
|
479
|
+
* `fleet/poll` echoing its target status — the orchestrator's acknowledged
|
|
480
|
+
* confirmation that it recorded the agent-owned status flip. No unsolicited frame.
|
|
481
|
+
*/
|
|
482
|
+
private pendingStatus;
|
|
483
|
+
private uninstallSignals;
|
|
484
|
+
/**
|
|
485
|
+
* Whether the room/transport listeners are currently attached (task 008). The
|
|
486
|
+
* subscription lifecycle tracks the connection lifecycle: attached on construct
|
|
487
|
+
* and on every `connect()`, detached on the terminal paths (`disconnect()`,
|
|
488
|
+
* `failConnect()`) so a discarded/replaced agent stops reacting to room events
|
|
489
|
+
* and the host can drop it (otherwise `RoomManager`'s broadcast retains it).
|
|
490
|
+
*/
|
|
491
|
+
private listenersAttached;
|
|
492
|
+
/**
|
|
493
|
+
* Drop provenance when a room is destroyed so a future id reuse is not mis-stamped
|
|
494
|
+
* (§7). Room create/destroy/define no longer trigger a push — changes surface at
|
|
495
|
+
* the next orchestrator poll (task 011).
|
|
496
|
+
*/
|
|
497
|
+
private readonly onRoomDestroy;
|
|
498
|
+
constructor(rivalis: Rivalis, options: FleetAgentOptions, internals?: AgentInternals);
|
|
499
|
+
/** Lifecycle status (§8): `'connecting' | 'connected' | 'draining' | 'closed'`. */
|
|
500
|
+
get status(): AgentLifecycleStatus;
|
|
501
|
+
/** Stable per-process id (§6), constant across reconnects. */
|
|
502
|
+
get processUid(): string;
|
|
503
|
+
/**
|
|
504
|
+
* Connect to the orchestrator; resolves on the first `fleet/hello`. Default:
|
|
505
|
+
* retries forever (backoff per §7) — the promise stays pending while the
|
|
506
|
+
* orchestrator is unreachable. With `connectTimeoutMs` set, rejects after the
|
|
507
|
+
* deadline and transitions to `'closed'` with no background retry loop (§8).
|
|
508
|
+
*/
|
|
509
|
+
connect(): Promise<void>;
|
|
510
|
+
/**
|
|
511
|
+
* Mark this instance draining (§7, task 011): flips the agent-owned status
|
|
512
|
+
* immediately (so the next `fleet/state` reply carries it) and resolves only when
|
|
513
|
+
* a subsequent `fleet/poll` echoes `status: 'draining'` — the orchestrator's
|
|
514
|
+
* acknowledged confirmation that it recorded the flip. No unsolicited frame.
|
|
515
|
+
*/
|
|
516
|
+
drain(): Promise<void>;
|
|
517
|
+
/** Reverse of `drain()` — restore the instance to `active`; resolves on the poll echo (§7). */
|
|
518
|
+
undrain(): Promise<void>;
|
|
519
|
+
/** Resolve once every local room is empty (zero connections), or reject on `timeoutMs` (§8). */
|
|
520
|
+
awaitEmpty({ timeoutMs }?: {
|
|
521
|
+
timeoutMs?: number;
|
|
522
|
+
}): Promise<void>;
|
|
523
|
+
/** Detach cleanly: stop all timers, close the transport, no further reconnects (§8). */
|
|
524
|
+
disconnect(): Promise<void>;
|
|
525
|
+
/**
|
|
526
|
+
* Wire `SIGTERM`/`SIGINT` to the graceful sequence (§8):
|
|
527
|
+
* drain → awaitEmpty → disconnect → `rivalis.shutdown()`.
|
|
528
|
+
*/
|
|
529
|
+
enableGracefulShutdown({ emptyTimeoutMs }?: {
|
|
530
|
+
emptyTimeoutMs?: number;
|
|
531
|
+
}): void;
|
|
532
|
+
private gracefulShutdown;
|
|
533
|
+
/**
|
|
534
|
+
* Attach the room-provenance and transport listeners (task 008). Idempotent —
|
|
535
|
+
* re-`connect()` after a `disconnect()` calls this again but it no-ops while
|
|
536
|
+
* already attached, so listeners are never doubled.
|
|
537
|
+
*/
|
|
538
|
+
private attachListeners;
|
|
539
|
+
/**
|
|
540
|
+
* Detach every listener on the terminal paths (task 008): the rooms broadcast
|
|
541
|
+
* stops retaining this agent (no more `forgetRoom` on room destroy) and the
|
|
542
|
+
* transport handlers are removed. Without this a discarded agent leaks — the
|
|
543
|
+
* `RoomManager` broadcast keeps it alive for the host process's lifetime.
|
|
544
|
+
*/
|
|
545
|
+
private detachListeners;
|
|
546
|
+
private wireClient;
|
|
547
|
+
private subscribeRooms;
|
|
548
|
+
private unsubscribeRooms;
|
|
549
|
+
private openConnection;
|
|
550
|
+
private onTransportOpen;
|
|
551
|
+
private onTransportClose;
|
|
552
|
+
private onTransportError;
|
|
553
|
+
private scheduleReconnect;
|
|
554
|
+
/** Full-jitter exponential backoff: random in `[0, min(cap, base·2^attempt)]` (§7). */
|
|
555
|
+
private backoffDelay;
|
|
556
|
+
private onHello;
|
|
557
|
+
/**
|
|
558
|
+
* Answer an orchestrator `fleet/poll` with a `fleet/state` reply (§7, task 011):
|
|
559
|
+
* full snapshot when our hash differs from the poll's `knownHash`, hash-only
|
|
560
|
+
* otherwise. A poll echoing a pending `drain()`/`undrain()` target status also
|
|
561
|
+
* resolves that promise (the acknowledged confirmation, no unsolicited frame).
|
|
562
|
+
*/
|
|
563
|
+
private onPoll;
|
|
564
|
+
private onCmd;
|
|
565
|
+
private execCreate;
|
|
566
|
+
private execDestroy;
|
|
567
|
+
private execStatusCmd;
|
|
568
|
+
private requestStatus;
|
|
569
|
+
/** Resolve every pending drain()/undrain() whose target matches the poll-echoed status. */
|
|
570
|
+
private resolveStatusOnEcho;
|
|
571
|
+
private sendState;
|
|
572
|
+
private sendAck;
|
|
573
|
+
private send;
|
|
574
|
+
/** Fatal connect failure (timeout or protocol mismatch): reject, close, stop retrying (§8). */
|
|
575
|
+
private failConnect;
|
|
576
|
+
private rejectPendingStatus;
|
|
577
|
+
private clearReconnect;
|
|
578
|
+
private clearConnectDeadline;
|
|
579
|
+
private clearAllTimers;
|
|
580
|
+
/** Run a transport/timer callback, swallowing+logging any throw (§8 host-safety contract). */
|
|
581
|
+
private guard;
|
|
582
|
+
/**
|
|
583
|
+
* Decode an inbound binary frame for a non-hello topic (§7, task 005). Logs +
|
|
584
|
+
* returns `null` on any failure — never throws into the host (§8). A
|
|
585
|
+
* protocol-incompatible frame is logged as a version mismatch; a
|
|
586
|
+
* malformed/truncated frame is logged and dropped. (`fleet/hello` handles a
|
|
587
|
+
* version mismatch itself — a loud connect failure — so it does not use this.)
|
|
588
|
+
*/
|
|
589
|
+
private decodeInbound;
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
export { FleetAgent, type FleetAgentOptions, type FleetApi, FleetError, type FleetErrorCode, type FleetEvent, type FleetEventType, type FleetStats, type InstanceInfo, Orchestrator, type OrchestratorOptions, PROTOCOL_VERSION, type PlacementRequest, type PlacementStrategy, type RoomInfo };
|