@rivalis/fleet 8.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +417 -0
- package/bin/rivalis-fleet.js +10 -0
- package/lib/AgentAuthenticator.js +56 -0
- package/lib/CommandEngine.js +258 -0
- package/lib/EventReconciler.js +90 -0
- package/lib/FleetAgent.js +1217 -0
- package/lib/FleetControl.js +139 -0
- package/lib/FleetState.js +865 -0
- package/lib/Orchestrator.js +2834 -0
- package/lib/Poller.js +113 -0
- package/lib/Snapshot.js +471 -0
- package/lib/canonical.js +82 -0
- package/lib/cli.js +3076 -0
- package/lib/domain.js +97 -0
- package/lib/env.js +99 -0
- package/lib/main.d.ts +592 -0
- package/lib/main.js +3618 -0
- package/lib/module.js +3582 -0
- package/lib/routers.js +598 -0
- package/lib/wire.js +507 -0
- package/package.json +78 -0
package/README.md
ADDED
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
# `@rivalis/fleet`
|
|
2
|
+
|
|
3
|
+
> Fleet orchestration for Rivalis: instance discovery, room placement, cluster control.
|
|
4
|
+
|
|
5
|
+
A **fleet** is a set of game-server instances and the rooms running on them. This package
|
|
6
|
+
gives you a central **`Orchestrator`** (which instances exist, what room types they host,
|
|
7
|
+
where clients should connect, and remote room create/destroy with acknowledged commands)
|
|
8
|
+
and a **`FleetAgent`** that embeds in each `@rivalis/core` instance to report its state and
|
|
9
|
+
execute orchestrator-pushed commands.
|
|
10
|
+
|
|
11
|
+
> **Strict orchestrator-driven request/reply (protocol v3).** The orchestrator drives the whole
|
|
12
|
+
> conversation: it **polls** each agent (`fleet/poll`) on its own cadence and the agent **replies**
|
|
13
|
+
> (`fleet/state` — a full snapshot or a hash-only liveness reply), and it pushes commands
|
|
14
|
+
> (`fleet/cmd`) the agent acks (`fleet/ack`). **Every agent frame must answer an outstanding
|
|
15
|
+
> request** (matched by correlation id); an unsolicited, duplicate, or unknown-topic frame gets the
|
|
16
|
+
> agent kicked. This shrinks what a compromised agent key can do unsolicited (it cannot spam
|
|
17
|
+
> snapshots or flood acks) and replaces the pre-v3 agent-push model.
|
|
18
|
+
|
|
19
|
+
State is **in-memory and rebuilt from agent poll replies** — no database, restart-safe. The
|
|
20
|
+
orchestrator only orchestrates rooms *within* already-running instances; spawning processes/VMs
|
|
21
|
+
is the job of k8s / Agones / autoscalers, and matchmaking logic is something you build *on top
|
|
22
|
+
of* the fleet API.
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
┌──────────────────────────┐ ┌──────────────────────────────┐
|
|
26
|
+
│ Game server process │ WS (agent key) │ Orchestrator │
|
|
27
|
+
│ Rivalis (@rivalis/core) │ ────────────────────► │ · embedded via Orchestrator │
|
|
28
|
+
│ + FleetAgent ──────────┼ ◄──────────────────── │ · or standalone bin │
|
|
29
|
+
│ │ commands + acks │ REST /v1/* (admin key) │
|
|
30
|
+
└──────────────────────────┘ └──────────────────────────────┘
|
|
31
|
+
× N instances matchmaker / ops / dashboards
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
> ⚠️ **Designed for private networks.** TLS termination is **out of scope** — see
|
|
35
|
+
> [Security](#️-security) before exposing this anywhere.
|
|
36
|
+
|
|
37
|
+
## 📦 Install
|
|
38
|
+
|
|
39
|
+
```sh
|
|
40
|
+
npm install @rivalis/fleet
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
`@rivalis/fleet` adopts the `@toolcase/*` node-service blueprint, so it carries a small set of
|
|
44
|
+
externalized runtime dependencies (the original zero-dependency goal was relaxed in spec §5):
|
|
45
|
+
|
|
46
|
+
```jsonc
|
|
47
|
+
"dependencies": {
|
|
48
|
+
"@toolcase/node": "^4.0.0", // typed env() loader, EndpointError, FieldSchema, Router/RouteHandler
|
|
49
|
+
"@toolcase/serializer": "3.x", // runtime-defined protobuf for the binary agent ↔ orch WS frames
|
|
50
|
+
"@fastify/cors": "^11.2.0", // CORS for the /v1 REST surface + SSE
|
|
51
|
+
"fastify": "^5.8.5", // HTTP server shared with the agent WS transport
|
|
52
|
+
"commander": "^12.1.0", // rivalis-fleet binary flag parser (§12)
|
|
53
|
+
"redis": "^5.12.1" // NOT used by the fleet — see note below
|
|
54
|
+
},
|
|
55
|
+
"peerDependencies": {
|
|
56
|
+
"@rivalis/core": ">=8 <9",
|
|
57
|
+
"@toolcase/base": "3.x",
|
|
58
|
+
"@toolcase/logging": "3.x",
|
|
59
|
+
"ws": "8.x"
|
|
60
|
+
}
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
**Why `redis` is here even though the fleet never opens a Redis connection.** `@toolcase/node@4`
|
|
64
|
+
is a monolithic backend bundle that eager-`require`s `redis` at module top (it ships the KV/leaderboard
|
|
65
|
+
helpers the fleet doesn't use). `redis` is an *optional* peer of `@toolcase/node`, but the eager require
|
|
66
|
+
means `require('@rivalis/fleet')` would throw `MODULE_NOT_FOUND: redis` without it. So it is kept **only
|
|
67
|
+
to satisfy `@toolcase/node@4`'s eager require** — do not remove it while that eager-load persists (a
|
|
68
|
+
smoke test guards this). The fix is upstream: once `@toolcase/node` lazy-loads `redis`, this line drops.
|
|
69
|
+
|
|
70
|
+
The core lower bound is **`>=8 <9`**.
|
|
71
|
+
|
|
72
|
+
## 🧩 Three consumption modes
|
|
73
|
+
|
|
74
|
+
1. **Library — agent side.** A `FleetAgent` a Rivalis app instantiates to attach itself to an
|
|
75
|
+
orchestrator: it reports the instance's rooms/connections and runs room create/destroy commands.
|
|
76
|
+
2. **Library — orchestrator side.** An `Orchestrator` embeddable in any Node process (custom
|
|
77
|
+
matchmaker, monolith, tests). Full fleet state + control API, plus an optional REST API.
|
|
78
|
+
3. **Binary.** `rivalis-fleet` runs a standalone orchestrator configured by env vars / CLI flags —
|
|
79
|
+
zero code needed to operate a cluster.
|
|
80
|
+
|
|
81
|
+
## 🛰️ Agent side (in each game server)
|
|
82
|
+
|
|
83
|
+
```ts
|
|
84
|
+
import { Rivalis } from '@rivalis/core'
|
|
85
|
+
import { FleetAgent } from '@rivalis/fleet'
|
|
86
|
+
|
|
87
|
+
const rivalis = new Rivalis({ /* ... */ })
|
|
88
|
+
rivalis.rooms.define('match', MatchRoom)
|
|
89
|
+
rivalis.rooms.define('lobby', LobbyRoom)
|
|
90
|
+
|
|
91
|
+
const agent = new FleetAgent(rivalis, {
|
|
92
|
+
url: 'ws://orchestrator.internal:7350', // orchestrator WS endpoint
|
|
93
|
+
key: process.env.FLEET_AGENT_KEY!, // agent key (sent via WS subprotocol, never a URL query)
|
|
94
|
+
endpointUrl: 'wss://eu1.game.example.com', // what game clients should be handed
|
|
95
|
+
name: 'eu1',
|
|
96
|
+
labels: { region: 'eu' },
|
|
97
|
+
capacity: { maxConnections: 2000, maxRooms: 100 },
|
|
98
|
+
// optional:
|
|
99
|
+
autoCreate: true // allow orchestrator-initiated rooms.create (default true)
|
|
100
|
+
// NOTE: no heartbeatMs option — the orchestrator owns the poll cadence (sent in fleet/hello)
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
await agent.connect() // resolves on the first successful fleet/hello.
|
|
104
|
+
// Default: retries forever (exponential backoff) — the promise
|
|
105
|
+
// stays pending while the orchestrator is unreachable. This is
|
|
106
|
+
// documented steady-state behavior, not a hang. Pass
|
|
107
|
+
// connectTimeoutMs to reject after a deadline instead.
|
|
108
|
+
|
|
109
|
+
agent.status // 'connecting' | 'connected' | 'draining' | 'closed'
|
|
110
|
+
|
|
111
|
+
await agent.drain() // stop receiving placements; flips the agent-owned status and
|
|
112
|
+
// resolves when a poll echoes 'draining' (the orchestrator recorded it)
|
|
113
|
+
await agent.awaitEmpty({ timeoutMs: 60_000 }) // resolves when all local rooms are empty
|
|
114
|
+
await agent.disconnect() // detach cleanly
|
|
115
|
+
|
|
116
|
+
// or wire SIGTERM/SIGINT to: drain → awaitEmpty → disconnect → rivalis.shutdown()
|
|
117
|
+
agent.enableGracefulShutdown({ emptyTimeoutMs: 60_000 })
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
The agent **never throws into the host process from network failures** — it logs via
|
|
121
|
+
`rivalis.logging.getLogger('fleet')` and retries with backoff. It tracks **room provenance**:
|
|
122
|
+
rooms it created in response to an orchestrator command are reported `origin: 'fleet'`,
|
|
123
|
+
everything else `origin: 'local'` — and this is the only source of the `RoomInfo.local` flag,
|
|
124
|
+
which survives orchestrator restarts because it lives in the process that owns the rooms.
|
|
125
|
+
|
|
126
|
+
## 🎛️ Orchestrator side (embedded)
|
|
127
|
+
|
|
128
|
+
```ts
|
|
129
|
+
import { Orchestrator } from '@rivalis/fleet'
|
|
130
|
+
|
|
131
|
+
const orchestrator = new Orchestrator({
|
|
132
|
+
host: '0.0.0.0', // bind address (default 0.0.0.0)
|
|
133
|
+
port: 7350,
|
|
134
|
+
agentKey: process.env.FLEET_AGENT_KEY!, // string | string[] — agents connect with any listed key
|
|
135
|
+
adminKey: process.env.FLEET_ADMIN_KEY!, // string | string[] — required when api: true
|
|
136
|
+
api: true, // serve REST /v1 (default true)
|
|
137
|
+
heartbeatMs: 5000,
|
|
138
|
+
commandTimeoutMs: 10000,
|
|
139
|
+
cors: false, // false (default) | { origins: string[] }
|
|
140
|
+
sseQueryAuth: false // allow ?key= auth on /v1/events for EventSource
|
|
141
|
+
})
|
|
142
|
+
|
|
143
|
+
await orchestrator.listen()
|
|
144
|
+
|
|
145
|
+
// ---- read model ----
|
|
146
|
+
orchestrator.fleet.stats // FleetStats
|
|
147
|
+
orchestrator.fleet.instances // InstanceInfo[]
|
|
148
|
+
orchestrator.fleet.rooms // RoomInfo[]
|
|
149
|
+
orchestrator.fleet.getInstance(id) // InstanceInfo | null
|
|
150
|
+
orchestrator.fleet.getRoom(roomId) // RoomInfo | null
|
|
151
|
+
orchestrator.fleet.findRooms({ type: 'match', labels: { region: 'eu' } })
|
|
152
|
+
|
|
153
|
+
// ---- control (all return Promises resolved on agent ack) ----
|
|
154
|
+
const room = await orchestrator.fleet.createRoom({
|
|
155
|
+
type: 'match',
|
|
156
|
+
roomId: 'match-42', // optional — generated if omitted (charset: ^[A-Za-z0-9_-]{1,64}$)
|
|
157
|
+
placement: { // optional — defaults to least-loaded
|
|
158
|
+
// instanceId: 'i_abc', // pin to a connection-scoped instance id (see caveat), OR:
|
|
159
|
+
// processUid: 'p_9f3…', // pin by stable process id, OR:
|
|
160
|
+
strategy: 'least-loaded', // 'least-loaded' | 'most-loaded' | 'random'
|
|
161
|
+
labels: { region: 'eu' }, // only instances matching all labels
|
|
162
|
+
force: false // pinning to a draining instance requires force: true
|
|
163
|
+
}
|
|
164
|
+
}) // → RoomInfo (includes endpointUrl for handing to clients)
|
|
165
|
+
|
|
166
|
+
await orchestrator.fleet.destroyRoom('match-42') // roomId is fleet-unique
|
|
167
|
+
await orchestrator.fleet.drainInstance('i_abc')
|
|
168
|
+
await orchestrator.fleet.undrainInstance('i_abc')
|
|
169
|
+
|
|
170
|
+
// ---- events ----
|
|
171
|
+
orchestrator.on('instance:join', (instance) => {})
|
|
172
|
+
orchestrator.on('instance:leave', (instance) => {})
|
|
173
|
+
orchestrator.on('instance:stale', (instance) => {})
|
|
174
|
+
orchestrator.on('room:create', (room) => {})
|
|
175
|
+
orchestrator.on('room:destroy', (room) => {})
|
|
176
|
+
orchestrator.on('sync', (stats) => {}) // any state change
|
|
177
|
+
|
|
178
|
+
await orchestrator.shutdown()
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Pinning caveat (`placement.instanceId`)
|
|
182
|
+
|
|
183
|
+
Instance ids are **connection-scoped** — any reconnect invalidates them, and a matchmaker that
|
|
184
|
+
cached one gets `404 INSTANCE_NOT_FOUND`. The contract is *look up, then pin immediately; treat
|
|
185
|
+
a 404 as "re-lookup, retry once"*. For a **stable handle across reconnects, pin by `processUid`**
|
|
186
|
+
instead — it identifies the process, not the connection. Specifying both is a `400 VALIDATION`.
|
|
187
|
+
|
|
188
|
+
## 🖥️ Binary — `rivalis-fleet`
|
|
189
|
+
|
|
190
|
+
The flag surface is parsed by [`commander`](https://github.com/tj/commander.js) (the help
|
|
191
|
+
screen and validation are generated, not hand-maintained):
|
|
192
|
+
|
|
193
|
+
```
|
|
194
|
+
$ rivalis-fleet --help
|
|
195
|
+
|
|
196
|
+
Usage: rivalis-fleet [options]
|
|
197
|
+
|
|
198
|
+
Options:
|
|
199
|
+
-H, --host <addr> bind address (env FLEET_HOST, default 0.0.0.0)
|
|
200
|
+
-p, --port <n> HTTP/WS port (env FLEET_PORT, default 7350)
|
|
201
|
+
--agent-key <key> agent auth key, repeatable (env FLEET_AGENT_KEY, required*)
|
|
202
|
+
--admin-key <key> REST admin key, repeatable (env FLEET_ADMIN_KEY, required* when --api)
|
|
203
|
+
--no-api disable REST API
|
|
204
|
+
--cors <origin> CORS allow-origin, repeatable (env FLEET_CORS_ORIGINS, default off)
|
|
205
|
+
--sse-query-auth allow ?key= on /v1/events (env FLEET_SSE_QUERY_AUTH, default off)
|
|
206
|
+
--heartbeat <ms> agent heartbeat interval (env FLEET_HEARTBEAT_MS, default 5000)
|
|
207
|
+
--command-timeout <ms> command ack timeout (env FLEET_COMMAND_TIMEOUT_MS, default 10000)
|
|
208
|
+
--log-level <level> trace|debug|info|warn|error (env FLEET_LOG_LEVEL, default info)
|
|
209
|
+
-v, --version output the version number
|
|
210
|
+
-h, --help display help for command
|
|
211
|
+
|
|
212
|
+
* If omitted, a random key (32 bytes from crypto.randomBytes, base64url-encoded) is
|
|
213
|
+
generated and printed once at startup (dev convenience; refused when NODE_ENV=production).
|
|
214
|
+
Supplied keys are checked against the §13 strength rule at startup. Env vars accept
|
|
215
|
+
comma-separated lists for key rotation.
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
```
|
|
219
|
+
$ FLEET_AGENT_KEY=s3cret FLEET_ADMIN_KEY=adm1n rivalis-fleet -p 7350
|
|
220
|
+
[INFO] fleet ▸ orchestrator listening host=(0.0.0.0) port=(7350) api=(/v1) heartbeat=(5000ms)
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
- **Dev-key behavior:** if no `--agent-key` / `--admin-key` (or env var) is supplied, a random
|
|
224
|
+
32-byte key is generated and printed once at startup so you can get going with zero config.
|
|
225
|
+
- **Production refusals:** when `NODE_ENV=production`, the binary **refuses to auto-generate** a
|
|
226
|
+
missing key. The orchestrator also refuses to start with a key shorter than 16 characters, or
|
|
227
|
+
when the agent-key and admin-key lists **intersect** (one key serving both audiences re-opens
|
|
228
|
+
the legacy single-token hole). Keys 16–31 chars long start with a "weak" warning.
|
|
229
|
+
- Comma-separated env values are accepted for **key rotation** (add new → roll callers → remove old).
|
|
230
|
+
|
|
231
|
+
## 🌐 REST API (`/v1`)
|
|
232
|
+
|
|
233
|
+
Served when `api: true` — built on **Fastify** + `@toolcase/node`'s `RouteHandler`/`Router`, sharing
|
|
234
|
+
the same `node:http` server as the agent WebSocket transport (one port for both). Auth is
|
|
235
|
+
`Authorization: Bearer <adminKey>` on everything except `/healthz` and `/readyz`. Request bodies are
|
|
236
|
+
capped at **64 KiB** before any parse (`413 PAYLOAD_TOO_LARGE`).
|
|
237
|
+
|
|
238
|
+
**Response envelope** — `@toolcase/base` `HTTP.RESTResponse` / `HTTP.RESTError`:
|
|
239
|
+
|
|
240
|
+
- **Success:** `{ "status": "OK", "data": … }` (a list response may also carry `"count"`).
|
|
241
|
+
- **Failure:** `{ "status": "rejected", "cause": "<CODE>" }` — `cause` is the stable, machine-readable
|
|
242
|
+
error code (the `FleetErrorCode`s in the table below).
|
|
243
|
+
|
|
244
|
+
> **Breaking change (was `@rivalis/registry` / pre-006):** the legacy envelope
|
|
245
|
+
> `{ message: 'OK' | 'FAIL', data?, code?, cause? }` is replaced. Map old → new:
|
|
246
|
+
> `message: 'OK'` → `status: 'OK'`, `message: 'FAIL'` → `status: 'rejected'`, and the machine-readable
|
|
247
|
+
> `code` now travels in **`cause`**. The HTTP status codes and the `FleetErrorCode` strings are
|
|
248
|
+
> unchanged — only the JSON shape and the field carrying the code moved.
|
|
249
|
+
|
|
250
|
+
| Method | Route | Purpose |
|
|
251
|
+
|--------|-------|---------|
|
|
252
|
+
| GET | `/healthz` | liveness, no auth |
|
|
253
|
+
| GET | `/readyz` | readiness (HTTP listening **and** WS transport attached), no auth |
|
|
254
|
+
| GET | `/v1/stats` | `FleetStats` |
|
|
255
|
+
| GET | `/v1/instances` | all instances |
|
|
256
|
+
| GET | `/v1/instances/:id` | one instance (404 if absent) |
|
|
257
|
+
| GET | `/v1/instances/:id/rooms` | rooms on one instance |
|
|
258
|
+
| POST | `/v1/instances/:id/drain` | mark draining |
|
|
259
|
+
| POST | `/v1/instances/:id/undrain` | restore to active |
|
|
260
|
+
| GET | `/v1/rooms?type=&instanceId=&label=k:v` | rooms cluster-wide; `label` repeatable, all must match |
|
|
261
|
+
| GET | `/v1/rooms/:roomId` | one room (404 if absent) |
|
|
262
|
+
| POST | `/v1/rooms` | **create with placement** — body `{ type, roomId?, placement? }` → `201` `RoomInfo` |
|
|
263
|
+
| DELETE | `/v1/rooms/:roomId` | destroy (orchestrator resolves the owning instance) |
|
|
264
|
+
| GET | `/v1/events` | **Server-Sent Events** stream of fleet events for dashboards |
|
|
265
|
+
|
|
266
|
+
**Conditional requests:** `GET /v1/stats`, `/v1/instances`, and `/v1/rooms` return
|
|
267
|
+
`ETag: W/"<stateHash>"` and honor `If-None-Match` → `304`. The hash covers semantic state only
|
|
268
|
+
(instances, rooms, counts, statuses, capacities) — heartbeat bookkeeping (`lastSyncAt`) is
|
|
269
|
+
excluded, so a quiet fleet actually produces `304`s. The ETag is weak because two bodies with
|
|
270
|
+
equal hashes may still differ in `lastSyncAt`. Change-polling is therefore plain HTTP semantics.
|
|
271
|
+
|
|
272
|
+
**SSE (`/v1/events`):** a `: ping` comment frame is emitted every 15 s so idle proxies don't kill
|
|
273
|
+
the stream. No event replay — `Last-Event-ID` is not supported; a reconnecting consumer re-`GET`s
|
|
274
|
+
`/v1/stats` + `/v1/instances` to resync, then resumes the stream. For browser `EventSource`
|
|
275
|
+
(which cannot set headers), `?key=<adminKey>` is accepted **only when `sseQueryAuth: true`** —
|
|
276
|
+
see the [security caveat](#️-security).
|
|
277
|
+
|
|
278
|
+
### ✅ Safe retries on `POST /v1/rooms` — the day-one matchmaker contract
|
|
279
|
+
|
|
280
|
+
A `504 COMMAND_TIMEOUT` does **not** mean the room wasn't created — the agent may ack late and the
|
|
281
|
+
next snapshot will surface it. Make retries idempotent by **always passing a client-supplied
|
|
282
|
+
`roomId`**:
|
|
283
|
+
|
|
284
|
+
1. `POST /v1/rooms` with `{ type, roomId }`.
|
|
285
|
+
2. On `504` (or a network error), **retry the same request**.
|
|
286
|
+
3. The retry either **succeeds**, or returns **`409 ROOM_EXISTS`** — treat that as success and
|
|
287
|
+
`GET /v1/rooms/:roomId` to fetch the `RoomInfo`.
|
|
288
|
+
|
|
289
|
+
The same `roomId` is reserved while a create is in flight, so two concurrent creates of one id can
|
|
290
|
+
never both land — exactly one wins, the rest get `409 ROOM_EXISTS`. Adopt this pattern from day
|
|
291
|
+
one; it is the documented contract, not a workaround.
|
|
292
|
+
|
|
293
|
+
### Error codes
|
|
294
|
+
|
|
295
|
+
Each code below is returned in the failure envelope's **`cause`** field (`{ status: 'rejected', cause }`).
|
|
296
|
+
|
|
297
|
+
| HTTP | `cause` | When |
|
|
298
|
+
|------|--------|------|
|
|
299
|
+
| 400 | `VALIDATION` | malformed body/params / `roomId` outside `^[A-Za-z0-9_-]{1,64}$` / both `instanceId` and `processUid` pins |
|
|
300
|
+
| 401 | `UNAUTHORIZED` | missing, unknown, or wrong-audience key — one uniform response for all three |
|
|
301
|
+
| 404 | `INSTANCE_NOT_FOUND`, `ROOM_NOT_FOUND` | unknown id |
|
|
302
|
+
| 409 | `NO_CANDIDATE` | no instance passes the placement filter |
|
|
303
|
+
| 409 | `ROOM_EXISTS` | explicit `roomId` already exists **or is reserved by an in-flight create** |
|
|
304
|
+
| 409 | `INSTANCE_DRAINING` | pinned placement to a draining instance without `force` |
|
|
305
|
+
| 413 | `PAYLOAD_TOO_LARGE` | request body over 64 KiB |
|
|
306
|
+
| 429 | `INSTANCE_BUSY` | per-instance in-flight command cap (32) reached |
|
|
307
|
+
| 429 | `AUTH_THROTTLED` | per-IP failed-auth limit exceeded |
|
|
308
|
+
| 429 | `SSE_LIMIT` | concurrent SSE stream cap (default 100) reached |
|
|
309
|
+
| 502 | `COMMAND_FAILED` | agent acked `ok: false` |
|
|
310
|
+
| 502 | `INSTANCE_DISCONNECTED` | agent dropped with the command in flight (immediate, no timeout wait) |
|
|
311
|
+
| 504 | `COMMAND_TIMEOUT` | no ack within `commandTimeoutMs` |
|
|
312
|
+
|
|
313
|
+
### Migrating from `@rivalis/registry`
|
|
314
|
+
|
|
315
|
+
| Legacy (`@rivalis/registry`) | New |
|
|
316
|
+
|---|---|
|
|
317
|
+
| `GET /api/stats` | `GET /v1/stats` |
|
|
318
|
+
| `GET /api/instances[...]` | `GET /v1/instances[...]` |
|
|
319
|
+
| `POST /api/instances/:id/rooms` | `POST /v1/rooms` with `placement.instanceId` |
|
|
320
|
+
| `DELETE /api/instances/:id/rooms/:roomId` | `DELETE /v1/rooms/:roomId` |
|
|
321
|
+
| raw token in `Authorization` | `Bearer` scheme, separate agent/admin keys |
|
|
322
|
+
|
|
323
|
+
`roomId` is now **fleet-unique** (legacy allowed the same id on different instances and returned an
|
|
324
|
+
array); `GET`/`DELETE /v1/rooms/:roomId` are unambiguous, and `DELETE` no longer needs the instance id.
|
|
325
|
+
|
|
326
|
+
## 🛡️ Security
|
|
327
|
+
|
|
328
|
+
> ⚠️ **Designed for private networks. TLS termination is OUT OF SCOPE.** Front the orchestrator
|
|
329
|
+
> with a reverse proxy or service mesh that terminates TLS. Bind to an internal interface with
|
|
330
|
+
> `host` / `--host` (e.g. `127.0.0.1` or a private NIC) where possible. The private-network
|
|
331
|
+
> assumption covers **transport**, not authentication — keys are still enforced.
|
|
332
|
+
|
|
333
|
+
- **Two keys, two audiences.** `agentKey` authenticates instances (each can only affect its own
|
|
334
|
+
state); `adminKey` authenticates the REST API (full control). **Audience separation is enforced:**
|
|
335
|
+
presenting an agent key to `/v1/*` is a plain `401`, never a downgraded read-only view. If the
|
|
336
|
+
configured agent/admin key lists intersect, the orchestrator warns — and **refuses to start when
|
|
337
|
+
`NODE_ENV=production`**.
|
|
338
|
+
- **Key strength enforced at startup** (production): keys shorter than 16 chars are refused,
|
|
339
|
+
shorter than 32 are warned. The auto-generated dev key is 32 bytes from `crypto.randomBytes`.
|
|
340
|
+
- **Key rotation without downtime.** Both options accept `string | string[]` (env vars accept a
|
|
341
|
+
comma-separated list). Procedure: **add the new key → roll agents/callers to it → remove the old
|
|
342
|
+
key.** No simultaneous fleet-wide restart.
|
|
343
|
+
- **No secrets in URLs.** The agent key travels in the `Sec-WebSocket-Protocol` header
|
|
344
|
+
(`ticketSource: 'protocol'`), never as a `?ticket=` query parameter. The `101` handshake echoes a
|
|
345
|
+
fixed sentinel subprotocol (`rivalis-fleet.v1`), never the key.
|
|
346
|
+
- **`sseQueryAuth` caveat (off by default).** The lone exception to the no-secrets-in-URLs rule is
|
|
347
|
+
the SSE `?key=` fallback for browser `EventSource`, which cannot set headers. It is an explicit
|
|
348
|
+
operator opt-in (`sseQueryAuth: true` / `--sse-query-auth`). ⚠️ **Query strings land in
|
|
349
|
+
proxy/access logs — prefer the `Authorization: Bearer` header form.** A short-lived derived-token
|
|
350
|
+
endpoint is on the roadmap.
|
|
351
|
+
- **Agent data is authenticated, not trusted.** Snapshots are bounds-checked before they touch the
|
|
352
|
+
read model (`endpointUrl` must be a `ws:`/`wss:`/`http:`/`https:` URL ≤ 512 chars; `name` ≤ 64;
|
|
353
|
+
≤ 32 labels; `roomTypes` ≤ 256). A failing snapshot is rejected with a logged warning and the
|
|
354
|
+
read model keeps its last good state.
|
|
355
|
+
- **Uniform, throttled auth failures.** Every failure (missing / unknown / wrong-audience; REST, WS,
|
|
356
|
+
or SSE) returns the identical `401 { status: 'rejected', cause: 'UNAUTHORIZED' }`. Failures are
|
|
357
|
+
rate-limited per source IP (`429 AUTH_THROTTLED`) and logged with IP and route — **never the
|
|
358
|
+
presented credential**. The failed-auth bucket map is bounded (fully-refilled buckets are pruned,
|
|
359
|
+
with a hard cap), so it cannot grow without limit under spoofed-IP churn.
|
|
360
|
+
- **`trustProxy` (off by default).** The per-IP throttle and audit log key on `req.ip`. ⚠️ **Without
|
|
361
|
+
`trustProxy`, that is the direct socket address — so behind a reverse proxy every client collapses
|
|
362
|
+
into the one proxy IP: the throttle is per-proxy, not per-client (10 failed auths from anyone can
|
|
363
|
+
`429` every dashboard/matchmaker), and audit lines all show the proxy.** When you front the
|
|
364
|
+
orchestrator with a *trusted* TLS-terminating proxy/mesh, set `trustProxy: true` (`--trust-proxy` /
|
|
365
|
+
`FLEET_TRUST_PROXY`) so Fastify resolves the real client IP from `X-Forwarded-For`. Leave it off for
|
|
366
|
+
direct exposure — a spoofable header from an untrusted network must not be believed.
|
|
367
|
+
- **Keys never logged.** Logs identify *which* configured key authenticated by an 8-hex-char
|
|
368
|
+
truncated-SHA-256 fingerprint (`key#a1b2c3d4`), so rotation stays observable without printing key
|
|
369
|
+
material. The three mutating routes (`POST /v1/rooms`, `DELETE /v1/rooms/:id`, drain/undrain) are
|
|
370
|
+
audit-logged: route, key fingerprint, source IP, outcome.
|
|
371
|
+
|
|
372
|
+
- **The orchestrator controls the conversation (protocol v3).** Every agent frame must be a reply to
|
|
373
|
+
an outstanding request (a `fleet/poll` reqId, or a `fleet/cmd` cmdId). An unsolicited, duplicate, or
|
|
374
|
+
unknown-topic frame is kicked — so a compromised agent key **cannot spam snapshots, flood acks, or
|
|
375
|
+
push state on its own schedule**. Defenses are now protocol structure, not just rate limiting.
|
|
376
|
+
|
|
377
|
+
### Residual risk (stated honestly)
|
|
378
|
+
|
|
379
|
+
The agent key is **shared across instances**, so a compromised game node can still register phantom
|
|
380
|
+
instances, advertise fake capacity to skew placement, or accept-and-blackhole placements *when the
|
|
381
|
+
orchestrator asks it to*. It can no longer flood the orchestrator with unsolicited frames (those are
|
|
382
|
+
kicked, see above). On the private networks this is designed for, the remaining surface is an accepted
|
|
383
|
+
risk — documented rather than hidden. The snapshot field caps above limit its blast radius, and
|
|
384
|
+
**per-instance registration tokens** are on the roadmap for deployments that need the stronger story.
|
|
385
|
+
|
|
386
|
+
## 🧯 Failure modes & guarantees
|
|
387
|
+
|
|
388
|
+
| Scenario | Behavior |
|
|
389
|
+
|----------|----------|
|
|
390
|
+
| Orchestrator restarts | Agents reconnect with backoff; full state rebuilt within ~1 poll interval (the orchestrator polls each reconnected agent with `knownHash: null`). Room provenance survives (agents report `origin`); duplicate ids are tie-broken deterministically (earliest joiner keeps the canonical id). |
|
|
391
|
+
| Agent socket drops | Instance evicted instantly; its rooms vanish from the read model (they keep running on the node — this is discovery only). Rejoin restores them; `processUid` correlates the leave/join pair. |
|
|
392
|
+
| Agent drops with commands in flight | All pending commands rejected immediately with `502 INSTANCE_DISCONNECTED` — no waiting out `commandTimeoutMs`. |
|
|
393
|
+
| Agent wedged (connected, silent) | Missed poll replies accrue: marked `stale` at **2 missed polls** (excluded from placement), evicted at **3 missed polls** (≈ 2×/3× the poll interval). In-flight commands are rejected `INSTANCE_DISCONNECTED` on evict. |
|
|
394
|
+
| Agent sends an unsolicited / duplicate / unknown-topic frame | **Kicked and evicted** — every agent frame must answer an outstanding request (matched by correlation id); the kick log names the cause + instance, never the payload. It reconnects fresh. |
|
|
395
|
+
| Local room create/destroy | Surfaces in the read model at the **next poll** (bounded by the poll interval) — local changes no longer push. Orchestrator-initiated creates are read-your-write via the cmd ack. |
|
|
396
|
+
| Command lost / agent slow | Ack timeout → `504`; the next poll reconciles actual state. Retry safely with an explicit `roomId`. (A late ack arriving after the timeout matches no pending command → the agent is kicked and reconnects.) |
|
|
397
|
+
| Destroy races the room's natural end | Agent acks `ok: true, alreadyGone: true` — idempotent, no spurious `502`. |
|
|
398
|
+
| Snapshot approaches the 4 MiB transport frame limit | Agent logs a warning at 50% and an error at 90% — degradation is observable before the hard failure. |
|
|
399
|
+
| Two agents, same `name` | Allowed (names are labels, ids are identity) — logged with a warning since it usually signals a config copy-paste. |
|
|
400
|
+
| Hostile/buggy agent sends malformed/oversized snapshot fields | Snapshot rejected with a logged warning; read model keeps its last good state. |
|
|
401
|
+
|
|
402
|
+
**Consistency stance:** the read model is **eventually consistent with agent truth, bounded by one
|
|
403
|
+
poll interval**. Command acks give read-your-write on the happy path. This matches what matchmaking
|
|
404
|
+
needs; nothing here pretends to be a database.
|
|
405
|
+
|
|
406
|
+
## 🗺️ Roadmap
|
|
407
|
+
|
|
408
|
+
Post-v1, explicitly out of scope today: create-time room `options` passed through to `onCreate`;
|
|
409
|
+
per-room metadata passthrough for richer matchmaking queries; optional Prometheus `/metrics`;
|
|
410
|
+
short-lived derived tokens for SSE/dashboard auth (replacing the `?key=` caveat); per-instance
|
|
411
|
+
registration tokens (closing the shared-agent-key residual risk); chunked `fleet/state` for fleets
|
|
412
|
+
near the 4 MiB frame ceiling; `url: string | string[]` on `FleetAgent` for failover-by-DNS; and
|
|
413
|
+
orchestrator HA (v2) if a single node ever becomes the bottleneck.
|
|
414
|
+
|
|
415
|
+
## License
|
|
416
|
+
|
|
417
|
+
MIT
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict'
|
|
3
|
+
|
|
4
|
+
// Thin shim → lib/cli.js (§5). All logic lives in the library so it stays testable;
|
|
5
|
+
// this only invokes main() and turns an escaping failure (e.g. listen() EADDRINUSE)
|
|
6
|
+
// into a non-zero exit instead of an unhandled rejection.
|
|
7
|
+
require('../lib/cli.js').main().catch((error) => {
|
|
8
|
+
console.error(error && error.stack ? error.stack : String(error))
|
|
9
|
+
process.exit(1)
|
|
10
|
+
})
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
|
|
20
|
+
// src/orchestrator/AgentAuthenticator.ts
|
|
21
|
+
var AgentAuthenticator_exports = {};
|
|
22
|
+
__export(AgentAuthenticator_exports, {
|
|
23
|
+
AgentAuthenticator: () => AgentAuthenticator,
|
|
24
|
+
matchKey: () => matchKey
|
|
25
|
+
});
|
|
26
|
+
module.exports = __toCommonJS(AgentAuthenticator_exports);
|
|
27
|
+
var import_node_crypto = require("crypto");
|
|
28
|
+
function matchKey(presented, keys) {
|
|
29
|
+
if (typeof presented !== "string" || presented.length === 0 || keys.length === 0) {
|
|
30
|
+
return null;
|
|
31
|
+
}
|
|
32
|
+
const presentedDigest = (0, import_node_crypto.createHash)("sha256").update(presented).digest();
|
|
33
|
+
let matched = null;
|
|
34
|
+
for (const key of keys) {
|
|
35
|
+
const candidate = (0, import_node_crypto.createHash)("sha256").update(key).digest();
|
|
36
|
+
if ((0, import_node_crypto.timingSafeEqual)(presentedDigest, candidate)) {
|
|
37
|
+
matched = key;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return matched;
|
|
41
|
+
}
|
|
42
|
+
var AgentAuthenticator = class {
|
|
43
|
+
constructor(agentKeys) {
|
|
44
|
+
this.agentKeys = agentKeys;
|
|
45
|
+
}
|
|
46
|
+
agentKeys;
|
|
47
|
+
/** True when `ticket` is one of the configured agent keys (constant-time, §13). */
|
|
48
|
+
matches(ticket) {
|
|
49
|
+
return matchKey(ticket, this.agentKeys) !== null;
|
|
50
|
+
}
|
|
51
|
+
};
|
|
52
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
53
|
+
0 && (module.exports = {
|
|
54
|
+
AgentAuthenticator,
|
|
55
|
+
matchKey
|
|
56
|
+
});
|