embark-ai 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +351 -0
- package/botSupervisor.js +237 -0
- package/mc-server/bot/bot.js +1415 -0
- package/mc-server/bot/damagePipeline.js +402 -0
- package/mc-server/bot/engine.js +212 -0
- package/mc-server/bot/entityLiveness.js +121 -0
- package/mc-server/bot/env.js +38 -0
- package/mc-server/bot/environmentPerception.js +384 -0
- package/mc-server/bot/fatalDesyncRecovery.js +42 -0
- package/mc-server/bot/goalRegistry.js +49 -0
- package/mc-server/bot/healthIntegrityWatchdog.js +59 -0
- package/mc-server/bot/llm.js +232 -0
- package/mc-server/bot/locomotionRecovery.js +190 -0
- package/mc-server/bot/logger.js +63 -0
- package/mc-server/bot/memory.js +59 -0
- package/mc-server/bot/movementController.js +110 -0
- package/mc-server/bot/package.json +14 -0
- package/mc-server/bot/positionGuard.js +75 -0
- package/mc-server/bot/recoveryEngine.js +315 -0
- package/mc-server/bot/safeMineflayer.js +129 -0
- package/mc-server/bot/state.js +105 -0
- package/mc-server/bot/tasks.js +939 -0
- package/mc-server/server.properties +74 -0
- package/package.json +44 -0
- package/tui.js +1099 -0
package/README.md
ADDED
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
# embark-ai — Autonomous Minecraft Agent
|
|
2
|
+
|
|
3
|
+
A Minecraft bot that thinks, survives, defends itself, and completes multi-step tasks.
|
|
4
|
+
Built on **Mineflayer** + **Featherless AI** (cloud) or **Ollama** (local LLM). No TypeScript, no databases.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## What Ember does
|
|
9
|
+
|
|
10
|
+
- **Talks naturally** — LLM interprets intent: "build me a house" chains wood-gathering → plank-crafting → construction automatically
|
|
11
|
+
- **Has a real personality** — stubborn, witty, refuses nonsense, holds grudges
|
|
12
|
+
- **Defends itself** — attacks back when hit, escalates with insults, equips best weapon
|
|
13
|
+
- **Survives autonomously** — manages hunger, energy, hazard avoidance, water escape, falling
|
|
14
|
+
- **Multi-step planning** — single requests like `build a house` or `mine 5 iron ore` chain multiple tasks internally
|
|
15
|
+
- **Grounded perception** — inventory, nearby blocks, mobs, drops read directly from Mineflayer APIs (zero hallucination)
|
|
16
|
+
- **Resilience architecture** — position-guard, desync recovery, task watchdogs, supervised restart with backoff
|
|
17
|
+
- **Reports everything in chat** — if a task fails, you see the reason in-game
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
<player> build me a house
|
|
21
|
+
<Ember> Starting house. Will gather and craft if needed.
|
|
22
|
+
<Ember> Got wood. Making planks. Have 32 planks. Building now.
|
|
23
|
+
<Ember> House done!
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
<player> mine 5 iron_ore
|
|
27
|
+
<Ember> On it.
|
|
28
|
+
<Ember> Mined 5 iron_ore.
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Architecture
|
|
34
|
+
|
|
35
|
+
### Request pipeline
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
chat message
|
|
39
|
+
│
|
|
40
|
+
├─► detectInsult() anger++ / attack at threshold
|
|
41
|
+
│
|
|
42
|
+
▼
|
|
43
|
+
classifyIntent() regex pre-filter (follow / build / attack / ...)
|
|
44
|
+
│
|
|
45
|
+
▼
|
|
46
|
+
evaluateSurvival() short-circuit if critically exhausted
|
|
47
|
+
│
|
|
48
|
+
▼
|
|
49
|
+
buildGroundedState() real inventory + blocks + mobs + drops + anger
|
|
50
|
+
│ only verified Mineflayer sensor data
|
|
51
|
+
▼
|
|
52
|
+
queryLLM() grounded prompt → strict JSON output
|
|
53
|
+
│ Featherless AI or Ollama (same code path)
|
|
54
|
+
▼
|
|
55
|
+
validateLLMOutput() schema check → safeDefault() on failure
|
|
56
|
+
│
|
|
57
|
+
▼
|
|
58
|
+
executeAction() LLM never controls movement directly
|
|
59
|
+
│ routes to runTask() → pathfinder / pvp / craft
|
|
60
|
+
▼
|
|
61
|
+
errors → bot.chat() every failure surfaced in-game
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Background loops (always running)
|
|
65
|
+
|
|
66
|
+
| Loop | Interval | What it does |
|
|
67
|
+
|------|----------|-------------|
|
|
68
|
+
| `entityHurt` listener | event | identifies attacker, fights back |
|
|
69
|
+
| Threat loop | 2.5 s | auto-engages hostile mobs within 10 blocks |
|
|
70
|
+
| Anger decay | 1 s | insults +1, hits +4, decays 0.05/s; attacks at level 5 |
|
|
71
|
+
| State loop | 1 s | energy/hunger drain, autonomous goal selection |
|
|
72
|
+
| Agent loop | 250 ms | look-at player, follow, water survival |
|
|
73
|
+
| Environment perception | 3 s | scans 6-block radius for hazards, cliff edges, enclosures |
|
|
74
|
+
| physicsTick guard | ~50 ms | restores NaN position from cache if a hit corrupts it |
|
|
75
|
+
|
|
76
|
+
### Resilience system
|
|
77
|
+
|
|
78
|
+
Ember has a layered resilience architecture that keeps her running under game physics edge cases, server desync, and combat:
|
|
79
|
+
|
|
80
|
+
| Component | File | Role |
|
|
81
|
+
|-----------|------|------|
|
|
82
|
+
| `positionGuard` | `positionGuard.js` | Detects NaN position caused by knockback, restores from cache on every physics tick (~50 ms). Prevents the ~6 s freeze-on-hit. |
|
|
83
|
+
| `entityLiveness` | `entityLiveness.js` | 5-state monitor: `LIVE_VALID → TRANSIENT_INVALID → STALE_USING_CACHE → RECOVERING → FATAL`. Declares desync in ~5–8 s. |
|
|
84
|
+
| `recoveryEngine` | `recoveryEngine.js` | Single arbiter for all recovery. Maps symptoms (DESYNC, STUCK, CRITICAL_HP, …) to escalation classes. Sole owner of `bot.quit()`. |
|
|
85
|
+
| `fatalDesyncRecovery` | `fatalDesyncRecovery.js` | Independent 2 s poller; reports `DESYNC` to `recoveryEngine` when entity is fatally stale. |
|
|
86
|
+
| `healthIntegrityWatchdog` | `healthIntegrityWatchdog.js` | Detects HP drops not caught by the event stream. |
|
|
87
|
+
| `movementController` | `movementController.js` | Sole writer to `bot.pathfinder`. Token-based ownership prevents multiple tasks from racing on movement. |
|
|
88
|
+
| `goalRegistry` | `goalRegistry.js` | Sole mutator of `state.goal`. All goal changes go through `setGoal()`. |
|
|
89
|
+
| `damagePipeline` | `damagePipeline.js` | Single damage-reaction authority. Classifies and routes player hits, mob attacks, environmental damage. Staging flag prevents health-handler / damage-window races. |
|
|
90
|
+
| `locomotionRecovery` | `locomotionRecovery.js` | Raw-control-state escape sequences for physically stuck situations. Separate from desync recovery (which only reconnects). |
|
|
91
|
+
| `environmentPerception` | `environmentPerception.js` | Scans surrounding blocks for lava, water, cliff edges, enclosed spaces, traversability. Returns `{ valid: false }` on NaN position instead of silently reporting "all clear". |
|
|
92
|
+
| `botSupervisor` | `botSupervisor.js` | Wraps the bot process with exit-reason classification, per-class exponential backoff, restart storm protection (≥5 restarts in 10 min → halt). |
|
|
93
|
+
|
|
94
|
+
### LLM integration
|
|
95
|
+
|
|
96
|
+
Both backends use the same `llm.js` code path. Ollama mode overrides `FEATHERLESS_URL` to point at Ollama's OpenAI-compatible endpoint.
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
Featherless AI → api.featherless.ai/v1/chat/completions (cloud, 3000+ models)
|
|
100
|
+
Ollama → localhost:11434/v1/chat/completions (local, private)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**LLM output schema:**
|
|
104
|
+
```json
|
|
105
|
+
{
|
|
106
|
+
"decision": "accept | reject | delay",
|
|
107
|
+
"reason": "...",
|
|
108
|
+
"action": "follow | stop | explore | gather_wood | craft_planks | go_to | remember_here |
|
|
109
|
+
attack_mobs | attack_player | collect_items | craft | place_block | mine_block |
|
|
110
|
+
eat_food | flee | escape | build_house_smart | none",
|
|
111
|
+
"say": "...",
|
|
112
|
+
"target": "..."
|
|
113
|
+
}
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Invalid or missing output falls back to `safeDefault(intent)`. The LLM picks an action label — it never touches movement or pathfinder directly.
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## Stack
|
|
121
|
+
|
|
122
|
+
| Layer | Technology |
|
|
123
|
+
|-------|-----------|
|
|
124
|
+
| Bot framework | [Mineflayer](https://github.com/PrismarineJS/mineflayer) |
|
|
125
|
+
| Pathfinding | [mineflayer-pathfinder](https://github.com/PrismarineJS/mineflayer-pathfinder) |
|
|
126
|
+
| Combat | [mineflayer-pvp](https://github.com/PrismarineJS/mineflayer-pvp) |
|
|
127
|
+
| Cloud LLM | [Featherless AI](https://featherless.ai) — OpenAI-compatible API, 3,000+ models |
|
|
128
|
+
| Local LLM | [Ollama](https://ollama.com) — runs models locally, no GPU required for small models |
|
|
129
|
+
| TUI | [blessed](https://github.com/chjj/blessed) |
|
|
130
|
+
| Minecraft server | Vanilla Java Edition 1.21.4 |
|
|
131
|
+
| Runtime | Node.js 18+ |
|
|
132
|
+
| Language | CommonJS JavaScript — no TypeScript, no databases |
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## System Requirements
|
|
137
|
+
|
|
138
|
+
| Requirement | Minimum |
|
|
139
|
+
|-------------|---------|
|
|
140
|
+
| Node.js | 18+ |
|
|
141
|
+
| Java | 21+ (for the Minecraft server) |
|
|
142
|
+
| RAM | 4 GB (bot + server); 8 GB recommended |
|
|
143
|
+
| OS | macOS, Linux (Windows via WSL) |
|
|
144
|
+
| Disk | ~1 GB for server + world data |
|
|
145
|
+
|
|
146
|
+
**For Ollama (local LLM):**
|
|
147
|
+
|
|
148
|
+
| RAM available | Recommended model | Download size |
|
|
149
|
+
|---------------|------------------|---------------|
|
|
150
|
+
| < 4 GB | `llama3.2:1b` | 1.3 GB |
|
|
151
|
+
| 4–8 GB | `llama3.2` | 2.0 GB |
|
|
152
|
+
| 16 GB | `qwen2.5-coder:14b` | 9 GB |
|
|
153
|
+
| 32 GB+ | `qwen2.5:32b` | 20 GB |
|
|
154
|
+
|
|
155
|
+
**For Featherless AI (cloud):**
|
|
156
|
+
- API key from [featherless.ai](https://featherless.ai) (format: `rc_xxx`)
|
|
157
|
+
- No GPU, no local model download required
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## Setup
|
|
162
|
+
|
|
163
|
+
### 1. Install dependencies
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
npm install # TUI (blessed)
|
|
167
|
+
cd mc-server/bot && npm install # Bot (mineflayer, pathfinder, pvp)
|
|
168
|
+
cd ../..
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### 2. Java 21+
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
# macOS
|
|
175
|
+
brew install openjdk@21
|
|
176
|
+
|
|
177
|
+
# Linux (Debian/Ubuntu)
|
|
178
|
+
sudo apt install openjdk-21-jdk
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### 3. Minecraft server jar
|
|
182
|
+
|
|
183
|
+
Download the Minecraft Java Edition 1.21.4 server jar from `minecraft.net/download/server` and place it at:
|
|
184
|
+
|
|
185
|
+
```
|
|
186
|
+
mc-server/server.jar
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### 4. Run
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
npm start
|
|
193
|
+
# or: node tui.js
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
On first launch the **onboarding wizard** opens automatically:
|
|
197
|
+
|
|
198
|
+
1. Choose **Ollama** (local) or **Featherless API** (cloud)
|
|
199
|
+
2. **Ollama path** — detects installation; if missing, shows the install command; reads your RAM and recommends a model; walks through `ollama pull <model>`
|
|
200
|
+
3. **Featherless path** — prompts for API key, then shows a model picker
|
|
201
|
+
4. **Server check** — verifies `server.jar` is present
|
|
202
|
+
|
|
203
|
+
Config is saved to `.ember-config.json` (gitignored). Press `[8]` in the TUI at any time to change backend, model, or API key.
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## TUI
|
|
208
|
+
|
|
209
|
+
```
|
|
210
|
+
┌─ embark-ai ─ ● Server localhost:25565 ─ Bots: ● Ember (llama3.2, 142s) ────┐
|
|
211
|
+
├─ Bot Status ────────────────────┬─ Actions ──────────────────────────────────┤
|
|
212
|
+
│ Ember │ AI: Ollama (llama3.2) │
|
|
213
|
+
│ │ │
|
|
214
|
+
│ HP: ████████████░░ 18.0/20 │ [1] Start server │
|
|
215
|
+
│ Food: ████████████████ 20/20 │ [2] Stop server │
|
|
216
|
+
│ Goal: following │ [3] Spawn bot │
|
|
217
|
+
│ Pos: (12, 64, -8) │ [4] Stop bot │
|
|
218
|
+
│ Anger: none │ [5] Restart bot │
|
|
219
|
+
│ │ [6] World settings │
|
|
220
|
+
│ Inventory: │ [7] List models │
|
|
221
|
+
│ oak_log x3, stick x4 │ [8] Reconfigure │
|
|
222
|
+
│ │ [s] Switch active bot │
|
|
223
|
+
├─ Live Log ──────────────────────┤ │
|
|
224
|
+
│ 14:23:01 [info] spawn_ok │ ↑↓ scroll log │
|
|
225
|
+
│ 14:23:05 [info] task_start │ Tab switch focus │
|
|
226
|
+
│ action=follow │ r refresh │
|
|
227
|
+
│ ... │ q quit │
|
|
228
|
+
└─────────────────────────────────┴────────────────────────────────────────────┘
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
| Key | Action |
|
|
232
|
+
|-----|--------|
|
|
233
|
+
| `1` | Start Minecraft server |
|
|
234
|
+
| `2` | Stop Minecraft server |
|
|
235
|
+
| `3` | Spawn bot (choose name + model) |
|
|
236
|
+
| `4` | Stop a bot |
|
|
237
|
+
| `5` | Restart a bot (clean chain) |
|
|
238
|
+
| `6` | World settings — gamemode, difficulty, pvp, seed, view distance, … |
|
|
239
|
+
| `7` | List models (installed Ollama models or Featherless catalog) |
|
|
240
|
+
| `8` | Re-run onboarding (change AI backend, model, or API key) |
|
|
241
|
+
| `s` | Switch which bot the status panel shows |
|
|
242
|
+
| `↑↓` / `PgUp/Dn` | Scroll the live log |
|
|
243
|
+
| `Tab` | Cycle panel focus |
|
|
244
|
+
| `r` | Manual refresh |
|
|
245
|
+
| `q` / `Ctrl+C` | Quit — stops all managed processes cleanly |
|
|
246
|
+
|
|
247
|
+
Multiple bots with different names and models can run simultaneously.
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## In-Game Commands
|
|
252
|
+
|
|
253
|
+
Speak naturally — the LLM interprets intent. Direct commands work too:
|
|
254
|
+
|
|
255
|
+
| Say | What happens |
|
|
256
|
+
|-----|-------------|
|
|
257
|
+
| `follow me` | Pathfinds to you, follows continuously |
|
|
258
|
+
| `stop` | Stops all movement and tasks |
|
|
259
|
+
| `explore` | Walks around autonomously |
|
|
260
|
+
| `build me a house` | Chains gather wood → craft planks → place 70 blocks |
|
|
261
|
+
| `get some wood` | Finds and chops the nearest log |
|
|
262
|
+
| `make planks` | Converts logs in inventory to planks |
|
|
263
|
+
| `craft a wooden_pickaxe` | Looks up recipe, crafts it (auto-makes sticks if needed) |
|
|
264
|
+
| `mine 5 iron_ore` | Mines that block type N times, auto-equips best tool |
|
|
265
|
+
| `place a crafting_table` | Places named block from inventory |
|
|
266
|
+
| `eat something` | Eats food from inventory |
|
|
267
|
+
| `kill that zombie` | Equips best weapon, engages nearest hostile mob |
|
|
268
|
+
| `pick up those items` | Walks to and collects nearby dropped items |
|
|
269
|
+
| `go to spawn` | Navigates to a saved location by name |
|
|
270
|
+
| `remember this place as base` | Saves current coordinates |
|
|
271
|
+
| `where are you` | Reports real coordinates |
|
|
272
|
+
| `what do you have` | Reports real inventory |
|
|
273
|
+
| `flee` | Runs away from the nearest hostile mob |
|
|
274
|
+
| `escape` / `you're stuck` | Pillars up or digs to surface |
|
|
275
|
+
| **(hit Ember)** | Anger +4; retaliates at threshold |
|
|
276
|
+
| **(insult Ember)** | Anger +1; warns at 3, attacks at 5 |
|
|
277
|
+
|
|
278
|
+
---
|
|
279
|
+
|
|
280
|
+
## File Structure
|
|
281
|
+
|
|
282
|
+
```
|
|
283
|
+
embark-ai/
|
|
284
|
+
├── tui.js TUI control panel — server + bot management + onboarding
|
|
285
|
+
├── botSupervisor.js Supervised bot lifecycle: backoff, storm protection
|
|
286
|
+
├── package.json
|
|
287
|
+
└── mc-server/
|
|
288
|
+
├── server.jar (gitignored — download separately)
|
|
289
|
+
├── server.properties
|
|
290
|
+
└── bot/
|
|
291
|
+
├── bot.js Main loop, action execution, agent/state/threat loops
|
|
292
|
+
├── engine.js Intent classifier, LLM validation, anger/insult detection
|
|
293
|
+
├── state.js buildGroundedState() — Mineflayer sensor data only
|
|
294
|
+
├── llm.js Featherless AI / Ollama integration (same code path)
|
|
295
|
+
├── memory.js Persistent memory — locations, events, knowledge
|
|
296
|
+
├── logger.js Structured JSONL logger → events.jsonl
|
|
297
|
+
├── env.js Environment variable loader
|
|
298
|
+
├── safeMineflayer.js Safe wrappers: safeDig, safeCraft, safeAttack, …
|
|
299
|
+
├── movementController.js Token-owned pathfinder writes
|
|
300
|
+
├── goalRegistry.js Sole mutator of state.goal
|
|
301
|
+
├── entityLiveness.js 5-state entity validity monitor
|
|
302
|
+
├── positionGuard.js physicsTick NaN position repair
|
|
303
|
+
├── recoveryEngine.js Single recovery arbiter; sole bot.quit() owner
|
|
304
|
+
├── fatalDesyncRecovery.js Desync reporter (independent 2 s poll)
|
|
305
|
+
├── healthIntegrityWatchdog.js HP integrity watchdog
|
|
306
|
+
├── damagePipeline.js Unified damage classification and reaction
|
|
307
|
+
├── tasks.js All 16 task implementations
|
|
308
|
+
├── environmentPerception.js Spatial scan — hazards, traversability
|
|
309
|
+
├── locomotionRecovery.js Physical escape sequences
|
|
310
|
+
└── package.json
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
---
|
|
314
|
+
|
|
315
|
+
## Design Principles
|
|
316
|
+
|
|
317
|
+
**Anti-hallucination** — `buildGroundedState()` returns only what `bot.inventory.items()`, `bot.findBlock()`, and `bot.entities` actually report. The LLM is explicitly told not to invent facts.
|
|
318
|
+
|
|
319
|
+
**LLM picks labels, code executes** — The LLM output is an action label. `executeAction()` is the only path to pathfinder, pvp, or crafting. Invalid LLM output falls back to `safeDefault(intent)`.
|
|
320
|
+
|
|
321
|
+
**Single authority per concern** — One module owns each resource: `movementController` for pathfinder, `goalRegistry` for `state.goal`, `recoveryEngine` for `bot.quit()`, `damagePipeline` for damage reactions.
|
|
322
|
+
|
|
323
|
+
**Token discipline** — Long-running async tasks mint a Symbol ownership token before starting. Every deferred callback checks `isOwner(myToken)` before acting, so a preempted task's callbacks silently no-op instead of racing with the new task.
|
|
324
|
+
|
|
325
|
+
**Survival is hardcoded** — Energy drain, hunger, water survival, threat reaction, and damage recovery live entirely outside the LLM path. The LLM can be wrong; the code enforces physical reality.
|
|
326
|
+
|
|
327
|
+
**Structured logging** — All events written as JSONL to `events.jsonl` with correlation IDs. The TUI tails this file live at 500 ms intervals.
|
|
328
|
+
|
|
329
|
+
**Errors surface to the player** — Every task body is wrapped in try/catch and calls `bot.chat("Error: ...")`.
|
|
330
|
+
|
|
331
|
+
---
|
|
332
|
+
|
|
333
|
+
## Environment Variables
|
|
334
|
+
|
|
335
|
+
Config is normally handled through TUI onboarding (saved to `.ember-config.json`). These env vars override it:
|
|
336
|
+
|
|
337
|
+
| Variable | Default | Description |
|
|
338
|
+
|----------|---------|-------------|
|
|
339
|
+
| `BOT_NAME` | `Ember` | In-game bot name |
|
|
340
|
+
| `SERVER_HOST` | `localhost` | Minecraft server host |
|
|
341
|
+
| `SERVER_PORT` | `25565` | Minecraft server port |
|
|
342
|
+
| `MC_VERSION` | `1.21.4` | Minecraft protocol version |
|
|
343
|
+
| `FEATHERLESS_API_KEY` | — | Featherless key (`rc_xxx`) or `ollama` for local mode |
|
|
344
|
+
| `FEATHERLESS_MODEL` | — | Model ID |
|
|
345
|
+
| `FEATHERLESS_URL` | Featherless endpoint | Set to `http://localhost:11434/v1/chat/completions` for Ollama |
|
|
346
|
+
|
|
347
|
+
---
|
|
348
|
+
|
|
349
|
+
## License
|
|
350
|
+
|
|
351
|
+
ISC
|
package/botSupervisor.js
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
// botSupervisor.js — Supervised bot lifecycle manager.
|
|
2
|
+
//
|
|
3
|
+
// Wraps bot process spawning with:
|
|
4
|
+
// - Exit reason classification from exit_reason.json
|
|
5
|
+
// - Per-class base backoff with exponential growth and a 5-minute cap
|
|
6
|
+
// - Restart storm prevention: ≥5 restarts in 10 minutes → halt with alert
|
|
7
|
+
// - Recovery chain IDs: each restart chain gets a unique UUID-like ID
|
|
8
|
+
// - onLog / onRespawn callbacks for integration with cli.js and tui.js
|
|
9
|
+
//
|
|
10
|
+
// Storm prevention intentionally halts rather than continuing — if 5 restarts
|
|
11
|
+
// in 10 minutes haven't fixed it, additional restarts make the problem worse.
|
|
12
|
+
// The operator must inspect and manually restart.
|
|
13
|
+
|
|
14
|
+
'use strict'
|
|
15
|
+
|
|
16
|
+
const { spawn } = require('child_process')
|
|
17
|
+
const fs = require('fs')
|
|
18
|
+
const path = require('path')
|
|
19
|
+
const crypto = require('crypto')
|
|
20
|
+
|
|
21
|
+
const EXIT_REASON_FILE = 'exit_reason.json'
|
|
22
|
+
|
|
23
|
+
// Base backoff (ms) per exit reason class.
|
|
24
|
+
// entity_desync: short — just reconnect after a brief pause
|
|
25
|
+
// kicked: moderate — give the server time to allow reconnect
|
|
26
|
+
// crash: longer — likely a bug; give things time to settle
|
|
27
|
+
// server_disconnect: long — server may still be restarting
|
|
28
|
+
// unknown_clean: minimal — was working fine, probably transient
|
|
29
|
+
// unknown_dirty: moderate — some non-zero exit; be cautious
|
|
30
|
+
const BASE_BACKOFF_MS = {
|
|
31
|
+
entity_desync: 3_000,
|
|
32
|
+
kicked: 20_000,
|
|
33
|
+
crash: 30_000,
|
|
34
|
+
server_disconnect:45_000,
|
|
35
|
+
unknown_clean: 5_000,
|
|
36
|
+
unknown_dirty: 30_000,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const BACKOFF_MULTIPLIER = 2
|
|
40
|
+
const BACKOFF_CAP_MS = 5 * 60 * 1000 // 5 minutes maximum
|
|
41
|
+
|
|
42
|
+
const STORM_WINDOW_MS = 10 * 60 * 1000 // 10-minute window
|
|
43
|
+
const STORM_MAX = 5 // halt after this many restarts in the window
|
|
44
|
+
|
|
45
|
+
function generateChainId() {
|
|
46
|
+
return crypto.randomBytes(4).toString('hex')
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function readExitReason(botDir) {
|
|
50
|
+
const p = path.join(botDir, EXIT_REASON_FILE)
|
|
51
|
+
try {
|
|
52
|
+
const raw = fs.readFileSync(p, 'utf8')
|
|
53
|
+
const obj = JSON.parse(raw)
|
|
54
|
+
fs.unlinkSync(p) // consume so stale file doesn't mislead next exit
|
|
55
|
+
return obj
|
|
56
|
+
} catch {
|
|
57
|
+
return null
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function classifyExit(exitReason, code, uptimeMs) {
|
|
62
|
+
if (exitReason) return exitReason.reason
|
|
63
|
+
if (code === 0 && uptimeMs > 60_000) return 'unknown_clean'
|
|
64
|
+
if (code === 0 && uptimeMs <= 60_000) return 'unknown_clean'
|
|
65
|
+
return 'unknown_dirty'
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
69
|
+
|
|
70
|
+
module.exports = function createBotSupervisor({ botDir, logDir, onLog, onRespawn, onStorm }) {
|
|
71
|
+
// bots: name → { proc, model, logPath, startedAt, chainId, restartCount,
|
|
72
|
+
// backoffMs, restartTimestamps, manualStopping, timer }
|
|
73
|
+
const bots = new Map()
|
|
74
|
+
|
|
75
|
+
function _log(level, msg, meta = {}) {
|
|
76
|
+
const entry = { t: new Date().toISOString(), level, msg, ...meta }
|
|
77
|
+
if (onLog) onLog(entry)
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
function launch(name, model, logPath, { fresh = true, chainId = null } = {}) {
|
|
81
|
+
if (bots.has(name)) {
|
|
82
|
+
const existing = bots.get(name)
|
|
83
|
+
if (existing.proc) return // already running
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const myChainId = chainId || generateChainId()
|
|
87
|
+
const isResume = bots.has(name)
|
|
88
|
+
const prev = bots.get(name)
|
|
89
|
+
|
|
90
|
+
const entry = {
|
|
91
|
+
proc: null,
|
|
92
|
+
model,
|
|
93
|
+
logPath,
|
|
94
|
+
startedAt: Date.now(),
|
|
95
|
+
chainId: myChainId,
|
|
96
|
+
restartCount: isResume ? prev.restartCount : 0,
|
|
97
|
+
backoffMs: isResume ? prev.backoffMs : BASE_BACKOFF_MS.unknown_clean,
|
|
98
|
+
restartTimestamps: isResume ? prev.restartTimestamps : [],
|
|
99
|
+
manualStopping: false,
|
|
100
|
+
timer: null,
|
|
101
|
+
}
|
|
102
|
+
bots.set(name, entry)
|
|
103
|
+
|
|
104
|
+
_log('info', 'supervisor_launch', {
|
|
105
|
+
name, model, chainId: myChainId,
|
|
106
|
+
restartCount: entry.restartCount,
|
|
107
|
+
backoffMs: isResume ? entry.backoffMs : 0,
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
const logFd = fs.openSync(logPath, fresh ? 'w' : 'a')
|
|
111
|
+
const env = {
|
|
112
|
+
...process.env,
|
|
113
|
+
BOT_NAME: name,
|
|
114
|
+
FEATHERLESS_MODEL: model,
|
|
115
|
+
RECOVERY_CHAIN_ID: myChainId,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
const proc = spawn('node', ['bot.js'], {
|
|
119
|
+
cwd: botDir,
|
|
120
|
+
env,
|
|
121
|
+
stdio: ['ignore', logFd, logFd],
|
|
122
|
+
})
|
|
123
|
+
|
|
124
|
+
entry.proc = proc
|
|
125
|
+
|
|
126
|
+
proc.on('error', (err) => {
|
|
127
|
+
_log('error', 'supervisor_spawn_error', { name, message: err.message, chainId: myChainId })
|
|
128
|
+
entry.proc = null
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
proc.on('exit', (code, signal) => {
|
|
132
|
+
const uptimeMs = Date.now() - entry.startedAt
|
|
133
|
+
entry.proc = null
|
|
134
|
+
|
|
135
|
+
if (entry.manualStopping) {
|
|
136
|
+
_log('info', 'supervisor_manual_stop', { name, code, uptimeMs, chainId: myChainId })
|
|
137
|
+
bots.delete(name)
|
|
138
|
+
return
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
_onExit(name, code, signal, uptimeMs)
|
|
142
|
+
})
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
function _onExit(name, code, signal, uptimeMs) {
|
|
146
|
+
const entry = bots.get(name)
|
|
147
|
+
if (!entry) return
|
|
148
|
+
|
|
149
|
+
const exitReason = readExitReason(botDir)
|
|
150
|
+
const reasonClass = classifyExit(exitReason, code, uptimeMs)
|
|
151
|
+
const base = BASE_BACKOFF_MS[reasonClass] ?? BASE_BACKOFF_MS.unknown_dirty
|
|
152
|
+
|
|
153
|
+
// Escalate backoff: cap at BACKOFF_CAP_MS
|
|
154
|
+
const newBackoff = Math.min(entry.backoffMs * BACKOFF_MULTIPLIER, BACKOFF_CAP_MS)
|
|
155
|
+
const delayMs = reasonClass === 'unknown_clean' && uptimeMs > 60_000
|
|
156
|
+
? base
|
|
157
|
+
: Math.max(base, newBackoff)
|
|
158
|
+
|
|
159
|
+
entry.backoffMs = delayMs
|
|
160
|
+
entry.restartCount++
|
|
161
|
+
entry.restartTimestamps.push(Date.now())
|
|
162
|
+
|
|
163
|
+
// Prune timestamps outside storm window
|
|
164
|
+
const cutoff = Date.now() - STORM_WINDOW_MS
|
|
165
|
+
entry.restartTimestamps = entry.restartTimestamps.filter(t => t > cutoff)
|
|
166
|
+
|
|
167
|
+
_log('warn', 'supervisor_exit', {
|
|
168
|
+
name, code, signal, uptimeMs, reasonClass,
|
|
169
|
+
restartCount: entry.restartCount,
|
|
170
|
+
delayMs,
|
|
171
|
+
chainId: entry.chainId,
|
|
172
|
+
restartsInWindow: entry.restartTimestamps.length,
|
|
173
|
+
...(exitReason && { exitDetail: exitReason }),
|
|
174
|
+
})
|
|
175
|
+
|
|
176
|
+
// Storm check
|
|
177
|
+
if (entry.restartTimestamps.length >= STORM_MAX) {
|
|
178
|
+
_log('error', 'supervisor_restart_storm', {
|
|
179
|
+
name, restartsInWindow: entry.restartTimestamps.length,
|
|
180
|
+
windowMs: STORM_WINDOW_MS, chainId: entry.chainId,
|
|
181
|
+
})
|
|
182
|
+
if (onStorm) onStorm({ name, chainId: entry.chainId, restartCount: entry.restartCount })
|
|
183
|
+
bots.delete(name)
|
|
184
|
+
return
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
_log('info', 'supervisor_schedule_respawn', {
|
|
188
|
+
name, delayMs, chainId: entry.chainId, reasonClass,
|
|
189
|
+
})
|
|
190
|
+
|
|
191
|
+
entry.timer = setTimeout(() => {
|
|
192
|
+
entry.timer = null
|
|
193
|
+
const current = bots.get(name)
|
|
194
|
+
if (!current || current.manualStopping) return
|
|
195
|
+
|
|
196
|
+
_log('info', 'supervisor_respawn', { name, chainId: entry.chainId, restartCount: entry.restartCount })
|
|
197
|
+
if (onRespawn) onRespawn({ name, model: entry.model, chainId: entry.chainId, restartCount: entry.restartCount })
|
|
198
|
+
|
|
199
|
+
launch(name, entry.model, entry.logPath, { fresh: false, chainId: entry.chainId })
|
|
200
|
+
}, delayMs)
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
function stop(name) {
|
|
204
|
+
const entry = bots.get(name)
|
|
205
|
+
if (!entry) return false
|
|
206
|
+
entry.manualStopping = true
|
|
207
|
+
if (entry.timer) { clearTimeout(entry.timer); entry.timer = null }
|
|
208
|
+
if (entry.proc) {
|
|
209
|
+
try { entry.proc.kill('SIGTERM') } catch {}
|
|
210
|
+
} else {
|
|
211
|
+
bots.delete(name)
|
|
212
|
+
}
|
|
213
|
+
return true
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
function getState() {
|
|
217
|
+
const out = {}
|
|
218
|
+
for (const [name, e] of bots) {
|
|
219
|
+
out[name] = {
|
|
220
|
+
running: !!e.proc,
|
|
221
|
+
model: e.model,
|
|
222
|
+
logPath: e.logPath,
|
|
223
|
+
chainId: e.chainId,
|
|
224
|
+
restartCount: e.restartCount,
|
|
225
|
+
backoffMs: e.backoffMs,
|
|
226
|
+
restartsInWindow: e.restartTimestamps.filter(t => t > Date.now() - STORM_WINDOW_MS).length,
|
|
227
|
+
uptimeMs: e.startedAt ? Date.now() - e.startedAt : null,
|
|
228
|
+
awaitingRespawn: !!e.timer,
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
return out
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
function has(name) { return bots.has(name) }
|
|
235
|
+
|
|
236
|
+
return { launch, stop, getState, has }
|
|
237
|
+
}
|