@elvatis_com/openclaw-self-healing-elvatis 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.ts ADDED
@@ -0,0 +1,834 @@
1
+ import fs from "node:fs";
2
+ import path from "node:path";
3
+ import os from "node:os";
4
+
5
+ export function expandHome(p: string): string {
6
+ if (!p) return p;
7
+ if (p === "~") return os.homedir();
8
+ if (p.startsWith("~/")) return path.join(os.homedir(), p.slice(2));
9
+ return p;
10
+ }
11
+
12
+ export type State = {
13
+ limited: Record<string, { lastHitAt: number; nextAvailableAt: number; reason?: string; lastProbeAt?: number }>;
14
+ pendingBackups?: Record<string, { createdAt: number; reason: string }>; // filePath -> meta
15
+ whatsapp?: {
16
+ lastSeenConnectedAt?: number;
17
+ lastRestartAt?: number;
18
+ disconnectStreak?: number;
19
+ };
20
+ cron?: {
21
+ failCounts?: Record<string, number>; // job id -> consecutive failures
22
+ lastIssueCreatedAt?: Record<string, number>; // job id -> timestamp
23
+ };
24
+ plugins?: {
25
+ lastDisableAt?: Record<string, number>; // plugin id -> timestamp
26
+ };
27
+ };
28
+
29
+ export type StatusSnapshot = {
30
+ health: "healthy" | "degraded" | "healing";
31
+ activeModel: string;
32
+ models: {
33
+ id: string;
34
+ status: "available" | "cooldown";
35
+ cooldownReason?: string;
36
+ cooldownRemainingSec?: number;
37
+ nextAvailableAt?: number;
38
+ lastProbeAt?: number;
39
+ }[];
40
+ whatsapp: {
41
+ status: "connected" | "disconnected" | "unknown";
42
+ disconnectStreak: number;
43
+ lastRestartAt: number | null;
44
+ lastSeenConnectedAt: number | null;
45
+ };
46
+ cron: {
47
+ trackedJobs: number;
48
+ failingJobs: { id: string; consecutiveFailures: number }[];
49
+ };
50
+ config: {
51
+ dryRun: boolean;
52
+ probeEnabled: boolean;
53
+ cooldownMinutes: number;
54
+ modelOrder: string[];
55
+ };
56
+ generatedAt: number;
57
+ };
58
+
59
+ export function buildStatusSnapshot(state: State, config: PluginConfig): StatusSnapshot {
60
+ const t = nowSec();
61
+
62
+ // Build model status list
63
+ const models = config.modelOrder.map((id) => {
64
+ const lim = state.limited[id];
65
+ const inCooldown = lim != null && lim.nextAvailableAt > t;
66
+ return {
67
+ id,
68
+ status: (inCooldown ? "cooldown" : "available") as "available" | "cooldown",
69
+ ...(inCooldown
70
+ ? {
71
+ cooldownReason: lim!.reason,
72
+ cooldownRemainingSec: lim!.nextAvailableAt - t,
73
+ nextAvailableAt: lim!.nextAvailableAt,
74
+ lastProbeAt: lim!.lastProbeAt,
75
+ }
76
+ : {}),
77
+ };
78
+ });
79
+
80
+ // Active model is the first available from the order
81
+ const activeModel = pickFallback(config.modelOrder, state);
82
+
83
+ // Determine health
84
+ const cooldownCount = models.filter((m) => m.status === "cooldown").length;
85
+ const health: StatusSnapshot["health"] =
86
+ cooldownCount === 0 ? "healthy" : cooldownCount < config.modelOrder.length ? "degraded" : "healing";
87
+
88
+ // WhatsApp status
89
+ const wa = state.whatsapp ?? {};
90
+ const waStatus: StatusSnapshot["whatsapp"]["status"] =
91
+ wa.lastSeenConnectedAt != null && wa.disconnectStreak === 0
92
+ ? "connected"
93
+ : (wa.disconnectStreak ?? 0) > 0
94
+ ? "disconnected"
95
+ : "unknown";
96
+
97
+ // Cron status
98
+ const failCounts = state.cron?.failCounts ?? {};
99
+ const failingJobs = Object.entries(failCounts)
100
+ .filter(([, count]) => count > 0)
101
+ .map(([id, consecutiveFailures]) => ({ id, consecutiveFailures }));
102
+
103
+ return {
104
+ health,
105
+ activeModel,
106
+ models,
107
+ whatsapp: {
108
+ status: waStatus,
109
+ disconnectStreak: wa.disconnectStreak ?? 0,
110
+ lastRestartAt: wa.lastRestartAt ?? null,
111
+ lastSeenConnectedAt: wa.lastSeenConnectedAt ?? null,
112
+ },
113
+ cron: {
114
+ trackedJobs: Object.keys(failCounts).length,
115
+ failingJobs,
116
+ },
117
+ config: {
118
+ dryRun: config.dryRun,
119
+ probeEnabled: config.probeEnabled,
120
+ cooldownMinutes: config.cooldownMinutes,
121
+ modelOrder: [...config.modelOrder],
122
+ },
123
+ generatedAt: t,
124
+ };
125
+ }
126
+
127
+ export type PluginConfig = {
128
+ modelOrder: string[];
129
+ cooldownMinutes: number;
130
+ stateFile: string;
131
+ statusFile: string;
132
+ sessionsFile: string;
133
+ configFile: string;
134
+ configBackupsDir: string;
135
+ patchPins: boolean;
136
+ disableFailingCrons: boolean;
137
+ disableFailingPlugins: boolean;
138
+ whatsappRestartEnabled: boolean;
139
+ whatsappDisconnectThreshold: number;
140
+ whatsappMinRestartIntervalSec: number;
141
+ cronFailThreshold: number;
142
+ issueCooldownSec: number;
143
+ issueRepo: string;
144
+ pluginDisableCooldownSec: number;
145
+ probeEnabled: boolean;
146
+ probeIntervalSec: number;
147
+ dryRun: boolean;
148
+ };
149
+
150
+ function shellQuote(value: string): string {
151
+ return `'${String(value).replace(/'/g, `'"'"'`)}'`;
152
+ }
153
+
154
+ const ISSUE_REPO_SLUG_RE = /^[A-Za-z0-9_.-]+\/[A-Za-z0-9_.-]+$/;
155
+
156
+ export function isValidIssueRepoSlug(value: string): boolean {
157
+ return ISSUE_REPO_SLUG_RE.test(value.trim());
158
+ }
159
+
160
+ export function resolveIssueRepo(configValue: unknown, envValue: unknown): string {
161
+ const defaultRepo = "elvatis/openclaw-self-healing-homeofe";
162
+ const candidates = [configValue, envValue, defaultRepo];
163
+ for (const candidate of candidates) {
164
+ if (typeof candidate !== "string") continue;
165
+ const trimmed = candidate.trim();
166
+ if (trimmed && isValidIssueRepoSlug(trimmed)) {
167
+ return trimmed;
168
+ }
169
+ }
170
+ return defaultRepo;
171
+ }
172
+
173
+ export function buildGhIssueCreateCommand(args: {
174
+ repo: string;
175
+ title: string;
176
+ body: string;
177
+ labels?: string[];
178
+ }): string {
179
+ const repo = args.repo.trim();
180
+ if (!isValidIssueRepoSlug(repo)) {
181
+ throw new Error(`Invalid issue repository slug: ${args.repo}`);
182
+ }
183
+
184
+ const labels = (args.labels ?? []).map((label) => label.trim()).filter(Boolean);
185
+ const parts = [
186
+ "gh issue create",
187
+ `-R ${shellQuote(repo)}`,
188
+ `--title ${shellQuote(args.title)}`,
189
+ `--body ${shellQuote(args.body)}`,
190
+ ];
191
+
192
+ if (labels.length > 0) {
193
+ parts.push(`--label ${shellQuote(labels.join(","))}`);
194
+ }
195
+
196
+ return parts.join(" ");
197
+ }
198
+
199
+ const DEFAULT_MODEL_ORDER = [
200
+ "anthropic/claude-opus-4-6",
201
+ "openai-codex/gpt-5.2",
202
+ "google-gemini-cli/gemini-2.5-flash",
203
+ ];
204
+
205
+ export function parseConfig(raw: any): PluginConfig {
206
+ const cfg = raw ?? {};
207
+ const autoFix = cfg.autoFix ?? {};
208
+ const issueRepo = resolveIssueRepo(autoFix.issueRepo, process.env.GITHUB_REPOSITORY);
209
+ return {
210
+ modelOrder: cfg.modelOrder?.length ? [...cfg.modelOrder] : [...DEFAULT_MODEL_ORDER],
211
+ cooldownMinutes: cfg.cooldownMinutes ?? 300,
212
+ stateFile: expandHome(cfg.stateFile ?? "~/.openclaw/workspace/memory/self-heal-state.json"),
213
+ statusFile: expandHome(cfg.statusFile ?? "~/.openclaw/workspace/memory/self-heal-status.json"),
214
+ sessionsFile: expandHome(cfg.sessionsFile ?? "~/.openclaw/agents/main/sessions/sessions.json"),
215
+ configFile: expandHome(cfg.configFile ?? "~/.openclaw/openclaw.json"),
216
+ configBackupsDir: expandHome(cfg.configBackupsDir ?? "~/.openclaw/backups/openclaw.json"),
217
+ patchPins: autoFix.patchSessionPins !== false,
218
+ disableFailingCrons: autoFix.disableFailingCrons === true,
219
+ disableFailingPlugins: autoFix.disableFailingPlugins === true,
220
+ whatsappRestartEnabled: autoFix.restartWhatsappOnDisconnect !== false,
221
+ whatsappDisconnectThreshold: autoFix.whatsappDisconnectThreshold ?? 2,
222
+ whatsappMinRestartIntervalSec: autoFix.whatsappMinRestartIntervalSec ?? 300,
223
+ cronFailThreshold: autoFix.cronFailThreshold ?? 3,
224
+ issueCooldownSec: autoFix.issueCooldownSec ?? 6 * 3600,
225
+ issueRepo,
226
+ pluginDisableCooldownSec: autoFix.pluginDisableCooldownSec ?? 3600,
227
+ probeEnabled: cfg.probeEnabled !== false,
228
+ probeIntervalSec: cfg.probeIntervalSec ?? 300,
229
+ dryRun: cfg.dryRun === true,
230
+ };
231
+ }
232
+
233
+ export type ConfigValidationResult = {
234
+ valid: boolean;
235
+ errors: string[];
236
+ };
237
+
238
+ export function validateConfig(config: PluginConfig): ConfigValidationResult {
239
+ const errors: string[] = [];
240
+
241
+ if (!Array.isArray(config.modelOrder) || config.modelOrder.length === 0) {
242
+ errors.push("modelOrder must have at least one entry");
243
+ }
244
+
245
+ if (typeof config.cooldownMinutes !== "number" || config.cooldownMinutes < 1 || config.cooldownMinutes > 10080) {
246
+ errors.push("cooldownMinutes must be between 1 and 10080 (1 week)");
247
+ }
248
+
249
+ if (typeof config.probeIntervalSec !== "number" || config.probeIntervalSec < 60) {
250
+ errors.push("probeIntervalSec must be >= 60");
251
+ }
252
+
253
+ if (typeof config.whatsappMinRestartIntervalSec !== "number" || config.whatsappMinRestartIntervalSec < 60) {
254
+ errors.push("whatsappMinRestartIntervalSec must be >= 60");
255
+ }
256
+
257
+ // Best-effort: check that the state file directory is writable
258
+ const stateDir = path.dirname(config.stateFile);
259
+ try {
260
+ fs.mkdirSync(stateDir, { recursive: true });
261
+ fs.accessSync(stateDir, fs.constants.W_OK);
262
+ } catch {
263
+ errors.push(`stateFile directory is not writable: ${stateDir}`);
264
+ }
265
+
266
+ return { valid: errors.length === 0, errors };
267
+ }
268
+
269
+ export function configDiff(a: PluginConfig, b: PluginConfig): string[] {
270
+ const changes: string[] = [];
271
+ for (const k of Object.keys(a) as (keyof PluginConfig)[]) {
272
+ const va = a[k];
273
+ const vb = b[k];
274
+ if (Array.isArray(va) && Array.isArray(vb)) {
275
+ if (JSON.stringify(va) !== JSON.stringify(vb)) changes.push(k);
276
+ } else if (va !== vb) {
277
+ changes.push(k);
278
+ }
279
+ }
280
+ return changes;
281
+ }
282
+
283
+ export function nowSec() {
284
+ return Math.floor(Date.now() / 1000);
285
+ }
286
+
287
+ export function loadState(p: string): State {
288
+ try {
289
+ const raw = fs.readFileSync(p, "utf-8");
290
+ const d = JSON.parse(raw);
291
+ if (!d.limited) d.limited = {};
292
+ if (!d.pendingBackups) d.pendingBackups = {};
293
+ if (!d.whatsapp) d.whatsapp = {};
294
+ if (!d.cron) d.cron = {};
295
+ if (!d.cron.failCounts) d.cron.failCounts = {};
296
+ if (!d.cron.lastIssueCreatedAt) d.cron.lastIssueCreatedAt = {};
297
+ if (!d.plugins) d.plugins = {};
298
+ if (!d.plugins.lastDisableAt) d.plugins.lastDisableAt = {};
299
+ return d;
300
+ } catch {
301
+ return { limited: {}, pendingBackups: {}, whatsapp: {}, cron: { failCounts: {}, lastIssueCreatedAt: {} }, plugins: { lastDisableAt: {} } };
302
+ }
303
+ }
304
+
305
+ export function saveState(p: string, s: State) {
306
+ fs.mkdirSync(path.dirname(p), { recursive: true });
307
+ fs.writeFileSync(p, JSON.stringify(s, null, 2));
308
+ }
309
+
310
+ export function writeStatusFile(filePath: string, snapshot: StatusSnapshot): void {
311
+ const dir = path.dirname(filePath);
312
+ fs.mkdirSync(dir, { recursive: true });
313
+ const tmp = filePath + ".tmp";
314
+ fs.writeFileSync(tmp, JSON.stringify(snapshot, null, 2));
315
+ fs.renameSync(tmp, filePath);
316
+ }
317
+
318
+ export function isRateLimitLike(err?: string): boolean {
319
+ if (!err) return false;
320
+ const s = err.toLowerCase();
321
+ return s.includes("rate limit") || s.includes("quota") || s.includes("429") || s.includes("resource_exhausted");
322
+ }
323
+
324
+ export function isAuthScopeLike(err?: string): boolean {
325
+ if (!err) return false;
326
+ const s = err.toLowerCase();
327
+ return (
328
+ s.includes("http 401") ||
329
+ s.includes("insufficient permissions") ||
330
+ s.includes("missing scopes") ||
331
+ s.includes("api.responses.write") ||
332
+ s.includes("unauthorized")
333
+ );
334
+ }
335
+
336
+ export function pickFallback(modelOrder: string[], state: State): string {
337
+ const t = nowSec();
338
+ for (const m of modelOrder) {
339
+ const lim = state.limited[m];
340
+ if (!lim) return m;
341
+ if (lim.nextAvailableAt <= t) return m;
342
+ }
343
+ return modelOrder[modelOrder.length - 1];
344
+ }
345
+
346
+ export function patchSessionModel(sessionsFile: string, sessionKey: string, model: string, logger: any): boolean {
347
+ try {
348
+ const raw = fs.readFileSync(sessionsFile, "utf-8");
349
+ const data = JSON.parse(raw);
350
+ if (!data[sessionKey]) return false;
351
+ const prev = data[sessionKey].model;
352
+ data[sessionKey].model = model;
353
+ fs.writeFileSync(sessionsFile, JSON.stringify(data, null, 0));
354
+ logger?.warn?.(`[self-heal] patched session model: ${sessionKey} ${prev} -> ${model}`);
355
+ return true;
356
+ } catch (e: any) {
357
+ logger?.error?.(`[self-heal] failed to patch session model: ${e?.message ?? String(e)}`);
358
+ return false;
359
+ }
360
+ }
361
+
362
+ async function runCmd(api: any, cmd: string, timeoutMs = 15000): Promise<{ ok: boolean; stdout: string; stderr: string; code?: number }> {
363
+ try {
364
+ const res = await api.runtime.system.runCommandWithTimeout({
365
+ command: ["bash", "-lc", cmd],
366
+ timeoutMs,
367
+ });
368
+ return {
369
+ ok: res.exitCode === 0,
370
+ stdout: String(res.stdout ?? ""),
371
+ stderr: String(res.stderr ?? ""),
372
+ code: res.exitCode,
373
+ };
374
+ } catch (e: any) {
375
+ return { ok: false, stdout: "", stderr: e?.message ?? String(e) };
376
+ }
377
+ }
378
+
379
+ export function safeJsonParse<T>(s: string): T | undefined {
380
+ try {
381
+ return JSON.parse(s) as T;
382
+ } catch {
383
+ return undefined;
384
+ }
385
+ }
386
+
387
+ export default function register(api: any) {
388
+ const raw = (api.pluginConfig ?? {}) as any;
389
+ if (raw.enabled === false) return;
390
+
391
+ let config = parseConfig(raw);
392
+
393
+ // Validate configuration - fail fast on invalid config
394
+ const validation = validateConfig(config);
395
+ if (!validation.valid) {
396
+ for (const err of validation.errors) {
397
+ api.logger?.error?.(`[self-heal] config validation failed: ${err}`);
398
+ }
399
+ api.logger?.error?.(`[self-heal] plugin not started due to ${validation.errors.length} config error(s)`);
400
+ return;
401
+ }
402
+
403
+ api.logger?.info?.(`[self-heal] enabled.${config.dryRun ? " DRY-RUN MODE." : ""} order=${config.modelOrder.join(" -> ")}`);
404
+
405
+ // If the gateway booted and config is valid, remove any pending backups from previous runs.
406
+ if (!config.dryRun) {
407
+ cleanupPendingBackups("startup").catch(() => undefined);
408
+ }
409
+
410
+ function isConfigValid(): { ok: boolean; error?: string } {
411
+ try {
412
+ const raw = fs.readFileSync(config.configFile, "utf-8");
413
+ JSON.parse(raw);
414
+ return { ok: true };
415
+ } catch (e: any) {
416
+ return { ok: false, error: e?.message ?? String(e) };
417
+ }
418
+ }
419
+
420
+ function backupConfig(reason: string): string | undefined {
421
+ try {
422
+ fs.mkdirSync(config.configBackupsDir, { recursive: true });
423
+ const ts = new Date().toISOString().replace(/[:.]/g, "-");
424
+ const out = path.join(config.configBackupsDir, `openclaw.json.${ts}.bak`);
425
+ fs.copyFileSync(config.configFile, out);
426
+
427
+ // Mark as pending so we can delete it after we have evidence the gateway still boots.
428
+ const st = loadState(config.stateFile);
429
+ st.pendingBackups = st.pendingBackups || {};
430
+ st.pendingBackups[out] = { createdAt: nowSec(), reason };
431
+ saveState(config.stateFile, st);
432
+
433
+ api.logger?.info?.(`[self-heal] backed up openclaw.json (${reason}) -> ${out} (pending cleanup)`);
434
+ return out;
435
+ } catch (e: any) {
436
+ api.logger?.warn?.(`[self-heal] failed to backup openclaw.json: ${e?.message ?? String(e)}`);
437
+ return undefined;
438
+ }
439
+ }
440
+
441
+ async function cleanupPendingBackups(where: string) {
442
+ const v = isConfigValid();
443
+ if (!v.ok) {
444
+ api.logger?.warn?.(`[self-heal] not cleaning backups (${where}): openclaw.json invalid: ${v.error}`);
445
+ return;
446
+ }
447
+
448
+ // Best-effort: ensure gateway responds to a status call.
449
+ const gw = await runCmd(api, "openclaw gateway status", 15000);
450
+ if (!gw.ok) {
451
+ api.logger?.warn?.(`[self-heal] not cleaning backups (${where}): gateway status check failed`);
452
+ return;
453
+ }
454
+
455
+ const st = loadState(config.stateFile);
456
+ const pending = st.pendingBackups || {};
457
+ const paths = Object.keys(pending);
458
+ if (paths.length === 0) return;
459
+
460
+ let deleted = 0;
461
+ for (const p of paths) {
462
+ try {
463
+ if (fs.existsSync(p)) {
464
+ fs.unlinkSync(p);
465
+ deleted++;
466
+ }
467
+ } catch {
468
+ // keep it in pending if we couldn't delete
469
+ continue;
470
+ }
471
+ delete pending[p];
472
+ }
473
+
474
+ st.pendingBackups = pending;
475
+ saveState(config.stateFile, st);
476
+ api.logger?.info?.(`[self-heal] cleaned ${deleted} pending openclaw.json backups (${where})`);
477
+ }
478
+
479
+ function reloadConfig(): boolean {
480
+ try {
481
+ const newRaw = (api.pluginConfig ?? {}) as any;
482
+ if (newRaw.enabled === false) {
483
+ api.logger?.warn?.("[self-heal] config reload: plugin disabled in new config, ignoring");
484
+ return false;
485
+ }
486
+ const newConfig = parseConfig(newRaw);
487
+ const changes = configDiff(config, newConfig);
488
+ if (changes.length === 0) return false;
489
+
490
+ api.logger?.info?.(`[self-heal] config reloaded: changed ${changes.join(", ")}`);
491
+ config = newConfig;
492
+ return true;
493
+ } catch (e: any) {
494
+ api.logger?.warn?.(`[self-heal] config reload failed, keeping current: ${e?.message ?? String(e)}`);
495
+ return false;
496
+ }
497
+ }
498
+
499
+ // Heal after an LLM failure.
500
+ api.on("agent_end", (event: any, ctx: any) => {
501
+ if (event?.success !== false) return;
502
+
503
+ const err = event?.error as string | undefined;
504
+ const rate = isRateLimitLike(err);
505
+ const auth = isAuthScopeLike(err);
506
+ if (!rate && !auth) return;
507
+
508
+ const state = loadState(config.stateFile);
509
+ const hitAt = nowSec();
510
+ const extra = auth ? 12 * 60 : 0;
511
+ const nextAvail = hitAt + (config.cooldownMinutes + extra) * 60;
512
+
513
+ // Best effort: mark the pinned model as limited if we can read it.
514
+ let pinnedModel: string | undefined;
515
+ try {
516
+ const data = JSON.parse(fs.readFileSync(config.sessionsFile, "utf-8"));
517
+ pinnedModel = ctx?.sessionKey ? data?.[ctx.sessionKey]?.model : undefined;
518
+ } catch {
519
+ pinnedModel = undefined;
520
+ }
521
+
522
+ const key = pinnedModel || config.modelOrder[0];
523
+ state.limited[key] = { lastHitAt: hitAt, nextAvailableAt: nextAvail, reason: err?.slice(0, 160) };
524
+ saveState(config.stateFile, state);
525
+
526
+ api.emit?.("self-heal:model-cooldown", {
527
+ model: key,
528
+ reason: err?.slice(0, 160),
529
+ cooldownSec: nextAvail - hitAt,
530
+ nextAvailableAt: nextAvail,
531
+ trigger: "agent_end",
532
+ dryRun: config.dryRun,
533
+ });
534
+
535
+ const fallback = pickFallback(config.modelOrder, state);
536
+
537
+ if (config.patchPins && ctx?.sessionKey && fallback && fallback !== pinnedModel) {
538
+ if (config.dryRun) {
539
+ api.logger?.info?.(`[self-heal] [dry-run] would patch session ${ctx.sessionKey} model: ${pinnedModel} -> ${fallback}`);
540
+ } else {
541
+ patchSessionModel(config.sessionsFile, ctx.sessionKey, fallback, api.logger);
542
+ }
543
+ api.emit?.("self-heal:session-patched", {
544
+ sessionKey: ctx.sessionKey,
545
+ oldModel: pinnedModel ?? key,
546
+ newModel: fallback,
547
+ trigger: "agent_end",
548
+ dryRun: config.dryRun,
549
+ });
550
+ }
551
+ });
552
+
553
+ // If the system ever emits a raw rate-limit message, self-heal future turns.
554
+ api.on("message_sent", (event: any, ctx: any) => {
555
+ const content = String(event?.content ?? "");
556
+ if (!content) return;
557
+ if (!isRateLimitLike(content) && !isAuthScopeLike(content)) return;
558
+
559
+ const state = loadState(config.stateFile);
560
+ const hitAt = nowSec();
561
+ const nextAvail = hitAt + config.cooldownMinutes * 60;
562
+ state.limited[config.modelOrder[0]] = {
563
+ lastHitAt: hitAt,
564
+ nextAvailableAt: nextAvail,
565
+ reason: "outbound error observed",
566
+ };
567
+ saveState(config.stateFile, state);
568
+
569
+ api.emit?.("self-heal:model-cooldown", {
570
+ model: config.modelOrder[0],
571
+ reason: "outbound error observed",
572
+ cooldownSec: config.cooldownMinutes * 60,
573
+ nextAvailableAt: nextAvail,
574
+ trigger: "message_sent",
575
+ dryRun: config.dryRun,
576
+ });
577
+
578
+ const fallback = pickFallback(config.modelOrder, state);
579
+ if (config.patchPins && ctx?.sessionKey) {
580
+ if (config.dryRun) {
581
+ api.logger?.info?.(`[self-heal] [dry-run] would patch session ${ctx.sessionKey} model -> ${fallback}`);
582
+ } else {
583
+ patchSessionModel(config.sessionsFile, ctx.sessionKey, fallback, api.logger);
584
+ }
585
+ api.emit?.("self-heal:session-patched", {
586
+ sessionKey: ctx.sessionKey,
587
+ oldModel: config.modelOrder[0],
588
+ newModel: fallback,
589
+ trigger: "message_sent",
590
+ dryRun: config.dryRun,
591
+ });
592
+ }
593
+ });
594
+
595
+ // Background monitor: WhatsApp disconnects, failing crons, failing plugins.
596
+ api.registerService({
597
+ id: "self-heal-monitor",
598
+ start: async () => {
599
+ let timer: NodeJS.Timeout | undefined;
600
+
601
+ const tick = async () => {
602
+ // Hot-reload: re-read api.pluginConfig to pick up changes
603
+ reloadConfig();
604
+
605
+ const state = loadState(config.stateFile);
606
+
607
+ // --- WhatsApp disconnect self-heal ---
608
+ if (config.whatsappRestartEnabled) {
609
+ const st = await runCmd(api, "openclaw channels status --json", 15000);
610
+ if (st.ok) {
611
+ const parsed = safeJsonParse<any>(st.stdout);
612
+ const wa = parsed?.channels?.whatsapp;
613
+ const connected = wa?.status === "connected" || wa?.connected === true;
614
+
615
+ if (connected) {
616
+ state.whatsapp!.lastSeenConnectedAt = nowSec();
617
+ state.whatsapp!.disconnectStreak = 0;
618
+ } else {
619
+ state.whatsapp!.disconnectStreak = (state.whatsapp!.disconnectStreak ?? 0) + 1;
620
+
621
+ const lastRestartAt = state.whatsapp!.lastRestartAt ?? 0;
622
+ const since = nowSec() - lastRestartAt;
623
+ const shouldRestart =
624
+ state.whatsapp!.disconnectStreak >= config.whatsappDisconnectThreshold &&
625
+ since >= config.whatsappMinRestartIntervalSec;
626
+
627
+ if (shouldRestart) {
628
+ const streak = state.whatsapp!.disconnectStreak!;
629
+ if (config.dryRun) {
630
+ api.logger?.info?.(
631
+ `[self-heal] [dry-run] would restart gateway (WhatsApp disconnect streak=${streak})`
632
+ );
633
+ state.whatsapp!.lastRestartAt = nowSec();
634
+ state.whatsapp!.disconnectStreak = 0;
635
+ } else {
636
+ api.logger?.warn?.(
637
+ `[self-heal] WhatsApp appears disconnected (streak=${streak}). Restarting gateway.`
638
+ );
639
+ // Guardrail: never restart if openclaw.json is invalid
640
+ const v = isConfigValid();
641
+ if (!v.ok) {
642
+ api.logger?.error?.(`[self-heal] NOT restarting gateway: openclaw.json invalid: ${v.error}`);
643
+ } else {
644
+ backupConfig("pre-gateway-restart");
645
+ await runCmd(api, "openclaw gateway restart", 60000);
646
+ // If we are still alive after restart, attempt cleanup.
647
+ await cleanupPendingBackups("post-gateway-restart");
648
+ state.whatsapp!.lastRestartAt = nowSec();
649
+ state.whatsapp!.disconnectStreak = 0;
650
+ }
651
+ }
652
+ api.emit?.("self-heal:whatsapp-restart", {
653
+ disconnectStreak: streak,
654
+ dryRun: config.dryRun,
655
+ });
656
+ }
657
+ }
658
+ }
659
+ }
660
+
661
+ // --- Cron failure self-heal ---
662
+ if (config.disableFailingCrons) {
663
+ const res = await runCmd(api, "openclaw cron list --json", 15000);
664
+ if (res.ok) {
665
+ const parsed = safeJsonParse<any>(res.stdout);
666
+ const jobs: any[] = parsed?.jobs ?? [];
667
+ for (const job of jobs) {
668
+ const id = job.id;
669
+ const name = job.name;
670
+ const lastStatus = job?.state?.lastStatus;
671
+ const lastError = String(job?.state?.lastError ?? "");
672
+
673
+ const isFail = lastStatus === "error";
674
+ const prev = state.cron!.failCounts![id] ?? 0;
675
+ state.cron!.failCounts![id] = isFail ? prev + 1 : 0;
676
+
677
+ if (isFail && state.cron!.failCounts![id] >= config.cronFailThreshold) {
678
+ const failCount = state.cron!.failCounts![id];
679
+ if (config.dryRun) {
680
+ api.logger?.info?.(
681
+ `[self-heal] [dry-run] would disable cron ${name} (${id}), failures=${failCount}`
682
+ );
683
+ const lastIssueAt = state.cron!.lastIssueCreatedAt![id] ?? 0;
684
+ if (nowSec() - lastIssueAt >= config.issueCooldownSec) {
685
+ api.logger?.info?.(
686
+ `[self-heal] [dry-run] would create GitHub issue for cron ${name} (${id})`
687
+ );
688
+ state.cron!.lastIssueCreatedAt![id] = nowSec();
689
+ }
690
+ } else {
691
+ // Guardrail: do not touch crons if config is invalid
692
+ const v = isConfigValid();
693
+ if (!v.ok) {
694
+ api.logger?.error?.(`[self-heal] NOT disabling cron: openclaw.json invalid: ${v.error}`);
695
+ } else {
696
+ // Disable the cron
697
+ api.logger?.warn?.(`[self-heal] Disabling failing cron ${name} (${id}).`);
698
+ backupConfig("pre-cron-disable");
699
+ await runCmd(api, `openclaw cron edit ${id} --disable`, 15000);
700
+ await cleanupPendingBackups("post-cron-disable");
701
+ }
702
+
703
+ // Create issue, but rate limit issue creation
704
+ const lastIssueAt = state.cron!.lastIssueCreatedAt![id] ?? 0;
705
+ if (nowSec() - lastIssueAt >= config.issueCooldownSec) {
706
+ const body = [
707
+ `Cron job failed repeatedly and was disabled by openclaw-self-healing.`,
708
+ ``,
709
+ `Name: ${name}`,
710
+ `ID: ${id}`,
711
+ `Consecutive failures: ${state.cron!.failCounts![id]}`,
712
+ `Last error:`,
713
+ "```",
714
+ lastError.slice(0, 1200),
715
+ "```",
716
+ ].join("\n");
717
+
718
+ // Issue goes to configured repo (or default)
719
+ const issueTitle = `Cron disabled: ${name}`;
720
+ const issueCommand = buildGhIssueCreateCommand({
721
+ repo: config.issueRepo,
722
+ title: issueTitle,
723
+ body,
724
+ labels: ["security"],
725
+ });
726
+ await runCmd(
727
+ api,
728
+ issueCommand,
729
+ 20000
730
+ );
731
+ state.cron!.lastIssueCreatedAt![id] = nowSec();
732
+ }
733
+ }
734
+
735
+ api.emit?.("self-heal:cron-disabled", {
736
+ cronId: id,
737
+ cronName: name,
738
+ consecutiveFailures: failCount,
739
+ lastError: lastError.slice(0, 160),
740
+ dryRun: config.dryRun,
741
+ });
742
+
743
+ state.cron!.failCounts![id] = 0;
744
+ }
745
+ }
746
+ }
747
+ }
748
+
749
+ // --- Plugin error rollback (disable plugin) ---
750
+ if (config.disableFailingPlugins) {
751
+ const res = await runCmd(api, "openclaw plugins list", 15000);
752
+ if (res.ok) {
753
+ // Heuristic: look for lines containing 'error' or 'crash'
754
+ const lines = res.stdout.split("\n");
755
+ for (const ln of lines) {
756
+ if (!ln.toLowerCase().includes("error")) continue;
757
+ // No robust parsing available in plain output. Use a conservative approach:
758
+ // if we see our own plugin listed with error, do not disable others.
759
+ }
760
+ }
761
+ // TODO: when openclaw provides plugins list --json, parse and disable any status=error.
762
+ }
763
+
764
+ // --- Active model recovery probing ---
765
+ if (config.probeEnabled) {
766
+ const t = nowSec();
767
+ for (const model of Object.keys(state.limited)) {
768
+ const info = state.limited[model];
769
+ // Only probe models still in cooldown
770
+ if (info.nextAvailableAt <= t) continue;
771
+
772
+ // Respect probe interval
773
+ const lastProbe = info.lastProbeAt ?? info.lastHitAt;
774
+ if (t - lastProbe < config.probeIntervalSec) continue;
775
+
776
+ if (config.dryRun) {
777
+ api.logger?.info?.(`[self-heal] [dry-run] would probe model ${model}`);
778
+ } else {
779
+ // Probe the model
780
+ const res = await runCmd(api, `openclaw model probe "${model}"`, 15000);
781
+ state.limited[model].lastProbeAt = t;
782
+
783
+ if (res.ok) {
784
+ api.logger?.info?.(
785
+ `[self-heal] model ${model} recovered early via probe, removing from cooldown`
786
+ );
787
+ delete state.limited[model];
788
+
789
+ api.emit?.("self-heal:model-recovered", {
790
+ model,
791
+ isPreferred: model === config.modelOrder[0],
792
+ });
793
+
794
+ if (model === config.modelOrder[0]) {
795
+ api.logger?.info?.(
796
+ `[self-heal] preferred model ${model} recovered, will be used for new requests`
797
+ );
798
+ }
799
+ }
800
+ }
801
+ }
802
+ }
803
+
804
+ saveState(config.stateFile, state);
805
+
806
+ // Emit status snapshot for external monitoring
807
+ const snapshot = buildStatusSnapshot(state, config);
808
+ api.emit?.("self-heal:status", snapshot);
809
+
810
+ // Write status file for external tools / dashboards
811
+ try {
812
+ writeStatusFile(config.statusFile, snapshot);
813
+ } catch (e: any) {
814
+ api.logger?.warn?.(`[self-heal] failed to write status file: ${e?.message ?? String(e)}`);
815
+ }
816
+ };
817
+
818
+ // tick every 60s
819
+ timer = setInterval(() => {
820
+ tick().catch((e) => api.logger?.error?.(`[self-heal] monitor tick failed: ${e?.message ?? String(e)}`));
821
+ }, 60_000);
822
+
823
+ // run once immediately
824
+ tick().catch((e) => api.logger?.error?.(`[self-heal] monitor start tick failed: ${e?.message ?? String(e)}`));
825
+
826
+ // store timer for stop
827
+ (api as any).__selfHealTimer = timer;
828
+ },
829
+ stop: async () => {
830
+ const t: NodeJS.Timeout | undefined = (api as any).__selfHealTimer;
831
+ if (t) clearInterval(t);
832
+ },
833
+ });
834
+ }