@geravant/sinain 1.0.17 → 1.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.ts CHANGED
@@ -8,49 +8,20 @@
8
8
  * - Strips <private> tags from tool results before persistence
9
9
  */
10
10
 
11
- import { readFileSync, writeFileSync, mkdirSync, existsSync, readdirSync, statSync, chmodSync, copyFileSync, renameSync } from "node:fs";
12
- import { join, dirname, extname } from "node:path";
11
+ import { readFileSync, writeFileSync, mkdirSync, existsSync, statSync, chmodSync, copyFileSync } from "node:fs";
12
+ import { join, dirname } from "node:path";
13
13
  import type { OpenClawPluginApi } from "openclaw/plugin-sdk";
14
14
 
15
- // ============================================================================
16
- // Types
17
- // ============================================================================
18
-
19
- type PluginConfig = {
20
- heartbeatPath?: string;
21
- skillPath?: string;
22
- memoryPath?: string;
23
- modulesPath?: string;
24
- sessionKey?: string;
25
- userTimezone?: string;
26
- };
27
-
28
- type ModuleRegistryEntry = {
29
- status: "active" | "suspended" | "disabled";
30
- priority: number;
31
- activatedAt: string | null;
32
- lastTriggered: string | null;
33
- locked: boolean;
34
- };
35
-
36
- type ModuleRegistry = {
37
- version: number;
38
- modules: Record<string, ModuleRegistryEntry>;
39
- };
40
-
41
- type ToolUsageEntry = {
42
- toolName: string;
43
- ts: number;
44
- durationMs?: number;
45
- error?: string;
46
- };
47
-
48
- type SessionState = {
49
- startedAt: number;
50
- toolUsage: ToolUsageEntry[];
51
- workspaceDir?: string;
52
- heartbeatToolCalled?: boolean;
53
- };
15
+ import type {
16
+ PluginConfig,
17
+ SessionState,
18
+ ParentContextCache,
19
+ } from "./sinain-knowledge/data/schema.js";
20
+ import { KnowledgeStore } from "./sinain-knowledge/data/store.js";
21
+ import { ResilienceManager, HealthWatchdog, OVERFLOW_CONSECUTIVE_THRESHOLD, SHORT_FAILURE_THRESHOLD_MS, ERROR_WINDOW_MS, SESSION_HYGIENE_SIZE_BYTES, SESSION_HYGIENE_AGE_MS, ALERT_COOLDOWN_MS } from "./sinain-knowledge/curation/resilience.js";
22
+ import type { ResilienceBackend } from "./sinain-knowledge/curation/resilience.js";
23
+ import { CurationEngine } from "./sinain-knowledge/curation/engine.js";
24
+ import { GitSnapshotStore } from "./sinain-knowledge/data/git-store.js";
54
25
 
55
26
  // ============================================================================
56
27
  // Privacy helpers
@@ -58,35 +29,8 @@ type SessionState = {
58
29
 
59
30
  const PRIVATE_TAG_RE = /<private>[\s\S]*?<\/private>/g;
60
31
 
61
- // ============================================================================
62
- // Retry storm resilience constants
63
- // ============================================================================
64
-
65
- const ERROR_WINDOW_MS = 5 * 60_000; // 5-min sliding window for error rate
66
- const OUTAGE_ERROR_RATE_THRESHOLD = 0.8; // 80% failure → outage detected
67
- const OUTAGE_MIN_SAMPLES = 3; // need ≥3 samples before threshold applies
68
- const FILE_SYNC_DEBOUNCE_MS = 3 * 60_000; // skip file sync if done <3 min ago
69
- const PLAYBOOK_GEN_DEBOUNCE_MS = 5 * 60_000; // skip playbook gen if done <5 min ago
70
- const SHORT_FAILURE_THRESHOLD_MS = 10_000; // fails in <10s = likely API error
71
- const LONG_FAILURE_THRESHOLD_MS = 3 * 60_000; // >3min failure = likely stuck retry loop
72
-
73
- // Context overflow watchdog constants
74
- const OVERFLOW_CONSECUTIVE_THRESHOLD = 5; // N consecutive overload errors → trigger reset
75
- const OVERFLOW_TRANSCRIPT_MIN_BYTES = 1_000_000; // 1MB guard — skip reset if transcript is small (transient outage)
76
- const OVERFLOW_ERROR_PATTERN = /overloaded|context.*too.*long|token.*limit|extra usage is required/i;
77
-
78
- // Proactive session hygiene constants
79
- const SESSION_HYGIENE_SIZE_BYTES = 2_000_000; // 2MB — proactive archive+truncate threshold
80
- const SESSION_HYGIENE_AGE_MS = 24 * 60 * 60 * 1000; // 24h — max session age before proactive reset
81
-
82
- // Health watchdog constants
83
- const WATCHDOG_INTERVAL_MS = 5 * 60_000; // 5 min — independent of curation timer
84
- const ALERT_COOLDOWN_MS = 15 * 60_000; // 15 min per alert type
85
- const STALENESS_WARNING_MS = 10 * 60_000; // 10 min no success → warning
86
- const STALENESS_CRITICAL_MS = 15 * 60_000; // 15 min no success after reset → emergency restart
87
- const SESSION_SIZE_WARNING_BYTES = 1_500_000; // 1.5MB → proactive reset
88
- const SESSION_SIZE_RESTART_BYTES = 2_000_000; // 2MB → forced reset
89
- const AUTO_RESTART_COOLDOWN_MS = 60 * 60_000; // max 1 auto-restart per hour
32
+ // Resilience constants — only import what index.ts still uses directly
33
+ // (ResilienceManager, HealthWatchdog, CurationEngine own the rest)
90
34
 
91
35
  // ============================================================================
92
36
  // Parent context injection (subagent support)
@@ -95,11 +39,6 @@ const AUTO_RESTART_COOLDOWN_MS = 60 * 60_000; // max 1 auto-restart per hour
95
39
  const PARENT_CONTEXT_MAX_CHARS = 4000;
96
40
  const PARENT_CONTEXT_TTL_MS = 10 * 60_000; // 10 minutes — stale cache won't be injected
97
41
 
98
- type ParentContextCache = {
99
- sessionKey: string;
100
- capturedAt: number;
101
- contextText: string;
102
- };
103
42
 
104
43
  function isSubagentSession(sessionKey: string): boolean {
105
44
  return sessionKey.includes(":subagent:") || sessionKey.startsWith("subagent:");
@@ -211,313 +150,6 @@ async function sendTelegramAlert(
211
150
  });
212
151
  }
213
152
 
214
- // ============================================================================
215
- // File sync helpers
216
- // ============================================================================
217
-
218
- function syncFileToWorkspace(
219
- sourcePath: string | undefined,
220
- workspaceDir: string,
221
- targetName: string,
222
- logger: OpenClawPluginApi["logger"],
223
- ): boolean {
224
- if (!sourcePath) return false;
225
-
226
- try {
227
- const content = readFileSync(sourcePath, "utf-8");
228
- const targetPath = join(workspaceDir, targetName);
229
- const targetDir = dirname(targetPath);
230
-
231
- if (!existsSync(targetDir)) {
232
- mkdirSync(targetDir, { recursive: true });
233
- }
234
-
235
- // Only write if content changed (avoid unnecessary git diffs)
236
- let existing = "";
237
- try {
238
- existing = readFileSync(targetPath, "utf-8");
239
- } catch {
240
- // File doesn't exist yet
241
- }
242
-
243
- if (existing !== content) {
244
- writeFileSync(targetPath, content, "utf-8");
245
- logger.info(`sinain-hud: synced ${targetName} to workspace`);
246
- return true;
247
- }
248
- return false;
249
- } catch (err) {
250
- logger.warn(`sinain-hud: failed to sync ${targetName}: ${String(err)}`);
251
- return false;
252
- }
253
- }
254
-
255
- /**
256
- * Recursively sync a source directory to the workspace with selective overwrite policy:
257
- * - .json, .sh, .txt, .jsonl — always overwritten (infra/config files we control)
258
- * - .py and others — deploy-once only (skip if already exists; bot owns these after first deploy)
259
- * Skips __pycache__ and hidden directories.
260
- */
261
- function syncDirToWorkspace(
262
- sourceDir: string,
263
- workspaceDir: string,
264
- targetDirName: string,
265
- logger: OpenClawPluginApi["logger"],
266
- ): number {
267
- if (!existsSync(sourceDir)) return 0;
268
- const targetDir = join(workspaceDir, targetDirName);
269
- if (!existsSync(targetDir)) mkdirSync(targetDir, { recursive: true });
270
-
271
- const ALWAYS_OVERWRITE = new Set([".json", ".sh", ".txt", ".jsonl", ".py"]);
272
- let synced = 0;
273
-
274
- function syncRecursive(srcDir: string, dstDir: string): void {
275
- if (!existsSync(dstDir)) mkdirSync(dstDir, { recursive: true });
276
- for (const entry of readdirSync(srcDir)) {
277
- const srcPath = join(srcDir, entry);
278
- const dstPath = join(dstDir, entry);
279
- const stat = statSync(srcPath);
280
- if (stat.isDirectory()) {
281
- if (entry.startsWith("__") || entry.startsWith(".")) continue;
282
- syncRecursive(srcPath, dstPath);
283
- continue;
284
- }
285
- if (!stat.isFile()) continue;
286
- const ext = extname(entry).toLowerCase();
287
- if (!ALWAYS_OVERWRITE.has(ext) && existsSync(dstPath)) continue;
288
- const content = readFileSync(srcPath, "utf-8");
289
- let existing = "";
290
- try { existing = readFileSync(dstPath, "utf-8"); } catch {}
291
- if (existing !== content) {
292
- writeFileSync(dstPath, content, "utf-8");
293
- synced++;
294
- }
295
- }
296
- }
297
-
298
- syncRecursive(sourceDir, targetDir);
299
- if (synced > 0) logger.info(`sinain-hud: synced ${synced} files to ${targetDirName}/`);
300
- return synced;
301
- }
302
-
303
- /**
304
- * Recursively sync a modules/ source directory to workspace with selective deploy policy:
305
- * - module-registry.json → deploy-once (agent manages via module_manager.py)
306
- * - manifest.json → always overwrite (plugin controls schema)
307
- * - patterns.md → deploy-once (agent/extract may have modified)
308
- * - context/*.json → always overwrite
309
- */
310
- function syncModulesToWorkspace(
311
- sourceDir: string,
312
- workspaceDir: string,
313
- logger: OpenClawPluginApi["logger"],
314
- ): number {
315
- if (!existsSync(sourceDir)) return 0;
316
- const targetDir = join(workspaceDir, "modules");
317
- if (!existsSync(targetDir)) mkdirSync(targetDir, { recursive: true });
318
-
319
- const ALWAYS_OVERWRITE = new Set(["manifest.json"]);
320
- const DEPLOY_ONCE = new Set(["module-registry.json", "patterns.md", "guidance.md"]);
321
- let synced = 0;
322
-
323
- function syncRecursive(srcDir: string, dstDir: string): void {
324
- if (!existsSync(dstDir)) mkdirSync(dstDir, { recursive: true });
325
-
326
- for (const entry of readdirSync(srcDir)) {
327
- const srcPath = join(srcDir, entry);
328
- const dstPath = join(dstDir, entry);
329
- const stat = statSync(srcPath);
330
-
331
- if (stat.isDirectory()) {
332
- syncRecursive(srcPath, dstPath);
333
- continue;
334
- }
335
-
336
- if (!stat.isFile()) continue;
337
-
338
- const fileName = entry;
339
- const isAlwaysOverwrite = ALWAYS_OVERWRITE.has(fileName) || fileName.startsWith("context/");
340
- const isDeployOnce = DEPLOY_ONCE.has(fileName);
341
-
342
- // Deploy-once: skip if already in workspace
343
- if (isDeployOnce && existsSync(dstPath)) continue;
344
-
345
- // Default for unknown files: deploy-once
346
- if (!isAlwaysOverwrite && !isDeployOnce && existsSync(dstPath)) continue;
347
-
348
- const content = readFileSync(srcPath, "utf-8");
349
- let existing = "";
350
- try { existing = readFileSync(dstPath, "utf-8"); } catch {}
351
- if (existing !== content) {
352
- writeFileSync(dstPath, content, "utf-8");
353
- synced++;
354
- }
355
- }
356
- }
357
-
358
- syncRecursive(sourceDir, targetDir);
359
- if (synced > 0) logger.info(`sinain-hud: synced ${synced} module files to modules/`);
360
- return synced;
361
- }
362
-
363
- /**
364
- * Collect behavioral guidance from all active modules for prependContext injection.
365
- *
366
- * Reads module-registry.json, collects guidance.md from each active module
367
- * (sorted by priority desc). Imported modules get a [transferred] label.
368
- * Returns a formatted [MODULE GUIDANCE] block or empty string.
369
- */
370
- function collectModuleGuidance(
371
- workspaceDir: string,
372
- logger: OpenClawPluginApi["logger"],
373
- ): string {
374
- const registryPath = join(workspaceDir, "modules", "module-registry.json");
375
- if (!existsSync(registryPath)) return "";
376
-
377
- let registry: ModuleRegistry;
378
- try {
379
- registry = JSON.parse(readFileSync(registryPath, "utf-8")) as ModuleRegistry;
380
- } catch {
381
- return "";
382
- }
383
-
384
- // Active modules sorted by priority desc
385
- const activeModules: Array<{ id: string; priority: number }> = [];
386
- for (const [id, entry] of Object.entries(registry.modules)) {
387
- if (entry.status === "active") {
388
- activeModules.push({ id, priority: entry.priority });
389
- }
390
- }
391
- activeModules.sort((a, b) => b.priority - a.priority);
392
-
393
- const guidanceSections: string[] = [];
394
- let moduleCount = 0;
395
-
396
- for (const mod of activeModules) {
397
- const guidancePath = join(workspaceDir, "modules", mod.id, "guidance.md");
398
- if (!existsSync(guidancePath)) continue;
399
-
400
- try {
401
- const content = readFileSync(guidancePath, "utf-8").trim();
402
- if (!content) continue;
403
-
404
- // Check if module was imported (transferred)
405
- let label = mod.id;
406
- const manifestPath = join(workspaceDir, "modules", mod.id, "manifest.json");
407
- if (existsSync(manifestPath)) {
408
- try {
409
- const manifest = JSON.parse(readFileSync(manifestPath, "utf-8"));
410
- if (manifest.importedAt) {
411
- label = `${manifest.name || mod.id} [transferred]`;
412
- }
413
- } catch { /* skip */ }
414
- }
415
-
416
- guidanceSections.push(`### ${label}\n${content}`);
417
- moduleCount++;
418
- } catch {
419
- // Skip unreadable guidance
420
- }
421
- }
422
-
423
- if (guidanceSections.length === 0) return "";
424
-
425
- logger.info(`sinain-hud: injecting guidance from ${moduleCount} module(s)`);
426
- return `[MODULE GUIDANCE]\n${guidanceSections.join("\n\n")}`;
427
- }
428
-
429
- /**
430
- * Generate the merged effective playbook from active modules + base playbook.
431
- *
432
- * Reads module-registry.json, collects patterns.md from each active module
433
- * (sorted by priority desc), reads the base sinain-playbook.md, and writes
434
- * the merged result to memory/sinain-playbook-effective.md.
435
- */
436
- function generateEffectivePlaybook(
437
- workspaceDir: string,
438
- logger: OpenClawPluginApi["logger"],
439
- ): boolean {
440
- const registryPath = join(workspaceDir, "modules", "module-registry.json");
441
- if (!existsSync(registryPath)) {
442
- logger.info("sinain-hud: no module-registry.json found, skipping effective playbook generation");
443
- return false;
444
- }
445
-
446
- let registry: ModuleRegistry;
447
- try {
448
- registry = JSON.parse(readFileSync(registryPath, "utf-8")) as ModuleRegistry;
449
- } catch (err) {
450
- logger.warn(`sinain-hud: failed to parse module-registry.json: ${String(err)}`);
451
- return false;
452
- }
453
-
454
- // Collect active modules sorted by priority desc
455
- const activeModules: Array<{ id: string; priority: number }> = [];
456
- for (const [id, entry] of Object.entries(registry.modules)) {
457
- if (entry.status === "active") {
458
- activeModules.push({ id, priority: entry.priority });
459
- }
460
- }
461
- activeModules.sort((a, b) => b.priority - a.priority);
462
-
463
- // Build module stack header
464
- const stackLabel = activeModules.map((m) => `${m.id}(${m.priority})`).join(", ");
465
-
466
- // Collect patterns from each active module
467
- const sections: string[] = [];
468
- sections.push(`<!-- module-stack: ${stackLabel} -->`);
469
- sections.push("");
470
-
471
- for (const mod of activeModules) {
472
- const patternsPath = join(workspaceDir, "modules", mod.id, "patterns.md");
473
- if (!existsSync(patternsPath)) continue;
474
- try {
475
- const patterns = readFileSync(patternsPath, "utf-8").trim();
476
- if (patterns) {
477
- sections.push(`<!-- module: ${mod.id} (priority ${mod.priority}) -->`);
478
- // Attribution for transferred (imported) modules
479
- const manifestPath = join(workspaceDir, "modules", mod.id, "manifest.json");
480
- if (existsSync(manifestPath)) {
481
- try {
482
- const manifest = JSON.parse(readFileSync(manifestPath, "utf-8"));
483
- if (manifest.importedAt) {
484
- sections.push(`> *[Transferred knowledge: ${manifest.name || mod.id}]*`);
485
- }
486
- } catch { /* skip if manifest unreadable */ }
487
- }
488
- sections.push(patterns);
489
- sections.push("");
490
- }
491
- } catch {
492
- // Skip unreadable patterns
493
- }
494
- }
495
-
496
- // Append base playbook
497
- const basePlaybookPath = join(workspaceDir, "memory", "sinain-playbook.md");
498
- if (existsSync(basePlaybookPath)) {
499
- try {
500
- const base = readFileSync(basePlaybookPath, "utf-8").trim();
501
- if (base) {
502
- sections.push("<!-- base-playbook -->");
503
- sections.push(base);
504
- sections.push("");
505
- }
506
- } catch {
507
- // Skip if unreadable
508
- }
509
- }
510
-
511
- // Write effective playbook (always overwrite)
512
- const effectivePath = join(workspaceDir, "memory", "sinain-playbook-effective.md");
513
- const effectiveDir = dirname(effectivePath);
514
- if (!existsSync(effectiveDir)) mkdirSync(effectiveDir, { recursive: true });
515
-
516
- const content = sections.join("\n");
517
- writeFileSync(effectivePath, content, "utf-8");
518
- logger.info(`sinain-hud: generated effective playbook (${activeModules.length} active modules)`);
519
- return true;
520
- }
521
153
 
522
154
  // ============================================================================
523
155
  // Plugin Definition
@@ -526,7 +158,6 @@ function generateEffectivePlaybook(
526
158
  export default function sinainHudPlugin(api: OpenClawPluginApi): void {
527
159
  const cfg = (api.pluginConfig ?? {}) as PluginConfig;
528
160
  const sessionStates = new Map<string, SessionState>();
529
- let curationInterval: ReturnType<typeof setInterval> | null = null;
530
161
  let lastWorkspaceDir: string | null = null;
531
162
 
532
163
  // Pre-initialize from config so situation.update works immediately after gateway restart,
@@ -537,53 +168,16 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
537
168
  api.logger.info(`sinain-hud: workspace pre-initialized from config: ${lastWorkspaceDir}`);
538
169
  }
539
170
 
540
- let consecutiveHeartbeatSkips = 0;
541
- let lastEvalReportDate: string | null = null;
171
+ // KnowledgeStore wraps all file I/O for workspace, playbooks, modules, eval
172
+ const store = new KnowledgeStore(lastWorkspaceDir ?? "/tmp/sinain-placeholder", api.logger);
542
173
 
543
- // Retry storm resilience state
544
- const recentOutcomes: Array<{ ts: number; success: boolean; error?: string }> = [];
545
- let lastSuccessTs = 0;
546
- let lastPlaybookGenTs = 0;
547
- let lastFileSyncTs = 0;
548
- let outageDetected = false;
549
- let consecutiveFailures = 0;
550
- let outageStartTs = 0;
551
- let consecutiveOverflowErrors = 0;
174
+ // Resilience layer
175
+ const resilience = new ResilienceManager();
552
176
 
553
177
  // Parent context cache for subagent injection
554
178
  let parentContextCache: ParentContextCache | null = null;
555
179
 
556
- // Health watchdog state
557
- let watchdogInterval: ReturnType<typeof setInterval> | null = null;
558
- let lastResetTs = 0;
559
- let lastAutoRestartTs = 0;
560
-
561
- function appendToContextCache(line: string): void {
562
- if (!parentContextCache) return;
563
- parentContextCache.contextText += "\n" + line;
564
- parentContextCache.capturedAt = Date.now();
565
- // Trim from front if over budget (keep most recent context)
566
- if (parentContextCache.contextText.length > PARENT_CONTEXT_MAX_CHARS) {
567
- const excess = parentContextCache.contextText.length - PARENT_CONTEXT_MAX_CHARS;
568
- const newStart = parentContextCache.contextText.indexOf("\n", excess);
569
- parentContextCache.contextText = newStart >= 0
570
- ? parentContextCache.contextText.slice(newStart + 1)
571
- : parentContextCache.contextText.slice(excess);
572
- }
573
- }
574
-
575
- function computeErrorRate(): { rate: number; total: number; failures: number } {
576
- const cutoff = Date.now() - ERROR_WINDOW_MS;
577
- // Prune entries older than the window
578
- while (recentOutcomes.length > 0 && recentOutcomes[0].ts < cutoff) {
579
- recentOutcomes.shift();
580
- }
581
- const total = recentOutcomes.length;
582
- if (total === 0) return { rate: 0, total: 0, failures: 0 };
583
- const failures = recentOutcomes.filter((o) => !o.success).length;
584
- return { rate: failures / total, total, failures };
585
- }
586
-
180
+ // ── Backend adapter for resilience (OpenClaw-specific) ──────────────────
587
181
  function getSessionsJsonPath(): string | null {
588
182
  if (!lastWorkspaceDir) return null;
589
183
  const sessionsDir = join(dirname(lastWorkspaceDir), "agents", "main", "sessions");
@@ -591,20 +185,31 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
591
185
  return existsSync(p) ? p : null;
592
186
  }
593
187
 
188
+ function getTranscriptSize(): { path: string; bytes: number } | null {
189
+ const sessionsJsonPath = getSessionsJsonPath();
190
+ if (!sessionsJsonPath || !cfg.sessionKey) return null;
191
+ try {
192
+ const sessionsData = JSON.parse(readFileSync(sessionsJsonPath, "utf-8"));
193
+ const session = sessionsData[cfg.sessionKey];
194
+ const transcriptPath = session?.sessionFile as string | undefined;
195
+ if (!transcriptPath || !existsSync(transcriptPath)) return null;
196
+ return { path: transcriptPath, bytes: statSync(transcriptPath).size };
197
+ } catch {
198
+ return null;
199
+ }
200
+ }
201
+
594
202
  function performOverflowReset(): boolean {
595
203
  const targetSessionKey = cfg.sessionKey;
596
204
  if (!targetSessionKey || !lastWorkspaceDir) {
597
205
  api.logger.warn("sinain-hud: overflow reset aborted — no sessionKey or workspace dir");
598
206
  return false;
599
207
  }
600
-
601
208
  const sessionsJsonPath = getSessionsJsonPath();
602
-
603
209
  if (!sessionsJsonPath) {
604
210
  api.logger.warn(`sinain-hud: overflow reset aborted — sessions.json not found`);
605
211
  return false;
606
212
  }
607
-
608
213
  let sessionsData: Record<string, Record<string, unknown>>;
609
214
  try {
610
215
  sessionsData = JSON.parse(readFileSync(sessionsJsonPath, "utf-8"));
@@ -612,15 +217,13 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
612
217
  api.logger.warn(`sinain-hud: overflow reset aborted — cannot parse sessions.json: ${err}`);
613
218
  return false;
614
219
  }
615
-
616
220
  const session = sessionsData[targetSessionKey];
617
221
  const transcriptPath = session?.sessionFile as string | undefined;
618
222
  if (!transcriptPath || !existsSync(transcriptPath)) {
619
223
  api.logger.warn(`sinain-hud: overflow reset aborted — transcript not found: ${transcriptPath}`);
620
224
  return false;
621
225
  }
622
-
623
- // Guard: only reset if transcript is actually large
226
+ const OVERFLOW_TRANSCRIPT_MIN_BYTES = 1_000_000;
624
227
  const size = statSync(transcriptPath).size;
625
228
  if (size < OVERFLOW_TRANSCRIPT_MIN_BYTES) {
626
229
  api.logger.info(
@@ -628,31 +231,59 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
628
231
  );
629
232
  return false;
630
233
  }
631
-
632
- // Archive → truncate → reset metadata
633
234
  const archivePath = transcriptPath.replace(/\.jsonl$/, `.archived.${Date.now()}.jsonl`);
634
- try {
635
- copyFileSync(transcriptPath, archivePath);
636
- } catch (err) {
235
+ try { copyFileSync(transcriptPath, archivePath); } catch (err) {
637
236
  api.logger.warn(`sinain-hud: overflow reset aborted — archive failed: ${err}`);
638
237
  return false;
639
238
  }
640
-
641
239
  writeFileSync(transcriptPath, "", "utf-8");
642
-
643
240
  try {
644
241
  session.contextTokens = 0;
645
242
  writeFileSync(sessionsJsonPath, JSON.stringify(sessionsData, null, 2), "utf-8");
646
- } catch {
647
- // Non-fatal — gateway recomputes tokens from transcript content
648
- }
649
-
243
+ } catch {}
650
244
  api.logger.info(
651
245
  `sinain-hud: === OVERFLOW RESET === Transcript truncated (was ${Math.round(size / 1024)}KB). Archive: ${archivePath}`,
652
246
  );
653
247
  return true;
654
248
  }
655
249
 
250
+ function getStateDir(): string | null {
251
+ if (!lastWorkspaceDir) return null;
252
+ return dirname(lastWorkspaceDir);
253
+ }
254
+
255
+ const resilienceBackend: ResilienceBackend = {
256
+ getTranscriptSize,
257
+ performOverflowReset,
258
+ async sendAlert(alertType: string, title: string, body: string): Promise<void> {
259
+ const sd = getStateDir();
260
+ if (sd) sendTelegramAlert(alertType, title, body, sd);
261
+ },
262
+ };
263
+
264
+ // CurationEngine + HealthWatchdog
265
+ const scriptRunner = (args: string[], opts: { timeoutMs: number; cwd: string }) =>
266
+ api.runtime.system.runCommandWithTimeout(args, opts);
267
+ const engine = new CurationEngine(store, scriptRunner, resilience, { userTimezone: cfg.userTimezone ?? "Europe/Berlin" }, api.logger);
268
+ if (cfg.snapshotRepoPath) {
269
+ engine.setGitSnapshotStore(new GitSnapshotStore(cfg.snapshotRepoPath, api.logger));
270
+ api.logger.info(`sinain-hud: git snapshot store configured at ${cfg.snapshotRepoPath}`);
271
+ }
272
+ const watchdog = new HealthWatchdog(resilience, resilienceBackend, api.logger);
273
+
274
+ function appendToContextCache(line: string): void {
275
+ if (!parentContextCache) return;
276
+ parentContextCache.contextText += "\n" + line;
277
+ parentContextCache.capturedAt = Date.now();
278
+ if (parentContextCache.contextText.length > PARENT_CONTEXT_MAX_CHARS) {
279
+ const excess = parentContextCache.contextText.length - PARENT_CONTEXT_MAX_CHARS;
280
+ const newStart = parentContextCache.contextText.indexOf("\n", excess);
281
+ parentContextCache.contextText = newStart >= 0
282
+ ? parentContextCache.contextText.slice(newStart + 1)
283
+ : parentContextCache.contextText.slice(excess);
284
+ }
285
+ }
286
+
656
287
  api.logger.info("sinain-hud: plugin registered");
657
288
 
658
289
  // ==========================================================================
@@ -669,13 +300,10 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
669
300
  respond(false, null, { code: "not_ready", message: "workspace not initialized" });
670
301
  return;
671
302
  }
672
- const situationPath = join(lastWorkspaceDir, "SITUATION.md");
673
- const tmpPath = situationPath + ".rpc.tmp";
674
303
  try {
675
- writeFileSync(tmpPath, content, "utf-8");
676
- renameSync(tmpPath, situationPath);
677
- respond(true, { ok: true, bytes: content.length });
678
- api.logger.info(`sinain-hud: SITUATION.md updated via RPC (${content.length} chars)`);
304
+ store.writeSituation(content as string);
305
+ respond(true, { ok: true, bytes: (content as string).length });
306
+ api.logger.info(`sinain-hud: SITUATION.md updated via RPC (${(content as string).length} chars)`);
679
307
  } catch (err: any) {
680
308
  respond(false, null, { code: "write_error", message: err.message });
681
309
  }
@@ -702,8 +330,9 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
702
330
  const workspaceDir = ctx.workspaceDir;
703
331
  if (!workspaceDir) return;
704
332
 
705
- // Track workspace dir in session state and for curation timer
333
+ // Track workspace dir in session state, store, and for curation timer
706
334
  lastWorkspaceDir = workspaceDir;
335
+ store.setWorkspaceDir(workspaceDir);
707
336
  const sessionKey = ctx.sessionKey;
708
337
  if (sessionKey) {
709
338
  const state = sessionStates.get(sessionKey);
@@ -714,9 +343,8 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
714
343
 
715
344
  const now = Date.now();
716
345
 
717
- // ── Debounced file sync (skip if done <3 min ago) ───────────────────
718
- const fileSyncDue = lastFileSyncTs === 0 || (now - lastFileSyncTs) >= FILE_SYNC_DEBOUNCE_MS;
719
- if (fileSyncDue) {
346
+ // ── Debounced file sync ──────────────────────────────────────────────
347
+ if (resilience.isFileSyncDue()) {
720
348
  const heartbeatSource = cfg.heartbeatPath
721
349
  ? api.resolvePath(cfg.heartbeatPath)
722
350
  : undefined;
@@ -724,41 +352,39 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
724
352
  ? api.resolvePath(cfg.skillPath)
725
353
  : undefined;
726
354
 
727
- syncFileToWorkspace(heartbeatSource, workspaceDir, "HEARTBEAT.md", api.logger);
728
- syncFileToWorkspace(skillSource, workspaceDir, "SKILL.md", api.logger);
355
+ store.deployFile(heartbeatSource, "HEARTBEAT.md");
356
+ store.deployFile(skillSource, "SKILL.md");
729
357
 
730
358
  const memorySource = cfg.memoryPath ? api.resolvePath(cfg.memoryPath) : undefined;
731
359
  if (memorySource) {
732
- syncDirToWorkspace(memorySource, workspaceDir, "sinain-memory", api.logger);
360
+ store.deployDir(memorySource, "sinain-memory");
733
361
  const gbPath = join(workspaceDir, "sinain-memory", "git_backup.sh");
734
362
  if (existsSync(gbPath)) try { chmodSync(gbPath, 0o755); } catch {}
735
363
  }
736
364
 
737
365
  const modulesSource = cfg.modulesPath ? api.resolvePath(cfg.modulesPath) : undefined;
738
366
  if (modulesSource && existsSync(modulesSource)) {
739
- syncModulesToWorkspace(modulesSource, workspaceDir, api.logger);
367
+ store.deployModules(modulesSource);
740
368
  }
741
369
 
742
- lastFileSyncTs = now;
370
+ resilience.markFileSynced();
743
371
  }
744
372
 
745
- // ── Debounced playbook generation (skip if done <5 min ago) ─────────
746
- const playbookGenDue = lastPlaybookGenTs === 0 || (now - lastPlaybookGenTs) >= PLAYBOOK_GEN_DEBOUNCE_MS;
747
- if (playbookGenDue) {
373
+ // ── Debounced playbook generation ────────────────────────────────────
374
+ if (resilience.isPlaybookGenDue()) {
748
375
  const modulesSource = cfg.modulesPath ? api.resolvePath(cfg.modulesPath) : undefined;
749
376
  if (modulesSource && existsSync(modulesSource)) {
750
- generateEffectivePlaybook(workspaceDir, api.logger);
751
- lastPlaybookGenTs = now;
377
+ store.generateEffectivePlaybook();
378
+ resilience.markPlaybookGenerated();
752
379
  }
753
380
  }
754
381
 
755
382
  // ── Fire-and-forget: ingest active module patterns into triple store
756
383
  try {
757
- const regPath = join(workspaceDir, "modules", "module-registry.json");
758
- if (existsSync(regPath)) {
759
- const reg = JSON.parse(readFileSync(regPath, "utf-8"));
760
- for (const [id, entry] of Object.entries(reg.modules || {})) {
761
- if ((entry as Record<string, unknown>).status === "active") {
384
+ const registry = store.readModuleRegistry();
385
+ if (registry) {
386
+ for (const [id, entry] of Object.entries(registry.modules)) {
387
+ if (entry.status === "active") {
762
388
  api.runtime.system.runCommandWithTimeout(
763
389
  ["uv", "run", "--with", "requests", "python3",
764
390
  "sinain-memory/triple_ingest.py",
@@ -773,21 +399,12 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
773
399
  } catch {}
774
400
 
775
401
  // ── Memory dirs — always run (cheap, idempotent) ────────────────────
776
- for (const dir of ["memory", "memory/playbook-archive", "memory/playbook-logs",
777
- "memory/eval-logs", "memory/eval-reports"]) {
778
- const fullPath = join(workspaceDir, dir);
779
- if (!existsSync(fullPath)) {
780
- mkdirSync(fullPath, { recursive: true });
781
- }
782
- // Ensure directory is writable even if created by another process (e.g. root)
783
- try { chmodSync(fullPath, 0o755); } catch {}
784
- }
402
+ store.ensureMemoryDirs();
785
403
 
786
404
  // ── Context capture + subagent injection ────────────────────────────
787
405
  const isSubagent = sessionKey ? isSubagentSession(sessionKey) : false;
788
406
 
789
407
  if (!isSubagent) {
790
- // Main session: capture recent conversation context for future subagents
791
408
  const messages = (event as Record<string, unknown>).messages as unknown[] | undefined;
792
409
  const prompt = (event as Record<string, unknown>).prompt as string | undefined;
793
410
  if (messages && Array.isArray(messages) && messages.length > 0) {
@@ -805,110 +422,15 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
805
422
  }
806
423
  }
807
424
 
808
- // ── Accumulate context parts (time + outage recovery + subagent injection)
809
- const contextParts: string[] = [];
810
-
811
- // Time awareness — always inject current local time
812
- const userTz = cfg.userTimezone ?? "Europe/Berlin";
813
- const nowLocal = new Date().toLocaleString("en-GB", {
814
- timeZone: userTz,
815
- weekday: "long",
816
- year: "numeric",
817
- month: "long",
818
- day: "numeric",
819
- hour: "2-digit",
820
- minute: "2-digit",
821
- hour12: false,
425
+ // ── Context assembly (delegated to CurationEngine) ────────────────
426
+ const contextParts = await engine.assembleContext({
427
+ isSubagent,
428
+ parentContextText: parentContextCache?.contextText ?? null,
429
+ parentContextAgeMs: parentContextCache ? now - parentContextCache.capturedAt : undefined,
430
+ parentContextTtlMs: PARENT_CONTEXT_TTL_MS,
431
+ heartbeatConfigured: !!cfg.heartbeatPath,
432
+ heartbeatTargetExists: existsSync(join(workspaceDir, "HEARTBEAT.md")),
822
433
  });
823
- contextParts.push(`[CURRENT TIME] ${nowLocal} (${userTz})`);
824
-
825
- // Recovery context injection after outage
826
- if (outageStartTs > 0 && !outageDetected && lastSuccessTs > outageStartTs) {
827
- const outageDurationMin = Math.round((lastSuccessTs - outageStartTs) / 60_000);
828
- outageStartTs = 0; // one-shot: only inject once
829
- api.logger.info(`sinain-hud: injecting recovery context (outage lasted ~${outageDurationMin}min)`);
830
- contextParts.push(
831
- `[SYSTEM] The upstream API was unavailable for ~${outageDurationMin} minutes. ` +
832
- `Multiple queued messages may have accumulated. Prioritize the current task, skip catch-up on stale items, and keep responses concise.`,
833
- );
834
- }
835
-
836
- // Subagent: inject cached parent context
837
- if (isSubagent && parentContextCache) {
838
- const cacheAgeMs = now - parentContextCache.capturedAt;
839
- if (cacheAgeMs < PARENT_CONTEXT_TTL_MS) {
840
- const cacheAgeSec = Math.round(cacheAgeMs / 1000);
841
- api.logger.info(
842
- `sinain-hud: injected parent context for subagent (${parentContextCache.contextText.length} chars, ${cacheAgeSec}s old)`,
843
- );
844
- contextParts.push(
845
- `[PARENT SESSION CONTEXT] The following is a summary of the recent conversation from the parent session that spawned you. Use it to understand references to code, files, or decisions discussed earlier:\n\n${parentContextCache.contextText}`,
846
- );
847
- } else {
848
- api.logger.info(
849
- `sinain-hud: skipped stale parent context for subagent (${Math.round(cacheAgeMs / 1000)}s old, TTL=${PARENT_CONTEXT_TTL_MS / 1000}s)`,
850
- );
851
- }
852
- }
853
-
854
- // Heartbeat enforcement (replaces fork's system-prompt.ts logic)
855
- if (cfg.heartbeatPath) {
856
- const hbTarget = join(workspaceDir, "HEARTBEAT.md");
857
- if (existsSync(hbTarget)) {
858
- contextParts.push(
859
- "[HEARTBEAT PROTOCOL] HEARTBEAT.md is loaded in your project context. " +
860
- "On every heartbeat poll, you MUST execute the full protocol defined in " +
861
- "HEARTBEAT.md — all phases, all steps, in order. " +
862
- "Only reply HEARTBEAT_OK if HEARTBEAT.md explicitly permits it " +
863
- "after you have completed all mandatory steps."
864
- );
865
- }
866
- }
867
-
868
- // SITUATION.md bootstrap (replaces fork's workspace.ts logic)
869
- const situationPath = join(workspaceDir, "SITUATION.md");
870
- if (existsSync(situationPath)) {
871
- try {
872
- const content = readFileSync(situationPath, "utf-8").trim();
873
- if (content) contextParts.push(`[SITUATION]\n${content}`);
874
- } catch {}
875
- }
876
-
877
- // Knowledge transfer attribution — if effective playbook contains imported modules
878
- const effectivePlaybookPath = join(workspaceDir, "memory", "sinain-playbook-effective.md");
879
- if (existsSync(effectivePlaybookPath)) {
880
- try {
881
- const effectiveContent = readFileSync(effectivePlaybookPath, "utf-8");
882
- if (effectiveContent.includes("[Transferred knowledge:")) {
883
- contextParts.push(
884
- "[KNOWLEDGE TRANSFER] Some patterns in your playbook were transferred from " +
885
- "another sinain instance. When surfacing these, briefly cite their origin."
886
- );
887
- }
888
- } catch { /* skip if unreadable */ }
889
- }
890
-
891
- // Module guidance injection — behavioral instructions from active modules
892
- const moduleGuidance = collectModuleGuidance(workspaceDir, api.logger);
893
- if (moduleGuidance) contextParts.push(moduleGuidance);
894
-
895
- // Synchronous: knowledge graph context (10s timeout, skipped on failure)
896
- try {
897
- const ragResult = await api.runtime.system.runCommandWithTimeout(
898
- ["uv", "run", "--with", "requests", "python3",
899
- "sinain-memory/triple_query.py",
900
- "--memory-dir", join(workspaceDir, "memory"),
901
- "--context", "current session",
902
- "--max-chars", "1500"],
903
- { timeoutMs: 10_000, cwd: workspaceDir },
904
- );
905
- if (ragResult.code === 0) {
906
- const parsed = JSON.parse(ragResult.stdout.trim());
907
- if (parsed.context && parsed.context.length > 50) {
908
- contextParts.push(`[KNOWLEDGE GRAPH CONTEXT]\n${parsed.context}`);
909
- }
910
- }
911
- } catch {}
912
434
 
913
435
  if (contextParts.length > 0) {
914
436
  return { prependContext: contextParts.join("\n\n") };
@@ -993,101 +515,22 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
993
515
  const isSuccess = event.success === true;
994
516
  const isShortFailure = !isSuccess && durationMs < SHORT_FAILURE_THRESHOLD_MS;
995
517
 
996
- // ── Retry storm: track outcome ──────────────────────────────────────
997
- recentOutcomes.push({
518
+ // ── Retry storm: track outcome via ResilienceManager ────────────────
519
+ resilience.recentOutcomes.push({
998
520
  ts: Date.now(),
999
521
  success: isSuccess,
1000
522
  error: isSuccess ? undefined : String(event.error ?? "unknown"),
1001
523
  });
1002
524
 
1003
525
  if (isSuccess) {
1004
- const wasOutage = outageDetected;
1005
- const outageDurationMs = outageStartTs > 0 ? Date.now() - outageStartTs : 0;
1006
- consecutiveFailures = 0;
1007
- outageDetected = false;
1008
- lastSuccessTs = Date.now();
1009
- if (wasOutage) {
1010
- api.logger.info(
1011
- `sinain-hud: OUTAGE RECOVERED — resumed after ${Math.round(outageDurationMs / 1000)}s`,
1012
- );
1013
- // outageStartTs is NOT reset here — before_agent_start uses it to
1014
- // inject recovery context on the next run, then resets it itself.
1015
-
1016
- // Send recovery alert via Telegram
1017
- const sd = getStateDir();
1018
- if (sd) {
1019
- sendTelegramAlert("recovery", "✅ *sinain-hud* recovered",
1020
- `• Gateway up, first run succeeded\n• Downtime: ~${Math.round(outageDurationMs / 60_000)}min`,
1021
- sd);
1022
- }
1023
- }
526
+ resilience.recordSuccess(resilienceBackend, api.logger);
1024
527
  } else if (isShortFailure) {
1025
- consecutiveFailures++;
1026
- const { rate, total } = computeErrorRate();
1027
- if (!outageDetected && total >= OUTAGE_MIN_SAMPLES && rate >= OUTAGE_ERROR_RATE_THRESHOLD) {
1028
- outageDetected = true;
1029
- outageStartTs = Date.now();
1030
- api.logger.warn(
1031
- `sinain-hud: OUTAGE DETECTED — ${Math.round(rate * 100)}% error rate over ${total} samples, ${consecutiveFailures} consecutive failures`,
1032
- );
1033
- const sd = getStateDir();
1034
- if (sd) {
1035
- sendTelegramAlert("outage", "🔴 *sinain-hud* OUTAGE DETECTED",
1036
- `• ${Math.round(rate * 100)}% error rate over ${total} samples\n• ${consecutiveFailures} consecutive failures`,
1037
- sd);
1038
- }
1039
- }
528
+ resilience.recordShortFailure(resilienceBackend, api.logger);
1040
529
  }
1041
530
 
1042
531
  // ── Context overflow watchdog ──────────────────────────────────────
1043
532
  if (sessionKey === cfg.sessionKey) {
1044
- if (!isSuccess && OVERFLOW_ERROR_PATTERN.test(String(event.error ?? ""))) {
1045
- consecutiveOverflowErrors++;
1046
- api.logger.warn(
1047
- `sinain-hud: overflow watchdog — error #${consecutiveOverflowErrors}/${OVERFLOW_CONSECUTIVE_THRESHOLD}`,
1048
- );
1049
- if (consecutiveOverflowErrors >= OVERFLOW_CONSECUTIVE_THRESHOLD) {
1050
- api.logger.warn("sinain-hud: OVERFLOW THRESHOLD REACHED — attempting transcript reset");
1051
- if (performOverflowReset()) {
1052
- lastResetTs = Date.now();
1053
- consecutiveOverflowErrors = 0;
1054
- outageDetected = false;
1055
- consecutiveFailures = 0;
1056
- outageStartTs = 0;
1057
- const sd = getStateDir();
1058
- if (sd) {
1059
- sendTelegramAlert("overflow_reset", "⚠️ *sinain-hud* overflow reset triggered",
1060
- `• ${OVERFLOW_CONSECUTIVE_THRESHOLD} consecutive overflow errors\n• Transcript truncated`,
1061
- sd);
1062
- }
1063
- }
1064
- }
1065
- } else if (isSuccess) {
1066
- consecutiveOverflowErrors = 0;
1067
- }
1068
-
1069
- // Duration-gated overflow reset: long failure + overflow error pattern = stuck retry loop.
1070
- // The core misclassifies "extra usage is required" as rate_limit → infinite retry.
1071
- // After the run times out (>3min), we detect it and reset for the next cycle.
1072
- const isLongFailure = !isSuccess && durationMs > LONG_FAILURE_THRESHOLD_MS;
1073
- if (isLongFailure && OVERFLOW_ERROR_PATTERN.test(String(event.error ?? ""))) {
1074
- api.logger.warn(
1075
- `sinain-hud: long failure (${Math.round(durationMs / 1000)}s) with overflow error — immediate reset`,
1076
- );
1077
- if (performOverflowReset()) {
1078
- lastResetTs = Date.now();
1079
- consecutiveOverflowErrors = 0;
1080
- outageDetected = false;
1081
- consecutiveFailures = 0;
1082
- outageStartTs = 0;
1083
- const sd = getStateDir();
1084
- if (sd) {
1085
- sendTelegramAlert("overflow_reset", "⚠️ *sinain-hud* overflow reset (stuck retry)",
1086
- `• ${Math.round(durationMs / 1000)}s failed run with overflow error\n• Transcript truncated, next heartbeat should recover`,
1087
- sd);
1088
- }
1089
- }
1090
- }
533
+ resilience.checkOverflow(isSuccess, event.error ? String(event.error) : undefined, durationMs, resilienceBackend, api.logger);
1091
534
  }
1092
535
 
1093
536
  // ── Count tool usage by name ────────────────────────────────────────
@@ -1097,14 +540,8 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
1097
540
  }
1098
541
 
1099
542
  // ── Write session summary (skip during outage — noise reduction) ───
1100
- const skipSummary = outageDetected && isShortFailure;
543
+ const skipSummary = resilience.outageDetected && isShortFailure;
1101
544
  if (state.workspaceDir && !skipSummary) {
1102
- const summaryPath = join(
1103
- state.workspaceDir,
1104
- "memory",
1105
- "session-summaries.jsonl",
1106
- );
1107
-
1108
545
  const summary = {
1109
546
  ts: new Date().toISOString(),
1110
547
  sessionKey,
@@ -1118,13 +555,7 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
1118
555
  };
1119
556
 
1120
557
  try {
1121
- const dir = dirname(summaryPath);
1122
- if (!existsSync(dir)) {
1123
- mkdirSync(dir, { recursive: true });
1124
- }
1125
- writeFileSync(summaryPath, JSON.stringify(summary) + "\n", {
1126
- flag: "a",
1127
- });
558
+ store.appendSessionSummary(summary);
1128
559
  api.logger.info(
1129
560
  `sinain-hud: session summary written (${toolCount} tools, ${Math.round(durationMs / 1000)}s)`,
1130
561
  );
@@ -1149,23 +580,22 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
1149
580
 
1150
581
  // ── Heartbeat compliance (exempt during outage) ─────────────────────
1151
582
  if ((ctx as Record<string, unknown>).messageProvider === "heartbeat") {
1152
- if (outageDetected && isShortFailure) {
1153
- // Agent couldn't even process the prompt — don't count as a skip
583
+ if (resilience.outageDetected && isShortFailure) {
1154
584
  api.logger.info(
1155
585
  `sinain-hud: heartbeat compliance exempted (outage active, ${Math.round(durationMs / 1000)}s run)`,
1156
586
  );
1157
587
  } else if (!state.heartbeatToolCalled) {
1158
- consecutiveHeartbeatSkips++;
588
+ resilience.consecutiveHeartbeatSkips++;
1159
589
  api.logger.warn(
1160
- `sinain-hud: heartbeat compliance violation — tool not called (consecutive: ${consecutiveHeartbeatSkips})`,
590
+ `sinain-hud: heartbeat compliance violation — tool not called (consecutive: ${resilience.consecutiveHeartbeatSkips})`,
1161
591
  );
1162
- if (consecutiveHeartbeatSkips >= 3) {
592
+ if (resilience.consecutiveHeartbeatSkips >= 3) {
1163
593
  api.logger.warn(
1164
- `sinain-hud: ESCALATION — ${consecutiveHeartbeatSkips} consecutive heartbeat skips`,
594
+ `sinain-hud: ESCALATION — ${resilience.consecutiveHeartbeatSkips} consecutive heartbeat skips`,
1165
595
  );
1166
596
  }
1167
597
  } else {
1168
- consecutiveHeartbeatSkips = 0;
598
+ resilience.consecutiveHeartbeatSkips = 0;
1169
599
  }
1170
600
  }
1171
601
 
@@ -1230,21 +660,10 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
1230
660
 
1231
661
  api.on("gateway_start", async () => {
1232
662
  sessionStates.clear();
1233
- // Reset all resilience state — clean slate on restart
1234
- recentOutcomes.length = 0;
1235
- lastSuccessTs = 0;
1236
- lastPlaybookGenTs = 0;
1237
- lastFileSyncTs = 0;
1238
- outageDetected = false;
1239
- consecutiveFailures = 0;
1240
- outageStartTs = 0;
1241
- consecutiveHeartbeatSkips = 0;
1242
- consecutiveOverflowErrors = 0;
663
+ resilience.resetAll();
1243
664
  parentContextCache = null;
1244
- // Reset watchdog alert state
1245
- lastResetTs = 0;
1246
665
  _alertCooldowns.clear();
1247
- _cachedBotToken = undefined; // re-read on next alert
666
+ _cachedBotToken = undefined;
1248
667
  _alertMissingConfigLogged = false;
1249
668
  api.logger.info("sinain-hud: gateway started, session + resilience + watchdog tracking reset");
1250
669
  });
@@ -1286,13 +705,13 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
1286
705
  }
1287
706
 
1288
707
  // Resilience info
1289
- const { rate, total, failures } = computeErrorRate();
708
+ const { rate, total, failures } = resilience.computeErrorRate();
1290
709
  lines.push("\n**Resilience**");
1291
- lines.push(`- Outage: ${outageDetected ? `ACTIVE (${Math.round((Date.now() - outageStartTs) / 1000)}s, ${consecutiveFailures} consecutive failures)` : "clear"}`);
710
+ lines.push(`- Outage: ${resilience.outageDetected ? `ACTIVE (${Math.round((Date.now() - resilience.outageStartTs) / 1000)}s, ${resilience.consecutiveFailures} consecutive failures)` : "clear"}`);
1292
711
  lines.push(`- Error rate: ${Math.round(rate * 100)}% (${failures}/${total} in ${ERROR_WINDOW_MS / 60_000}min window)`);
1293
- lines.push(`- Last success: ${lastSuccessTs > 0 ? `${Math.round((Date.now() - lastSuccessTs) / 1000)}s ago` : "never"}`);
1294
- lines.push(`- Heartbeat skips: ${consecutiveHeartbeatSkips}`);
1295
- lines.push(`- Overflow watchdog: ${consecutiveOverflowErrors}/${OVERFLOW_CONSECUTIVE_THRESHOLD}`);
712
+ lines.push(`- Last success: ${resilience.lastSuccessTs > 0 ? `${Math.round((Date.now() - resilience.lastSuccessTs) / 1000)}s ago` : "never"}`);
713
+ lines.push(`- Heartbeat skips: ${resilience.consecutiveHeartbeatSkips}`);
714
+ lines.push(`- Overflow watchdog: ${resilience.consecutiveOverflowErrors}/${OVERFLOW_CONSECUTIVE_THRESHOLD}`);
1296
715
  lines.push(`- Parent context cache: ${parentContextCache ? `${parentContextCache.contextText.length} chars, ${Math.round((Date.now() - parentContextCache.capturedAt) / 1000)}s old` : "empty"}`);
1297
716
 
1298
717
  return { text: lines.join("\n") };
@@ -1316,18 +735,11 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
1316
735
  return { text: "No workspace directory available (no active session)." };
1317
736
  }
1318
737
 
1319
- const registryPath = join(workspaceDir, "modules", "module-registry.json");
1320
- if (!existsSync(registryPath)) {
738
+ const registry = store.readModuleRegistry();
739
+ if (!registry) {
1321
740
  return { text: "Module system not initialized (no module-registry.json found)." };
1322
741
  }
1323
742
 
1324
- let registry: ModuleRegistry;
1325
- try {
1326
- registry = JSON.parse(readFileSync(registryPath, "utf-8")) as ModuleRegistry;
1327
- } catch {
1328
- return { text: "Failed to parse module-registry.json." };
1329
- }
1330
-
1331
743
  const active: Array<{ id: string; priority: number; locked: boolean }> = [];
1332
744
  const suspended: string[] = [];
1333
745
  const disabled: string[] = [];
@@ -1383,53 +795,27 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
1383
795
  return { text: "No workspace directory available (no active session)." };
1384
796
  }
1385
797
 
1386
- const reportsDir = join(workspaceDir, "memory", "eval-reports");
1387
- const logsDir = join(workspaceDir, "memory", "eval-logs");
1388
798
  const lines: string[] = ["**Evaluation Report**\n"];
1389
799
 
1390
800
  // Find latest report
1391
- let latestReport = "";
1392
- if (existsSync(reportsDir)) {
1393
- const reports = readdirSync(reportsDir)
1394
- .filter((f: string) => f.endsWith(".md"))
1395
- .sort()
1396
- .reverse();
1397
- if (reports.length > 0) {
1398
- try {
1399
- latestReport = readFileSync(join(reportsDir, reports[0]), "utf-8");
1400
- lines.push(latestReport.trim());
1401
- } catch {
1402
- lines.push("Failed to read latest report.");
1403
- }
1404
- }
1405
- }
1406
-
1407
- if (!latestReport) {
801
+ const latestReport = store.readLatestEvalReport();
802
+ if (latestReport) {
803
+ lines.push(latestReport.trim());
804
+ } else {
1408
805
  lines.push("No eval reports generated yet.\n");
1409
806
  }
1410
807
 
1411
808
  // Show latest eval-log entries
1412
- if (existsSync(logsDir)) {
1413
- const logFiles = readdirSync(logsDir)
1414
- .filter((f: string) => f.endsWith(".jsonl"))
1415
- .sort()
1416
- .reverse();
1417
- if (logFiles.length > 0) {
809
+ const recentLogs = store.readRecentEvalLogs(5);
810
+ if (recentLogs.length > 0) {
811
+ lines.push("\n**Recent Tick Evaluations** (last 5):");
812
+ for (const line of recentLogs) {
1418
813
  try {
1419
- const content = readFileSync(join(logsDir, logFiles[0]), "utf-8");
1420
- const entries = content.trim().split("\n").slice(-5);
1421
- lines.push("\n**Recent Tick Evaluations** (last 5):");
1422
- for (const line of entries) {
1423
- try {
1424
- const e = JSON.parse(line) as Record<string, unknown>;
1425
- const judges = e.judges ? ` judgeAvg=${e.judgeAvg ?? "?"}` : "";
1426
- lines.push(` ${e.tickTs} — passRate=${e.passRate}${judges}`);
1427
- } catch {
1428
- // skip malformed line
1429
- }
1430
- }
814
+ const e = JSON.parse(line) as Record<string, unknown>;
815
+ const judges = e.judges ? ` judgeAvg=${e.judgeAvg ?? "?"}` : "";
816
+ lines.push(` ${e.tickTs} passRate=${e.passRate}${judges}`);
1431
817
  } catch {
1432
- // skip if unreadable
818
+ // skip malformed line
1433
819
  }
1434
820
  }
1435
821
  }
@@ -1484,15 +870,15 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
1484
870
  name: "sinain_health",
1485
871
  description: "Run health watchdog checks on-demand and show results",
1486
872
  handler: () => {
1487
- const checks = runHealthChecks();
873
+ const checks = watchdog.runChecks();
1488
874
  const lines: string[] = ["**Health Watchdog Report**\n"];
1489
875
 
1490
876
  lines.push(`Transcript: ${checks.transcriptMB !== null ? `${checks.transcriptMB}MB` : "unknown"}`);
1491
- lines.push(`Last success: ${checks.staleSec > 0 ? `${checks.staleSec}s ago` : lastSuccessTs > 0 ? "just now" : "never"}`);
877
+ lines.push(`Last success: ${checks.staleSec > 0 ? `${checks.staleSec}s ago` : resilience.lastSuccessTs > 0 ? "just now" : "never"}`);
1492
878
  lines.push(`Error rate: ${Math.round(checks.errorRate * 100)}% (${checks.errorTotal} samples)`);
1493
879
  lines.push(`Overflow counter: ${checks.overflowCount}/${OVERFLOW_CONSECUTIVE_THRESHOLD}`);
1494
- lines.push(`Last reset: ${lastResetTs > 0 ? `${Math.round((Date.now() - lastResetTs) / 1000)}s ago` : "never"}`);
1495
- lines.push(`Last auto-restart: ${lastAutoRestartTs > 0 ? `${Math.round((Date.now() - lastAutoRestartTs) / 1000)}s ago` : "never"}`);
880
+ lines.push(`Last reset: ${resilience.lastResetTs > 0 ? `${Math.round((Date.now() - resilience.lastResetTs) / 1000)}s ago` : "never"}`);
881
+ lines.push(`Last auto-restart: ${resilience.lastAutoRestartTs > 0 ? `${Math.round((Date.now() - resilience.lastAutoRestartTs) / 1000)}s ago` : "never"}`);
1496
882
  lines.push(`Alerts configured: ${process.env.SINAIN_ALERT_CHAT_ID ? "yes" : "no (SINAIN_ALERT_CHAT_ID not set)"}`);
1497
883
 
1498
884
  if (checks.issues.length > 0) {
@@ -1541,155 +927,8 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
1541
927
  _toolCallId: string,
1542
928
  params: { sessionSummary: string; idle: boolean },
1543
929
  ) {
1544
- const result: Record<string, unknown> = {
1545
- status: "ok",
1546
- gitBackup: null,
1547
- signals: [],
1548
- recommendedAction: { action: "skip", task: null, confidence: 0 },
1549
- output: null,
1550
- skipped: false,
1551
- skipReason: null,
1552
- logWritten: false,
1553
- };
1554
-
1555
- // Helper: run a python script and parse JSON stdout
1556
- const runScript = async (
1557
- args: string[],
1558
- timeoutMs = 60_000,
1559
- ): Promise<Record<string, unknown> | null> => {
1560
- try {
1561
- const out = await api.runtime.system.runCommandWithTimeout(
1562
- ["uv", "run", "--with", "requests", "python3", ...args],
1563
- { timeoutMs, cwd: workspaceDir },
1564
- );
1565
- if (out.code !== 0) {
1566
- api.logger.warn(
1567
- `sinain-hud: heartbeat script failed: ${args[0]} (code ${out.code})\n${out.stderr}`,
1568
- );
1569
- return null;
1570
- }
1571
- return JSON.parse(out.stdout.trim());
1572
- } catch (err) {
1573
- api.logger.warn(
1574
- `sinain-hud: heartbeat script error: ${args[0]}: ${String(err)}`,
1575
- );
1576
- return null;
1577
- }
1578
- };
1579
-
1580
- // Latency tracking helper
1581
- const latencyMs: Record<string, number> = {};
1582
- const heartbeatStart = Date.now();
1583
-
1584
- // 1. Git backup (30s timeout)
1585
- try {
1586
- const t0 = Date.now();
1587
- const gitOut = await api.runtime.system.runCommandWithTimeout(
1588
- ["bash", "sinain-memory/git_backup.sh"],
1589
- { timeoutMs: 30_000, cwd: workspaceDir },
1590
- );
1591
- latencyMs.gitBackup = Date.now() - t0;
1592
- result.gitBackup = gitOut.stdout.trim() || "nothing to commit";
1593
- } catch (err) {
1594
- api.logger.warn(`sinain-hud: git backup error: ${String(err)}`);
1595
- result.gitBackup = `error: ${String(err)}`;
1596
- }
1597
-
1598
- // Current time string for memory scripts
1599
- const hbTz = cfg.userTimezone ?? "Europe/Berlin";
1600
- const currentTimeStr = new Date().toLocaleString("en-GB", {
1601
- timeZone: hbTz, weekday: "long", year: "numeric", month: "long",
1602
- day: "numeric", hour: "2-digit", minute: "2-digit", hour12: false,
1603
- }) + ` (${hbTz})`;
1604
-
1605
- // 2. Signal analysis (60s timeout)
1606
- const signalArgs = [
1607
- "sinain-memory/signal_analyzer.py",
1608
- "--memory-dir", "memory/",
1609
- "--session-summary", params.sessionSummary,
1610
- "--current-time", currentTimeStr,
1611
- ];
1612
- if (params.idle) signalArgs.push("--idle");
1613
-
1614
- const signalT0 = Date.now();
1615
- const signalResult = await runScript(signalArgs, 60_000);
1616
- latencyMs.signalAnalysis = Date.now() - signalT0;
1617
- if (signalResult) {
1618
- result.signals = signalResult.signals ?? [];
1619
- result.recommendedAction = signalResult.recommendedAction ?? {
1620
- action: "skip",
1621
- task: null,
1622
- confidence: 0,
1623
- };
1624
-
1625
- // Fire-and-forget: ingest signal into triple store
1626
- const tickTs = new Date().toISOString();
1627
- runScript([
1628
- "sinain-memory/triple_ingest.py",
1629
- "--memory-dir", "memory/",
1630
- "--tick-ts", tickTs,
1631
- "--signal-result", JSON.stringify(signalResult),
1632
- "--embed",
1633
- ], 15_000).catch(() => {});
1634
- }
1635
-
1636
- // 3. Insight synthesis (60s timeout)
1637
- const synthArgs = [
1638
- "sinain-memory/insight_synthesizer.py",
1639
- "--memory-dir", "memory/",
1640
- "--session-summary", params.sessionSummary,
1641
- "--current-time", currentTimeStr,
1642
- ];
1643
- if (params.idle) synthArgs.push("--idle");
1644
-
1645
- const synthT0 = Date.now();
1646
- const synthResult = await runScript(synthArgs, 60_000);
1647
- latencyMs.insightSynthesis = Date.now() - synthT0;
1648
- if (synthResult) {
1649
- if (synthResult.skip === false) {
1650
- result.output = {
1651
- suggestion: synthResult.suggestion ?? null,
1652
- insight: synthResult.insight ?? null,
1653
- };
1654
- } else {
1655
- result.skipped = true;
1656
- result.skipReason = synthResult.skipReason ?? "synthesizer skipped";
1657
- }
1658
- }
1659
-
1660
- // 4. Write log entry to memory/playbook-logs/YYYY-MM-DD.jsonl
1661
- try {
1662
- const now = new Date();
1663
- const dateStr = now.toISOString().slice(0, 10);
1664
- const logDir = join(workspaceDir, "memory", "playbook-logs");
1665
- if (!existsSync(logDir)) mkdirSync(logDir, { recursive: true });
1666
-
1667
- const totalLatencyMs = Date.now() - heartbeatStart;
1668
- const logEntry = {
1669
- ts: now.toISOString(),
1670
- idle: params.idle,
1671
- sessionHistorySummary: params.sessionSummary,
1672
- signals: result.signals,
1673
- recommendedAction: result.recommendedAction,
1674
- output: result.output,
1675
- skipped: result.skipped,
1676
- skipReason: result.skipReason,
1677
- gitBackup: result.gitBackup,
1678
- latencyMs,
1679
- totalLatencyMs,
1680
- };
1681
-
1682
- writeFileSync(
1683
- join(logDir, `${dateStr}.jsonl`),
1684
- JSON.stringify(logEntry) + "\n",
1685
- { flag: "a" },
1686
- );
1687
- result.logWritten = true;
1688
- } catch (err) {
1689
- api.logger.warn(
1690
- `sinain-hud: failed to write heartbeat log: ${String(err)}`,
1691
- );
1692
- }
930
+ store.setWorkspaceDir(workspaceDir);
931
+ const result = await engine.executeHeartbeatTick(params);
1693
932
 
1694
933
  return {
1695
934
  content: [
@@ -1703,315 +942,6 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
1703
942
  { name: "sinain_heartbeat_tick" },
1704
943
  );
1705
944
 
1706
- // ==========================================================================
1707
- // Effectiveness footer update
1708
- // ==========================================================================
1709
-
1710
- function updateEffectivenessFooter(
1711
- workspaceDir: string,
1712
- effectiveness: Record<string, unknown>,
1713
- ): void {
1714
- const playbookPath = join(workspaceDir, "memory", "sinain-playbook.md");
1715
- if (!existsSync(playbookPath)) return;
1716
- let content = readFileSync(playbookPath, "utf-8");
1717
- const today = new Date().toISOString().slice(0, 10);
1718
- const newFooter = `<!-- effectiveness: outputs=${effectiveness.outputs ?? 0}, positive=${effectiveness.positive ?? 0}, negative=${effectiveness.negative ?? 0}, neutral=${effectiveness.neutral ?? 0}, rate=${effectiveness.rate ?? 0}, updated=${today} -->`;
1719
- const footerRe = /<!--\s*effectiveness:[^>]+-->/;
1720
- if (footerRe.test(content)) {
1721
- content = content.replace(footerRe, newFooter);
1722
- } else {
1723
- content = content.trimEnd() + "\n\n" + newFooter + "\n";
1724
- }
1725
- writeFileSync(playbookPath, content, "utf-8");
1726
- }
1727
-
1728
- // ==========================================================================
1729
- // Curation pipeline (runs on 30-min timer)
1730
- // ==========================================================================
1731
-
1732
- async function runCurationPipeline(workspaceDir: string): Promise<void> {
1733
- const runScript = async (
1734
- args: string[],
1735
- timeoutMs = 90_000,
1736
- ): Promise<Record<string, unknown> | null> => {
1737
- try {
1738
- const result = await api.runtime.system.runCommandWithTimeout(
1739
- ["uv", "run", "--with", "requests", "python3", ...args],
1740
- { timeoutMs, cwd: workspaceDir },
1741
- );
1742
- if (result.code !== 0) {
1743
- api.logger.warn(
1744
- `sinain-hud: curation script failed: ${args[0]} (code ${result.code})\n${result.stderr}`,
1745
- );
1746
- return null;
1747
- }
1748
- return JSON.parse(result.stdout.trim());
1749
- } catch (err) {
1750
- api.logger.warn(
1751
- `sinain-hud: curation script error: ${args[0]}: ${String(err)}`,
1752
- );
1753
- return null;
1754
- }
1755
- };
1756
-
1757
- api.logger.info("sinain-hud: curation pipeline starting");
1758
- const curationLatency: Record<string, number> = {};
1759
-
1760
- // Step 1: Feedback analysis
1761
- const feedbackT0 = Date.now();
1762
- const feedback = await runScript([
1763
- "sinain-memory/feedback_analyzer.py",
1764
- "--memory-dir", "memory/",
1765
- "--session-summary", "periodic curation (plugin timer)",
1766
- ]);
1767
- curationLatency.feedback = Date.now() - feedbackT0;
1768
- const directive = (feedback as Record<string, unknown> | null)?.curateDirective as string ?? "stability";
1769
-
1770
- // Step 2: Memory mining (background task — mines unread daily files)
1771
- const miningT0 = Date.now();
1772
- const mining = await runScript([
1773
- "sinain-memory/memory_miner.py",
1774
- "--memory-dir", "memory/",
1775
- ]);
1776
- curationLatency.mining = Date.now() - miningT0;
1777
- const findings = mining?.findings ? JSON.stringify(mining.findings) : null;
1778
-
1779
- // Fire-and-forget: ingest mining results into triple store
1780
- if (mining) {
1781
- runScript([
1782
- "sinain-memory/triple_ingest.py",
1783
- "--memory-dir", "memory/",
1784
- "--ingest-mining", JSON.stringify(mining),
1785
- "--embed",
1786
- ], 15_000).catch(() => {});
1787
- }
1788
-
1789
- // Step 3: Playbook curation
1790
- const curatorArgs = [
1791
- "sinain-memory/playbook_curator.py",
1792
- "--memory-dir", "memory/",
1793
- "--session-summary", "periodic curation (plugin timer)",
1794
- "--curate-directive", directive,
1795
- ];
1796
- if (findings) {
1797
- curatorArgs.push("--mining-findings", findings);
1798
- }
1799
- const curatorT0 = Date.now();
1800
- const curator = await runScript(curatorArgs);
1801
- curationLatency.curation = Date.now() - curatorT0;
1802
-
1803
- // Fire-and-forget: ingest playbook patterns into triple store
1804
- runScript([
1805
- "sinain-memory/triple_ingest.py",
1806
- "--memory-dir", "memory/",
1807
- "--ingest-playbook",
1808
- "--embed",
1809
- ], 15_000).catch(() => {});
1810
-
1811
- // Step 4: Update effectiveness footer with fresh metrics
1812
- const effectiveness = (feedback as Record<string, unknown> | null)?.effectiveness;
1813
- if (effectiveness && typeof effectiveness === "object") {
1814
- try {
1815
- updateEffectivenessFooter(workspaceDir, effectiveness as Record<string, unknown>);
1816
- } catch (err) {
1817
- api.logger.warn(`sinain-hud: effectiveness footer update failed: ${String(err)}`);
1818
- }
1819
- }
1820
-
1821
- // Step 5: Regenerate effective playbook after curation
1822
- generateEffectivePlaybook(workspaceDir, api.logger);
1823
-
1824
- // Step 6: Tick evaluation (runs mechanical + sampled judges)
1825
- await runScript([
1826
- "sinain-memory/tick_evaluator.py",
1827
- "--memory-dir", "memory/",
1828
- ], 120_000);
1829
-
1830
- // Step 7: Daily eval report (run once per day after 03:00 UTC)
1831
- const nowUTC = new Date();
1832
- const todayStr = nowUTC.toISOString().slice(0, 10);
1833
- if (nowUTC.getUTCHours() >= 3 && lastEvalReportDate !== todayStr) {
1834
- await runScript([
1835
- "sinain-memory/eval_reporter.py",
1836
- "--memory-dir", "memory/",
1837
- ], 120_000);
1838
- lastEvalReportDate = todayStr;
1839
- }
1840
-
1841
- // Log result with curation latency
1842
- const changes = (curator as Record<string, unknown> | null)?.changes ?? "unknown";
1843
- api.logger.info(
1844
- `sinain-hud: curation pipeline complete (directive=${directive}, changes=${JSON.stringify(changes)}, latency=${JSON.stringify(curationLatency)})`,
1845
- );
1846
-
1847
- // Write curation result to playbook-logs so eval_reporter can track churn
1848
- if (curator) {
1849
- try {
1850
- const dateStr = new Date().toISOString().slice(0, 10);
1851
- const logDir = join(workspaceDir, "memory", "playbook-logs");
1852
- const curatorChanges = (curator as Record<string, unknown>).changes as Record<string, string[]> | undefined;
1853
- const curationEntry = {
1854
- _type: "curation",
1855
- ts: new Date().toISOString(),
1856
- directive,
1857
- playbookChanges: {
1858
- added: curatorChanges?.added ?? [],
1859
- pruned: curatorChanges?.pruned ?? [],
1860
- promoted: curatorChanges?.promoted ?? [],
1861
- playbookLines: (curator as Record<string, unknown>).playbookLines ?? 0,
1862
- },
1863
- latencyMs: curationLatency,
1864
- };
1865
- writeFileSync(
1866
- join(logDir, `${dateStr}.jsonl`),
1867
- JSON.stringify(curationEntry) + "\n",
1868
- { flag: "a" },
1869
- );
1870
- } catch (err) {
1871
- api.logger.warn(`sinain-hud: failed to write curation log entry: ${String(err)}`);
1872
- }
1873
- }
1874
- }
1875
-
1876
- // ==========================================================================
1877
- // Health watchdog helpers
1878
- // ==========================================================================
1879
-
1880
- function getStateDir(): string | null {
1881
- // State dir is the parent of the workspace dir (e.g. /home/node/.openclaw)
1882
- if (!lastWorkspaceDir) return null;
1883
- return dirname(lastWorkspaceDir);
1884
- }
1885
-
1886
- function getTranscriptSize(): { path: string; bytes: number } | null {
1887
- const sessionsJsonPath = getSessionsJsonPath();
1888
- if (!sessionsJsonPath || !cfg.sessionKey) return null;
1889
- try {
1890
- const sessionsData = JSON.parse(readFileSync(sessionsJsonPath, "utf-8"));
1891
- const session = sessionsData[cfg.sessionKey];
1892
- const transcriptPath = session?.sessionFile as string | undefined;
1893
- if (!transcriptPath || !existsSync(transcriptPath)) return null;
1894
- return { path: transcriptPath, bytes: statSync(transcriptPath).size };
1895
- } catch {
1896
- return null;
1897
- }
1898
- }
1899
-
1900
- function runHealthChecks(): {
1901
- transcriptMB: number | null;
1902
- staleSec: number;
1903
- errorRate: number;
1904
- errorTotal: number;
1905
- overflowCount: number;
1906
- resetRecently: boolean;
1907
- issues: string[];
1908
- } {
1909
- const transcript = getTranscriptSize();
1910
- const transcriptMB = transcript ? +(transcript.bytes / 1_000_000).toFixed(2) : null;
1911
- const staleSec = lastSuccessTs > 0 ? Math.round((Date.now() - lastSuccessTs) / 1000) : 0;
1912
- const { rate, total } = computeErrorRate();
1913
- const resetRecently = lastResetTs > 0 && (Date.now() - lastResetTs) < STALENESS_CRITICAL_MS * 2;
1914
-
1915
- const issues: string[] = [];
1916
- if (transcriptMB !== null && transcript!.bytes >= SESSION_SIZE_WARNING_BYTES) {
1917
- issues.push(`transcript ${transcriptMB}MB (threshold ${(SESSION_SIZE_WARNING_BYTES / 1_000_000).toFixed(1)}MB)`);
1918
- }
1919
- if (lastSuccessTs > 0 && (Date.now() - lastSuccessTs) >= STALENESS_WARNING_MS && recentOutcomes.length >= 3) {
1920
- issues.push(`stale ${staleSec}s since last success`);
1921
- }
1922
- if (total >= 5 && rate > 0.5) {
1923
- issues.push(`error rate ${Math.round(rate * 100)}% (${total} samples)`);
1924
- }
1925
- if (consecutiveOverflowErrors >= 3) {
1926
- issues.push(`overflow errors ${consecutiveOverflowErrors}/${OVERFLOW_CONSECUTIVE_THRESHOLD}`);
1927
- }
1928
- if (resetRecently && lastSuccessTs > 0 && lastSuccessTs < lastResetTs) {
1929
- issues.push("post-reset stall (no success since reset)");
1930
- }
1931
-
1932
- return { transcriptMB, staleSec, errorRate: rate, errorTotal: total, overflowCount: consecutiveOverflowErrors, resetRecently, issues };
1933
- }
1934
-
1935
- async function runHealthWatchdog(): Promise<void> {
1936
- const stateDir = getStateDir();
1937
- if (!stateDir) return;
1938
-
1939
- const transcript = getTranscriptSize();
1940
- const now = Date.now();
1941
-
1942
- // ── Layer 1: Proactive session size check ────────────────────────────
1943
- if (transcript && transcript.bytes >= SESSION_SIZE_WARNING_BYTES) {
1944
- const sizeMB = (transcript.bytes / 1_000_000).toFixed(1);
1945
-
1946
- if (transcript.bytes >= SESSION_SIZE_RESTART_BYTES) {
1947
- // Critical — force reset
1948
- api.logger.warn(`sinain-hud: watchdog — transcript ${sizeMB}MB, forcing overflow reset`);
1949
- if (performOverflowReset()) {
1950
- lastResetTs = now;
1951
- consecutiveOverflowErrors = 0;
1952
- sendTelegramAlert("proactive_reset", "⚠️ *sinain-hud* proactive session reset", `• Transcript was ${sizeMB}MB → truncated\n• No downtime expected`, stateDir);
1953
- }
1954
- } else {
1955
- // Warning — proactive reset at 1.5MB
1956
- api.logger.info(`sinain-hud: watchdog — transcript ${sizeMB}MB, proactive reset`);
1957
- if (performOverflowReset()) {
1958
- lastResetTs = now;
1959
- consecutiveOverflowErrors = 0;
1960
- sendTelegramAlert("proactive_reset", "⚠️ *sinain-hud* proactive session reset", `• Transcript was ${sizeMB}MB → truncated\n• No downtime expected`, stateDir);
1961
- }
1962
- }
1963
- }
1964
-
1965
- // ── Staleness check ──────────────────────────────────────────────────
1966
- if (lastSuccessTs > 0 && recentOutcomes.length >= 3) {
1967
- const staleMs = now - lastSuccessTs;
1968
-
1969
- if (staleMs >= STALENESS_WARNING_MS && staleMs < STALENESS_CRITICAL_MS) {
1970
- const staleMin = Math.round(staleMs / 60_000);
1971
- sendTelegramAlert("staleness_warning", "⚠️ *sinain-hud* response stale",
1972
- `• No successful run in ${staleMin}min\n• Error rate: ${Math.round(computeErrorRate().rate * 100)}%`,
1973
- stateDir);
1974
- }
1975
- }
1976
-
1977
- // ── Layer 2: Emergency restart — reset didn't recover ────────────────
1978
- if (lastResetTs > 0 && lastSuccessTs > 0 && lastSuccessTs < lastResetTs) {
1979
- const sinceResetMs = now - lastResetTs;
1980
- if (sinceResetMs >= STALENESS_CRITICAL_MS) {
1981
- // Reset was performed but no success since → queue is jammed
1982
- const canRestart = (now - lastAutoRestartTs) >= AUTO_RESTART_COOLDOWN_MS;
1983
- if (canRestart) {
1984
- const staleMin = Math.round((now - lastSuccessTs) / 60_000);
1985
- api.logger.warn(`sinain-hud: EMERGENCY RESTART — reset ${Math.round(sinceResetMs / 60_000)}min ago, no recovery`);
1986
- // Send alert BEFORE exit so user sees it
1987
- await sendTelegramAlert("emergency_restart", "🔴 *sinain-hud* EMERGENCY RESTART",
1988
- `• Queue jammed — reset didn't recover in ${Math.round(sinceResetMs / 60_000)}min\n• Last success: ${staleMin}min ago\n• Gateway restarting now (~5s)`,
1989
- stateDir);
1990
- lastAutoRestartTs = now;
1991
- // Give Telegram a moment to deliver
1992
- await new Promise((r) => setTimeout(r, 1000));
1993
- process.exit(1);
1994
- } else {
1995
- api.logger.warn("sinain-hud: watchdog — would restart but cooldown active (max 1/hour)");
1996
- }
1997
- }
1998
- }
1999
-
2000
- // ── Error rate alert ─────────────────────────────────────────────────
2001
- const { rate, total } = computeErrorRate();
2002
- if (total >= 5 && rate > 0.5) {
2003
- sendTelegramAlert("high_error_rate", "⚠️ *sinain-hud* high error rate",
2004
- `• ${Math.round(rate * 100)}% failures over ${total} samples\n• Consecutive overflow errors: ${consecutiveOverflowErrors}/${OVERFLOW_CONSECUTIVE_THRESHOLD}`,
2005
- stateDir);
2006
- }
2007
-
2008
- // ── Overflow approaching threshold ───────────────────────────────────
2009
- if (consecutiveOverflowErrors >= 3 && consecutiveOverflowErrors < OVERFLOW_CONSECUTIVE_THRESHOLD) {
2010
- sendTelegramAlert("overflow_warning", "⚠️ *sinain-hud* overflow errors accumulating",
2011
- `• ${consecutiveOverflowErrors}/${OVERFLOW_CONSECUTIVE_THRESHOLD} consecutive overflow errors\n• Auto-reset will trigger at ${OVERFLOW_CONSECUTIVE_THRESHOLD}`,
2012
- stateDir);
2013
- }
2014
- }
2015
945
 
2016
946
  // ==========================================================================
2017
947
  // Service registration
@@ -2024,42 +954,24 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
2024
954
  `sinain-hud: service started (heartbeat: ${cfg.heartbeatPath ?? "not configured"})`,
2025
955
  );
2026
956
 
2027
- // Start health watchdog — runs every 5 minutes, independent of curation
2028
- watchdogInterval = setInterval(() => {
2029
- runHealthWatchdog().catch((err) => {
2030
- api.logger.warn(`sinain-hud: watchdog error: ${String(err)}`);
2031
- });
2032
- }, WATCHDOG_INTERVAL_MS);
2033
- api.logger.info("sinain-hud: health watchdog started (5-min interval)");
957
+ // Start health watchdog — runs every 5 minutes
958
+ watchdog.start();
2034
959
 
2035
960
  // Start curation timer — runs every 30 minutes
2036
- curationInterval = setInterval(async () => {
2037
- // Skip curation during outage — scripts would work (OpenRouter) but
2038
- // results are wasted when no agent runs succeed
2039
- if (outageDetected) {
2040
- api.logger.info("sinain-hud: curation skipped — outage active");
2041
- return;
2042
- }
2043
-
2044
- // Find workspace dir from active sessions or last known
2045
- let workspaceDir: string | undefined;
961
+ const resolveWorkspaceDir = (): string | null => {
2046
962
  for (const state of sessionStates.values()) {
2047
- if (state.workspaceDir) { workspaceDir = state.workspaceDir; break; }
2048
- }
2049
- workspaceDir ??= lastWorkspaceDir ?? undefined;
2050
- if (!workspaceDir) {
2051
- api.logger.info("sinain-hud: curation skipped — no workspace dir");
2052
- return;
2053
- }
2054
- try {
2055
- await runCurationPipeline(workspaceDir);
2056
- } catch (err) {
2057
- api.logger.warn(`sinain-hud: curation pipeline error: ${String(err)}`);
963
+ if (state.workspaceDir) return state.workspaceDir;
2058
964
  }
965
+ return lastWorkspaceDir;
966
+ };
967
+ engine.startCurationTimer(
968
+ () => resilience.outageDetected,
969
+ resolveWorkspaceDir,
970
+ );
2059
971
 
2060
- // ── Proactive session hygiene ──────────────────────────────────
2061
- // Check sinain session size/age and archive+truncate if needed.
2062
- // This prevents context bloat from causing cascading RPC timeouts.
972
+ // Proactive session hygiene on a 30-min curation cycle
973
+ // (piggybacks on the curation timer checked after each pipeline run)
974
+ setInterval(() => {
2063
975
  try {
2064
976
  const sessionsJsonPath = getSessionsJsonPath();
2065
977
  if (sessionsJsonPath && cfg.sessionKey) {
@@ -2073,13 +985,13 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
2073
985
  const ageMs = Date.now() - createdAt;
2074
986
  if (size > SESSION_HYGIENE_SIZE_BYTES || ageMs > SESSION_HYGIENE_AGE_MS) {
2075
987
  api.logger.info(
2076
- `sinain-hud: proactive session hygiene \u2014 size=${Math.round(size / 1024)}KB, age=${Math.round(ageMs / 3600000)}h`,
988
+ `sinain-hud: proactive session hygiene size=${Math.round(size / 1024)}KB, age=${Math.round(ageMs / 3600000)}h`,
2077
989
  );
2078
990
  if (performOverflowReset()) {
2079
- consecutiveOverflowErrors = 0;
2080
- outageDetected = false;
2081
- consecutiveFailures = 0;
2082
- outageStartTs = 0;
991
+ resilience.consecutiveOverflowErrors = 0;
992
+ resilience.outageDetected = false;
993
+ resilience.consecutiveFailures = 0;
994
+ resilience.outageStartTs = 0;
2083
995
  }
2084
996
  }
2085
997
  }
@@ -2087,17 +999,11 @@ export default function sinainHudPlugin(api: OpenClawPluginApi): void {
2087
999
  } catch (err) {
2088
1000
  api.logger.warn(`sinain-hud: session hygiene check error: ${String(err)}`);
2089
1001
  }
2090
- }, 30 * 60 * 1000); // 30 minutes
1002
+ }, 30 * 60 * 1000);
2091
1003
  },
2092
1004
  stop: () => {
2093
- if (curationInterval) {
2094
- clearInterval(curationInterval);
2095
- curationInterval = null;
2096
- }
2097
- if (watchdogInterval) {
2098
- clearInterval(watchdogInterval);
2099
- watchdogInterval = null;
2100
- }
1005
+ engine.stopCurationTimer();
1006
+ watchdog.stop();
2101
1007
  api.logger.info("sinain-hud: service stopped");
2102
1008
  sessionStates.clear();
2103
1009
  },