@virtengine/openfleet 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. package/.env.example +914 -0
  2. package/LICENSE +190 -0
  3. package/README.md +500 -0
  4. package/agent-endpoint.mjs +918 -0
  5. package/agent-hook-bridge.mjs +230 -0
  6. package/agent-hooks.mjs +1188 -0
  7. package/agent-pool.mjs +2403 -0
  8. package/agent-prompts.mjs +689 -0
  9. package/agent-sdk.mjs +141 -0
  10. package/anomaly-detector.mjs +1195 -0
  11. package/autofix.mjs +1294 -0
  12. package/claude-shell.mjs +708 -0
  13. package/cli.mjs +906 -0
  14. package/codex-config.mjs +1274 -0
  15. package/codex-model-profiles.mjs +135 -0
  16. package/codex-shell.mjs +762 -0
  17. package/config-doctor.mjs +613 -0
  18. package/config.mjs +1720 -0
  19. package/conflict-resolver.mjs +248 -0
  20. package/container-runner.mjs +450 -0
  21. package/copilot-shell.mjs +827 -0
  22. package/daemon-restart-policy.mjs +56 -0
  23. package/diff-stats.mjs +282 -0
  24. package/error-detector.mjs +829 -0
  25. package/fetch-runtime.mjs +34 -0
  26. package/fleet-coordinator.mjs +838 -0
  27. package/get-telegram-chat-id.mjs +71 -0
  28. package/git-safety.mjs +170 -0
  29. package/github-reconciler.mjs +403 -0
  30. package/hook-profiles.mjs +651 -0
  31. package/kanban-adapter.mjs +4491 -0
  32. package/lib/logger.mjs +645 -0
  33. package/maintenance.mjs +828 -0
  34. package/merge-strategy.mjs +1171 -0
  35. package/monitor.mjs +12207 -0
  36. package/openfleet.config.example.json +115 -0
  37. package/openfleet.schema.json +465 -0
  38. package/package.json +203 -0
  39. package/postinstall.mjs +187 -0
  40. package/pr-cleanup-daemon.mjs +978 -0
  41. package/preflight.mjs +408 -0
  42. package/prepublish-check.mjs +90 -0
  43. package/presence.mjs +328 -0
  44. package/primary-agent.mjs +282 -0
  45. package/publish.mjs +151 -0
  46. package/repo-root.mjs +29 -0
  47. package/restart-controller.mjs +100 -0
  48. package/review-agent.mjs +557 -0
  49. package/rotate-agent-logs.sh +133 -0
  50. package/sdk-conflict-resolver.mjs +973 -0
  51. package/session-tracker.mjs +880 -0
  52. package/setup.mjs +3937 -0
  53. package/shared-knowledge.mjs +410 -0
  54. package/shared-state-manager.mjs +841 -0
  55. package/shared-workspace-cli.mjs +199 -0
  56. package/shared-workspace-registry.mjs +537 -0
  57. package/shared-workspaces.json +18 -0
  58. package/startup-service.mjs +1070 -0
  59. package/sync-engine.mjs +1063 -0
  60. package/task-archiver.mjs +801 -0
  61. package/task-assessment.mjs +550 -0
  62. package/task-claims.mjs +924 -0
  63. package/task-complexity.mjs +581 -0
  64. package/task-executor.mjs +5111 -0
  65. package/task-store.mjs +753 -0
  66. package/telegram-bot.mjs +9281 -0
  67. package/telegram-sentinel.mjs +2010 -0
  68. package/ui/app.js +867 -0
  69. package/ui/app.legacy.js +1464 -0
  70. package/ui/app.monolith.js +2488 -0
  71. package/ui/components/charts.js +226 -0
  72. package/ui/components/chat-view.js +567 -0
  73. package/ui/components/command-palette.js +587 -0
  74. package/ui/components/diff-viewer.js +190 -0
  75. package/ui/components/forms.js +327 -0
  76. package/ui/components/kanban-board.js +451 -0
  77. package/ui/components/session-list.js +305 -0
  78. package/ui/components/shared.js +473 -0
  79. package/ui/index.html +70 -0
  80. package/ui/modules/api.js +297 -0
  81. package/ui/modules/icons.js +461 -0
  82. package/ui/modules/router.js +81 -0
  83. package/ui/modules/settings-schema.js +261 -0
  84. package/ui/modules/state.js +679 -0
  85. package/ui/modules/telegram.js +331 -0
  86. package/ui/modules/utils.js +270 -0
  87. package/ui/styles/animations.css +140 -0
  88. package/ui/styles/base.css +98 -0
  89. package/ui/styles/components.css +1915 -0
  90. package/ui/styles/kanban.css +286 -0
  91. package/ui/styles/layout.css +809 -0
  92. package/ui/styles/sessions.css +827 -0
  93. package/ui/styles/variables.css +188 -0
  94. package/ui/styles.css +141 -0
  95. package/ui/styles.monolith.css +1046 -0
  96. package/ui/tabs/agents.js +1417 -0
  97. package/ui/tabs/chat.js +74 -0
  98. package/ui/tabs/control.js +887 -0
  99. package/ui/tabs/dashboard.js +515 -0
  100. package/ui/tabs/infra.js +537 -0
  101. package/ui/tabs/logs.js +783 -0
  102. package/ui/tabs/settings.js +1487 -0
  103. package/ui/tabs/tasks.js +1385 -0
  104. package/ui-server.mjs +4073 -0
  105. package/update-check.mjs +465 -0
  106. package/utils.mjs +172 -0
  107. package/ve-kanban.mjs +654 -0
  108. package/ve-kanban.ps1 +1365 -0
  109. package/ve-kanban.sh +18 -0
  110. package/ve-orchestrator.mjs +340 -0
  111. package/ve-orchestrator.ps1 +6546 -0
  112. package/ve-orchestrator.sh +18 -0
  113. package/vibe-kanban-wrapper.mjs +41 -0
  114. package/vk-error-resolver.mjs +470 -0
  115. package/vk-log-stream.mjs +914 -0
  116. package/whatsapp-channel.mjs +520 -0
  117. package/workspace-monitor.mjs +581 -0
  118. package/workspace-reaper.mjs +405 -0
  119. package/workspace-registry.mjs +238 -0
  120. package/worktree-manager.mjs +1266 -0
@@ -0,0 +1,1195 @@
1
+ /**
2
+ * anomaly-detector.mjs — Plaintext real-time anomaly detection for VK agent sessions.
3
+ *
4
+ * Detects death loops, stalls, token overflows, rebase spirals, and other
5
+ * wasteful agent behaviors by pattern-matching raw log lines. No AI inference —
6
+ * purely regex/string-based detection for low latency.
7
+ *
8
+ * Integration:
9
+ * Wired into VkLogStream.onLine callback in monitor.mjs.
10
+ * Each log line is fed to processLine(line, meta) which maintains per-process
11
+ * state and emits anomaly events via the onAnomaly callback.
12
+ *
13
+ * Architecture:
14
+ * - Per-process tracking via ProcessState objects
15
+ * - Sliding window counters for rate-based detection
16
+ * - Fingerprinted dedup to avoid alert spam
17
+ * - Severity levels: CRITICAL (kill), HIGH (kill at threshold/warn), MEDIUM (warn), LOW (info)
18
+ * - KILL action triggers at kill thresholds for all anomaly types (not just TOKEN_OVERFLOW)
19
+ * - Active process monitoring only (completed processes archived for analysis)
20
+ *
21
+ * Pattern catalog: See VK_FAILURE_PATTERN_CATALOG.md
22
+ */
23
+
24
+ import { normalizeDedupKey, stripAnsi, escapeHtml } from "./utils.mjs";
25
+
26
+ // ── Severity levels ─────────────────────────────────────────────────────────
27
+ export const Severity = /** @type {const} */ ({
28
+ CRITICAL: "CRITICAL", // Reserved for TOKEN_OVERFLOW (unrecoverable)
29
+ HIGH: "HIGH", // Serious issues requiring attention (but don't kill)
30
+ MEDIUM: "MEDIUM", // Should warn, may need intervention
31
+ LOW: "LOW", // Informational
32
+ });
33
+
34
+ // ── Anomaly types ───────────────────────────────────────────────────────────
35
+ export const AnomalyType = /** @type {const} */ ({
36
+ TOKEN_OVERFLOW: "TOKEN_OVERFLOW",
37
+ MODEL_NOT_SUPPORTED: "MODEL_NOT_SUPPORTED",
38
+ STREAM_DEATH: "STREAM_DEATH",
39
+ TOOL_CALL_LOOP: "TOOL_CALL_LOOP",
40
+ REBASE_SPIRAL: "REBASE_SPIRAL",
41
+ GIT_PUSH_LOOP: "GIT_PUSH_LOOP",
42
+ SUBAGENT_WASTE: "SUBAGENT_WASTE",
43
+ COMMAND_FAILURE_RATE: "COMMAND_FAILURE_RATE",
44
+ TOOL_FAILURE_CASCADE: "TOOL_FAILURE_CASCADE",
45
+ THOUGHT_SPINNING: "THOUGHT_SPINNING",
46
+ SELF_DEBUG_LOOP: "SELF_DEBUG_LOOP",
47
+ REPEATED_ERROR: "REPEATED_ERROR",
48
+ IDLE_STALL: "IDLE_STALL",
49
+ });
50
+
51
+ // ── Default thresholds (configurable) ───────────────────────────────────────
52
+ const DEFAULT_THRESHOLDS = {
53
+ // Tool call loop: N consecutive identical tool titles
54
+ toolCallLoopWarn: 6,
55
+ toolCallLoopKill: 12,
56
+
57
+ // Rebase spiral: N rebase --continue commands
58
+ rebaseWarn: 10,
59
+ rebaseKill: 25,
60
+
61
+ // Git push attempts in a session
62
+ gitPushWarn: 4,
63
+ gitPushKill: 8,
64
+
65
+ // Subagent spawns per session
66
+ subagentWarn: 10,
67
+ subagentKill: 20,
68
+
69
+ // Tool failures per session before alert
70
+ toolFailureWarn: 10,
71
+ toolFailureKill: 30,
72
+
73
+ // Command failure rate (%) over sliding window
74
+ commandFailureRateWarn: 25,
75
+
76
+ // Thought repetition (same text N+ times)
77
+ thoughtSpinWarn: 25,
78
+ thoughtSpinKill: 50,
79
+
80
+ // Model-not-supported failures before kill (high threshold — external issue)
81
+ modelFailureKill: 5,
82
+
83
+ // Repeated error fingerprint threshold
84
+ repeatedErrorWarn: 5,
85
+ repeatedErrorKill: 10,
86
+
87
+ // Idle stall: seconds with no line activity
88
+ idleStallWarnSec: 300, // 5 minutes
89
+ idleStallKillSec: 600, // 10 minutes
90
+
91
+ // Dedup window: don't re-alert same anomaly within this many ms
92
+ alertDedupWindowMs: 5 * 60 * 1000,
93
+
94
+ // Process state cleanup: remove tracking after this many ms of inactivity
95
+ processCleanupMs: 30 * 60 * 1000,
96
+ };
97
+
98
+ // Thought patterns that are legitimate during long-running operations.
99
+ // Agents running test suites, builds, or installations will naturally repeat
100
+ // these status thoughts many times — they're progress indicators, not loops.
101
+ const THOUGHT_SPINNING_EXCLUSIONS = [
102
+ /^running\s+\w*\s*tests?$/i, // "Running integration tests", "Running portal tests", "Running unit tests"
103
+ /^running\s+\w+$/i, // "Running prettier", "Running eslint"
104
+ /^waiting\s+for\s+/i, // "Waiting for tests to complete"
105
+ /^installing\s+/i, // "Installing dependencies"
106
+ /^building\s+/i, // "Building the project"
107
+ /^compiling\s+/i, // "Compiling TypeScript"
108
+ /^testing\s+/i, // "Testing the implementation"
109
+ /^executing\s+/i, // "Executing the command"
110
+ /^checking\s+/i, // "Checking test results"
111
+ /^analyzing\s+/i, // "Analyzing test output"
112
+ ];
113
+
114
+ /**
115
+ * Check if a thought is a legitimate operational status message
116
+ * that should not count toward thought spinning detection.
117
+ * @param {string} normalized - Lowercase, trimmed thought text
118
+ * @returns {boolean}
119
+ */
120
+ function isOperationalThought(normalized) {
121
+ return THOUGHT_SPINNING_EXCLUSIONS.some((re) => re.test(normalized));
122
+ }
123
+
124
+ // ── Per-process state ───────────────────────────────────────────────────────
125
+
126
+ /**
127
+ * @typedef {Object} ProcessState
128
+ * @property {string} processId
129
+ * @property {string} shortId
130
+ * @property {number} lineCount
131
+ * @property {number} firstLineAt
132
+ * @property {number} lastLineAt
133
+ * @property {string|null} lastToolTitle - Last ToolCall title seen
134
+ * @property {number} lastToolCallFingerprint - DJB2 hash of last tool call (minus toolCallId)
135
+ * @property {number} consecutiveSameToolCount - How many times in a row
136
+ * @property {number} rebaseCount - git rebase --continue count
137
+ * @property {number} rebaseAbortCount
138
+ * @property {number} gitPushCount
139
+ * @property {number} subagentCount
140
+ * @property {number} toolFailureCount
141
+ * @property {number} commandCount - Total command executions
142
+ * @property {number} commandFailureCount - Failed command executions
143
+ * @property {Map<string, number>} thoughtCounts - Normalized thought text → count
144
+ * @property {Map<string, number>} errorFingerprints - Error fingerprint → count
145
+ * @property {number} modelFailureCount
146
+ * @property {boolean} isDead - Process known to be dead/finished
147
+ * @property {string|null} taskTitle
148
+ * @property {string|null} branch
149
+ * @property {Set<string>} alertsSent - Dedup keys for alerts already sent
150
+ * @property {Map<string, number>} alertTimestamps - Dedup key → last alert time
151
+ * @property {Map<string, number>} alertEmitCounts - type → total emit count (for escalation)
152
+ */
153
+
154
+ /**
155
+ * Create a fresh process state
156
+ * @param {string} processId
157
+ * @returns {ProcessState}
158
+ */
159
+ function createProcessState(processId) {
160
+ const now = Date.now();
161
+ return {
162
+ processId,
163
+ shortId: processId.slice(0, 8),
164
+ lineCount: 0,
165
+ firstLineAt: now,
166
+ lastLineAt: now,
167
+ lastToolTitle: null,
168
+ lastToolCallFingerprint: 0,
169
+ consecutiveSameToolCount: 0,
170
+ rebaseCount: 0,
171
+ rebaseAbortCount: 0,
172
+ gitPushCount: 0,
173
+ subagentCount: 0,
174
+ toolFailureCount: 0,
175
+ commandCount: 0,
176
+ commandFailureCount: 0,
177
+ thoughtCounts: new Map(),
178
+ errorFingerprints: new Map(),
179
+ modelFailureCount: 0,
180
+ isDead: false,
181
+ taskTitle: null,
182
+ branch: null,
183
+ alertsSent: new Set(),
184
+ alertTimestamps: new Map(),
185
+ alertEmitCounts: new Map(),
186
+ };
187
+ }
188
+
189
+ // ── Compiled patterns (computed once) ───────────────────────────────────────
190
+
191
+ // P0: Token overflow
192
+ const RE_TOKEN_OVERFLOW =
193
+ /CAPIError: 400 prompt token count of (\d+) exceeds the limit of (\d+)/;
194
+
195
+ // P0: Model not supported
196
+ const STR_MODEL_NOT_SUPPORTED =
197
+ "CAPIError: 400 The requested model is not supported";
198
+
199
+ // P1: Stream death
200
+ const STR_STREAM_DEATH = "Stream completed without a response.completed event";
201
+
202
+ // P1: Rebase spiral
203
+ const RE_REBASE_CONTINUE = /git rebase --continue/;
204
+ const RE_REBASE_ABORT = /git rebase --abort/;
205
+
206
+ // P2: Tool call (Copilot format) — extract title
207
+ const RE_TOOL_CALL_TITLE = /"ToolCall"\s*:\s*\{[^}]*"title"\s*:\s*"([^"]+)"/;
208
+
209
+ // P2: Strip toolCallId from tool call lines for content fingerprinting
210
+ // toolCallId changes every call, so we strip it to compare actual content
211
+ const RE_TOOL_CALL_ID = /"toolCallId"\s*:\s*"[^"]*"\s*,?\s*/g;
212
+
213
+ // Tools that are inherently iterative — agents legitimately call these many
214
+ // times on the same file during normal development (edit→test→edit cycles).
215
+ // These get multiplied thresholds to avoid false-positive kill signals.
216
+ const ITERATIVE_TOOL_PREFIXES = [
217
+ "Editing ", // replace_string_in_file, multi_replace_string_in_file
218
+ "Reading ", // read_file
219
+ "Searching ", // grep_search, file_search, semantic_search
220
+ "Listing ", // list_dir, list_code_usages
221
+ ];
222
+
223
+ /**
224
+ * Simple DJB2 string hash for fingerprinting tool call lines.
225
+ * Not cryptographic — just fast dedup.
226
+ * @param {string} str
227
+ * @returns {number}
228
+ */
229
+ function djb2Hash(str) {
230
+ let hash = 5381;
231
+ for (let i = 0; i < str.length; i++) {
232
+ hash = ((hash << 5) + hash + str.charCodeAt(i)) | 0;
233
+ }
234
+ return hash;
235
+ }
236
+
237
+ /**
238
+ * Check if a tool title represents an inherently iterative operation.
239
+ * @param {string} title
240
+ * @returns {boolean}
241
+ */
242
+ function isIterativeTool(title) {
243
+ return ITERATIVE_TOOL_PREFIXES.some((p) => title.startsWith(p));
244
+ }
245
+
246
+ // P2: Tool failure (Copilot format)
247
+ const STR_TOOL_FAILED = '"status":"failed"';
248
+ const RE_TOOL_UPDATE_FAILED =
249
+ /"ToolUpdate"\s*:\s*\{[^}]*"status"\s*:\s*"failed"/;
250
+
251
+ // P2: Git push
252
+ const RE_GIT_PUSH = /git push(?:\s|$)/;
253
+
254
+ // P2: Subagent spawn (Copilot format — ToolCall with "prompt" in rawInput)
255
+ const RE_SUBAGENT_SPAWN =
256
+ /"ToolCall"\s*:\s*\{[^}]*"rawInput"\s*:\s*\{[^}]*"prompt"\s*:/;
257
+
258
+ // P3: Command failure (Codex format)
259
+ const RE_CMD_FAILED_CODEX =
260
+ /"type"\s*:\s*"commandExecution"[^}]*"status"\s*:\s*"failed"/;
261
+ const RE_CMD_COMPLETED_CODEX =
262
+ /"type"\s*:\s*"commandExecution"[^}]*"status"\s*:\s*"completed"/;
263
+
264
+ // P3: Thought tokens (Copilot format)
265
+ const RE_THOUGHT_TEXT =
266
+ /"Thought"\s*:\s*\{\s*"type"\s*:\s*"text"\s*,\s*"text"\s*:\s*"([^"]+)"/;
267
+
268
+ // P3: Reasoning summary (Codex format)
269
+ const RE_REASONING_SUMMARY =
270
+ /"type"\s*:\s*"reasoning"[^}]*"summary"\s*:\s*\["([^"]+)"/;
271
+
272
+ // Self-debugging keywords in reasoning
273
+ const SELF_DEBUG_KEYWORDS = [
274
+ "troubleshooting",
275
+ "debugging",
276
+ "analyzing grep",
277
+ "figuring out",
278
+ "retrying",
279
+ "diagnosing",
280
+ ];
281
+
282
+ // Error line patterns
283
+ const RE_ERROR_PATTERNS = [
284
+ /\bError:\s/i,
285
+ /\bFailed\b.*\b(?:to|with)\b/i,
286
+ /\bfatal\b/i,
287
+ /\bpanic\b/i,
288
+ /\bCAPIError\b/,
289
+ ];
290
+
291
+ // Noise exclusions (don't count these as errors)
292
+ const RE_ERROR_NOISE = [
293
+ /error=0/i,
294
+ /errors: 0/i,
295
+ /no errors/i,
296
+ /\berror handling\b/i,
297
+ /error_count.*:\s*0/i,
298
+ /"status":"completed"/,
299
+ /PASSED/,
300
+ ];
301
+
302
+ // Session completion indicators
303
+ const RE_SESSION_DONE = /"Done"\s*:\s*"/;
304
+ const STR_TASK_COMPLETE = "task_complete";
305
+
306
+ // ── Main Detector Class ─────────────────────────────────────────────────────
307
+
308
+ export class AnomalyDetector {
309
+ /** @type {Map<string, ProcessState>} Per-process state (active only) */
310
+ #processes = new Map();
311
+
312
+ /** @type {Map<string, ProcessState>} Completed processes (archived for analysis) */
313
+ #completedProcesses = new Map();
314
+
315
+ /** @type {(anomaly: Anomaly) => void} */
316
+ #onAnomaly;
317
+
318
+ /** @type {(text: string, options?: object) => void} */
319
+ #notify;
320
+
321
+ /** @type {typeof DEFAULT_THRESHOLDS} */
322
+ #thresholds;
323
+
324
+ /** @type {NodeJS.Timeout|null} */
325
+ #cleanupInterval = null;
326
+
327
+ /** @type {NodeJS.Timeout|null} */
328
+ #stallCheckInterval = null;
329
+
330
+ /** @type {Map<string, number>} Global anomaly counters by type */
331
+ #globalCounts = new Map();
332
+
333
+ /** @type {number} Total lines processed */
334
+ #totalLines = 0;
335
+
336
+ /** @type {number} Detector start time */
337
+ #startedAt = Date.now();
338
+
339
+ /**
340
+ * @param {object} options
341
+ * @param {(anomaly: Anomaly) => void} options.onAnomaly - Called when anomaly detected
342
+ * @param {(text: string, options?: object) => void} [options.notify] - Notification function (Telegram)
343
+ * @param {Partial<typeof DEFAULT_THRESHOLDS>} [options.thresholds] - Override defaults
344
+ */
345
+ constructor(options) {
346
+ this.#onAnomaly = options.onAnomaly || (() => {});
347
+ this.#notify = options.notify || (() => {});
348
+ this.#thresholds = { ...DEFAULT_THRESHOLDS, ...options.thresholds };
349
+ }
350
+
351
+ /**
352
+ * Start background timers (stall detection, cleanup).
353
+ * Call once after construction.
354
+ */
355
+ start() {
356
+ // Check for idle stalls every 30 seconds
357
+ this.#stallCheckInterval = setInterval(() => {
358
+ this.#checkStalls();
359
+ }, 30_000);
360
+ this.#stallCheckInterval.unref?.();
361
+
362
+ // Clean up old process state every 10 minutes
363
+ this.#cleanupInterval = setInterval(() => {
364
+ this.#cleanupOldProcesses();
365
+ }, 10 * 60_000);
366
+ this.#cleanupInterval.unref?.();
367
+ }
368
+
369
+ /**
370
+ * Stop background timers.
371
+ */
372
+ stop() {
373
+ if (this.#stallCheckInterval) {
374
+ clearInterval(this.#stallCheckInterval);
375
+ this.#stallCheckInterval = null;
376
+ }
377
+ if (this.#cleanupInterval) {
378
+ clearInterval(this.#cleanupInterval);
379
+ this.#cleanupInterval = null;
380
+ }
381
+ }
382
+
383
+ /**
384
+ * Process a single log line from VkLogStream.
385
+ * This is the main entry point — called from the onLine callback.
386
+ *
387
+ * @param {string} rawLine - Raw log line
388
+ * @param {object} meta - Metadata from VkLogStream
389
+ * @param {string} meta.processId - VK execution process ID
390
+ * @param {string} meta.stream - "stdout" or "stderr"
391
+ * @param {string} [meta.taskTitle] - Task title if known
392
+ * @param {string} [meta.branch] - Git branch if known
393
+ * @param {string} [meta.sessionId] - VK session ID
394
+ * @param {string} [meta.attemptId] - Attempt ID
395
+ */
396
+ processLine(rawLine, meta) {
397
+ if (!rawLine || !meta?.processId) return;
398
+
399
+ const line = stripAnsi(rawLine).trim();
400
+ if (!line) return;
401
+
402
+ this.#totalLines++;
403
+
404
+ // Get or create per-process state
405
+ const pid = meta.processId;
406
+ if (this.#completedProcesses.has(pid)) {
407
+ return;
408
+ }
409
+ let state = this.#processes.get(pid);
410
+ if (!state) {
411
+ state = createProcessState(pid);
412
+ this.#processes.set(pid, state);
413
+ }
414
+
415
+ state.lineCount++;
416
+ state.lastLineAt = Date.now();
417
+ if (meta.taskTitle && !state.taskTitle) state.taskTitle = meta.taskTitle;
418
+ if (meta.branch && !state.branch) state.branch = meta.branch;
419
+
420
+ // Skip further analysis on dead/completed processes
421
+ if (state.isDead) {
422
+ // Archive completed process on first detection
423
+ if (this.#processes.has(pid)) {
424
+ this.#completedProcesses.set(pid, state);
425
+ this.#processes.delete(pid);
426
+ }
427
+ return;
428
+ }
429
+
430
+ // ── Run all detectors ───────────────────────────────────────────
431
+ this.#detectTokenOverflow(line, state);
432
+ this.#detectModelNotSupported(line, state);
433
+ this.#detectStreamDeath(line, state);
434
+ this.#detectToolCallLoop(line, state);
435
+ this.#detectToolFailures(line, state);
436
+ this.#detectRebaseSpiral(line, state);
437
+ this.#detectGitPushLoop(line, state);
438
+ this.#detectSubagentWaste(line, state);
439
+ this.#detectCommandFailures(line, state);
440
+ this.#detectThoughtSpinning(line, state);
441
+ this.#detectSelfDebugLoop(line, state);
442
+ this.#detectRepeatedErrors(line, state);
443
+ this.#detectSessionCompletion(line, state);
444
+
445
+ // Move completed processes out of the active map immediately so stats reflect completion.
446
+ if (state.isDead && this.#processes.has(pid)) {
447
+ this.#completedProcesses.set(pid, state);
448
+ this.#processes.delete(pid);
449
+ }
450
+ }
451
+
452
+ /**
453
+ * Get anomaly statistics across all tracked processes.
454
+ * @returns {object}
455
+ */
456
+ getStats() {
457
+ const stats = {
458
+ uptimeMs: Date.now() - this.#startedAt,
459
+ totalLinesProcessed: this.#totalLines,
460
+ activeProcesses: this.#processes.size,
461
+ completedProcesses: this.#completedProcesses.size,
462
+ deadProcesses: this.#completedProcesses.size,
463
+ anomalyCounts: Object.fromEntries(this.#globalCounts),
464
+ processes: /** @type {object[]} */ ([]),
465
+ };
466
+
467
+ for (const [pid, state] of this.#processes) {
468
+ stats.processes.push({
469
+ shortId: state.shortId,
470
+ taskTitle: state.taskTitle || "(unknown)",
471
+ lineCount: state.lineCount,
472
+ isDead: state.isDead,
473
+ toolFailures: state.toolFailureCount,
474
+ rebaseCount: state.rebaseCount,
475
+ gitPushCount: state.gitPushCount,
476
+ subagentCount: state.subagentCount,
477
+ modelFailures: state.modelFailureCount,
478
+ consecutiveSameToolCount: state.consecutiveSameToolCount,
479
+ lastToolTitle: state.lastToolTitle,
480
+ idleSec: Math.round((Date.now() - state.lastLineAt) / 1000),
481
+ alertEmitCounts: Object.fromEntries(state.alertEmitCounts),
482
+ runtimeMin: Math.round((Date.now() - state.firstLineAt) / 60_000),
483
+ });
484
+ }
485
+
486
+ return stats;
487
+ }
488
+
489
+ /**
490
+ * Get a formatted status string suitable for Telegram /status command.
491
+ * @returns {string}
492
+ */
493
+ getStatusReport() {
494
+ const s = this.getStats();
495
+ const uptimeMin = Math.round(s.uptimeMs / 60_000);
496
+ const lines = [
497
+ `<b>🔍 Anomaly Detector Status</b>`,
498
+ `Uptime: ${uptimeMin}m | Lines: ${s.totalLinesProcessed.toLocaleString()}`,
499
+ `Active: ${s.activeProcesses} | Completed: ${s.completedProcesses}`,
500
+ ];
501
+
502
+ const counts = Object.entries(s.anomalyCounts);
503
+ if (counts.length > 0) {
504
+ lines.push(
505
+ `\n<b>Anomalies detected:</b>`,
506
+ ...counts.map(([type, count]) => ` ${type}: ${count}`),
507
+ );
508
+ } else {
509
+ lines.push(`\nNo anomalies detected.`);
510
+ }
511
+
512
+ // Show any active concerns
513
+ for (const proc of s.processes) {
514
+ if (proc.isDead) continue;
515
+ const concerns = [];
516
+ if (proc.consecutiveSameToolCount >= this.#thresholds.toolCallLoopWarn) {
517
+ concerns.push(
518
+ `tool loop (${proc.consecutiveSameToolCount}x ${proc.lastToolTitle})`,
519
+ );
520
+ }
521
+ if (proc.rebaseCount >= this.#thresholds.rebaseWarn) {
522
+ concerns.push(`rebase spiral (${proc.rebaseCount})`);
523
+ }
524
+ if (proc.gitPushCount >= this.#thresholds.gitPushWarn) {
525
+ concerns.push(`push loop (${proc.gitPushCount})`);
526
+ }
527
+ if (proc.idleSec >= this.#thresholds.idleStallWarnSec) {
528
+ concerns.push(`idle ${proc.idleSec}s`);
529
+ }
530
+ // Show circuit-breaker escalation status
531
+ const escalated = Object.entries(proc.alertEmitCounts || {}).filter(
532
+ ([, c]) => c >= 3,
533
+ );
534
+ if (escalated.length > 0) {
535
+ concerns.push(
536
+ `escalated: ${escalated.map(([t, c]) => `${t}(${c}x)`).join(", ")}`,
537
+ );
538
+ }
539
+ if (proc.runtimeMin >= 60) {
540
+ concerns.push(`runtime ${proc.runtimeMin}min`);
541
+ }
542
+ if (concerns.length > 0) {
543
+ lines.push(
544
+ `\n⚠️ <b>${escapeHtml(proc.shortId)}</b> (${escapeHtml(proc.taskTitle || "?")}):`,
545
+ ` ${concerns.join(", ")}`,
546
+ );
547
+ }
548
+ }
549
+
550
+ return lines.join("\n");
551
+ }
552
+
553
+ /**
554
+ * Reset a specific process's state (e.g., after restart).
555
+ * @param {string} processId
556
+ */
557
+ resetProcess(processId) {
558
+ this.#processes.delete(processId);
559
+ }
560
+
561
+ // ── Detector methods ──────────────────────────────────────────────────────
562
+
563
+ /**
564
+ * P0: Token count exceeds model limit — instant death.
565
+ */
566
+ #detectTokenOverflow(line, state) {
567
+ const match = RE_TOKEN_OVERFLOW.exec(line);
568
+ if (!match) return;
569
+
570
+ const tokenCount = parseInt(match[1], 10);
571
+ const limit = parseInt(match[2], 10);
572
+ state.isDead = true;
573
+
574
+ this.#emit({
575
+ type: AnomalyType.TOKEN_OVERFLOW,
576
+ severity: Severity.CRITICAL,
577
+ processId: state.processId,
578
+ shortId: state.shortId,
579
+ taskTitle: state.taskTitle,
580
+ message: `Token overflow: ${tokenCount.toLocaleString()} tokens vs ${limit.toLocaleString()} limit (+${(tokenCount - limit).toLocaleString()} over)`,
581
+ data: { tokenCount, limit, overflow: tokenCount - limit },
582
+ action: "kill",
583
+ });
584
+ }
585
+
586
+ /**
587
+ * P0: Model not supported — subagent dies, parent wastes ~90s retrying.
588
+ * While this is an external issue (Azure/model config), after enough failures
589
+ * the agent is wasting compute spinning in retry loops. Kill it so the slot
590
+ * is freed for a fresh attempt that might succeed after config fixes.
591
+ */
592
+ #detectModelNotSupported(line, state) {
593
+ if (!line.includes(STR_MODEL_NOT_SUPPORTED)) return;
594
+
595
+ state.modelFailureCount++;
596
+
597
+ if (state.modelFailureCount >= this.#thresholds.modelFailureKill) {
598
+ this.#emit({
599
+ type: AnomalyType.MODEL_NOT_SUPPORTED,
600
+ severity: Severity.HIGH,
601
+ processId: state.processId,
602
+ shortId: state.shortId,
603
+ taskTitle: state.taskTitle,
604
+ message: `Model not supported — ${state.modelFailureCount} failures, each wasting ~90s in retries`,
605
+ data: { failureCount: state.modelFailureCount },
606
+ action: "kill",
607
+ });
608
+ } else {
609
+ this.#emit({
610
+ type: AnomalyType.MODEL_NOT_SUPPORTED,
611
+ severity: Severity.MEDIUM,
612
+ processId: state.processId,
613
+ shortId: state.shortId,
614
+ taskTitle: state.taskTitle,
615
+ message: `Model not supported failure #${state.modelFailureCount} (~90s wasted per retry)`,
616
+ data: { failureCount: state.modelFailureCount },
617
+ action: "warn",
618
+ });
619
+ }
620
+ }
621
+
622
+ /**
623
+ * P1: Stream completed without response — session is dead.
624
+ */
625
+ #detectStreamDeath(line, state) {
626
+ if (!line.includes(STR_STREAM_DEATH)) return;
627
+
628
+ state.isDead = true;
629
+
630
+ this.#emit({
631
+ type: AnomalyType.STREAM_DEATH,
632
+ severity: Severity.HIGH,
633
+ processId: state.processId,
634
+ shortId: state.shortId,
635
+ taskTitle: state.taskTitle,
636
+ message: "Stream completed without response — session dead",
637
+ data: {},
638
+ action: "restart",
639
+ });
640
+ }
641
+
642
+ /**
643
+ * P2: Consecutive identical tool calls — agent stuck in a loop.
644
+ *
645
+ * KEY: We fingerprint the ENTIRE tool call content (minus the ever-changing
646
+ * toolCallId) so that different edits to the same file are NOT counted as
647
+ * a loop. Only truly identical calls (same title, same arguments, same
648
+ * content) increment the counter.
649
+ *
650
+ * Additionally, known-iterative tools (Editing, Reading, Searching) get
651
+ * multiplied thresholds since agents legitimately call them many times
652
+ * during normal edit→test→edit development cycles.
653
+ */
654
+ #detectToolCallLoop(line, state) {
655
+ const match = RE_TOOL_CALL_TITLE.exec(line);
656
+ if (!match) {
657
+ // Non-tool-call lines don't reset the counter (reasoning/thought between calls is normal)
658
+ return;
659
+ }
660
+
661
+ const title = match[1];
662
+
663
+ // Fingerprint the full tool call content, stripping the toolCallId which
664
+ // changes every invocation. Two calls are "identical" only when both the
665
+ // tool name AND the arguments/content are the same.
666
+ const stripped = line.replace(RE_TOOL_CALL_ID, "");
667
+ const fingerprint = djb2Hash(stripped);
668
+
669
+ if (fingerprint === state.lastToolCallFingerprint && title === state.lastToolTitle) {
670
+ state.consecutiveSameToolCount++;
671
+ } else {
672
+ state.lastToolTitle = title;
673
+ state.lastToolCallFingerprint = fingerprint;
674
+ state.consecutiveSameToolCount = 1;
675
+ }
676
+
677
+ const count = state.consecutiveSameToolCount;
678
+
679
+ // Use elevated thresholds for inherently iterative tools (editing, reading)
680
+ const iterative = isIterativeTool(title);
681
+ const warnThreshold = iterative
682
+ ? this.#thresholds.toolCallLoopWarn * 3
683
+ : this.#thresholds.toolCallLoopWarn;
684
+ const killThreshold = iterative
685
+ ? this.#thresholds.toolCallLoopKill * 3
686
+ : this.#thresholds.toolCallLoopKill;
687
+
688
+ if (count >= killThreshold) {
689
+ this.#emit({
690
+ type: AnomalyType.TOOL_CALL_LOOP,
691
+ severity: Severity.HIGH,
692
+ processId: state.processId,
693
+ shortId: state.shortId,
694
+ taskTitle: state.taskTitle,
695
+ message: `Tool call death loop: "${title}" called ${count}x consecutively (identical content)`,
696
+ data: { tool: title, count, iterative },
697
+ action: "kill",
698
+ });
699
+ } else if (count >= warnThreshold) {
700
+ this.#emit({
701
+ type: AnomalyType.TOOL_CALL_LOOP,
702
+ severity: Severity.MEDIUM,
703
+ processId: state.processId,
704
+ shortId: state.shortId,
705
+ taskTitle: state.taskTitle,
706
+ message: `Tool call loop: "${title}" called ${count}x consecutively (identical content)`,
707
+ data: { tool: title, count, iterative },
708
+ action: "warn",
709
+ });
710
+ }
711
+ }
712
+
713
+ /**
714
+ * P2: Tool failures accumulating.
715
+ */
716
+ #detectToolFailures(line, state) {
717
+ if (!RE_TOOL_UPDATE_FAILED.test(line)) return;
718
+
719
+ state.toolFailureCount++;
720
+
721
+ if (state.toolFailureCount >= this.#thresholds.toolFailureKill) {
722
+ this.#emit({
723
+ type: AnomalyType.TOOL_FAILURE_CASCADE,
724
+ severity: Severity.HIGH,
725
+ processId: state.processId,
726
+ shortId: state.shortId,
727
+ taskTitle: state.taskTitle,
728
+ message: `Tool failure cascade: ${state.toolFailureCount} failures in session`,
729
+ data: { count: state.toolFailureCount },
730
+ action: "kill",
731
+ });
732
+ } else if (state.toolFailureCount >= this.#thresholds.toolFailureWarn) {
733
+ this.#emit({
734
+ type: AnomalyType.TOOL_FAILURE_CASCADE,
735
+ severity: Severity.MEDIUM,
736
+ processId: state.processId,
737
+ shortId: state.shortId,
738
+ taskTitle: state.taskTitle,
739
+ message: `High tool failure rate: ${state.toolFailureCount} failures in session`,
740
+ data: { count: state.toolFailureCount },
741
+ action: "warn",
742
+ });
743
+ }
744
+ }
745
+
746
+ /**
747
+ * P1: Rebase --continue death spiral.
748
+ */
749
+ #detectRebaseSpiral(line, state) {
750
+ if (RE_REBASE_CONTINUE.test(line)) {
751
+ state.rebaseCount++;
752
+ } else if (RE_REBASE_ABORT.test(line)) {
753
+ state.rebaseAbortCount++;
754
+ return; // abort is recovery, don't alert
755
+ } else {
756
+ return;
757
+ }
758
+
759
+ if (state.rebaseCount >= this.#thresholds.rebaseKill) {
760
+ this.#emit({
761
+ type: AnomalyType.REBASE_SPIRAL,
762
+ severity: Severity.HIGH,
763
+ processId: state.processId,
764
+ shortId: state.shortId,
765
+ taskTitle: state.taskTitle,
766
+ message: `Rebase spiral detected: ${state.rebaseCount} rebase --continue attempts`,
767
+ data: {
768
+ rebaseCount: state.rebaseCount,
769
+ abortCount: state.rebaseAbortCount,
770
+ },
771
+ action: "kill",
772
+ });
773
+ } else if (state.rebaseCount >= this.#thresholds.rebaseWarn) {
774
+ this.#emit({
775
+ type: AnomalyType.REBASE_SPIRAL,
776
+ severity: Severity.HIGH,
777
+ processId: state.processId,
778
+ shortId: state.shortId,
779
+ taskTitle: state.taskTitle,
780
+ message: `Rebase spiral: ${state.rebaseCount} rebase --continue attempts`,
781
+ data: {
782
+ rebaseCount: state.rebaseCount,
783
+ abortCount: state.rebaseAbortCount,
784
+ },
785
+ action: "warn",
786
+ });
787
+ }
788
+ }
789
+
790
+ /**
791
+ * P2: Git push retry loop.
792
+ */
793
+ #detectGitPushLoop(line, state) {
794
+ if (!RE_GIT_PUSH.test(line)) return;
795
+
796
+ state.gitPushCount++;
797
+
798
+ if (state.gitPushCount >= this.#thresholds.gitPushKill) {
799
+ this.#emit({
800
+ type: AnomalyType.GIT_PUSH_LOOP,
801
+ severity: Severity.HIGH,
802
+ processId: state.processId,
803
+ shortId: state.shortId,
804
+ taskTitle: state.taskTitle,
805
+ message: `Git push loop detected: ${state.gitPushCount} push attempts`,
806
+ data: { count: state.gitPushCount },
807
+ action: "kill",
808
+ });
809
+ } else if (state.gitPushCount >= this.#thresholds.gitPushWarn) {
810
+ this.#emit({
811
+ type: AnomalyType.GIT_PUSH_LOOP,
812
+ severity: Severity.MEDIUM,
813
+ processId: state.processId,
814
+ shortId: state.shortId,
815
+ taskTitle: state.taskTitle,
816
+ message: `Git push loop: ${state.gitPushCount} push attempts in session`,
817
+ data: { count: state.gitPushCount },
818
+ action: "warn",
819
+ });
820
+ }
821
+ }
822
+
823
+ /**
824
+ * P2: Subagent over-spawning.
825
+ */
826
+ #detectSubagentWaste(line, state) {
827
+ if (!RE_SUBAGENT_SPAWN.test(line)) return;
828
+
829
+ state.subagentCount++;
830
+
831
+ if (state.subagentCount >= this.#thresholds.subagentKill) {
832
+ this.#emit({
833
+ type: AnomalyType.SUBAGENT_WASTE,
834
+ severity: Severity.HIGH,
835
+ processId: state.processId,
836
+ shortId: state.shortId,
837
+ taskTitle: state.taskTitle,
838
+ message: `Excessive subagent spawning: ${state.subagentCount} subagents`,
839
+ data: { count: state.subagentCount },
840
+ action: "kill",
841
+ });
842
+ } else if (state.subagentCount >= this.#thresholds.subagentWarn) {
843
+ this.#emit({
844
+ type: AnomalyType.SUBAGENT_WASTE,
845
+ severity: Severity.MEDIUM,
846
+ processId: state.processId,
847
+ shortId: state.shortId,
848
+ taskTitle: state.taskTitle,
849
+ message: `High subagent count: ${state.subagentCount} subagents spawned`,
850
+ data: { count: state.subagentCount },
851
+ action: "warn",
852
+ });
853
+ }
854
+ }
855
+
856
+ /**
857
+ * P3: Command failure rate tracking (Codex format).
858
+ */
859
+ #detectCommandFailures(line, state) {
860
+ if (RE_CMD_FAILED_CODEX.test(line)) {
861
+ state.commandCount++;
862
+ state.commandFailureCount++;
863
+ } else if (RE_CMD_COMPLETED_CODEX.test(line)) {
864
+ state.commandCount++;
865
+ } else {
866
+ return;
867
+ }
868
+
869
+ // Check failure rate after enough samples
870
+ if (state.commandCount >= 10) {
871
+ const rate = (state.commandFailureCount / state.commandCount) * 100;
872
+ if (rate >= this.#thresholds.commandFailureRateWarn) {
873
+ this.#emit({
874
+ type: AnomalyType.COMMAND_FAILURE_RATE,
875
+ severity: Severity.MEDIUM,
876
+ processId: state.processId,
877
+ shortId: state.shortId,
878
+ taskTitle: state.taskTitle,
879
+ message: `High command failure rate: ${rate.toFixed(0)}% (${state.commandFailureCount}/${state.commandCount})`,
880
+ data: {
881
+ rate,
882
+ failed: state.commandFailureCount,
883
+ total: state.commandCount,
884
+ },
885
+ action: "warn",
886
+ });
887
+ }
888
+ }
889
+ }
890
+
891
+ /**
892
+ * P3: Thought repetition (model spinning/looping).
893
+ */
894
+ #detectThoughtSpinning(line, state) {
895
+ let thoughtText = null;
896
+
897
+ // Copilot format
898
+ const thoughtMatch = RE_THOUGHT_TEXT.exec(line);
899
+ if (thoughtMatch) {
900
+ thoughtText = thoughtMatch[1];
901
+ }
902
+
903
+ if (!thoughtText) return;
904
+
905
+ // Normalize: lowercase, trim, collapse whitespace
906
+ const normalized = thoughtText.toLowerCase().trim().replace(/\s+/g, " ");
907
+ // Skip short fragments — streaming often emits single tokens ("portal",
908
+ // " trust", "the") that accumulate massive counts but aren't real repeated
909
+ // thoughts. Require at least 12 chars (~2-3 words) to count as a trackable
910
+ // thought pattern.
911
+ if (normalized.length < 12) return;
912
+
913
+ // Skip operational status messages — agents running tests, builds, or
914
+ // installations legitimately repeat status thoughts like "Running integration
915
+ // tests" many times. These are progress indicators, not loops.
916
+ if (isOperationalThought(normalized)) return;
917
+
918
+ const count = (state.thoughtCounts.get(normalized) || 0) + 1;
919
+ state.thoughtCounts.set(normalized, count);
920
+
921
+ if (count >= this.#thresholds.thoughtSpinKill) {
922
+ this.#emit({
923
+ type: AnomalyType.THOUGHT_SPINNING,
924
+ severity: Severity.HIGH,
925
+ processId: state.processId,
926
+ shortId: state.shortId,
927
+ taskTitle: state.taskTitle,
928
+ message: `Thought spinning: "${thoughtText}" repeated ${count}x — model may be looping`,
929
+ data: { thought: thoughtText, count },
930
+ action: "kill",
931
+ });
932
+ } else if (count >= this.#thresholds.thoughtSpinWarn) {
933
+ this.#emit({
934
+ type: AnomalyType.THOUGHT_SPINNING,
935
+ severity: Severity.LOW,
936
+ processId: state.processId,
937
+ shortId: state.shortId,
938
+ taskTitle: state.taskTitle,
939
+ message: `Thought repetition: "${thoughtText}" repeated ${count}x`,
940
+ data: { thought: thoughtText, count },
941
+ action: "info",
942
+ });
943
+ }
944
+ }
945
+
946
+ /**
947
+ * P3: Self-debugging reasoning loops (Codex format).
948
+ */
949
+ #detectSelfDebugLoop(line, state) {
950
+ const match = RE_REASONING_SUMMARY.exec(line);
951
+ if (!match) return;
952
+
953
+ const summary = match[1].toLowerCase();
954
+ const isDebug = SELF_DEBUG_KEYWORDS.some((kw) => summary.includes(kw));
955
+ if (!isDebug) return;
956
+
957
+ this.#emit({
958
+ type: AnomalyType.SELF_DEBUG_LOOP,
959
+ severity: Severity.LOW,
960
+ processId: state.processId,
961
+ shortId: state.shortId,
962
+ taskTitle: state.taskTitle,
963
+ message: `Agent self-debugging: "${match[1]}"`,
964
+ data: { summary: match[1] },
965
+ action: "info",
966
+ });
967
+ }
968
+
969
+ /**
970
+ * P3: Repeated error fingerprints.
971
+ */
972
+ #detectRepeatedErrors(line, state) {
973
+ // Only check lines that look like errors
974
+ if (RE_ERROR_NOISE.some((re) => re.test(line))) return;
975
+ if (!RE_ERROR_PATTERNS.some((re) => re.test(line))) return;
976
+
977
+ const fingerprint = normalizeDedupKey(line).slice(0, 120);
978
+ const count = (state.errorFingerprints.get(fingerprint) || 0) + 1;
979
+ state.errorFingerprints.set(fingerprint, count);
980
+
981
+ if (count >= this.#thresholds.repeatedErrorKill) {
982
+ this.#emit({
983
+ type: AnomalyType.REPEATED_ERROR,
984
+ severity: Severity.HIGH,
985
+ processId: state.processId,
986
+ shortId: state.shortId,
987
+ taskTitle: state.taskTitle,
988
+ message: `Repeated error (${count}x): ${line.slice(0, 150)}`,
989
+ data: { fingerprint, count },
990
+ action: "kill",
991
+ });
992
+ } else if (count >= this.#thresholds.repeatedErrorWarn) {
993
+ this.#emit({
994
+ type: AnomalyType.REPEATED_ERROR,
995
+ severity: Severity.MEDIUM,
996
+ processId: state.processId,
997
+ shortId: state.shortId,
998
+ taskTitle: state.taskTitle,
999
+ message: `Repeated error (${count}x): ${line.slice(0, 150)}`,
1000
+ data: { fingerprint, count },
1001
+ action: "warn",
1002
+ });
1003
+ }
1004
+ }
1005
+
1006
+ /**
1007
+ * Detect session completion (mark as dead to stop analysis).
1008
+ */
1009
+ #detectSessionCompletion(line, state) {
1010
+ if (RE_SESSION_DONE.test(line) || line.includes(STR_TASK_COMPLETE)) {
1011
+ state.isDead = true;
1012
+ }
1013
+ }
1014
+
1015
+ // ── Stall detection (timer-based) ─────────────────────────────────────────
1016
+
1017
+ /**
1018
+ * Check all active processes for idle stalls.
1019
+ * Called on a 30-second interval.
1020
+ */
1021
+ #checkStalls() {
1022
+ const now = Date.now();
1023
+ for (const [, state] of this.#processes) {
1024
+ if (state.isDead) continue;
1025
+ if (state.lineCount < 5) continue; // Don't alert on brand-new processes
1026
+
1027
+ const idleMs = now - state.lastLineAt;
1028
+
1029
+ if (idleMs >= this.#thresholds.idleStallKillSec * 1000) {
1030
+ this.#emit({
1031
+ type: AnomalyType.IDLE_STALL,
1032
+ severity: Severity.HIGH,
1033
+ processId: state.processId,
1034
+ shortId: state.shortId,
1035
+ taskTitle: state.taskTitle,
1036
+ message: `Agent may be stalled: no output for ${Math.round(idleMs / 1000)}s`,
1037
+ data: { idleSec: Math.round(idleMs / 1000) },
1038
+ action: "kill",
1039
+ });
1040
+ } else if (idleMs >= this.#thresholds.idleStallWarnSec * 1000) {
1041
+ this.#emit({
1042
+ type: AnomalyType.IDLE_STALL,
1043
+ severity: Severity.MEDIUM,
1044
+ processId: state.processId,
1045
+ shortId: state.shortId,
1046
+ taskTitle: state.taskTitle,
1047
+ message: `Agent may be stalled: no output for ${Math.round(idleMs / 1000)}s`,
1048
+ data: { idleSec: Math.round(idleMs / 1000) },
1049
+ action: "warn",
1050
+ });
1051
+ }
1052
+ }
1053
+ }
1054
+
1055
+ // ── Housekeeping ──────────────────────────────────────────────────────────
1056
+
1057
+ /**
1058
+ * Remove process state for processes inactive beyond cleanup threshold.
1059
+ * Cleans both active and completed process archives.
1060
+ */
1061
+ #cleanupOldProcesses() {
1062
+ const now = Date.now();
1063
+ // Clean active processes
1064
+ for (const [pid, state] of this.#processes) {
1065
+ if (now - state.lastLineAt > this.#thresholds.processCleanupMs) {
1066
+ this.#processes.delete(pid);
1067
+ }
1068
+ }
1069
+ // Clean completed process archives
1070
+ for (const [pid, state] of this.#completedProcesses) {
1071
+ if (now - state.lastLineAt > this.#thresholds.processCleanupMs) {
1072
+ this.#completedProcesses.delete(pid);
1073
+ }
1074
+ }
1075
+ }
1076
+
1077
+ // ── Emission ──────────────────────────────────────────────────────────────
1078
+
1079
+ /**
1080
+ * Emit an anomaly event with dedup protection and auto-escalation.
1081
+ *
1082
+ * Circuit breaker: When a warn-level anomaly fires 3+ times for the same
1083
+ * process (each separated by the dedup window), auto-escalate to
1084
+ * action="kill". This prevents agents from wasting hours in loops that
1085
+ * individually don't cross kill thresholds but collectively indicate a
1086
+ * stuck process.
1087
+ *
1088
+ * @param {Anomaly} anomaly
1089
+ */
1090
+ #emit(anomaly) {
1091
+ // Build dedup key: type + processId + severity (so escalations still fire)
1092
+ const dedupKey = `${anomaly.type}:${anomaly.shortId}:${anomaly.severity}`;
1093
+ const state = this.#processes.get(anomaly.processId);
1094
+
1095
+ if (state) {
1096
+ const now = Date.now();
1097
+ const lastAlert = state.alertTimestamps.get(dedupKey) || 0;
1098
+ if (now - lastAlert < this.#thresholds.alertDedupWindowMs) {
1099
+ return; // Already alerted recently
1100
+ }
1101
+ state.alertTimestamps.set(dedupKey, now);
1102
+
1103
+ // ── Circuit breaker escalation ─────────────────────────────────
1104
+ // Track how many times this anomaly type has been emitted for this
1105
+ // process. If a warn/info action fires 3+ times, auto-escalate
1106
+ // to kill — the process is stuck and won't recover on its own.
1107
+ const emitKey = anomaly.type;
1108
+ const emitCount = (state.alertEmitCounts.get(emitKey) || 0) + 1;
1109
+ state.alertEmitCounts.set(emitKey, emitCount);
1110
+
1111
+ if (anomaly.action === "warn" || anomaly.action === "info") {
1112
+ if (emitCount >= 3) {
1113
+ console.warn(
1114
+ `[anomaly-detector] circuit breaker: ${anomaly.type} fired ${emitCount}x for ${anomaly.shortId} — escalating to KILL`,
1115
+ );
1116
+ anomaly.action = "kill";
1117
+ anomaly.severity = Severity.HIGH;
1118
+ anomaly.message = `[ESCALATED] ${anomaly.message} (${emitCount} alerts over ${Math.round((now - state.firstLineAt) / 60_000)}min)`;
1119
+ }
1120
+ }
1121
+ }
1122
+
1123
+ // Increment global counter
1124
+ const prev = this.#globalCounts.get(anomaly.type) || 0;
1125
+ this.#globalCounts.set(anomaly.type, prev + 1);
1126
+
1127
+ // Invoke callback
1128
+ try {
1129
+ this.#onAnomaly(anomaly);
1130
+ } catch {
1131
+ /* callback error — ignore */
1132
+ }
1133
+
1134
+ // Send notification for HIGH+ severity
1135
+ if (
1136
+ anomaly.severity === Severity.CRITICAL ||
1137
+ anomaly.severity === Severity.HIGH
1138
+ ) {
1139
+ const icon = anomaly.severity === Severity.CRITICAL ? "🔴" : "🟠";
1140
+ const actionLabel =
1141
+ anomaly.action === "kill"
1142
+ ? "⛔ KILL"
1143
+ : anomaly.action === "restart"
1144
+ ? "🔄 RESTART"
1145
+ : "⚠️ ALERT";
1146
+
1147
+ const msg = [
1148
+ `${icon} <b>Anomaly: ${escapeHtml(anomaly.type)}</b>`,
1149
+ `Process: <code>${escapeHtml(anomaly.shortId)}</code>`,
1150
+ anomaly.taskTitle ? `Task: ${escapeHtml(anomaly.taskTitle)}` : null,
1151
+ `${escapeHtml(anomaly.message)}`,
1152
+ `Action: ${actionLabel}`,
1153
+ ]
1154
+ .filter(Boolean)
1155
+ .join("\n");
1156
+
1157
+ try {
1158
+ this.#notify(msg, { parseMode: "HTML", skipDedup: false });
1159
+ } catch {
1160
+ /* notification error — ignore */
1161
+ }
1162
+ }
1163
+ }
1164
+ }
1165
+
1166
+ // ── Factory function ────────────────────────────────────────────────────────
1167
+
1168
+ /**
1169
+ * Create and start an anomaly detector instance.
1170
+ *
1171
+ * @param {object} options
1172
+ * @param {(anomaly: Anomaly) => void} [options.onAnomaly] - Custom anomaly handler
1173
+ * @param {(text: string, options?: object) => void} [options.notify] - Telegram notification fn
1174
+ * @param {Partial<typeof DEFAULT_THRESHOLDS>} [options.thresholds] - Threshold overrides
1175
+ * @returns {AnomalyDetector}
1176
+ */
1177
+ export function createAnomalyDetector(options = {}) {
1178
+ const detector = new AnomalyDetector(options);
1179
+ detector.start();
1180
+ return detector;
1181
+ }
1182
+
1183
+ /**
1184
+ * @typedef {Object} Anomaly
1185
+ * @property {string} type - AnomalyType value
1186
+ * @property {string} severity - Severity value
1187
+ * @property {string} processId - Full process ID
1188
+ * @property {string} shortId - 8-char short process ID
1189
+ * @property {string|null} taskTitle - Task title if known
1190
+ * @property {string} message - Human-readable description
1191
+ * @property {object} data - Structured data for the anomaly
1192
+ * @property {string} action - Recommended action: "kill" | "restart" | "warn" | "info"
1193
+ */
1194
+
1195
+ export default AnomalyDetector;