@virtengine/openfleet 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +914 -0
- package/LICENSE +190 -0
- package/README.md +500 -0
- package/agent-endpoint.mjs +918 -0
- package/agent-hook-bridge.mjs +230 -0
- package/agent-hooks.mjs +1188 -0
- package/agent-pool.mjs +2403 -0
- package/agent-prompts.mjs +689 -0
- package/agent-sdk.mjs +141 -0
- package/anomaly-detector.mjs +1195 -0
- package/autofix.mjs +1294 -0
- package/claude-shell.mjs +708 -0
- package/cli.mjs +906 -0
- package/codex-config.mjs +1274 -0
- package/codex-model-profiles.mjs +135 -0
- package/codex-shell.mjs +762 -0
- package/config-doctor.mjs +613 -0
- package/config.mjs +1720 -0
- package/conflict-resolver.mjs +248 -0
- package/container-runner.mjs +450 -0
- package/copilot-shell.mjs +827 -0
- package/daemon-restart-policy.mjs +56 -0
- package/diff-stats.mjs +282 -0
- package/error-detector.mjs +829 -0
- package/fetch-runtime.mjs +34 -0
- package/fleet-coordinator.mjs +838 -0
- package/get-telegram-chat-id.mjs +71 -0
- package/git-safety.mjs +170 -0
- package/github-reconciler.mjs +403 -0
- package/hook-profiles.mjs +651 -0
- package/kanban-adapter.mjs +4491 -0
- package/lib/logger.mjs +645 -0
- package/maintenance.mjs +828 -0
- package/merge-strategy.mjs +1171 -0
- package/monitor.mjs +12207 -0
- package/openfleet.config.example.json +115 -0
- package/openfleet.schema.json +465 -0
- package/package.json +203 -0
- package/postinstall.mjs +187 -0
- package/pr-cleanup-daemon.mjs +978 -0
- package/preflight.mjs +408 -0
- package/prepublish-check.mjs +90 -0
- package/presence.mjs +328 -0
- package/primary-agent.mjs +282 -0
- package/publish.mjs +151 -0
- package/repo-root.mjs +29 -0
- package/restart-controller.mjs +100 -0
- package/review-agent.mjs +557 -0
- package/rotate-agent-logs.sh +133 -0
- package/sdk-conflict-resolver.mjs +973 -0
- package/session-tracker.mjs +880 -0
- package/setup.mjs +3937 -0
- package/shared-knowledge.mjs +410 -0
- package/shared-state-manager.mjs +841 -0
- package/shared-workspace-cli.mjs +199 -0
- package/shared-workspace-registry.mjs +537 -0
- package/shared-workspaces.json +18 -0
- package/startup-service.mjs +1070 -0
- package/sync-engine.mjs +1063 -0
- package/task-archiver.mjs +801 -0
- package/task-assessment.mjs +550 -0
- package/task-claims.mjs +924 -0
- package/task-complexity.mjs +581 -0
- package/task-executor.mjs +5111 -0
- package/task-store.mjs +753 -0
- package/telegram-bot.mjs +9281 -0
- package/telegram-sentinel.mjs +2010 -0
- package/ui/app.js +867 -0
- package/ui/app.legacy.js +1464 -0
- package/ui/app.monolith.js +2488 -0
- package/ui/components/charts.js +226 -0
- package/ui/components/chat-view.js +567 -0
- package/ui/components/command-palette.js +587 -0
- package/ui/components/diff-viewer.js +190 -0
- package/ui/components/forms.js +327 -0
- package/ui/components/kanban-board.js +451 -0
- package/ui/components/session-list.js +305 -0
- package/ui/components/shared.js +473 -0
- package/ui/index.html +70 -0
- package/ui/modules/api.js +297 -0
- package/ui/modules/icons.js +461 -0
- package/ui/modules/router.js +81 -0
- package/ui/modules/settings-schema.js +261 -0
- package/ui/modules/state.js +679 -0
- package/ui/modules/telegram.js +331 -0
- package/ui/modules/utils.js +270 -0
- package/ui/styles/animations.css +140 -0
- package/ui/styles/base.css +98 -0
- package/ui/styles/components.css +1915 -0
- package/ui/styles/kanban.css +286 -0
- package/ui/styles/layout.css +809 -0
- package/ui/styles/sessions.css +827 -0
- package/ui/styles/variables.css +188 -0
- package/ui/styles.css +141 -0
- package/ui/styles.monolith.css +1046 -0
- package/ui/tabs/agents.js +1417 -0
- package/ui/tabs/chat.js +74 -0
- package/ui/tabs/control.js +887 -0
- package/ui/tabs/dashboard.js +515 -0
- package/ui/tabs/infra.js +537 -0
- package/ui/tabs/logs.js +783 -0
- package/ui/tabs/settings.js +1487 -0
- package/ui/tabs/tasks.js +1385 -0
- package/ui-server.mjs +4073 -0
- package/update-check.mjs +465 -0
- package/utils.mjs +172 -0
- package/ve-kanban.mjs +654 -0
- package/ve-kanban.ps1 +1365 -0
- package/ve-kanban.sh +18 -0
- package/ve-orchestrator.mjs +340 -0
- package/ve-orchestrator.ps1 +6546 -0
- package/ve-orchestrator.sh +18 -0
- package/vibe-kanban-wrapper.mjs +41 -0
- package/vk-error-resolver.mjs +470 -0
- package/vk-log-stream.mjs +914 -0
- package/whatsapp-channel.mjs +520 -0
- package/workspace-monitor.mjs +581 -0
- package/workspace-reaper.mjs +405 -0
- package/workspace-registry.mjs +238 -0
- package/worktree-manager.mjs +1266 -0
|
@@ -0,0 +1,1195 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* anomaly-detector.mjs — Plaintext real-time anomaly detection for VK agent sessions.
|
|
3
|
+
*
|
|
4
|
+
* Detects death loops, stalls, token overflows, rebase spirals, and other
|
|
5
|
+
* wasteful agent behaviors by pattern-matching raw log lines. No AI inference —
|
|
6
|
+
* purely regex/string-based detection for low latency.
|
|
7
|
+
*
|
|
8
|
+
* Integration:
|
|
9
|
+
* Wired into VkLogStream.onLine callback in monitor.mjs.
|
|
10
|
+
* Each log line is fed to processLine(line, meta) which maintains per-process
|
|
11
|
+
* state and emits anomaly events via the onAnomaly callback.
|
|
12
|
+
*
|
|
13
|
+
* Architecture:
|
|
14
|
+
* - Per-process tracking via ProcessState objects
|
|
15
|
+
* - Sliding window counters for rate-based detection
|
|
16
|
+
* - Fingerprinted dedup to avoid alert spam
|
|
17
|
+
* - Severity levels: CRITICAL (kill), HIGH (kill at threshold/warn), MEDIUM (warn), LOW (info)
|
|
18
|
+
* - KILL action triggers at kill thresholds for all anomaly types (not just TOKEN_OVERFLOW)
|
|
19
|
+
* - Active process monitoring only (completed processes archived for analysis)
|
|
20
|
+
*
|
|
21
|
+
* Pattern catalog: See VK_FAILURE_PATTERN_CATALOG.md
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import { normalizeDedupKey, stripAnsi, escapeHtml } from "./utils.mjs";
|
|
25
|
+
|
|
26
|
+
// ── Severity levels ─────────────────────────────────────────────────────────
|
|
27
|
+
export const Severity = /** @type {const} */ ({
|
|
28
|
+
CRITICAL: "CRITICAL", // Reserved for TOKEN_OVERFLOW (unrecoverable)
|
|
29
|
+
HIGH: "HIGH", // Serious issues requiring attention (but don't kill)
|
|
30
|
+
MEDIUM: "MEDIUM", // Should warn, may need intervention
|
|
31
|
+
LOW: "LOW", // Informational
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
// ── Anomaly types ───────────────────────────────────────────────────────────
|
|
35
|
+
export const AnomalyType = /** @type {const} */ ({
|
|
36
|
+
TOKEN_OVERFLOW: "TOKEN_OVERFLOW",
|
|
37
|
+
MODEL_NOT_SUPPORTED: "MODEL_NOT_SUPPORTED",
|
|
38
|
+
STREAM_DEATH: "STREAM_DEATH",
|
|
39
|
+
TOOL_CALL_LOOP: "TOOL_CALL_LOOP",
|
|
40
|
+
REBASE_SPIRAL: "REBASE_SPIRAL",
|
|
41
|
+
GIT_PUSH_LOOP: "GIT_PUSH_LOOP",
|
|
42
|
+
SUBAGENT_WASTE: "SUBAGENT_WASTE",
|
|
43
|
+
COMMAND_FAILURE_RATE: "COMMAND_FAILURE_RATE",
|
|
44
|
+
TOOL_FAILURE_CASCADE: "TOOL_FAILURE_CASCADE",
|
|
45
|
+
THOUGHT_SPINNING: "THOUGHT_SPINNING",
|
|
46
|
+
SELF_DEBUG_LOOP: "SELF_DEBUG_LOOP",
|
|
47
|
+
REPEATED_ERROR: "REPEATED_ERROR",
|
|
48
|
+
IDLE_STALL: "IDLE_STALL",
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
// ── Default thresholds (configurable) ───────────────────────────────────────
|
|
52
|
+
const DEFAULT_THRESHOLDS = {
|
|
53
|
+
// Tool call loop: N consecutive identical tool titles
|
|
54
|
+
toolCallLoopWarn: 6,
|
|
55
|
+
toolCallLoopKill: 12,
|
|
56
|
+
|
|
57
|
+
// Rebase spiral: N rebase --continue commands
|
|
58
|
+
rebaseWarn: 10,
|
|
59
|
+
rebaseKill: 25,
|
|
60
|
+
|
|
61
|
+
// Git push attempts in a session
|
|
62
|
+
gitPushWarn: 4,
|
|
63
|
+
gitPushKill: 8,
|
|
64
|
+
|
|
65
|
+
// Subagent spawns per session
|
|
66
|
+
subagentWarn: 10,
|
|
67
|
+
subagentKill: 20,
|
|
68
|
+
|
|
69
|
+
// Tool failures per session before alert
|
|
70
|
+
toolFailureWarn: 10,
|
|
71
|
+
toolFailureKill: 30,
|
|
72
|
+
|
|
73
|
+
// Command failure rate (%) over sliding window
|
|
74
|
+
commandFailureRateWarn: 25,
|
|
75
|
+
|
|
76
|
+
// Thought repetition (same text N+ times)
|
|
77
|
+
thoughtSpinWarn: 25,
|
|
78
|
+
thoughtSpinKill: 50,
|
|
79
|
+
|
|
80
|
+
// Model-not-supported failures before kill (high threshold — external issue)
|
|
81
|
+
modelFailureKill: 5,
|
|
82
|
+
|
|
83
|
+
// Repeated error fingerprint threshold
|
|
84
|
+
repeatedErrorWarn: 5,
|
|
85
|
+
repeatedErrorKill: 10,
|
|
86
|
+
|
|
87
|
+
// Idle stall: seconds with no line activity
|
|
88
|
+
idleStallWarnSec: 300, // 5 minutes
|
|
89
|
+
idleStallKillSec: 600, // 10 minutes
|
|
90
|
+
|
|
91
|
+
// Dedup window: don't re-alert same anomaly within this many ms
|
|
92
|
+
alertDedupWindowMs: 5 * 60 * 1000,
|
|
93
|
+
|
|
94
|
+
// Process state cleanup: remove tracking after this many ms of inactivity
|
|
95
|
+
processCleanupMs: 30 * 60 * 1000,
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
// Thought patterns that are legitimate during long-running operations.
|
|
99
|
+
// Agents running test suites, builds, or installations will naturally repeat
|
|
100
|
+
// these status thoughts many times — they're progress indicators, not loops.
|
|
101
|
+
const THOUGHT_SPINNING_EXCLUSIONS = [
|
|
102
|
+
/^running\s+\w*\s*tests?$/i, // "Running integration tests", "Running portal tests", "Running unit tests"
|
|
103
|
+
/^running\s+\w+$/i, // "Running prettier", "Running eslint"
|
|
104
|
+
/^waiting\s+for\s+/i, // "Waiting for tests to complete"
|
|
105
|
+
/^installing\s+/i, // "Installing dependencies"
|
|
106
|
+
/^building\s+/i, // "Building the project"
|
|
107
|
+
/^compiling\s+/i, // "Compiling TypeScript"
|
|
108
|
+
/^testing\s+/i, // "Testing the implementation"
|
|
109
|
+
/^executing\s+/i, // "Executing the command"
|
|
110
|
+
/^checking\s+/i, // "Checking test results"
|
|
111
|
+
/^analyzing\s+/i, // "Analyzing test output"
|
|
112
|
+
];
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Check if a thought is a legitimate operational status message
|
|
116
|
+
* that should not count toward thought spinning detection.
|
|
117
|
+
* @param {string} normalized - Lowercase, trimmed thought text
|
|
118
|
+
* @returns {boolean}
|
|
119
|
+
*/
|
|
120
|
+
function isOperationalThought(normalized) {
|
|
121
|
+
return THOUGHT_SPINNING_EXCLUSIONS.some((re) => re.test(normalized));
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// ── Per-process state ───────────────────────────────────────────────────────
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* @typedef {Object} ProcessState
|
|
128
|
+
* @property {string} processId
|
|
129
|
+
* @property {string} shortId
|
|
130
|
+
* @property {number} lineCount
|
|
131
|
+
* @property {number} firstLineAt
|
|
132
|
+
* @property {number} lastLineAt
|
|
133
|
+
* @property {string|null} lastToolTitle - Last ToolCall title seen
|
|
134
|
+
* @property {number} lastToolCallFingerprint - DJB2 hash of last tool call (minus toolCallId)
|
|
135
|
+
* @property {number} consecutiveSameToolCount - How many times in a row
|
|
136
|
+
* @property {number} rebaseCount - git rebase --continue count
|
|
137
|
+
* @property {number} rebaseAbortCount
|
|
138
|
+
* @property {number} gitPushCount
|
|
139
|
+
* @property {number} subagentCount
|
|
140
|
+
* @property {number} toolFailureCount
|
|
141
|
+
* @property {number} commandCount - Total command executions
|
|
142
|
+
* @property {number} commandFailureCount - Failed command executions
|
|
143
|
+
* @property {Map<string, number>} thoughtCounts - Normalized thought text → count
|
|
144
|
+
* @property {Map<string, number>} errorFingerprints - Error fingerprint → count
|
|
145
|
+
* @property {number} modelFailureCount
|
|
146
|
+
* @property {boolean} isDead - Process known to be dead/finished
|
|
147
|
+
* @property {string|null} taskTitle
|
|
148
|
+
* @property {string|null} branch
|
|
149
|
+
* @property {Set<string>} alertsSent - Dedup keys for alerts already sent
|
|
150
|
+
* @property {Map<string, number>} alertTimestamps - Dedup key → last alert time
|
|
151
|
+
* @property {Map<string, number>} alertEmitCounts - type → total emit count (for escalation)
|
|
152
|
+
*/
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Create a fresh process state
|
|
156
|
+
* @param {string} processId
|
|
157
|
+
* @returns {ProcessState}
|
|
158
|
+
*/
|
|
159
|
+
function createProcessState(processId) {
|
|
160
|
+
const now = Date.now();
|
|
161
|
+
return {
|
|
162
|
+
processId,
|
|
163
|
+
shortId: processId.slice(0, 8),
|
|
164
|
+
lineCount: 0,
|
|
165
|
+
firstLineAt: now,
|
|
166
|
+
lastLineAt: now,
|
|
167
|
+
lastToolTitle: null,
|
|
168
|
+
lastToolCallFingerprint: 0,
|
|
169
|
+
consecutiveSameToolCount: 0,
|
|
170
|
+
rebaseCount: 0,
|
|
171
|
+
rebaseAbortCount: 0,
|
|
172
|
+
gitPushCount: 0,
|
|
173
|
+
subagentCount: 0,
|
|
174
|
+
toolFailureCount: 0,
|
|
175
|
+
commandCount: 0,
|
|
176
|
+
commandFailureCount: 0,
|
|
177
|
+
thoughtCounts: new Map(),
|
|
178
|
+
errorFingerprints: new Map(),
|
|
179
|
+
modelFailureCount: 0,
|
|
180
|
+
isDead: false,
|
|
181
|
+
taskTitle: null,
|
|
182
|
+
branch: null,
|
|
183
|
+
alertsSent: new Set(),
|
|
184
|
+
alertTimestamps: new Map(),
|
|
185
|
+
alertEmitCounts: new Map(),
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// ── Compiled patterns (computed once) ───────────────────────────────────────
|
|
190
|
+
|
|
191
|
+
// P0: Token overflow
|
|
192
|
+
const RE_TOKEN_OVERFLOW =
|
|
193
|
+
/CAPIError: 400 prompt token count of (\d+) exceeds the limit of (\d+)/;
|
|
194
|
+
|
|
195
|
+
// P0: Model not supported
|
|
196
|
+
const STR_MODEL_NOT_SUPPORTED =
|
|
197
|
+
"CAPIError: 400 The requested model is not supported";
|
|
198
|
+
|
|
199
|
+
// P1: Stream death
|
|
200
|
+
const STR_STREAM_DEATH = "Stream completed without a response.completed event";
|
|
201
|
+
|
|
202
|
+
// P1: Rebase spiral
|
|
203
|
+
const RE_REBASE_CONTINUE = /git rebase --continue/;
|
|
204
|
+
const RE_REBASE_ABORT = /git rebase --abort/;
|
|
205
|
+
|
|
206
|
+
// P2: Tool call (Copilot format) — extract title
|
|
207
|
+
const RE_TOOL_CALL_TITLE = /"ToolCall"\s*:\s*\{[^}]*"title"\s*:\s*"([^"]+)"/;
|
|
208
|
+
|
|
209
|
+
// P2: Strip toolCallId from tool call lines for content fingerprinting
|
|
210
|
+
// toolCallId changes every call, so we strip it to compare actual content
|
|
211
|
+
const RE_TOOL_CALL_ID = /"toolCallId"\s*:\s*"[^"]*"\s*,?\s*/g;
|
|
212
|
+
|
|
213
|
+
// Tools that are inherently iterative — agents legitimately call these many
|
|
214
|
+
// times on the same file during normal development (edit→test→edit cycles).
|
|
215
|
+
// These get multiplied thresholds to avoid false-positive kill signals.
|
|
216
|
+
const ITERATIVE_TOOL_PREFIXES = [
|
|
217
|
+
"Editing ", // replace_string_in_file, multi_replace_string_in_file
|
|
218
|
+
"Reading ", // read_file
|
|
219
|
+
"Searching ", // grep_search, file_search, semantic_search
|
|
220
|
+
"Listing ", // list_dir, list_code_usages
|
|
221
|
+
];
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Simple DJB2 string hash for fingerprinting tool call lines.
|
|
225
|
+
* Not cryptographic — just fast dedup.
|
|
226
|
+
* @param {string} str
|
|
227
|
+
* @returns {number}
|
|
228
|
+
*/
|
|
229
|
+
function djb2Hash(str) {
|
|
230
|
+
let hash = 5381;
|
|
231
|
+
for (let i = 0; i < str.length; i++) {
|
|
232
|
+
hash = ((hash << 5) + hash + str.charCodeAt(i)) | 0;
|
|
233
|
+
}
|
|
234
|
+
return hash;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Check if a tool title represents an inherently iterative operation.
|
|
239
|
+
* @param {string} title
|
|
240
|
+
* @returns {boolean}
|
|
241
|
+
*/
|
|
242
|
+
function isIterativeTool(title) {
|
|
243
|
+
return ITERATIVE_TOOL_PREFIXES.some((p) => title.startsWith(p));
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// P2: Tool failure (Copilot format)
|
|
247
|
+
const STR_TOOL_FAILED = '"status":"failed"';
|
|
248
|
+
const RE_TOOL_UPDATE_FAILED =
|
|
249
|
+
/"ToolUpdate"\s*:\s*\{[^}]*"status"\s*:\s*"failed"/;
|
|
250
|
+
|
|
251
|
+
// P2: Git push
|
|
252
|
+
const RE_GIT_PUSH = /git push(?:\s|$)/;
|
|
253
|
+
|
|
254
|
+
// P2: Subagent spawn (Copilot format — ToolCall with "prompt" in rawInput)
|
|
255
|
+
const RE_SUBAGENT_SPAWN =
|
|
256
|
+
/"ToolCall"\s*:\s*\{[^}]*"rawInput"\s*:\s*\{[^}]*"prompt"\s*:/;
|
|
257
|
+
|
|
258
|
+
// P3: Command failure (Codex format)
|
|
259
|
+
const RE_CMD_FAILED_CODEX =
|
|
260
|
+
/"type"\s*:\s*"commandExecution"[^}]*"status"\s*:\s*"failed"/;
|
|
261
|
+
const RE_CMD_COMPLETED_CODEX =
|
|
262
|
+
/"type"\s*:\s*"commandExecution"[^}]*"status"\s*:\s*"completed"/;
|
|
263
|
+
|
|
264
|
+
// P3: Thought tokens (Copilot format)
|
|
265
|
+
const RE_THOUGHT_TEXT =
|
|
266
|
+
/"Thought"\s*:\s*\{\s*"type"\s*:\s*"text"\s*,\s*"text"\s*:\s*"([^"]+)"/;
|
|
267
|
+
|
|
268
|
+
// P3: Reasoning summary (Codex format)
|
|
269
|
+
const RE_REASONING_SUMMARY =
|
|
270
|
+
/"type"\s*:\s*"reasoning"[^}]*"summary"\s*:\s*\["([^"]+)"/;
|
|
271
|
+
|
|
272
|
+
// Self-debugging keywords in reasoning
|
|
273
|
+
const SELF_DEBUG_KEYWORDS = [
|
|
274
|
+
"troubleshooting",
|
|
275
|
+
"debugging",
|
|
276
|
+
"analyzing grep",
|
|
277
|
+
"figuring out",
|
|
278
|
+
"retrying",
|
|
279
|
+
"diagnosing",
|
|
280
|
+
];
|
|
281
|
+
|
|
282
|
+
// Error line patterns
|
|
283
|
+
const RE_ERROR_PATTERNS = [
|
|
284
|
+
/\bError:\s/i,
|
|
285
|
+
/\bFailed\b.*\b(?:to|with)\b/i,
|
|
286
|
+
/\bfatal\b/i,
|
|
287
|
+
/\bpanic\b/i,
|
|
288
|
+
/\bCAPIError\b/,
|
|
289
|
+
];
|
|
290
|
+
|
|
291
|
+
// Noise exclusions (don't count these as errors)
|
|
292
|
+
const RE_ERROR_NOISE = [
|
|
293
|
+
/error=0/i,
|
|
294
|
+
/errors: 0/i,
|
|
295
|
+
/no errors/i,
|
|
296
|
+
/\berror handling\b/i,
|
|
297
|
+
/error_count.*:\s*0/i,
|
|
298
|
+
/"status":"completed"/,
|
|
299
|
+
/PASSED/,
|
|
300
|
+
];
|
|
301
|
+
|
|
302
|
+
// Session completion indicators
|
|
303
|
+
const RE_SESSION_DONE = /"Done"\s*:\s*"/;
|
|
304
|
+
const STR_TASK_COMPLETE = "task_complete";
|
|
305
|
+
|
|
306
|
+
// ── Main Detector Class ─────────────────────────────────────────────────────
|
|
307
|
+
|
|
308
|
+
export class AnomalyDetector {
|
|
309
|
+
/** @type {Map<string, ProcessState>} Per-process state (active only) */
|
|
310
|
+
#processes = new Map();
|
|
311
|
+
|
|
312
|
+
/** @type {Map<string, ProcessState>} Completed processes (archived for analysis) */
|
|
313
|
+
#completedProcesses = new Map();
|
|
314
|
+
|
|
315
|
+
/** @type {(anomaly: Anomaly) => void} */
|
|
316
|
+
#onAnomaly;
|
|
317
|
+
|
|
318
|
+
/** @type {(text: string, options?: object) => void} */
|
|
319
|
+
#notify;
|
|
320
|
+
|
|
321
|
+
/** @type {typeof DEFAULT_THRESHOLDS} */
|
|
322
|
+
#thresholds;
|
|
323
|
+
|
|
324
|
+
/** @type {NodeJS.Timeout|null} */
|
|
325
|
+
#cleanupInterval = null;
|
|
326
|
+
|
|
327
|
+
/** @type {NodeJS.Timeout|null} */
|
|
328
|
+
#stallCheckInterval = null;
|
|
329
|
+
|
|
330
|
+
/** @type {Map<string, number>} Global anomaly counters by type */
|
|
331
|
+
#globalCounts = new Map();
|
|
332
|
+
|
|
333
|
+
/** @type {number} Total lines processed */
|
|
334
|
+
#totalLines = 0;
|
|
335
|
+
|
|
336
|
+
/** @type {number} Detector start time */
|
|
337
|
+
#startedAt = Date.now();
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* @param {object} options
|
|
341
|
+
* @param {(anomaly: Anomaly) => void} options.onAnomaly - Called when anomaly detected
|
|
342
|
+
* @param {(text: string, options?: object) => void} [options.notify] - Notification function (Telegram)
|
|
343
|
+
* @param {Partial<typeof DEFAULT_THRESHOLDS>} [options.thresholds] - Override defaults
|
|
344
|
+
*/
|
|
345
|
+
constructor(options) {
|
|
346
|
+
this.#onAnomaly = options.onAnomaly || (() => {});
|
|
347
|
+
this.#notify = options.notify || (() => {});
|
|
348
|
+
this.#thresholds = { ...DEFAULT_THRESHOLDS, ...options.thresholds };
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
/**
|
|
352
|
+
* Start background timers (stall detection, cleanup).
|
|
353
|
+
* Call once after construction.
|
|
354
|
+
*/
|
|
355
|
+
start() {
|
|
356
|
+
// Check for idle stalls every 30 seconds
|
|
357
|
+
this.#stallCheckInterval = setInterval(() => {
|
|
358
|
+
this.#checkStalls();
|
|
359
|
+
}, 30_000);
|
|
360
|
+
this.#stallCheckInterval.unref?.();
|
|
361
|
+
|
|
362
|
+
// Clean up old process state every 10 minutes
|
|
363
|
+
this.#cleanupInterval = setInterval(() => {
|
|
364
|
+
this.#cleanupOldProcesses();
|
|
365
|
+
}, 10 * 60_000);
|
|
366
|
+
this.#cleanupInterval.unref?.();
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
/**
|
|
370
|
+
* Stop background timers.
|
|
371
|
+
*/
|
|
372
|
+
stop() {
|
|
373
|
+
if (this.#stallCheckInterval) {
|
|
374
|
+
clearInterval(this.#stallCheckInterval);
|
|
375
|
+
this.#stallCheckInterval = null;
|
|
376
|
+
}
|
|
377
|
+
if (this.#cleanupInterval) {
|
|
378
|
+
clearInterval(this.#cleanupInterval);
|
|
379
|
+
this.#cleanupInterval = null;
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/**
|
|
384
|
+
* Process a single log line from VkLogStream.
|
|
385
|
+
* This is the main entry point — called from the onLine callback.
|
|
386
|
+
*
|
|
387
|
+
* @param {string} rawLine - Raw log line
|
|
388
|
+
* @param {object} meta - Metadata from VkLogStream
|
|
389
|
+
* @param {string} meta.processId - VK execution process ID
|
|
390
|
+
* @param {string} meta.stream - "stdout" or "stderr"
|
|
391
|
+
* @param {string} [meta.taskTitle] - Task title if known
|
|
392
|
+
* @param {string} [meta.branch] - Git branch if known
|
|
393
|
+
* @param {string} [meta.sessionId] - VK session ID
|
|
394
|
+
* @param {string} [meta.attemptId] - Attempt ID
|
|
395
|
+
*/
|
|
396
|
+
processLine(rawLine, meta) {
|
|
397
|
+
if (!rawLine || !meta?.processId) return;
|
|
398
|
+
|
|
399
|
+
const line = stripAnsi(rawLine).trim();
|
|
400
|
+
if (!line) return;
|
|
401
|
+
|
|
402
|
+
this.#totalLines++;
|
|
403
|
+
|
|
404
|
+
// Get or create per-process state
|
|
405
|
+
const pid = meta.processId;
|
|
406
|
+
if (this.#completedProcesses.has(pid)) {
|
|
407
|
+
return;
|
|
408
|
+
}
|
|
409
|
+
let state = this.#processes.get(pid);
|
|
410
|
+
if (!state) {
|
|
411
|
+
state = createProcessState(pid);
|
|
412
|
+
this.#processes.set(pid, state);
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
state.lineCount++;
|
|
416
|
+
state.lastLineAt = Date.now();
|
|
417
|
+
if (meta.taskTitle && !state.taskTitle) state.taskTitle = meta.taskTitle;
|
|
418
|
+
if (meta.branch && !state.branch) state.branch = meta.branch;
|
|
419
|
+
|
|
420
|
+
// Skip further analysis on dead/completed processes
|
|
421
|
+
if (state.isDead) {
|
|
422
|
+
// Archive completed process on first detection
|
|
423
|
+
if (this.#processes.has(pid)) {
|
|
424
|
+
this.#completedProcesses.set(pid, state);
|
|
425
|
+
this.#processes.delete(pid);
|
|
426
|
+
}
|
|
427
|
+
return;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
// ── Run all detectors ───────────────────────────────────────────
|
|
431
|
+
this.#detectTokenOverflow(line, state);
|
|
432
|
+
this.#detectModelNotSupported(line, state);
|
|
433
|
+
this.#detectStreamDeath(line, state);
|
|
434
|
+
this.#detectToolCallLoop(line, state);
|
|
435
|
+
this.#detectToolFailures(line, state);
|
|
436
|
+
this.#detectRebaseSpiral(line, state);
|
|
437
|
+
this.#detectGitPushLoop(line, state);
|
|
438
|
+
this.#detectSubagentWaste(line, state);
|
|
439
|
+
this.#detectCommandFailures(line, state);
|
|
440
|
+
this.#detectThoughtSpinning(line, state);
|
|
441
|
+
this.#detectSelfDebugLoop(line, state);
|
|
442
|
+
this.#detectRepeatedErrors(line, state);
|
|
443
|
+
this.#detectSessionCompletion(line, state);
|
|
444
|
+
|
|
445
|
+
// Move completed processes out of the active map immediately so stats reflect completion.
|
|
446
|
+
if (state.isDead && this.#processes.has(pid)) {
|
|
447
|
+
this.#completedProcesses.set(pid, state);
|
|
448
|
+
this.#processes.delete(pid);
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
/**
|
|
453
|
+
* Get anomaly statistics across all tracked processes.
|
|
454
|
+
* @returns {object}
|
|
455
|
+
*/
|
|
456
|
+
getStats() {
|
|
457
|
+
const stats = {
|
|
458
|
+
uptimeMs: Date.now() - this.#startedAt,
|
|
459
|
+
totalLinesProcessed: this.#totalLines,
|
|
460
|
+
activeProcesses: this.#processes.size,
|
|
461
|
+
completedProcesses: this.#completedProcesses.size,
|
|
462
|
+
deadProcesses: this.#completedProcesses.size,
|
|
463
|
+
anomalyCounts: Object.fromEntries(this.#globalCounts),
|
|
464
|
+
processes: /** @type {object[]} */ ([]),
|
|
465
|
+
};
|
|
466
|
+
|
|
467
|
+
for (const [pid, state] of this.#processes) {
|
|
468
|
+
stats.processes.push({
|
|
469
|
+
shortId: state.shortId,
|
|
470
|
+
taskTitle: state.taskTitle || "(unknown)",
|
|
471
|
+
lineCount: state.lineCount,
|
|
472
|
+
isDead: state.isDead,
|
|
473
|
+
toolFailures: state.toolFailureCount,
|
|
474
|
+
rebaseCount: state.rebaseCount,
|
|
475
|
+
gitPushCount: state.gitPushCount,
|
|
476
|
+
subagentCount: state.subagentCount,
|
|
477
|
+
modelFailures: state.modelFailureCount,
|
|
478
|
+
consecutiveSameToolCount: state.consecutiveSameToolCount,
|
|
479
|
+
lastToolTitle: state.lastToolTitle,
|
|
480
|
+
idleSec: Math.round((Date.now() - state.lastLineAt) / 1000),
|
|
481
|
+
alertEmitCounts: Object.fromEntries(state.alertEmitCounts),
|
|
482
|
+
runtimeMin: Math.round((Date.now() - state.firstLineAt) / 60_000),
|
|
483
|
+
});
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
return stats;
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
/**
|
|
490
|
+
* Get a formatted status string suitable for Telegram /status command.
|
|
491
|
+
* @returns {string}
|
|
492
|
+
*/
|
|
493
|
+
getStatusReport() {
|
|
494
|
+
const s = this.getStats();
|
|
495
|
+
const uptimeMin = Math.round(s.uptimeMs / 60_000);
|
|
496
|
+
const lines = [
|
|
497
|
+
`<b>🔍 Anomaly Detector Status</b>`,
|
|
498
|
+
`Uptime: ${uptimeMin}m | Lines: ${s.totalLinesProcessed.toLocaleString()}`,
|
|
499
|
+
`Active: ${s.activeProcesses} | Completed: ${s.completedProcesses}`,
|
|
500
|
+
];
|
|
501
|
+
|
|
502
|
+
const counts = Object.entries(s.anomalyCounts);
|
|
503
|
+
if (counts.length > 0) {
|
|
504
|
+
lines.push(
|
|
505
|
+
`\n<b>Anomalies detected:</b>`,
|
|
506
|
+
...counts.map(([type, count]) => ` ${type}: ${count}`),
|
|
507
|
+
);
|
|
508
|
+
} else {
|
|
509
|
+
lines.push(`\nNo anomalies detected.`);
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
// Show any active concerns
|
|
513
|
+
for (const proc of s.processes) {
|
|
514
|
+
if (proc.isDead) continue;
|
|
515
|
+
const concerns = [];
|
|
516
|
+
if (proc.consecutiveSameToolCount >= this.#thresholds.toolCallLoopWarn) {
|
|
517
|
+
concerns.push(
|
|
518
|
+
`tool loop (${proc.consecutiveSameToolCount}x ${proc.lastToolTitle})`,
|
|
519
|
+
);
|
|
520
|
+
}
|
|
521
|
+
if (proc.rebaseCount >= this.#thresholds.rebaseWarn) {
|
|
522
|
+
concerns.push(`rebase spiral (${proc.rebaseCount})`);
|
|
523
|
+
}
|
|
524
|
+
if (proc.gitPushCount >= this.#thresholds.gitPushWarn) {
|
|
525
|
+
concerns.push(`push loop (${proc.gitPushCount})`);
|
|
526
|
+
}
|
|
527
|
+
if (proc.idleSec >= this.#thresholds.idleStallWarnSec) {
|
|
528
|
+
concerns.push(`idle ${proc.idleSec}s`);
|
|
529
|
+
}
|
|
530
|
+
// Show circuit-breaker escalation status
|
|
531
|
+
const escalated = Object.entries(proc.alertEmitCounts || {}).filter(
|
|
532
|
+
([, c]) => c >= 3,
|
|
533
|
+
);
|
|
534
|
+
if (escalated.length > 0) {
|
|
535
|
+
concerns.push(
|
|
536
|
+
`escalated: ${escalated.map(([t, c]) => `${t}(${c}x)`).join(", ")}`,
|
|
537
|
+
);
|
|
538
|
+
}
|
|
539
|
+
if (proc.runtimeMin >= 60) {
|
|
540
|
+
concerns.push(`runtime ${proc.runtimeMin}min`);
|
|
541
|
+
}
|
|
542
|
+
if (concerns.length > 0) {
|
|
543
|
+
lines.push(
|
|
544
|
+
`\n⚠️ <b>${escapeHtml(proc.shortId)}</b> (${escapeHtml(proc.taskTitle || "?")}):`,
|
|
545
|
+
` ${concerns.join(", ")}`,
|
|
546
|
+
);
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
return lines.join("\n");
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
/**
|
|
554
|
+
* Reset a specific process's state (e.g., after restart).
|
|
555
|
+
* @param {string} processId
|
|
556
|
+
*/
|
|
557
|
+
resetProcess(processId) {
|
|
558
|
+
this.#processes.delete(processId);
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
// ── Detector methods ──────────────────────────────────────────────────────
|
|
562
|
+
|
|
563
|
+
/**
|
|
564
|
+
* P0: Token count exceeds model limit — instant death.
|
|
565
|
+
*/
|
|
566
|
+
#detectTokenOverflow(line, state) {
|
|
567
|
+
const match = RE_TOKEN_OVERFLOW.exec(line);
|
|
568
|
+
if (!match) return;
|
|
569
|
+
|
|
570
|
+
const tokenCount = parseInt(match[1], 10);
|
|
571
|
+
const limit = parseInt(match[2], 10);
|
|
572
|
+
state.isDead = true;
|
|
573
|
+
|
|
574
|
+
this.#emit({
|
|
575
|
+
type: AnomalyType.TOKEN_OVERFLOW,
|
|
576
|
+
severity: Severity.CRITICAL,
|
|
577
|
+
processId: state.processId,
|
|
578
|
+
shortId: state.shortId,
|
|
579
|
+
taskTitle: state.taskTitle,
|
|
580
|
+
message: `Token overflow: ${tokenCount.toLocaleString()} tokens vs ${limit.toLocaleString()} limit (+${(tokenCount - limit).toLocaleString()} over)`,
|
|
581
|
+
data: { tokenCount, limit, overflow: tokenCount - limit },
|
|
582
|
+
action: "kill",
|
|
583
|
+
});
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
/**
|
|
587
|
+
* P0: Model not supported — subagent dies, parent wastes ~90s retrying.
|
|
588
|
+
* While this is an external issue (Azure/model config), after enough failures
|
|
589
|
+
* the agent is wasting compute spinning in retry loops. Kill it so the slot
|
|
590
|
+
* is freed for a fresh attempt that might succeed after config fixes.
|
|
591
|
+
*/
|
|
592
|
+
#detectModelNotSupported(line, state) {
|
|
593
|
+
if (!line.includes(STR_MODEL_NOT_SUPPORTED)) return;
|
|
594
|
+
|
|
595
|
+
state.modelFailureCount++;
|
|
596
|
+
|
|
597
|
+
if (state.modelFailureCount >= this.#thresholds.modelFailureKill) {
|
|
598
|
+
this.#emit({
|
|
599
|
+
type: AnomalyType.MODEL_NOT_SUPPORTED,
|
|
600
|
+
severity: Severity.HIGH,
|
|
601
|
+
processId: state.processId,
|
|
602
|
+
shortId: state.shortId,
|
|
603
|
+
taskTitle: state.taskTitle,
|
|
604
|
+
message: `Model not supported — ${state.modelFailureCount} failures, each wasting ~90s in retries`,
|
|
605
|
+
data: { failureCount: state.modelFailureCount },
|
|
606
|
+
action: "kill",
|
|
607
|
+
});
|
|
608
|
+
} else {
|
|
609
|
+
this.#emit({
|
|
610
|
+
type: AnomalyType.MODEL_NOT_SUPPORTED,
|
|
611
|
+
severity: Severity.MEDIUM,
|
|
612
|
+
processId: state.processId,
|
|
613
|
+
shortId: state.shortId,
|
|
614
|
+
taskTitle: state.taskTitle,
|
|
615
|
+
message: `Model not supported failure #${state.modelFailureCount} (~90s wasted per retry)`,
|
|
616
|
+
data: { failureCount: state.modelFailureCount },
|
|
617
|
+
action: "warn",
|
|
618
|
+
});
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
/**
|
|
623
|
+
* P1: Stream completed without response — session is dead.
|
|
624
|
+
*/
|
|
625
|
+
#detectStreamDeath(line, state) {
|
|
626
|
+
if (!line.includes(STR_STREAM_DEATH)) return;
|
|
627
|
+
|
|
628
|
+
state.isDead = true;
|
|
629
|
+
|
|
630
|
+
this.#emit({
|
|
631
|
+
type: AnomalyType.STREAM_DEATH,
|
|
632
|
+
severity: Severity.HIGH,
|
|
633
|
+
processId: state.processId,
|
|
634
|
+
shortId: state.shortId,
|
|
635
|
+
taskTitle: state.taskTitle,
|
|
636
|
+
message: "Stream completed without response — session dead",
|
|
637
|
+
data: {},
|
|
638
|
+
action: "restart",
|
|
639
|
+
});
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
/**
|
|
643
|
+
* P2: Consecutive identical tool calls — agent stuck in a loop.
|
|
644
|
+
*
|
|
645
|
+
* KEY: We fingerprint the ENTIRE tool call content (minus the ever-changing
|
|
646
|
+
* toolCallId) so that different edits to the same file are NOT counted as
|
|
647
|
+
* a loop. Only truly identical calls (same title, same arguments, same
|
|
648
|
+
* content) increment the counter.
|
|
649
|
+
*
|
|
650
|
+
* Additionally, known-iterative tools (Editing, Reading, Searching) get
|
|
651
|
+
* multiplied thresholds since agents legitimately call them many times
|
|
652
|
+
* during normal edit→test→edit development cycles.
|
|
653
|
+
*/
|
|
654
|
+
#detectToolCallLoop(line, state) {
|
|
655
|
+
const match = RE_TOOL_CALL_TITLE.exec(line);
|
|
656
|
+
if (!match) {
|
|
657
|
+
// Non-tool-call lines don't reset the counter (reasoning/thought between calls is normal)
|
|
658
|
+
return;
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
const title = match[1];
|
|
662
|
+
|
|
663
|
+
// Fingerprint the full tool call content, stripping the toolCallId which
|
|
664
|
+
// changes every invocation. Two calls are "identical" only when both the
|
|
665
|
+
// tool name AND the arguments/content are the same.
|
|
666
|
+
const stripped = line.replace(RE_TOOL_CALL_ID, "");
|
|
667
|
+
const fingerprint = djb2Hash(stripped);
|
|
668
|
+
|
|
669
|
+
if (fingerprint === state.lastToolCallFingerprint && title === state.lastToolTitle) {
|
|
670
|
+
state.consecutiveSameToolCount++;
|
|
671
|
+
} else {
|
|
672
|
+
state.lastToolTitle = title;
|
|
673
|
+
state.lastToolCallFingerprint = fingerprint;
|
|
674
|
+
state.consecutiveSameToolCount = 1;
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
const count = state.consecutiveSameToolCount;
|
|
678
|
+
|
|
679
|
+
// Use elevated thresholds for inherently iterative tools (editing, reading)
|
|
680
|
+
const iterative = isIterativeTool(title);
|
|
681
|
+
const warnThreshold = iterative
|
|
682
|
+
? this.#thresholds.toolCallLoopWarn * 3
|
|
683
|
+
: this.#thresholds.toolCallLoopWarn;
|
|
684
|
+
const killThreshold = iterative
|
|
685
|
+
? this.#thresholds.toolCallLoopKill * 3
|
|
686
|
+
: this.#thresholds.toolCallLoopKill;
|
|
687
|
+
|
|
688
|
+
if (count >= killThreshold) {
|
|
689
|
+
this.#emit({
|
|
690
|
+
type: AnomalyType.TOOL_CALL_LOOP,
|
|
691
|
+
severity: Severity.HIGH,
|
|
692
|
+
processId: state.processId,
|
|
693
|
+
shortId: state.shortId,
|
|
694
|
+
taskTitle: state.taskTitle,
|
|
695
|
+
message: `Tool call death loop: "${title}" called ${count}x consecutively (identical content)`,
|
|
696
|
+
data: { tool: title, count, iterative },
|
|
697
|
+
action: "kill",
|
|
698
|
+
});
|
|
699
|
+
} else if (count >= warnThreshold) {
|
|
700
|
+
this.#emit({
|
|
701
|
+
type: AnomalyType.TOOL_CALL_LOOP,
|
|
702
|
+
severity: Severity.MEDIUM,
|
|
703
|
+
processId: state.processId,
|
|
704
|
+
shortId: state.shortId,
|
|
705
|
+
taskTitle: state.taskTitle,
|
|
706
|
+
message: `Tool call loop: "${title}" called ${count}x consecutively (identical content)`,
|
|
707
|
+
data: { tool: title, count, iterative },
|
|
708
|
+
action: "warn",
|
|
709
|
+
});
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
/**
|
|
714
|
+
* P2: Tool failures accumulating.
|
|
715
|
+
*/
|
|
716
|
+
#detectToolFailures(line, state) {
|
|
717
|
+
if (!RE_TOOL_UPDATE_FAILED.test(line)) return;
|
|
718
|
+
|
|
719
|
+
state.toolFailureCount++;
|
|
720
|
+
|
|
721
|
+
if (state.toolFailureCount >= this.#thresholds.toolFailureKill) {
|
|
722
|
+
this.#emit({
|
|
723
|
+
type: AnomalyType.TOOL_FAILURE_CASCADE,
|
|
724
|
+
severity: Severity.HIGH,
|
|
725
|
+
processId: state.processId,
|
|
726
|
+
shortId: state.shortId,
|
|
727
|
+
taskTitle: state.taskTitle,
|
|
728
|
+
message: `Tool failure cascade: ${state.toolFailureCount} failures in session`,
|
|
729
|
+
data: { count: state.toolFailureCount },
|
|
730
|
+
action: "kill",
|
|
731
|
+
});
|
|
732
|
+
} else if (state.toolFailureCount >= this.#thresholds.toolFailureWarn) {
|
|
733
|
+
this.#emit({
|
|
734
|
+
type: AnomalyType.TOOL_FAILURE_CASCADE,
|
|
735
|
+
severity: Severity.MEDIUM,
|
|
736
|
+
processId: state.processId,
|
|
737
|
+
shortId: state.shortId,
|
|
738
|
+
taskTitle: state.taskTitle,
|
|
739
|
+
message: `High tool failure rate: ${state.toolFailureCount} failures in session`,
|
|
740
|
+
data: { count: state.toolFailureCount },
|
|
741
|
+
action: "warn",
|
|
742
|
+
});
|
|
743
|
+
}
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
/**
|
|
747
|
+
* P1: Rebase --continue death spiral.
|
|
748
|
+
*/
|
|
749
|
+
#detectRebaseSpiral(line, state) {
|
|
750
|
+
if (RE_REBASE_CONTINUE.test(line)) {
|
|
751
|
+
state.rebaseCount++;
|
|
752
|
+
} else if (RE_REBASE_ABORT.test(line)) {
|
|
753
|
+
state.rebaseAbortCount++;
|
|
754
|
+
return; // abort is recovery, don't alert
|
|
755
|
+
} else {
|
|
756
|
+
return;
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
if (state.rebaseCount >= this.#thresholds.rebaseKill) {
|
|
760
|
+
this.#emit({
|
|
761
|
+
type: AnomalyType.REBASE_SPIRAL,
|
|
762
|
+
severity: Severity.HIGH,
|
|
763
|
+
processId: state.processId,
|
|
764
|
+
shortId: state.shortId,
|
|
765
|
+
taskTitle: state.taskTitle,
|
|
766
|
+
message: `Rebase spiral detected: ${state.rebaseCount} rebase --continue attempts`,
|
|
767
|
+
data: {
|
|
768
|
+
rebaseCount: state.rebaseCount,
|
|
769
|
+
abortCount: state.rebaseAbortCount,
|
|
770
|
+
},
|
|
771
|
+
action: "kill",
|
|
772
|
+
});
|
|
773
|
+
} else if (state.rebaseCount >= this.#thresholds.rebaseWarn) {
|
|
774
|
+
this.#emit({
|
|
775
|
+
type: AnomalyType.REBASE_SPIRAL,
|
|
776
|
+
severity: Severity.HIGH,
|
|
777
|
+
processId: state.processId,
|
|
778
|
+
shortId: state.shortId,
|
|
779
|
+
taskTitle: state.taskTitle,
|
|
780
|
+
message: `Rebase spiral: ${state.rebaseCount} rebase --continue attempts`,
|
|
781
|
+
data: {
|
|
782
|
+
rebaseCount: state.rebaseCount,
|
|
783
|
+
abortCount: state.rebaseAbortCount,
|
|
784
|
+
},
|
|
785
|
+
action: "warn",
|
|
786
|
+
});
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
/**
|
|
791
|
+
* P2: Git push retry loop.
|
|
792
|
+
*/
|
|
793
|
+
#detectGitPushLoop(line, state) {
|
|
794
|
+
if (!RE_GIT_PUSH.test(line)) return;
|
|
795
|
+
|
|
796
|
+
state.gitPushCount++;
|
|
797
|
+
|
|
798
|
+
if (state.gitPushCount >= this.#thresholds.gitPushKill) {
|
|
799
|
+
this.#emit({
|
|
800
|
+
type: AnomalyType.GIT_PUSH_LOOP,
|
|
801
|
+
severity: Severity.HIGH,
|
|
802
|
+
processId: state.processId,
|
|
803
|
+
shortId: state.shortId,
|
|
804
|
+
taskTitle: state.taskTitle,
|
|
805
|
+
message: `Git push loop detected: ${state.gitPushCount} push attempts`,
|
|
806
|
+
data: { count: state.gitPushCount },
|
|
807
|
+
action: "kill",
|
|
808
|
+
});
|
|
809
|
+
} else if (state.gitPushCount >= this.#thresholds.gitPushWarn) {
|
|
810
|
+
this.#emit({
|
|
811
|
+
type: AnomalyType.GIT_PUSH_LOOP,
|
|
812
|
+
severity: Severity.MEDIUM,
|
|
813
|
+
processId: state.processId,
|
|
814
|
+
shortId: state.shortId,
|
|
815
|
+
taskTitle: state.taskTitle,
|
|
816
|
+
message: `Git push loop: ${state.gitPushCount} push attempts in session`,
|
|
817
|
+
data: { count: state.gitPushCount },
|
|
818
|
+
action: "warn",
|
|
819
|
+
});
|
|
820
|
+
}
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
/**
|
|
824
|
+
* P2: Subagent over-spawning.
|
|
825
|
+
*/
|
|
826
|
+
#detectSubagentWaste(line, state) {
|
|
827
|
+
if (!RE_SUBAGENT_SPAWN.test(line)) return;
|
|
828
|
+
|
|
829
|
+
state.subagentCount++;
|
|
830
|
+
|
|
831
|
+
if (state.subagentCount >= this.#thresholds.subagentKill) {
|
|
832
|
+
this.#emit({
|
|
833
|
+
type: AnomalyType.SUBAGENT_WASTE,
|
|
834
|
+
severity: Severity.HIGH,
|
|
835
|
+
processId: state.processId,
|
|
836
|
+
shortId: state.shortId,
|
|
837
|
+
taskTitle: state.taskTitle,
|
|
838
|
+
message: `Excessive subagent spawning: ${state.subagentCount} subagents`,
|
|
839
|
+
data: { count: state.subagentCount },
|
|
840
|
+
action: "kill",
|
|
841
|
+
});
|
|
842
|
+
} else if (state.subagentCount >= this.#thresholds.subagentWarn) {
|
|
843
|
+
this.#emit({
|
|
844
|
+
type: AnomalyType.SUBAGENT_WASTE,
|
|
845
|
+
severity: Severity.MEDIUM,
|
|
846
|
+
processId: state.processId,
|
|
847
|
+
shortId: state.shortId,
|
|
848
|
+
taskTitle: state.taskTitle,
|
|
849
|
+
message: `High subagent count: ${state.subagentCount} subagents spawned`,
|
|
850
|
+
data: { count: state.subagentCount },
|
|
851
|
+
action: "warn",
|
|
852
|
+
});
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
/**
|
|
857
|
+
* P3: Command failure rate tracking (Codex format).
|
|
858
|
+
*/
|
|
859
|
+
#detectCommandFailures(line, state) {
|
|
860
|
+
if (RE_CMD_FAILED_CODEX.test(line)) {
|
|
861
|
+
state.commandCount++;
|
|
862
|
+
state.commandFailureCount++;
|
|
863
|
+
} else if (RE_CMD_COMPLETED_CODEX.test(line)) {
|
|
864
|
+
state.commandCount++;
|
|
865
|
+
} else {
|
|
866
|
+
return;
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
// Check failure rate after enough samples
|
|
870
|
+
if (state.commandCount >= 10) {
|
|
871
|
+
const rate = (state.commandFailureCount / state.commandCount) * 100;
|
|
872
|
+
if (rate >= this.#thresholds.commandFailureRateWarn) {
|
|
873
|
+
this.#emit({
|
|
874
|
+
type: AnomalyType.COMMAND_FAILURE_RATE,
|
|
875
|
+
severity: Severity.MEDIUM,
|
|
876
|
+
processId: state.processId,
|
|
877
|
+
shortId: state.shortId,
|
|
878
|
+
taskTitle: state.taskTitle,
|
|
879
|
+
message: `High command failure rate: ${rate.toFixed(0)}% (${state.commandFailureCount}/${state.commandCount})`,
|
|
880
|
+
data: {
|
|
881
|
+
rate,
|
|
882
|
+
failed: state.commandFailureCount,
|
|
883
|
+
total: state.commandCount,
|
|
884
|
+
},
|
|
885
|
+
action: "warn",
|
|
886
|
+
});
|
|
887
|
+
}
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
/**
|
|
892
|
+
* P3: Thought repetition (model spinning/looping).
|
|
893
|
+
*/
|
|
894
|
+
#detectThoughtSpinning(line, state) {
|
|
895
|
+
let thoughtText = null;
|
|
896
|
+
|
|
897
|
+
// Copilot format
|
|
898
|
+
const thoughtMatch = RE_THOUGHT_TEXT.exec(line);
|
|
899
|
+
if (thoughtMatch) {
|
|
900
|
+
thoughtText = thoughtMatch[1];
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
if (!thoughtText) return;
|
|
904
|
+
|
|
905
|
+
// Normalize: lowercase, trim, collapse whitespace
|
|
906
|
+
const normalized = thoughtText.toLowerCase().trim().replace(/\s+/g, " ");
|
|
907
|
+
// Skip short fragments — streaming often emits single tokens ("portal",
|
|
908
|
+
// " trust", "the") that accumulate massive counts but aren't real repeated
|
|
909
|
+
// thoughts. Require at least 12 chars (~2-3 words) to count as a trackable
|
|
910
|
+
// thought pattern.
|
|
911
|
+
if (normalized.length < 12) return;
|
|
912
|
+
|
|
913
|
+
// Skip operational status messages — agents running tests, builds, or
|
|
914
|
+
// installations legitimately repeat status thoughts like "Running integration
|
|
915
|
+
// tests" many times. These are progress indicators, not loops.
|
|
916
|
+
if (isOperationalThought(normalized)) return;
|
|
917
|
+
|
|
918
|
+
const count = (state.thoughtCounts.get(normalized) || 0) + 1;
|
|
919
|
+
state.thoughtCounts.set(normalized, count);
|
|
920
|
+
|
|
921
|
+
if (count >= this.#thresholds.thoughtSpinKill) {
|
|
922
|
+
this.#emit({
|
|
923
|
+
type: AnomalyType.THOUGHT_SPINNING,
|
|
924
|
+
severity: Severity.HIGH,
|
|
925
|
+
processId: state.processId,
|
|
926
|
+
shortId: state.shortId,
|
|
927
|
+
taskTitle: state.taskTitle,
|
|
928
|
+
message: `Thought spinning: "${thoughtText}" repeated ${count}x — model may be looping`,
|
|
929
|
+
data: { thought: thoughtText, count },
|
|
930
|
+
action: "kill",
|
|
931
|
+
});
|
|
932
|
+
} else if (count >= this.#thresholds.thoughtSpinWarn) {
|
|
933
|
+
this.#emit({
|
|
934
|
+
type: AnomalyType.THOUGHT_SPINNING,
|
|
935
|
+
severity: Severity.LOW,
|
|
936
|
+
processId: state.processId,
|
|
937
|
+
shortId: state.shortId,
|
|
938
|
+
taskTitle: state.taskTitle,
|
|
939
|
+
message: `Thought repetition: "${thoughtText}" repeated ${count}x`,
|
|
940
|
+
data: { thought: thoughtText, count },
|
|
941
|
+
action: "info",
|
|
942
|
+
});
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
/**
|
|
947
|
+
* P3: Self-debugging reasoning loops (Codex format).
|
|
948
|
+
*/
|
|
949
|
+
#detectSelfDebugLoop(line, state) {
|
|
950
|
+
const match = RE_REASONING_SUMMARY.exec(line);
|
|
951
|
+
if (!match) return;
|
|
952
|
+
|
|
953
|
+
const summary = match[1].toLowerCase();
|
|
954
|
+
const isDebug = SELF_DEBUG_KEYWORDS.some((kw) => summary.includes(kw));
|
|
955
|
+
if (!isDebug) return;
|
|
956
|
+
|
|
957
|
+
this.#emit({
|
|
958
|
+
type: AnomalyType.SELF_DEBUG_LOOP,
|
|
959
|
+
severity: Severity.LOW,
|
|
960
|
+
processId: state.processId,
|
|
961
|
+
shortId: state.shortId,
|
|
962
|
+
taskTitle: state.taskTitle,
|
|
963
|
+
message: `Agent self-debugging: "${match[1]}"`,
|
|
964
|
+
data: { summary: match[1] },
|
|
965
|
+
action: "info",
|
|
966
|
+
});
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
/**
|
|
970
|
+
* P3: Repeated error fingerprints.
|
|
971
|
+
*/
|
|
972
|
+
#detectRepeatedErrors(line, state) {
|
|
973
|
+
// Only check lines that look like errors
|
|
974
|
+
if (RE_ERROR_NOISE.some((re) => re.test(line))) return;
|
|
975
|
+
if (!RE_ERROR_PATTERNS.some((re) => re.test(line))) return;
|
|
976
|
+
|
|
977
|
+
const fingerprint = normalizeDedupKey(line).slice(0, 120);
|
|
978
|
+
const count = (state.errorFingerprints.get(fingerprint) || 0) + 1;
|
|
979
|
+
state.errorFingerprints.set(fingerprint, count);
|
|
980
|
+
|
|
981
|
+
if (count >= this.#thresholds.repeatedErrorKill) {
|
|
982
|
+
this.#emit({
|
|
983
|
+
type: AnomalyType.REPEATED_ERROR,
|
|
984
|
+
severity: Severity.HIGH,
|
|
985
|
+
processId: state.processId,
|
|
986
|
+
shortId: state.shortId,
|
|
987
|
+
taskTitle: state.taskTitle,
|
|
988
|
+
message: `Repeated error (${count}x): ${line.slice(0, 150)}`,
|
|
989
|
+
data: { fingerprint, count },
|
|
990
|
+
action: "kill",
|
|
991
|
+
});
|
|
992
|
+
} else if (count >= this.#thresholds.repeatedErrorWarn) {
|
|
993
|
+
this.#emit({
|
|
994
|
+
type: AnomalyType.REPEATED_ERROR,
|
|
995
|
+
severity: Severity.MEDIUM,
|
|
996
|
+
processId: state.processId,
|
|
997
|
+
shortId: state.shortId,
|
|
998
|
+
taskTitle: state.taskTitle,
|
|
999
|
+
message: `Repeated error (${count}x): ${line.slice(0, 150)}`,
|
|
1000
|
+
data: { fingerprint, count },
|
|
1001
|
+
action: "warn",
|
|
1002
|
+
});
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
/**
|
|
1007
|
+
* Detect session completion (mark as dead to stop analysis).
|
|
1008
|
+
*/
|
|
1009
|
+
#detectSessionCompletion(line, state) {
|
|
1010
|
+
if (RE_SESSION_DONE.test(line) || line.includes(STR_TASK_COMPLETE)) {
|
|
1011
|
+
state.isDead = true;
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
// ── Stall detection (timer-based) ─────────────────────────────────────────
|
|
1016
|
+
|
|
1017
|
+
/**
|
|
1018
|
+
* Check all active processes for idle stalls.
|
|
1019
|
+
* Called on a 30-second interval.
|
|
1020
|
+
*/
|
|
1021
|
+
#checkStalls() {
|
|
1022
|
+
const now = Date.now();
|
|
1023
|
+
for (const [, state] of this.#processes) {
|
|
1024
|
+
if (state.isDead) continue;
|
|
1025
|
+
if (state.lineCount < 5) continue; // Don't alert on brand-new processes
|
|
1026
|
+
|
|
1027
|
+
const idleMs = now - state.lastLineAt;
|
|
1028
|
+
|
|
1029
|
+
if (idleMs >= this.#thresholds.idleStallKillSec * 1000) {
|
|
1030
|
+
this.#emit({
|
|
1031
|
+
type: AnomalyType.IDLE_STALL,
|
|
1032
|
+
severity: Severity.HIGH,
|
|
1033
|
+
processId: state.processId,
|
|
1034
|
+
shortId: state.shortId,
|
|
1035
|
+
taskTitle: state.taskTitle,
|
|
1036
|
+
message: `Agent may be stalled: no output for ${Math.round(idleMs / 1000)}s`,
|
|
1037
|
+
data: { idleSec: Math.round(idleMs / 1000) },
|
|
1038
|
+
action: "kill",
|
|
1039
|
+
});
|
|
1040
|
+
} else if (idleMs >= this.#thresholds.idleStallWarnSec * 1000) {
|
|
1041
|
+
this.#emit({
|
|
1042
|
+
type: AnomalyType.IDLE_STALL,
|
|
1043
|
+
severity: Severity.MEDIUM,
|
|
1044
|
+
processId: state.processId,
|
|
1045
|
+
shortId: state.shortId,
|
|
1046
|
+
taskTitle: state.taskTitle,
|
|
1047
|
+
message: `Agent may be stalled: no output for ${Math.round(idleMs / 1000)}s`,
|
|
1048
|
+
data: { idleSec: Math.round(idleMs / 1000) },
|
|
1049
|
+
action: "warn",
|
|
1050
|
+
});
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
// ── Housekeeping ──────────────────────────────────────────────────────────
|
|
1056
|
+
|
|
1057
|
+
/**
|
|
1058
|
+
* Remove process state for processes inactive beyond cleanup threshold.
|
|
1059
|
+
* Cleans both active and completed process archives.
|
|
1060
|
+
*/
|
|
1061
|
+
#cleanupOldProcesses() {
|
|
1062
|
+
const now = Date.now();
|
|
1063
|
+
// Clean active processes
|
|
1064
|
+
for (const [pid, state] of this.#processes) {
|
|
1065
|
+
if (now - state.lastLineAt > this.#thresholds.processCleanupMs) {
|
|
1066
|
+
this.#processes.delete(pid);
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
// Clean completed process archives
|
|
1070
|
+
for (const [pid, state] of this.#completedProcesses) {
|
|
1071
|
+
if (now - state.lastLineAt > this.#thresholds.processCleanupMs) {
|
|
1072
|
+
this.#completedProcesses.delete(pid);
|
|
1073
|
+
}
|
|
1074
|
+
}
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
// ── Emission ──────────────────────────────────────────────────────────────
|
|
1078
|
+
|
|
1079
|
+
/**
|
|
1080
|
+
* Emit an anomaly event with dedup protection and auto-escalation.
|
|
1081
|
+
*
|
|
1082
|
+
* Circuit breaker: When a warn-level anomaly fires 3+ times for the same
|
|
1083
|
+
* process (each separated by the dedup window), auto-escalate to
|
|
1084
|
+
* action="kill". This prevents agents from wasting hours in loops that
|
|
1085
|
+
* individually don't cross kill thresholds but collectively indicate a
|
|
1086
|
+
* stuck process.
|
|
1087
|
+
*
|
|
1088
|
+
* @param {Anomaly} anomaly
|
|
1089
|
+
*/
|
|
1090
|
+
#emit(anomaly) {
|
|
1091
|
+
// Build dedup key: type + processId + severity (so escalations still fire)
|
|
1092
|
+
const dedupKey = `${anomaly.type}:${anomaly.shortId}:${anomaly.severity}`;
|
|
1093
|
+
const state = this.#processes.get(anomaly.processId);
|
|
1094
|
+
|
|
1095
|
+
if (state) {
|
|
1096
|
+
const now = Date.now();
|
|
1097
|
+
const lastAlert = state.alertTimestamps.get(dedupKey) || 0;
|
|
1098
|
+
if (now - lastAlert < this.#thresholds.alertDedupWindowMs) {
|
|
1099
|
+
return; // Already alerted recently
|
|
1100
|
+
}
|
|
1101
|
+
state.alertTimestamps.set(dedupKey, now);
|
|
1102
|
+
|
|
1103
|
+
// ── Circuit breaker escalation ─────────────────────────────────
|
|
1104
|
+
// Track how many times this anomaly type has been emitted for this
|
|
1105
|
+
// process. If a warn/info action fires 3+ times, auto-escalate
|
|
1106
|
+
// to kill — the process is stuck and won't recover on its own.
|
|
1107
|
+
const emitKey = anomaly.type;
|
|
1108
|
+
const emitCount = (state.alertEmitCounts.get(emitKey) || 0) + 1;
|
|
1109
|
+
state.alertEmitCounts.set(emitKey, emitCount);
|
|
1110
|
+
|
|
1111
|
+
if (anomaly.action === "warn" || anomaly.action === "info") {
|
|
1112
|
+
if (emitCount >= 3) {
|
|
1113
|
+
console.warn(
|
|
1114
|
+
`[anomaly-detector] circuit breaker: ${anomaly.type} fired ${emitCount}x for ${anomaly.shortId} — escalating to KILL`,
|
|
1115
|
+
);
|
|
1116
|
+
anomaly.action = "kill";
|
|
1117
|
+
anomaly.severity = Severity.HIGH;
|
|
1118
|
+
anomaly.message = `[ESCALATED] ${anomaly.message} (${emitCount} alerts over ${Math.round((now - state.firstLineAt) / 60_000)}min)`;
|
|
1119
|
+
}
|
|
1120
|
+
}
|
|
1121
|
+
}
|
|
1122
|
+
|
|
1123
|
+
// Increment global counter
|
|
1124
|
+
const prev = this.#globalCounts.get(anomaly.type) || 0;
|
|
1125
|
+
this.#globalCounts.set(anomaly.type, prev + 1);
|
|
1126
|
+
|
|
1127
|
+
// Invoke callback
|
|
1128
|
+
try {
|
|
1129
|
+
this.#onAnomaly(anomaly);
|
|
1130
|
+
} catch {
|
|
1131
|
+
/* callback error — ignore */
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
// Send notification for HIGH+ severity
|
|
1135
|
+
if (
|
|
1136
|
+
anomaly.severity === Severity.CRITICAL ||
|
|
1137
|
+
anomaly.severity === Severity.HIGH
|
|
1138
|
+
) {
|
|
1139
|
+
const icon = anomaly.severity === Severity.CRITICAL ? "🔴" : "🟠";
|
|
1140
|
+
const actionLabel =
|
|
1141
|
+
anomaly.action === "kill"
|
|
1142
|
+
? "⛔ KILL"
|
|
1143
|
+
: anomaly.action === "restart"
|
|
1144
|
+
? "🔄 RESTART"
|
|
1145
|
+
: "⚠️ ALERT";
|
|
1146
|
+
|
|
1147
|
+
const msg = [
|
|
1148
|
+
`${icon} <b>Anomaly: ${escapeHtml(anomaly.type)}</b>`,
|
|
1149
|
+
`Process: <code>${escapeHtml(anomaly.shortId)}</code>`,
|
|
1150
|
+
anomaly.taskTitle ? `Task: ${escapeHtml(anomaly.taskTitle)}` : null,
|
|
1151
|
+
`${escapeHtml(anomaly.message)}`,
|
|
1152
|
+
`Action: ${actionLabel}`,
|
|
1153
|
+
]
|
|
1154
|
+
.filter(Boolean)
|
|
1155
|
+
.join("\n");
|
|
1156
|
+
|
|
1157
|
+
try {
|
|
1158
|
+
this.#notify(msg, { parseMode: "HTML", skipDedup: false });
|
|
1159
|
+
} catch {
|
|
1160
|
+
/* notification error — ignore */
|
|
1161
|
+
}
|
|
1162
|
+
}
|
|
1163
|
+
}
|
|
1164
|
+
}
|
|
1165
|
+
|
|
1166
|
+
// ── Factory function ────────────────────────────────────────────────────────
|
|
1167
|
+
|
|
1168
|
+
/**
|
|
1169
|
+
* Create and start an anomaly detector instance.
|
|
1170
|
+
*
|
|
1171
|
+
* @param {object} options
|
|
1172
|
+
* @param {(anomaly: Anomaly) => void} [options.onAnomaly] - Custom anomaly handler
|
|
1173
|
+
* @param {(text: string, options?: object) => void} [options.notify] - Telegram notification fn
|
|
1174
|
+
* @param {Partial<typeof DEFAULT_THRESHOLDS>} [options.thresholds] - Threshold overrides
|
|
1175
|
+
* @returns {AnomalyDetector}
|
|
1176
|
+
*/
|
|
1177
|
+
export function createAnomalyDetector(options = {}) {
|
|
1178
|
+
const detector = new AnomalyDetector(options);
|
|
1179
|
+
detector.start();
|
|
1180
|
+
return detector;
|
|
1181
|
+
}
|
|
1182
|
+
|
|
1183
|
+
/**
|
|
1184
|
+
* @typedef {Object} Anomaly
|
|
1185
|
+
* @property {string} type - AnomalyType value
|
|
1186
|
+
* @property {string} severity - Severity value
|
|
1187
|
+
* @property {string} processId - Full process ID
|
|
1188
|
+
* @property {string} shortId - 8-char short process ID
|
|
1189
|
+
* @property {string|null} taskTitle - Task title if known
|
|
1190
|
+
* @property {string} message - Human-readable description
|
|
1191
|
+
* @property {object} data - Structured data for the anomaly
|
|
1192
|
+
* @property {string} action - Recommended action: "kill" | "restart" | "warn" | "info"
|
|
1193
|
+
*/
|
|
1194
|
+
|
|
1195
|
+
export default AnomalyDetector;
|