@consensus-tools/universal 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/dist/consensus-llm.test.d.ts +2 -0
  2. package/dist/consensus-llm.test.d.ts.map +1 -0
  3. package/dist/consensus-llm.test.js +244 -0
  4. package/dist/consensus-llm.test.js.map +1 -0
  5. package/dist/defaults.d.ts +10 -0
  6. package/dist/defaults.d.ts.map +1 -1
  7. package/dist/defaults.js +63 -2
  8. package/dist/defaults.js.map +1 -1
  9. package/dist/index.d.ts +13 -11
  10. package/dist/index.d.ts.map +1 -1
  11. package/dist/index.js +130 -49
  12. package/dist/index.js.map +1 -1
  13. package/dist/persona-reviewer-factory.d.ts +22 -0
  14. package/dist/persona-reviewer-factory.d.ts.map +1 -0
  15. package/dist/persona-reviewer-factory.js +318 -0
  16. package/dist/persona-reviewer-factory.js.map +1 -0
  17. package/dist/reputation-manager.d.ts +38 -0
  18. package/dist/reputation-manager.d.ts.map +1 -0
  19. package/dist/reputation-manager.js +154 -0
  20. package/dist/reputation-manager.js.map +1 -0
  21. package/dist/reputation-manager.test.d.ts +2 -0
  22. package/dist/reputation-manager.test.d.ts.map +1 -0
  23. package/dist/reputation-manager.test.js +111 -0
  24. package/dist/reputation-manager.test.js.map +1 -0
  25. package/dist/risk-tiers.d.ts +10 -0
  26. package/dist/risk-tiers.d.ts.map +1 -0
  27. package/dist/risk-tiers.js +46 -0
  28. package/dist/risk-tiers.js.map +1 -0
  29. package/dist/risk-tiers.test.d.ts +2 -0
  30. package/dist/risk-tiers.test.d.ts.map +1 -0
  31. package/dist/risk-tiers.test.js +40 -0
  32. package/dist/risk-tiers.test.js.map +1 -0
  33. package/dist/types.d.ts +59 -6
  34. package/dist/types.d.ts.map +1 -1
  35. package/package.json +9 -9
  36. package/src/consensus-llm.test.ts +23 -4
  37. package/src/defaults.ts +10 -4
  38. package/src/index.ts +22 -18
  39. package/src/persona-reviewer-factory.ts +90 -70
  40. package/src/reputation-manager.ts +46 -31
  41. package/src/risk-tiers.test.ts +8 -0
  42. package/src/risk-tiers.ts +7 -5
@@ -17,10 +17,24 @@ import type { RiskTierMap } from "./types.js";
17
17
  // 2. Risk tier check (low = fast-path regex only)
18
18
  // 3. Parallel LLM calls per persona (with timeout + fallback)
19
19
  // 4. Parse votes from LLM responses
20
- // 5. Synthesize ConsensusInput (Job, Submissions, Votes)
21
- // 6. Call resolveConsensus() with the configured policy
20
+ // 5. Synthesize ConsensusInput: ONE "allow" submission, all personas
21
+ // vote on it (YES = +1, NO = -1). resolveConsensus aggregates.
22
+ // 6. Determine action from consensus result
22
23
  // 7. Return LlmDecisionResult
23
24
 
25
+ // ── Safe JSON Serialization ──────────────────────────────────────────
26
+
27
+ function safeStringify(obj: unknown, indent?: number): string {
28
+ const seen = new WeakSet();
29
+ return JSON.stringify(obj, (_key, value) => {
30
+ if (typeof value === "object" && value !== null) {
31
+ if (seen.has(value)) return "[Circular]";
32
+ seen.add(value);
33
+ }
34
+ return value;
35
+ }, indent);
36
+ }
37
+
24
38
  // ── Vote Parsing ─────────────────────────────────────────────────────
25
39
 
26
40
  interface ParsedVote {
@@ -29,18 +43,22 @@ interface ParsedVote {
29
43
  rationale: string;
30
44
  }
31
45
 
32
- const VOTE_PATTERN = /\b(YES|NO|REWRITE)\b/i;
46
+ // Match VOTE: YES/NO/REWRITE on its own line (anchored to reduce injection risk)
47
+ const VOTE_LINE_PATTERN = /^(?:VOTE:\s*)?(YES|NO|REWRITE)\s*$/im;
48
+ // Fallback: match anywhere but only as a last resort
49
+ const VOTE_FALLBACK_PATTERN = /\b(YES|NO|REWRITE)\b/i;
33
50
  const CONFIDENCE_PATTERN = /confidence[:\s]*([0-9]*\.?[0-9]+)/i;
34
51
 
35
52
  function parseVoteFromLlm(response: string): ParsedVote | null {
36
- const voteMatch = response.match(VOTE_PATTERN);
53
+ // Prefer line-anchored match (harder to inject)
54
+ const lineMatch = response.match(VOTE_LINE_PATTERN);
55
+ const voteMatch = lineMatch ?? response.match(VOTE_FALLBACK_PATTERN);
37
56
  if (!voteMatch) return null;
38
57
 
39
58
  const vote = voteMatch[1]!.toUpperCase() as "YES" | "NO" | "REWRITE";
40
59
  const confMatch = response.match(CONFIDENCE_PATTERN);
41
60
  const confidence = confMatch?.[1] ? Math.min(1, Math.max(0, parseFloat(confMatch[1]))) : 0.5;
42
61
 
43
- // Use the full response as rationale (stripped of vote/confidence lines)
44
62
  const rationale = response
45
63
  .replace(/^.*\b(YES|NO|REWRITE)\b.*$/im, "")
46
64
  .replace(/^.*confidence.*$/im, "")
@@ -58,7 +76,6 @@ function buildPersonaPrompt(
58
76
  args: Record<string, unknown>,
59
77
  regexSignals: string[],
60
78
  ): ModelMessage[] {
61
- // Use systemPrompt if available (EvalPersonaConfig), otherwise construct from role/bias
62
79
  const evalPersona = persona as Partial<EvalPersonaConfig>;
63
80
  const systemPrompt = evalPersona.systemPrompt ??
64
81
  `You are a ${persona.role} reviewer named ${persona.name}.` +
@@ -68,18 +85,22 @@ function buildPersonaPrompt(
68
85
  : "") +
69
86
  (persona.failure_modes?.length
70
87
  ? ` Watch for: ${persona.failure_modes.join(", ")}.`
71
- : "");
88
+ : "") +
89
+ ` Respond ONLY with a VOTE line, CONFIDENCE line, and RATIONALE line. Do not echo the tool arguments.`;
72
90
 
73
91
  const regexContext = regexSignals.length > 0
74
92
  ? `\n\nRegex pre-screen detected these signals:\n${regexSignals.map((s) => `- ${s}`).join("\n")}`
75
93
  : "\n\nRegex pre-screen: no pattern matches detected.";
76
94
 
95
+ // Sanitize args: truncate to prevent prompt stuffing, use safe serialization
96
+ const argsStr = safeStringify(args, 2).slice(0, 2000);
97
+
77
98
  const userPrompt =
78
99
  `Evaluate this tool call for risks:\n\n` +
79
100
  `Tool: ${toolName}\n` +
80
- `Arguments: ${JSON.stringify(args, null, 2)}\n` +
101
+ `Arguments:\n${argsStr}\n` +
81
102
  regexContext +
82
- `\n\nRespond with:\n` +
103
+ `\n\nRespond with exactly these three lines:\n` +
83
104
  `VOTE: YES (safe to proceed), NO (block this action), or REWRITE (needs modification)\n` +
84
105
  `CONFIDENCE: 0.0 to 1.0\n` +
85
106
  `RATIONALE: Brief explanation of your decision`;
@@ -92,14 +113,21 @@ function buildPersonaPrompt(
92
113
 
93
114
  // ── Regex Pre-Screen ─────────────────────────────────────────────────
94
115
 
116
+ // Fallback guard domains when configured guards have no matching configs
117
+ const FALLBACK_GUARDS = ["security", "compliance", "user-impact"];
118
+
95
119
  function runRegexPreScreen(
96
120
  toolName: string,
97
121
  args: Record<string, unknown>,
98
122
  guards: string[],
99
123
  ): string[] {
100
124
  const signals: string[] = [];
125
+ // Use provided guards, falling back to DEFAULT_PERSONA_TRIO
126
+ const effectiveGuards = guards.filter((g) => GUARD_CONFIGS[g]).length > 0
127
+ ? guards
128
+ : FALLBACK_GUARDS;
101
129
 
102
- for (const domain of guards) {
130
+ for (const domain of effectiveGuards) {
103
131
  const config = GUARD_CONFIGS[domain];
104
132
  if (!config) continue;
105
133
 
@@ -130,28 +158,25 @@ async function callLlmWithTimeout(
130
158
  messages: ModelMessage[],
131
159
  timeoutMs: number,
132
160
  ): Promise<string> {
133
- const controller = new AbortController();
134
- const timer = setTimeout(() => controller.abort(), timeoutMs);
161
+ let timer: ReturnType<typeof setTimeout> | undefined;
135
162
 
136
163
  try {
137
164
  const result = await Promise.race([
138
165
  model(messages),
139
166
  new Promise<never>((_, reject) => {
140
- controller.signal.addEventListener("abort", () =>
141
- reject(new Error("LLM call timed out")),
142
- );
167
+ timer = setTimeout(() => reject(new Error("LLM call timed out")), timeoutMs);
143
168
  }),
144
169
  ]);
145
170
  return result;
146
171
  } finally {
147
- clearTimeout(timer);
172
+ if (timer) clearTimeout(timer);
148
173
  }
149
174
  }
150
175
 
151
176
  // ── Regex Fallback Vote ──────────────────────────────────────────────
152
177
 
153
178
  function regexFallbackVote(
154
- persona: PersonaConfig,
179
+ _persona: PersonaConfig,
155
180
  toolName: string,
156
181
  args: Record<string, unknown>,
157
182
  guards: string[],
@@ -164,10 +189,12 @@ function regexFallbackVote(
164
189
  rationale: `Regex fallback: ${signals.join("; ")}`,
165
190
  };
166
191
  }
192
+ // When LLM is unavailable AND regex finds nothing, default to block for safety.
193
+ // This prevents fail-open when all LLMs are down.
167
194
  return {
168
- vote: "YES",
169
- confidence: 0.4,
170
- rationale: "Regex fallback: no pattern matches (LLM unavailable)",
195
+ vote: "NO",
196
+ confidence: 0.3,
197
+ rationale: "Regex fallback: no pattern matches but LLM unavailable (fail-closed)",
171
198
  };
172
199
  }
173
200
 
@@ -197,7 +224,7 @@ export async function deliberate(
197
224
  ): Promise<LlmDecisionResult> {
198
225
  const decisionId = `dec_${crypto.randomUUID().slice(0, 12)}`;
199
226
  const personas = config.reputationManager.getPersonas();
200
- const guards = config.guards ?? ["security", "compliance", "user-impact"];
227
+ const guards = config.guards ?? FALLBACK_GUARDS;
201
228
 
202
229
  // 1. Regex pre-screen
203
230
  const regexSignals = runRegexPreScreen(toolName, args, guards);
@@ -205,7 +232,6 @@ export async function deliberate(
205
232
  // 2. Risk tier check
206
233
  const tier = classifyTool(toolName, config.riskTiers);
207
234
  if (tier === "low") {
208
- // Fast-path: regex only, no LLM calls
209
235
  const hasRisk = regexSignals.length > 0;
210
236
  return {
211
237
  decisionId,
@@ -244,7 +270,6 @@ export async function deliberate(
244
270
  };
245
271
  }
246
272
 
247
- // Unparseable response, fall back to regex
248
273
  const fallback = regexFallbackVote(persona, toolName, args, guards);
249
274
  return {
250
275
  personaId: persona.id,
@@ -253,7 +278,6 @@ export async function deliberate(
253
278
  source: "regex_fallback" as const,
254
279
  };
255
280
  } catch {
256
- // LLM failure, fall back to regex
257
281
  const fallback = regexFallbackVote(persona, toolName, args, guards);
258
282
  return {
259
283
  personaId: persona.id,
@@ -266,17 +290,21 @@ export async function deliberate(
266
290
  );
267
291
 
268
292
  // 4. Synthesize ConsensusInput for resolveConsensus()
269
- // Each persona creates a "submission" (their evaluation) and votes for it
293
+ //
294
+ // FIXED: Use a SINGLE "allow" submission. All personas vote on it.
295
+ // YES voters score +1, NO voters score -1, REWRITE voters score 0.
296
+ // This way resolveConsensus sees N votes on 1 submission, not N
297
+ // submissions with 1 vote each.
270
298
  const now = new Date().toISOString();
271
299
  const jobId = `job_facade_${decisionId}`;
300
+ const submissionId = `sub_${decisionId}_allow`;
272
301
 
273
- // Create a minimal Job with the configured policy
274
302
  const job = {
275
303
  id: jobId,
276
304
  boardId: "",
277
305
  status: "SUBMITTED" as const,
278
306
  title: `Deliberation: ${toolName}`,
279
- description: JSON.stringify(args),
307
+ description: "",
280
308
  createdByAgentId: "facade",
281
309
  createdAt: now,
282
310
  updatedAt: now,
@@ -288,33 +316,31 @@ export async function deliberate(
288
316
  minParticipants: 1,
289
317
  };
290
318
 
291
- // Each persona submits their evaluation
292
- const submissions = voteResults.map((v, i) => ({
293
- id: `sub_${decisionId}_${i}`,
319
+ // Single submission representing "allow this tool call"
320
+ const submissions = [{
321
+ id: submissionId,
294
322
  jobId,
295
- agentId: v.personaId,
323
+ agentId: "facade",
296
324
  submittedAt: now,
297
- summary: v.rationale,
298
- artifacts: { vote: v.vote, confidence: v.confidence, source: v.source },
299
- confidence: v.confidence,
325
+ summary: `Allow ${toolName}`,
326
+ artifacts: {},
327
+ confidence: 1.0,
300
328
  requestedPayout: 0,
301
329
  status: "SUBMITTED" as const,
302
- }));
330
+ }];
303
331
 
304
- // Each persona votes YES (+1) on their own submission
305
- // and scores based on their confidence
332
+ // Each persona votes on the single submission
306
333
  const votes = voteResults.map((v, i) => ({
307
334
  id: `vote_${decisionId}_${i}`,
308
335
  jobId,
309
336
  agentId: v.personaId,
310
- submissionId: `sub_${decisionId}_${i}`,
337
+ submissionId,
311
338
  score: v.vote === "YES" ? 1 : v.vote === "NO" ? -1 : 0,
312
339
  weight: v.confidence,
313
340
  rationale: v.rationale,
314
341
  createdAt: now,
315
342
  }));
316
343
 
317
- // Reputation function from the manager
318
344
  const reputation = (agentId: string) =>
319
345
  config.reputationManager.getReputation(agentId);
320
346
 
@@ -326,45 +352,40 @@ export async function deliberate(
326
352
  reputation,
327
353
  };
328
354
 
329
- let consensusResult: ConsensusResult;
355
+ let consensusTrace: Record<string, unknown>;
356
+
330
357
  try {
331
- consensusResult = resolveConsensus(consensusInput);
358
+ const result: ConsensusResult = resolveConsensus(consensusInput);
359
+ consensusTrace = result.consensusTrace;
360
+
361
+ // Extract the actual weighted score from the consensus trace.
362
+ // resolveConsensus always returns a "winner" (the single submission),
363
+ // but the score may be negative (more NO than YES votes).
364
+ const traceScores = (consensusTrace as any)?.scores as Record<string, number> | undefined;
365
+ const submissionScore = traceScores?.[submissionId] ?? 0;
366
+ consensusTrace = { ...consensusTrace, submissionScore };
332
367
  } catch {
333
- // If resolution fails, fall back to simple majority
334
- const yesCount = voteResults.filter((v) => v.vote === "YES").length;
335
- const majority = yesCount > voteResults.length / 2;
336
- consensusResult = {
337
- winners: majority ? ["allow"] : ["block"],
338
- winningSubmissionIds: [],
339
- consensusTrace: { policy: "fallback_majority", reason: "resolve_error" },
340
- finalArtifact: null,
341
- };
368
+ consensusTrace = { policy: "fallback_majority", reason: "resolve_error" };
342
369
  }
343
370
 
344
- // 6. Determine final action
345
- const winnerIds = new Set(consensusResult.winners);
346
- const winningVotes = voteResults.filter((v) => winnerIds.has(v.personaId));
347
- const dominantVote = winningVotes.length > 0
348
- ? winningVotes[0]!.vote
349
- : voteResults[0]?.vote ?? "YES";
371
+ // 6. Determine action from vote distribution (direct counting)
372
+ // resolveConsensus provides the audit trace; vote counting determines the action.
373
+ // This avoids the "always-a-winner" problem where resolveConsensus returns
374
+ // a winner even when the score is negative.
375
+ const yesCount = voteResults.filter((v) => v.vote === "YES").length;
376
+ const noCount = voteResults.filter((v) => v.vote === "NO").length;
377
+ const rewriteCount = voteResults.filter((v) => v.vote === "REWRITE").length;
350
378
 
351
379
  let action: "allow" | "block" | "escalate";
352
- if (dominantVote === "YES") {
380
+ if (rewriteCount > voteResults.length / 2) {
381
+ action = "escalate";
382
+ } else if (yesCount > noCount) {
353
383
  action = "allow";
354
- } else if (dominantVote === "NO") {
355
- action = "block";
356
384
  } else {
357
- action = "escalate";
358
- }
359
-
360
- // If no clear winner (empty winners), use simple vote counting
361
- if (consensusResult.winners.length === 0) {
362
- const yesCount = voteResults.filter((v) => v.vote === "YES").length;
363
- const noCount = voteResults.filter((v) => v.vote === "NO").length;
364
- action = yesCount >= noCount ? "allow" : "block";
385
+ action = "block";
365
386
  }
366
387
 
367
- // Compute aggregate score (0-1 based on vote distribution)
388
+ // Compute aggregate score
368
389
  const totalConfidence = voteResults.reduce((s, v) => s + v.confidence, 0);
369
390
  const yesConfidence = voteResults
370
391
  .filter((v) => v.vote === "YES")
@@ -376,11 +397,10 @@ export async function deliberate(
376
397
  action,
377
398
  votes: voteResults,
378
399
  policy: config.policyType,
379
- consensusTrace: consensusResult.consensusTrace,
400
+ consensusTrace,
380
401
  aggregateScore,
381
402
  };
382
403
 
383
- // 7. Record decision for reputation tracking
384
404
  config.reputationManager.recordDecision(result);
385
405
 
386
406
  return result;
@@ -9,6 +9,9 @@ import type { FeedbackSignal, LlmDecisionResult } from "./types.js";
9
9
  // Updates from human feedback signals (onFeedback), not self-consensus.
10
10
  // Triggers persona respawn when reputation drops below threshold.
11
11
 
12
+ const MAX_DECISION_LOOKBACK = 100;
13
+ const MAX_FEEDBACK_LOOKBACK = 500;
14
+
12
15
  export interface RespawnEvent {
13
16
  oldPersona: PersonaConfig;
14
17
  newPersona: PersonaConfig;
@@ -51,10 +54,16 @@ export class ReputationManager {
51
54
  recordDecision(result: LlmDecisionResult): void {
52
55
  this.decisions.set(result.decisionId, result);
53
56
  this.decisionHistory.push(result);
54
- // Keep last 100 decisions for respawn analysis
55
- if (this.decisionHistory.length > 100) {
57
+
58
+ // Cap both collections to prevent memory leaks
59
+ if (this.decisionHistory.length >= MAX_DECISION_LOOKBACK) {
56
60
  this.decisionHistory.shift();
57
61
  }
62
+ // Trim the feedback correlation map (keep most recent N entries)
63
+ if (this.decisions.size > MAX_FEEDBACK_LOOKBACK) {
64
+ const oldest = this.decisions.keys().next().value;
65
+ if (oldest) this.decisions.delete(oldest);
66
+ }
58
67
  }
59
68
 
60
69
  /** Process human feedback signal and update reputation. */
@@ -85,7 +94,7 @@ export class ReputationManager {
85
94
  this.scores.set(change.persona_id, change.reputation_after);
86
95
  }
87
96
 
88
- // Check for respawn
97
+ // Check for respawn (collect respawns, then apply)
89
98
  this.checkRespawn();
90
99
 
91
100
  // Persist if store configured
@@ -94,39 +103,44 @@ export class ReputationManager {
94
103
  return result.changes;
95
104
  }
96
105
 
97
- /** Check if any persona needs respawn. */
106
+ /** Check if any persona needs respawn. Collects replacements first to avoid mutation during iteration. */
98
107
  private checkRespawn(): void {
108
+ const replacements: Array<{ index: number; old: PersonaConfig; rep: number }> = [];
109
+
110
+ // Collect personas that need respawn (don't mutate during scan)
99
111
  for (let i = 0; i < this.personas.length; i++) {
100
112
  const persona = this.personas[i]!;
101
113
  const rep = this.scores.get(persona.id) ?? 0.55;
102
-
103
114
  if (rep < this.threshold) {
104
- // Build learning summary from decision history
105
- const decisionRecords = this.decisionHistory.map((d) => ({
106
- final_decision: d.action === "allow" ? "ALLOW" : "BLOCK",
107
- votes: d.votes.map((v) => ({
108
- persona_id: v.personaId,
109
- vote: v.vote,
110
- confidence: v.confidence,
111
- })),
112
- }));
113
-
114
- const learning = buildLearningSummary(persona.id, decisionRecords);
115
- const successor = mutatePersona(persona, learning);
116
-
117
- // Replace persona
118
- this.personas[i] = successor;
119
- this.scores.delete(persona.id);
120
- this.scores.set(successor.id, successor.reputation ?? 0.55);
121
-
122
- this.onRespawn?.({
123
- oldPersona: persona,
124
- newPersona: successor,
125
- reputation: rep,
126
- reason: `Reputation ${rep.toFixed(3)} below threshold ${this.threshold}`,
127
- });
115
+ replacements.push({ index: i, old: persona, rep });
128
116
  }
129
117
  }
118
+
119
+ // Apply replacements after scan
120
+ for (const { index, old, rep } of replacements) {
121
+ const decisionRecords = this.decisionHistory.map((d) => ({
122
+ final_decision: d.action === "allow" ? "ALLOW" : "BLOCK",
123
+ votes: d.votes.map((v) => ({
124
+ persona_id: v.personaId,
125
+ vote: v.vote,
126
+ confidence: v.confidence,
127
+ })),
128
+ }));
129
+
130
+ const learning = buildLearningSummary(old.id, decisionRecords);
131
+ const successor = mutatePersona(old, learning);
132
+
133
+ this.personas[index] = successor;
134
+ this.scores.delete(old.id);
135
+ this.scores.set(successor.id, successor.reputation ?? 0.55);
136
+
137
+ this.onRespawn?.({
138
+ oldPersona: old,
139
+ newPersona: successor,
140
+ reputation: rep,
141
+ reason: `Reputation ${rep.toFixed(3)} below threshold ${this.threshold}`,
142
+ });
143
+ }
130
144
  }
131
145
 
132
146
  /** Get current persona list (may include respawned successors). */
@@ -143,8 +157,9 @@ export class ReputationManager {
143
157
  }
144
158
  this.store.update((state) => {
145
159
  (state as any).reputation = data;
146
- }).catch(() => {
147
- // Persistence failure is non-fatal
160
+ }).catch((err) => {
161
+ // Log persistence failures instead of silently swallowing
162
+ console.warn("[consensus] Reputation persistence failed:", err); // eslint-disable-line no-console
148
163
  });
149
164
  }
150
165
 
@@ -33,4 +33,12 @@ describe("classifyTool", () => {
33
33
  expect(classifyTool("send_email", { send_email: "low" })).toBe("low");
34
34
  expect(classifyTool("get_weather", { get_weather: "high" })).toBe("high");
35
35
  });
36
+
37
+ it("prevents bypass via compound names (high-risk checked first)", () => {
38
+ // These start with read-like prefixes but contain destructive operations
39
+ expect(classifyTool("execute_and_log")).toBe("high");
40
+ expect(classifyTool("run_cleanup")).toBe("high");
41
+ expect(classifyTool("delete_then_verify")).toBe("high");
42
+ expect(classifyTool("send_and_check")).toBe("high");
43
+ });
36
44
  });
package/src/risk-tiers.ts CHANGED
@@ -28,7 +28,8 @@ const LOW_RISK_PATTERNS = [
28
28
  /**
29
29
  * Classify a tool name into a risk tier.
30
30
  *
31
- * Priority: user overrides > low-risk patterns > high-risk patterns > default high.
31
+ * Priority: user overrides > high-risk patterns > low-risk patterns > default high.
32
+ * High-risk checked FIRST to prevent bypass via naming (e.g., "get_and_delete_user").
32
33
  * Unknown tools default to high-risk (safe by default).
33
34
  */
34
35
  export function classifyTool(toolName: string, overrides?: RiskTierMap): RiskTier {
@@ -36,14 +37,15 @@ export function classifyTool(toolName: string, overrides?: RiskTierMap): RiskTie
36
37
  return overrides[toolName];
37
38
  }
38
39
 
39
- for (const pattern of LOW_RISK_PATTERNS) {
40
- if (pattern.test(toolName)) return "low";
41
- }
42
-
40
+ // Check high-risk FIRST to prevent bypass via compound names
43
41
  for (const pattern of HIGH_RISK_PATTERNS) {
44
42
  if (pattern.test(toolName)) return "high";
45
43
  }
46
44
 
45
+ for (const pattern of LOW_RISK_PATTERNS) {
46
+ if (pattern.test(toolName)) return "low";
47
+ }
48
+
47
49
  // Unknown tools default to high-risk (safe by default)
48
50
  return "high";
49
51
  }