npm - @consensus-tools/universal - Versions diffs - 0.9.0 → 0.9.1 - Mend

@consensus-tools/universal 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

package/dist/consensus-llm.test.d.ts +2 -0
package/dist/consensus-llm.test.d.ts.map +1 -0
package/dist/consensus-llm.test.js +244 -0
package/dist/consensus-llm.test.js.map +1 -0
package/dist/defaults.d.ts +10 -0
package/dist/defaults.d.ts.map +1 -1
package/dist/defaults.js +63 -2
package/dist/defaults.js.map +1 -1
package/dist/index.d.ts +13 -11
package/dist/index.d.ts.map +1 -1
package/dist/index.js +130 -49
package/dist/index.js.map +1 -1
package/dist/persona-reviewer-factory.d.ts +22 -0
package/dist/persona-reviewer-factory.d.ts.map +1 -0
package/dist/persona-reviewer-factory.js +318 -0
package/dist/persona-reviewer-factory.js.map +1 -0
package/dist/reputation-manager.d.ts +38 -0
package/dist/reputation-manager.d.ts.map +1 -0
package/dist/reputation-manager.js +154 -0
package/dist/reputation-manager.js.map +1 -0
package/dist/reputation-manager.test.d.ts +2 -0
package/dist/reputation-manager.test.d.ts.map +1 -0
package/dist/reputation-manager.test.js +111 -0
package/dist/reputation-manager.test.js.map +1 -0
package/dist/risk-tiers.d.ts +10 -0
package/dist/risk-tiers.d.ts.map +1 -0
package/dist/risk-tiers.js +46 -0
package/dist/risk-tiers.js.map +1 -0
package/dist/risk-tiers.test.d.ts +2 -0
package/dist/risk-tiers.test.d.ts.map +1 -0
package/dist/risk-tiers.test.js +40 -0
package/dist/risk-tiers.test.js.map +1 -0
package/dist/types.d.ts +59 -6
package/dist/types.d.ts.map +1 -1
package/package.json +9 -9
package/src/consensus-llm.test.ts +23 -4
package/src/defaults.ts +10 -4
package/src/index.ts +22 -18
package/src/persona-reviewer-factory.ts +90 -70
package/src/reputation-manager.ts +46 -31
package/src/risk-tiers.test.ts +8 -0
package/src/risk-tiers.ts +7 -5

package/src/persona-reviewer-factory.ts CHANGED Viewed

@@ -17,10 +17,24 @@ import type { RiskTierMap } from "./types.js";
 //   2. Risk tier check (low = fast-path regex only)
 //   3. Parallel LLM calls per persona (with timeout + fallback)
 //   4. Parse votes from LLM responses
-//   5. Synthesize ConsensusInput (Job, Submissions, Votes)
-//   6. Call resolveConsensus() with the configured policy
+//   5. Synthesize ConsensusInput: ONE "allow" submission, all personas
+//      vote on it (YES = +1, NO = -1). resolveConsensus aggregates.
+//   6. Determine action from consensus result
 //   7. Return LlmDecisionResult
+// ── Safe JSON Serialization ──────────────────────────────────────────
+function safeStringify(obj: unknown, indent?: number): string {
+  const seen = new WeakSet();
+  return JSON.stringify(obj, (_key, value) => {
+    if (typeof value === "object" && value !== null) {
+      if (seen.has(value)) return "[Circular]";
+      seen.add(value);
+    }
+    return value;
+  }, indent);
+}
 // ── Vote Parsing ─────────────────────────────────────────────────────
 interface ParsedVote {
@@ -29,18 +43,22 @@ interface ParsedVote {
   rationale: string;
 }
-const VOTE_PATTERN = /\b(YES|NO|REWRITE)\b/i;
+// Match VOTE: YES/NO/REWRITE on its own line (anchored to reduce injection risk)
+const VOTE_LINE_PATTERN = /^(?:VOTE:\s*)?(YES|NO|REWRITE)\s*$/im;
+// Fallback: match anywhere but only as a last resort
+const VOTE_FALLBACK_PATTERN = /\b(YES|NO|REWRITE)\b/i;
 const CONFIDENCE_PATTERN = /confidence[:\s]*([0-9]*\.?[0-9]+)/i;
 function parseVoteFromLlm(response: string): ParsedVote | null {
-  const voteMatch = response.match(VOTE_PATTERN);
+  // Prefer line-anchored match (harder to inject)
+  const lineMatch = response.match(VOTE_LINE_PATTERN);
+  const voteMatch = lineMatch ?? response.match(VOTE_FALLBACK_PATTERN);
   if (!voteMatch) return null;
   const vote = voteMatch[1]!.toUpperCase() as "YES" | "NO" | "REWRITE";
   const confMatch = response.match(CONFIDENCE_PATTERN);
   const confidence = confMatch?.[1] ? Math.min(1, Math.max(0, parseFloat(confMatch[1]))) : 0.5;
-  // Use the full response as rationale (stripped of vote/confidence lines)
   const rationale = response
     .replace(/^.*\b(YES|NO|REWRITE)\b.*$/im, "")
     .replace(/^.*confidence.*$/im, "")
@@ -58,7 +76,6 @@ function buildPersonaPrompt(
   args: Record<string, unknown>,
   regexSignals: string[],
 ): ModelMessage[] {
-  // Use systemPrompt if available (EvalPersonaConfig), otherwise construct from role/bias
   const evalPersona = persona as Partial<EvalPersonaConfig>;
   const systemPrompt = evalPersona.systemPrompt ??
     `You are a ${persona.role} reviewer named ${persona.name}.` +
@@ -68,18 +85,22 @@ function buildPersonaPrompt(
       : "") +
     (persona.failure_modes?.length
       ? ` Watch for: ${persona.failure_modes.join(", ")}.`
-      : "");
+      : "") +
+    ` Respond ONLY with a VOTE line, CONFIDENCE line, and RATIONALE line. Do not echo the tool arguments.`;
   const regexContext = regexSignals.length > 0
     ? `\n\nRegex pre-screen detected these signals:\n${regexSignals.map((s) => `- ${s}`).join("\n")}`
     : "\n\nRegex pre-screen: no pattern matches detected.";
+  // Sanitize args: truncate to prevent prompt stuffing, use safe serialization
+  const argsStr = safeStringify(args, 2).slice(0, 2000);
   const userPrompt =
     `Evaluate this tool call for risks:\n\n` +
     `Tool: ${toolName}\n` +
-    `Arguments: ${JSON.stringify(args, null, 2)}\n` +
+    `Arguments:\n${argsStr}\n` +
     regexContext +
-    `\n\nRespond with:\n` +
+    `\n\nRespond with exactly these three lines:\n` +
     `VOTE: YES (safe to proceed), NO (block this action), or REWRITE (needs modification)\n` +
     `CONFIDENCE: 0.0 to 1.0\n` +
     `RATIONALE: Brief explanation of your decision`;
@@ -92,14 +113,21 @@ function buildPersonaPrompt(
 // ── Regex Pre-Screen ─────────────────────────────────────────────────
+// Fallback guard domains when configured guards have no matching configs
+const FALLBACK_GUARDS = ["security", "compliance", "user-impact"];
 function runRegexPreScreen(
   toolName: string,
   args: Record<string, unknown>,
   guards: string[],
 ): string[] {
   const signals: string[] = [];
+  // Use provided guards, falling back to DEFAULT_PERSONA_TRIO
+  const effectiveGuards = guards.filter((g) => GUARD_CONFIGS[g]).length > 0
+    ? guards
+    : FALLBACK_GUARDS;
-  for (const domain of guards) {
+  for (const domain of effectiveGuards) {
     const config = GUARD_CONFIGS[domain];
     if (!config) continue;
@@ -130,28 +158,25 @@ async function callLlmWithTimeout(
   messages: ModelMessage[],
   timeoutMs: number,
 ): Promise<string> {
-  const controller = new AbortController();
-  const timer = setTimeout(() => controller.abort(), timeoutMs);
+  let timer: ReturnType<typeof setTimeout> | undefined;
   try {
     const result = await Promise.race([
       model(messages),
       new Promise<never>((_, reject) => {
-        controller.signal.addEventListener("abort", () =>
-          reject(new Error("LLM call timed out")),
-        );
+        timer = setTimeout(() => reject(new Error("LLM call timed out")), timeoutMs);
       }),
     ]);
     return result;
   } finally {
-    clearTimeout(timer);
+    if (timer) clearTimeout(timer);
   }
 }
 // ── Regex Fallback Vote ──────────────────────────────────────────────
 function regexFallbackVote(
-  persona: PersonaConfig,
+  _persona: PersonaConfig,
   toolName: string,
   args: Record<string, unknown>,
   guards: string[],
@@ -164,10 +189,12 @@ function regexFallbackVote(
       rationale: `Regex fallback: ${signals.join("; ")}`,
     };
   }
+  // When LLM is unavailable AND regex finds nothing, default to block for safety.
+  // This prevents fail-open when all LLMs are down.
   return {
-    vote: "YES",
-    confidence: 0.4,
-    rationale: "Regex fallback: no pattern matches (LLM unavailable)",
+    vote: "NO",
+    confidence: 0.3,
+    rationale: "Regex fallback: no pattern matches but LLM unavailable (fail-closed)",
   };
 }
@@ -197,7 +224,7 @@ export async function deliberate(
 ): Promise<LlmDecisionResult> {
   const decisionId = `dec_${crypto.randomUUID().slice(0, 12)}`;
   const personas = config.reputationManager.getPersonas();
-  const guards = config.guards ?? ["security", "compliance", "user-impact"];
+  const guards = config.guards ?? FALLBACK_GUARDS;
   // 1. Regex pre-screen
   const regexSignals = runRegexPreScreen(toolName, args, guards);
@@ -205,7 +232,6 @@ export async function deliberate(
   // 2. Risk tier check
   const tier = classifyTool(toolName, config.riskTiers);
   if (tier === "low") {
-    // Fast-path: regex only, no LLM calls
     const hasRisk = regexSignals.length > 0;
     return {
       decisionId,
@@ -244,7 +270,6 @@ export async function deliberate(
           };
         }
-        // Unparseable response, fall back to regex
         const fallback = regexFallbackVote(persona, toolName, args, guards);
         return {
           personaId: persona.id,
@@ -253,7 +278,6 @@ export async function deliberate(
           source: "regex_fallback" as const,
         };
       } catch {
-        // LLM failure, fall back to regex
         const fallback = regexFallbackVote(persona, toolName, args, guards);
         return {
           personaId: persona.id,
@@ -266,17 +290,21 @@ export async function deliberate(
   );
   // 4. Synthesize ConsensusInput for resolveConsensus()
-  //    Each persona creates a "submission" (their evaluation) and votes for it
+  //
+  // FIXED: Use a SINGLE "allow" submission. All personas vote on it.
+  // YES voters score +1, NO voters score -1, REWRITE voters score 0.
+  // This way resolveConsensus sees N votes on 1 submission, not N
+  // submissions with 1 vote each.
   const now = new Date().toISOString();
   const jobId = `job_facade_${decisionId}`;
+  const submissionId = `sub_${decisionId}_allow`;
-  // Create a minimal Job with the configured policy
   const job = {
     id: jobId,
     boardId: "",
     status: "SUBMITTED" as const,
     title: `Deliberation: ${toolName}`,
-    description: JSON.stringify(args),
+    description: "",
     createdByAgentId: "facade",
     createdAt: now,
     updatedAt: now,
@@ -288,33 +316,31 @@ export async function deliberate(
     minParticipants: 1,
   };
-  // Each persona submits their evaluation
-  const submissions = voteResults.map((v, i) => ({
-    id: `sub_${decisionId}_${i}`,
+  // Single submission representing "allow this tool call"
+  const submissions = [{
+    id: submissionId,
     jobId,
-    agentId: v.personaId,
+    agentId: "facade",
     submittedAt: now,
-    summary: v.rationale,
-    artifacts: { vote: v.vote, confidence: v.confidence, source: v.source },
-    confidence: v.confidence,
+    summary: `Allow ${toolName}`,
+    artifacts: {},
+    confidence: 1.0,
     requestedPayout: 0,
     status: "SUBMITTED" as const,
-  }));
+  }];
-  // Each persona votes YES (+1) on their own submission
-  // and scores based on their confidence
+  // Each persona votes on the single submission
   const votes = voteResults.map((v, i) => ({
     id: `vote_${decisionId}_${i}`,
     jobId,
     agentId: v.personaId,
-    submissionId: `sub_${decisionId}_${i}`,
+    submissionId,
     score: v.vote === "YES" ? 1 : v.vote === "NO" ? -1 : 0,
     weight: v.confidence,
     rationale: v.rationale,
     createdAt: now,
   }));
-  // Reputation function from the manager
   const reputation = (agentId: string) =>
     config.reputationManager.getReputation(agentId);
@@ -326,45 +352,40 @@ export async function deliberate(
     reputation,
   };
-  let consensusResult: ConsensusResult;
+  let consensusTrace: Record<string, unknown>;
   try {
-    consensusResult = resolveConsensus(consensusInput);
+    const result: ConsensusResult = resolveConsensus(consensusInput);
+    consensusTrace = result.consensusTrace;
+    // Extract the actual weighted score from the consensus trace.
+    // resolveConsensus always returns a "winner" (the single submission),
+    // but the score may be negative (more NO than YES votes).
+    const traceScores = (consensusTrace as any)?.scores as Record<string, number> | undefined;
+    const submissionScore = traceScores?.[submissionId] ?? 0;
+    consensusTrace = { ...consensusTrace, submissionScore };
   } catch {
-    // If resolution fails, fall back to simple majority
-    const yesCount = voteResults.filter((v) => v.vote === "YES").length;
-    const majority = yesCount > voteResults.length / 2;
-    consensusResult = {
-      winners: majority ? ["allow"] : ["block"],
-      winningSubmissionIds: [],
-      consensusTrace: { policy: "fallback_majority", reason: "resolve_error" },
-      finalArtifact: null,
-    };
+    consensusTrace = { policy: "fallback_majority", reason: "resolve_error" };
   }
-  // 6. Determine final action
-  const winnerIds = new Set(consensusResult.winners);
-  const winningVotes = voteResults.filter((v) => winnerIds.has(v.personaId));
-  const dominantVote = winningVotes.length > 0
-    ? winningVotes[0]!.vote
-    : voteResults[0]?.vote ?? "YES";
+  // 6. Determine action from vote distribution (direct counting)
+  // resolveConsensus provides the audit trace; vote counting determines the action.
+  // This avoids the "always-a-winner" problem where resolveConsensus returns
+  // a winner even when the score is negative.
+  const yesCount = voteResults.filter((v) => v.vote === "YES").length;
+  const noCount = voteResults.filter((v) => v.vote === "NO").length;
+  const rewriteCount = voteResults.filter((v) => v.vote === "REWRITE").length;
   let action: "allow" | "block" | "escalate";
-  if (dominantVote === "YES") {
+  if (rewriteCount > voteResults.length / 2) {
+    action = "escalate";
+  } else if (yesCount > noCount) {
     action = "allow";
-  } else if (dominantVote === "NO") {
-    action = "block";
   } else {
-    action = "escalate";
-  }
-  // If no clear winner (empty winners), use simple vote counting
-  if (consensusResult.winners.length === 0) {
-    const yesCount = voteResults.filter((v) => v.vote === "YES").length;
-    const noCount = voteResults.filter((v) => v.vote === "NO").length;
-    action = yesCount >= noCount ? "allow" : "block";
+    action = "block";
   }
-  // Compute aggregate score (0-1 based on vote distribution)
+  // Compute aggregate score
   const totalConfidence = voteResults.reduce((s, v) => s + v.confidence, 0);
   const yesConfidence = voteResults
     .filter((v) => v.vote === "YES")
@@ -376,11 +397,10 @@ export async function deliberate(
     action,
     votes: voteResults,
     policy: config.policyType,
-    consensusTrace: consensusResult.consensusTrace,
+    consensusTrace,
     aggregateScore,
   };
-  // 7. Record decision for reputation tracking
   config.reputationManager.recordDecision(result);
   return result;

package/src/reputation-manager.ts CHANGED Viewed

@@ -9,6 +9,9 @@ import type { FeedbackSignal, LlmDecisionResult } from "./types.js";
 // Updates from human feedback signals (onFeedback), not self-consensus.
 // Triggers persona respawn when reputation drops below threshold.
+const MAX_DECISION_LOOKBACK = 100;
+const MAX_FEEDBACK_LOOKBACK = 500;
 export interface RespawnEvent {
   oldPersona: PersonaConfig;
   newPersona: PersonaConfig;
@@ -51,10 +54,16 @@ export class ReputationManager {
   recordDecision(result: LlmDecisionResult): void {
     this.decisions.set(result.decisionId, result);
     this.decisionHistory.push(result);
-    // Keep last 100 decisions for respawn analysis
-    if (this.decisionHistory.length > 100) {
+    // Cap both collections to prevent memory leaks
+    if (this.decisionHistory.length >= MAX_DECISION_LOOKBACK) {
       this.decisionHistory.shift();
     }
+    // Trim the feedback correlation map (keep most recent N entries)
+    if (this.decisions.size > MAX_FEEDBACK_LOOKBACK) {
+      const oldest = this.decisions.keys().next().value;
+      if (oldest) this.decisions.delete(oldest);
+    }
   }
   /** Process human feedback signal and update reputation. */
@@ -85,7 +94,7 @@ export class ReputationManager {
       this.scores.set(change.persona_id, change.reputation_after);
     }
-    // Check for respawn
+    // Check for respawn (collect respawns, then apply)
     this.checkRespawn();
     // Persist if store configured
@@ -94,39 +103,44 @@ export class ReputationManager {
     return result.changes;
   }
-  /** Check if any persona needs respawn. */
+  /** Check if any persona needs respawn. Collects replacements first to avoid mutation during iteration. */
   private checkRespawn(): void {
+    const replacements: Array<{ index: number; old: PersonaConfig; rep: number }> = [];
+    // Collect personas that need respawn (don't mutate during scan)
     for (let i = 0; i < this.personas.length; i++) {
       const persona = this.personas[i]!;
       const rep = this.scores.get(persona.id) ?? 0.55;
       if (rep < this.threshold) {
-        // Build learning summary from decision history
-        const decisionRecords = this.decisionHistory.map((d) => ({
-          final_decision: d.action === "allow" ? "ALLOW" : "BLOCK",
-          votes: d.votes.map((v) => ({
-            persona_id: v.personaId,
-            vote: v.vote,
-            confidence: v.confidence,
-          })),
-        }));
-        const learning = buildLearningSummary(persona.id, decisionRecords);
-        const successor = mutatePersona(persona, learning);
-        // Replace persona
-        this.personas[i] = successor;
-        this.scores.delete(persona.id);
-        this.scores.set(successor.id, successor.reputation ?? 0.55);
-        this.onRespawn?.({
-          oldPersona: persona,
-          newPersona: successor,
-          reputation: rep,
-          reason: `Reputation ${rep.toFixed(3)} below threshold ${this.threshold}`,
-        });
+        replacements.push({ index: i, old: persona, rep });
       }
     }
+    // Apply replacements after scan
+    for (const { index, old, rep } of replacements) {
+      const decisionRecords = this.decisionHistory.map((d) => ({
+        final_decision: d.action === "allow" ? "ALLOW" : "BLOCK",
+        votes: d.votes.map((v) => ({
+          persona_id: v.personaId,
+          vote: v.vote,
+          confidence: v.confidence,
+        })),
+      }));
+      const learning = buildLearningSummary(old.id, decisionRecords);
+      const successor = mutatePersona(old, learning);
+      this.personas[index] = successor;
+      this.scores.delete(old.id);
+      this.scores.set(successor.id, successor.reputation ?? 0.55);
+      this.onRespawn?.({
+        oldPersona: old,
+        newPersona: successor,
+        reputation: rep,
+        reason: `Reputation ${rep.toFixed(3)} below threshold ${this.threshold}`,
+      });
+    }
   }
   /** Get current persona list (may include respawned successors). */
@@ -143,8 +157,9 @@ export class ReputationManager {
     }
     this.store.update((state) => {
       (state as any).reputation = data;
-    }).catch(() => {
-      // Persistence failure is non-fatal
+    }).catch((err) => {
+      // Log persistence failures instead of silently swallowing
+      console.warn("[consensus] Reputation persistence failed:", err); // eslint-disable-line no-console
     });
   }

package/src/risk-tiers.test.ts CHANGED Viewed

@@ -33,4 +33,12 @@ describe("classifyTool", () => {
     expect(classifyTool("send_email", { send_email: "low" })).toBe("low");
     expect(classifyTool("get_weather", { get_weather: "high" })).toBe("high");
   });
+  it("prevents bypass via compound names (high-risk checked first)", () => {
+    // These start with read-like prefixes but contain destructive operations
+    expect(classifyTool("execute_and_log")).toBe("high");
+    expect(classifyTool("run_cleanup")).toBe("high");
+    expect(classifyTool("delete_then_verify")).toBe("high");
+    expect(classifyTool("send_and_check")).toBe("high");
+  });
 });

package/src/risk-tiers.ts CHANGED Viewed

@@ -28,7 +28,8 @@ const LOW_RISK_PATTERNS = [
 /**
  * Classify a tool name into a risk tier.
  *
- * Priority: user overrides > low-risk patterns > high-risk patterns > default high.
+ * Priority: user overrides > high-risk patterns > low-risk patterns > default high.
+ * High-risk checked FIRST to prevent bypass via naming (e.g., "get_and_delete_user").
  * Unknown tools default to high-risk (safe by default).
  */
 export function classifyTool(toolName: string, overrides?: RiskTierMap): RiskTier {
@@ -36,14 +37,15 @@ export function classifyTool(toolName: string, overrides?: RiskTierMap): RiskTie
     return overrides[toolName];
   }
-  for (const pattern of LOW_RISK_PATTERNS) {
-    if (pattern.test(toolName)) return "low";
-  }
+  // Check high-risk FIRST to prevent bypass via compound names
   for (const pattern of HIGH_RISK_PATTERNS) {
     if (pattern.test(toolName)) return "high";
   }
+  for (const pattern of LOW_RISK_PATTERNS) {
+    if (pattern.test(toolName)) return "low";
+  }
   // Unknown tools default to high-risk (safe by default)
   return "high";
 }