npm - @tangle-network/agent-eval - Versions diffs - 0.31.1 → 0.33.0 - Mend

@tangle-network/agent-eval 0.31.1 → 0.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.js CHANGED Viewed

@@ -2900,6 +2900,11 @@ var MetricsCollector = class {
 };
 // src/driver.ts
+var RIGOR_STANCE = {
+  cooperative: "Your stance: a pragmatic early adopter. You accept reasonable answers and only push back on clear gaps or outright errors.",
+  demanding: "Your stance: an experienced professional with no time to waste. You do not accept vague, hedged, or generic answers \u2014 you expect specifics, and you say so plainly when you do not get them.",
+  relentless: "Your stance: a senior partner reviewing this work for a client who will litigate if it is wrong. You interrogate every claim. You accept nothing undefended. You find the single weakest point in every answer and attack it. Courteous, never satisfied."
+};
 var AgentDriver = class {
   tc;
   client;
@@ -2929,12 +2934,14 @@ var AgentDriver = class {
     const conversationHistory = [];
     let completed = false;
     let turnsToCompletion = null;
+    let criteriaMetAtTurn = null;
     for (let turn = 1; turn <= persona.maxTurns; turn++) {
       const state = await metrics.getState();
       const userMessage = await this.decideNextMessage(persona, state, conversationHistory);
       if (userMessage === "DONE") {
         completed = true;
         turnsToCompletion = turn - 1;
+        console.log(`  SIGNED OFF by simulated ${persona.role} after turn ${turn - 1}`);
         break;
       }
       const turnStart = Date.now();
@@ -2963,11 +2970,9 @@ var AgentDriver = class {
       console.log(
         `  [turn ${turn}] ${conv.completionPercent.toFixed(0)}% \u2014 ${criteriaStr} (${(latency / 1e3).toFixed(1)}s)`
       );
-      if (conv.complete) {
-        completed = true;
-        turnsToCompletion = turn;
-        console.log(`  COMPLETE at turn ${turn}`);
-        break;
+      if (conv.complete && criteriaMetAtTurn === null) {
+        criteriaMetAtTurn = turn;
+        console.log(`  criteria met at turn ${turn} \u2014 driver continues pressure-testing`);
       }
     }
     const finalState = await metrics.getState();
@@ -2975,6 +2980,7 @@ var AgentDriver = class {
       personaId: persona.id,
       completed,
       turnsToCompletion,
+      criteriaMetAtTurn,
       totalTurns: turnMetrics.length,
       metrics: turnMetrics,
       finalState,
@@ -2985,51 +2991,13 @@ var AgentDriver = class {
   }
   /** Use the driver LLM to decide what the "user" says next */
   async decideNextMessage(persona, state, history) {
-    const lastResponse = history.length > 0 ? history[history.length - 1].content.slice(0, 2e3) : "(no conversation yet \u2014 this is the first message)";
-    const recentHistory = history.slice(-6).map((h) => `${h.role}: ${h.content.slice(0, 500)}`).join("\n\n");
-    const resp = await this.tc.chat({
-      model: this.driverModel,
-      messages: [
-        {
-          role: "system",
-          content: `You are playing the role of a ${persona.role} testing an AI agent.
-Your goal: ${persona.goal}
-${this.productContext ? `Product context:
-${this.productContext}
-` : ""}
-Current state:
-- Tasks: ${state.tasks}
-- Events: ${state.events}
-- Proposals: pending=${state.proposals.pending}, approved=${state.proposals.approved}, rejected=${state.proposals.rejected}
-- Vault files: ${state.vaultFiles.length} (${state.vaultFiles.slice(0, 10).join(", ")}${state.vaultFiles.length > 10 ? "..." : ""})
-Completion criteria met: ${this.describeCompletion(persona, state)}
-Decide what to do next:
-1. If completion is 100% \u2014 respond with exactly "DONE"
-2. If a proposal is pending \u2014 approve or reject it (with reason)
-3. If the agent is on track \u2014 push for the next deliverable
-4. If the agent is off track \u2014 give specific corrective feedback
-5. If this is the first message \u2014 start with a clear, actionable request
-Output ONLY your next message to the agent. Be specific. Be realistic.
-Don't be patient \u2014 a real ${persona.role} wouldn't accept vague answers.`
-        },
-        {
-          role: "user",
-          content: recentHistory ? `Recent conversation:
-${recentHistory}
-The agent just said:
-${lastResponse}` : "No conversation yet. Send your opening message."
-        }
-      ],
-      temperature: 0.5,
-      maxTokens: 500
+    return decideNextUserTurn(this.tc, {
+      persona,
+      state,
+      history,
+      productContext: this.productContext,
+      model: this.driverModel
     });
-    const content = resp.choices?.[0]?.message?.content ?? "";
-    return content.trim();
   }
   /** Handle pending approvals based on persona feedback patterns */
   async handleApprovals(persona, workspaceId, _state) {
@@ -3049,16 +3017,77 @@ ${lastResponse}` : "No conversation yet. Send your opening message."
       }
     }
   }
-  /** Describe which completion criteria are met */
-  describeCompletion(persona, state) {
-    const results = persona.completionCriteria.map((c) => {
-      const met = c.check(state);
-      return `${c.name}: ${met ? "MET" : "NOT MET"}`;
-    });
-    const metCount = results.filter((r) => r.includes("MET") && !r.includes("NOT")).length;
-    return `${metCount}/${persona.completionCriteria.length} \u2014 ${results.join(", ")}`;
-  }
 };
+function describeCompletion(persona, state) {
+  const results = persona.completionCriteria.map((c) => {
+    const met = c.check(state);
+    return `${c.name}: ${met ? "MET" : "NOT MET"}`;
+  });
+  const metCount = results.filter((r) => r.includes("MET") && !r.includes("NOT")).length;
+  return `${metCount}/${persona.completionCriteria.length} \u2014 ${results.join(", ")}`;
+}
+function buildDriverSystemPrompt(persona, state, productContext = "") {
+  const rigor = persona.rigor ?? "demanding";
+  const expertise = persona.expertise ? ` You are ${persona.expertise}.` : "";
+  const pressure = persona.pressurePoints && persona.pressurePoints.length > 0 ? `
+A competent ${persona.role} here MUST get the agent to address each of:
+${persona.pressurePoints.map((p) => `  - ${p}`).join(
+    "\n"
+  )}
+Do NOT hand these to the agent. Probe whether it surfaces them itself. If it misses one, press on exactly that gap until it delivers or demonstrably fails.
+` : "";
+  const curveballs = persona.curveballs && persona.curveballs.length > 0 ? `
+Once the agent is coasting on easy answers, introduce ONE of these as a genuine new development \u2014 never as a quiz:
+${persona.curveballs.map((c) => `  - ${c}`).join("\n")}
+` : "";
+  return `You are role-playing a real ${persona.role} putting an AI agent through its paces.${expertise}
+Your objective: ${persona.goal}
+You are deciding whether this agent's work is good enough to stake your professional reputation on. Assume it is not \u2014 until it proves otherwise.
+${RIGOR_STANCE[rigor]}
+${productContext ? `Product context:
+${productContext}
+` : ""}Current workspace state:
+- Tasks: ${state.tasks} | Events: ${state.events}
+- Proposals: pending=${state.proposals.pending}, approved=${state.proposals.approved}, rejected=${state.proposals.rejected}
+- Vault files (${state.vaultFiles.length}): ${state.vaultFiles.slice(0, 10).join(", ")}${state.vaultFiles.length > 10 ? " \u2026" : ""}
+- Nominal task criteria: ${describeCompletion(persona, state)}
+${pressure}${curveballs}
+How to choose your next message:
+1. Silently judge the agent's last response the way a ${persona.role} would. Is every claim defended with a specific authority, figure, or mechanism? Or is it vague, hedged, or generic?
+2. If it is vague or hand-waved \u2014 do NOT move on. Name the gap and demand the specific authority / figure / mechanism. "It depends" is not an answer; force the decision.
+3. If it makes a claim you can challenge \u2014 challenge it. Make the agent defend or correct it.
+4. If it missed something a ${persona.role} would catch \u2014 press on exactly that, without naming it for the agent.
+5. If it is genuinely solid \u2014 escalate: go a layer deeper, or introduce a curveball.
+6. First message \u2014 state your situation as you really would: realistic, specific, with the messy detail, but do not coach the agent.
+Sign-off: respond with exactly "DONE" only when a ${persona.role} would act on this work without redoing it. Nominal task completion is NOT sign-off \u2014 sloppy-but-complete still fails. If the agent never gets there, keep pushing; never sign off on weak work.
+Output ONLY your next message to the agent \u2014 in character, first person, no meta-commentary, no stage directions.`;
+}
+async function decideNextUserTurn(tc, opts) {
+  const { persona, state, history, productContext = "", model = "claude-sonnet-4-6" } = opts;
+  const lastResponse = history.length > 0 ? history[history.length - 1].content.slice(0, 2e3) : "(no conversation yet \u2014 this is the first message)";
+  const recentHistory = history.slice(-6).map((h) => `${h.role}: ${h.content.slice(0, 500)}`).join("\n\n");
+  const resp = await tc.chat({
+    model,
+    messages: [
+      { role: "system", content: buildDriverSystemPrompt(persona, state, productContext) },
+      {
+        role: "user",
+        content: recentHistory ? `Recent conversation:
+${recentHistory}
+The agent's latest response:
+${lastResponse}` : "No conversation yet. Send your opening message \u2014 in character, phrased as this person actually would."
+      }
+    ],
+    temperature: 0.5,
+    maxTokens: 700
+  });
+  const content = resp.choices?.[0]?.message?.content ?? "";
+  return content.trim();
+}
 // src/integration-gates.ts
 function integrationManifestValidatedPayload(input) {
@@ -4520,6 +4549,194 @@ function pathExists(obj, path) {
   return true;
 }
+// src/completion-verifier.ts
+var STOPWORDS = /* @__PURE__ */ new Set([
+  "the",
+  "a",
+  "an",
+  "of",
+  "for",
+  "and",
+  "or",
+  "to",
+  "in",
+  "on",
+  "with",
+  "by"
+]);
+var MATCH_THRESHOLD = 0.5;
+var MIN_CONTENT_CHARS = 50;
+function tokens(s) {
+  return new Set(
+    s.toLowerCase().split(/[^a-z0-9]+/).filter((t) => t.length > 1 && !STOPWORDS.has(t))
+  );
+}
+function tokenRecall(requirementText, candidateText) {
+  const req = tokens(requirementText);
+  if (req.size === 0) return 0;
+  const cand = tokens(candidateText);
+  let hit = 0;
+  for (const t of req) if (cand.has(t)) hit++;
+  return hit / req.size;
+}
+function artifactCandidates(req, reqIndex, artifacts) {
+  const reqText = `${req.title} ${req.category ?? ""}`;
+  const out = [];
+  artifacts.forEach((a, i) => {
+    if ((a.content ?? "").trim().length < MIN_CONTENT_CHARS) return;
+    let score = tokenRecall(reqText, `${a.path ?? ""} ${a.kind}`);
+    if (req.category && a.kind && req.category.toLowerCase() === a.kind.toLowerCase()) {
+      score = Math.max(score, 1);
+    }
+    if (score < MATCH_THRESHOLD) return;
+    out.push({
+      reqIndex,
+      itemKey: `artifact:${i}`,
+      score,
+      evidence: `artifact '${a.path ?? a.kind}' matched (token recall ${score.toFixed(2)})`,
+      content: a.content ?? null
+    });
+  });
+  return out;
+}
+function proposalCandidates(req, reqIndex, proposals) {
+  const reqText = `${req.title} ${req.category ?? ""}`;
+  const out = [];
+  for (const p of proposals) {
+    if (p.status !== "approved") continue;
+    const score = tokenRecall(reqText, p.title);
+    if (score < MATCH_THRESHOLD) continue;
+    const body = p.content ?? "";
+    out.push({
+      reqIndex,
+      itemKey: `proposal:${p.id}`,
+      score,
+      evidence: `approved proposal '${p.title}' matched (token recall ${score.toFixed(2)})`,
+      content: body.trim().length >= MIN_CONTENT_CHARS ? body : null
+    });
+  }
+  return out;
+}
+function toolCallCandidates(req, reqIndex, toolCalls) {
+  const out = [];
+  toolCalls.forEach((name, i) => {
+    const score = tokenRecall(req.title, name);
+    if (score < MATCH_THRESHOLD) return;
+    out.push({
+      reqIndex,
+      itemKey: `tool:${i}`,
+      score,
+      evidence: `tool call '${name}' matched (token recall ${score.toFixed(2)})`,
+      content: null
+    });
+  });
+  return out;
+}
+async function verifyCompletion(gold, state, checkCorrectness) {
+  if (gold.requirements.length === 0) {
+    throw new Error(
+      `verifyCompletion: task '${gold.taskId}' has no requirements \u2014 malformed gold spec`
+    );
+  }
+  const candidates = [];
+  gold.requirements.forEach((req, i) => {
+    const by = req.satisfiedBy ?? "any";
+    if (by === "artifact" || by === "any") {
+      candidates.push(...artifactCandidates(req, i, state.artifacts));
+    }
+    if (by === "proposal" || by === "any") {
+      candidates.push(...proposalCandidates(req, i, state.proposals));
+    }
+    if (by === "tool-call" || by === "any") {
+      candidates.push(...toolCallCandidates(req, i, state.toolCalls));
+    }
+  });
+  candidates.sort((a, b) => b.score - a.score);
+  const assigned = /* @__PURE__ */ new Map();
+  const itemTaken = /* @__PURE__ */ new Set();
+  for (const c of candidates) {
+    if (assigned.has(c.reqIndex) || itemTaken.has(c.itemKey)) continue;
+    assigned.set(c.reqIndex, c);
+    itemTaken.add(c.itemKey);
+  }
+  const requirements = [];
+  for (let i = 0; i < gold.requirements.length; i++) {
+    const req = gold.requirements[i];
+    const match = assigned.get(i);
+    const evidence = [];
+    let correct = null;
+    if (match) {
+      evidence.push(match.evidence);
+      if (match.content !== null) {
+        const r = await checkCorrectness(req, match.content);
+        correct = r.correct;
+        evidence.push(`correctness: ${r.correct ? "pass" : "fail"} \u2014 ${r.reason}`);
+      } else {
+        evidence.push("correctness: not assessed \u2014 matched item carries no content");
+      }
+    } else {
+      const by = req.satisfiedBy ?? "any";
+      const kind = by === "any" ? "artifact/proposal/tool-call" : by;
+      evidence.push(`no produced ${kind} matched this requirement`);
+    }
+    const structurallyPresent = match !== void 0;
+    const satisfied = structurallyPresent && correct !== false;
+    requirements.push({
+      reqId: req.reqId,
+      title: req.title,
+      structurallyPresent,
+      correct,
+      satisfied,
+      evidence
+    });
+  }
+  const satisfiedCount = requirements.filter((r) => r.satisfied).length;
+  return {
+    taskId: gold.taskId,
+    requirements,
+    completionRate: satisfiedCount / requirements.length,
+    fullyComplete: satisfiedCount === requirements.length
+  };
+}
+function parseCorrectnessResponse(raw) {
+  const match = raw.match(/\{[\s\S]*\}/);
+  if (!match) {
+    throw new Error(`correctness checker: no JSON object in model response: ${raw.slice(0, 200)}`);
+  }
+  const parsed = JSON.parse(match[0]);
+  if (typeof parsed.correct !== "boolean") {
+    throw new Error(`correctness checker: 'correct' is not a boolean in: ${match[0].slice(0, 200)}`);
+  }
+  return { correct: parsed.correct, reason: typeof parsed.reason === "string" ? parsed.reason : "" };
+}
+function createLlmCorrectnessChecker(tc, opts = {}) {
+  const model = opts.model ?? "claude-sonnet-4-6";
+  const maxContentChars = opts.maxContentChars ?? 8e3;
+  return async (requirement, content) => {
+    const resp = await tc.chat({
+      model,
+      messages: [
+        {
+          role: "system",
+          content: 'You verify whether a produced work artifact actually fulfils a stated requirement. Judge fulfilment only \u2014 is the deliverable substantively present and on-point \u2014 not polish. A plan to do it later, a vague gesture, or a description of what should be done does NOT fulfil a requirement; the artifact must BE the deliverable. Respond with a single JSON object: {"correct": boolean, "reason": string (<= 30 words)}.'
+        },
+        {
+          role: "user",
+          content: `Requirement: ${requirement.title}
+${requirement.category ? `Category: ${requirement.category}
+` : ""}
+Produced artifact:
+${content.slice(0, maxContentChars)}`
+        }
+      ],
+      temperature: 0,
+      maxTokens: 200
+    });
+    const raw = resp.choices?.[0]?.message?.content ?? "";
+    return parseCorrectnessResponse(raw);
+  };
+}
 // src/dual-agent-bench.ts
 var DualAgentBench = class {
   async run(config) {
@@ -5174,6 +5391,40 @@ function canonicalInstruction(value) {
   return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
 }
+// src/produced-state.ts
+function artifactKind(mimeType) {
+  if (!mimeType) return "file";
+  if (mimeType.includes("json")) return "json";
+  if (mimeType.startsWith("text/")) return "text";
+  return "file";
+}
+function extractProducedState(events) {
+  const artifacts = [];
+  const proposals = [];
+  const toolCalls = [];
+  const seenTools = /* @__PURE__ */ new Set();
+  for (const ev of events) {
+    if (ev.type === "tool_call") {
+      const name = ev.toolName;
+      if (name && !seenTools.has(name)) {
+        seenTools.add(name);
+        toolCalls.push(name);
+      }
+    } else if (ev.type === "artifact") {
+      const a = ev;
+      artifacts.push({
+        kind: artifactKind(a.mimeType),
+        path: a.name ?? a.uri ?? a.artifactId,
+        content: a.content ?? ""
+      });
+    } else if (ev.type === "proposal_created") {
+      const p = ev;
+      proposals.push({ id: p.proposalId, title: p.title, status: p.status ?? "pending" });
+    }
+  }
+  return { artifacts, proposals, toolCalls };
+}
 // src/prompt-registry.ts
 var PromptRegistry = class {
   entries = /* @__PURE__ */ new Map();
@@ -9092,8 +9343,8 @@ function ratio(numerator, denominator) {
   return denominator > 0 ? numerator / denominator : 0;
 }
 function tokenJaccard(a, b) {
-  const left = new Set(tokens(a));
-  const right = new Set(tokens(b));
+  const left = new Set(tokens2(a));
+  const right = new Set(tokens2(b));
   if (left.size === 0 || right.size === 0) return 0;
   let intersection = 0;
   for (const token of left) {
@@ -9111,7 +9362,7 @@ function tagOverlap(a, b) {
   }
   return intersection / Math.max(left.size, right.size);
 }
-function tokens(text) {
+function tokens2(text) {
   return normalize(text).split(/\s+/).filter((token) => token.length >= 3 && !STOP_WORDS.has(token));
 }
 function normalize(text) {
@@ -10545,6 +10796,7 @@ export {
   blockingKnowledgeEval,
   bonferroni,
   bootstrapCi,
+  buildDriverSystemPrompt,
   buildReflectionPrompt,
   buildReviewerPrompt,
   buildTraceAnalystTools,
@@ -10595,6 +10847,7 @@ export {
   createFeedbackTrajectory,
   createIntentMatchJudge,
   createJudgeAdapter,
+  createLlmCorrectnessChecker,
   createLlmReviewer,
   createReplayFetch,
   createRunCriticAdapter,
@@ -10607,6 +10860,7 @@ export {
   createVerifierAdapter,
   crossTraceDiff,
   crowdingDistance,
+  decideNextUserTurn,
   decideReferenceReplayPromotion,
   decideReferenceReplayRunPromotion,
   defaultIsMaterial,
@@ -10637,6 +10891,7 @@ export {
   exportRunAsOtlp,
   extractAssetUrls,
   extractErrorCount,
+  extractProducedState,
   feedbackTrajectoriesToDatasetScenarios,
   feedbackTrajectoriesToOptimizerRows,
   feedbackTrajectoryToDatasetScenario,
@@ -10714,6 +10969,7 @@ export {
   paretoChart,
   paretoFrontier,
   paretoFrontierWithCrowding,
+  parseCorrectnessResponse,
   parseFeedbackTrajectoriesJsonl,
   parseFindingSubject,
   parseRawFinding,
@@ -10825,6 +11081,7 @@ export {
   userQuestionsForKnowledgeGaps,
   validateRunRecord,
   verbosityBias,
+  verifyCompletion,
   verifyManifest,
   visualDiff,
   viteDeployRunner,