@tangle-network/agent-eval 0.31.1 → 0.32.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2900,6 +2900,11 @@ var MetricsCollector = class {
2900
2900
  };
2901
2901
 
2902
2902
  // src/driver.ts
2903
+ var RIGOR_STANCE = {
2904
+ cooperative: "Your stance: a pragmatic early adopter. You accept reasonable answers and only push back on clear gaps or outright errors.",
2905
+ demanding: "Your stance: an experienced professional with no time to waste. You do not accept vague, hedged, or generic answers \u2014 you expect specifics, and you say so plainly when you do not get them.",
2906
+ relentless: "Your stance: a senior partner reviewing this work for a client who will litigate if it is wrong. You interrogate every claim. You accept nothing undefended. You find the single weakest point in every answer and attack it. Courteous, never satisfied."
2907
+ };
2903
2908
  var AgentDriver = class {
2904
2909
  tc;
2905
2910
  client;
@@ -2929,12 +2934,14 @@ var AgentDriver = class {
2929
2934
  const conversationHistory = [];
2930
2935
  let completed = false;
2931
2936
  let turnsToCompletion = null;
2937
+ let criteriaMetAtTurn = null;
2932
2938
  for (let turn = 1; turn <= persona.maxTurns; turn++) {
2933
2939
  const state = await metrics.getState();
2934
2940
  const userMessage = await this.decideNextMessage(persona, state, conversationHistory);
2935
2941
  if (userMessage === "DONE") {
2936
2942
  completed = true;
2937
2943
  turnsToCompletion = turn - 1;
2944
+ console.log(` SIGNED OFF by simulated ${persona.role} after turn ${turn - 1}`);
2938
2945
  break;
2939
2946
  }
2940
2947
  const turnStart = Date.now();
@@ -2963,11 +2970,9 @@ var AgentDriver = class {
2963
2970
  console.log(
2964
2971
  ` [turn ${turn}] ${conv.completionPercent.toFixed(0)}% \u2014 ${criteriaStr} (${(latency / 1e3).toFixed(1)}s)`
2965
2972
  );
2966
- if (conv.complete) {
2967
- completed = true;
2968
- turnsToCompletion = turn;
2969
- console.log(` COMPLETE at turn ${turn}`);
2970
- break;
2973
+ if (conv.complete && criteriaMetAtTurn === null) {
2974
+ criteriaMetAtTurn = turn;
2975
+ console.log(` criteria met at turn ${turn} \u2014 driver continues pressure-testing`);
2971
2976
  }
2972
2977
  }
2973
2978
  const finalState = await metrics.getState();
@@ -2975,6 +2980,7 @@ var AgentDriver = class {
2975
2980
  personaId: persona.id,
2976
2981
  completed,
2977
2982
  turnsToCompletion,
2983
+ criteriaMetAtTurn,
2978
2984
  totalTurns: turnMetrics.length,
2979
2985
  metrics: turnMetrics,
2980
2986
  finalState,
@@ -2992,41 +2998,19 @@ var AgentDriver = class {
2992
2998
  messages: [
2993
2999
  {
2994
3000
  role: "system",
2995
- content: `You are playing the role of a ${persona.role} testing an AI agent.
2996
- Your goal: ${persona.goal}
2997
-
2998
- ${this.productContext ? `Product context:
2999
- ${this.productContext}
3000
- ` : ""}
3001
- Current state:
3002
- - Tasks: ${state.tasks}
3003
- - Events: ${state.events}
3004
- - Proposals: pending=${state.proposals.pending}, approved=${state.proposals.approved}, rejected=${state.proposals.rejected}
3005
- - Vault files: ${state.vaultFiles.length} (${state.vaultFiles.slice(0, 10).join(", ")}${state.vaultFiles.length > 10 ? "..." : ""})
3006
-
3007
- Completion criteria met: ${this.describeCompletion(persona, state)}
3008
-
3009
- Decide what to do next:
3010
- 1. If completion is 100% \u2014 respond with exactly "DONE"
3011
- 2. If a proposal is pending \u2014 approve or reject it (with reason)
3012
- 3. If the agent is on track \u2014 push for the next deliverable
3013
- 4. If the agent is off track \u2014 give specific corrective feedback
3014
- 5. If this is the first message \u2014 start with a clear, actionable request
3015
-
3016
- Output ONLY your next message to the agent. Be specific. Be realistic.
3017
- Don't be patient \u2014 a real ${persona.role} wouldn't accept vague answers.`
3001
+ content: buildDriverSystemPrompt(persona, state, this.productContext)
3018
3002
  },
3019
3003
  {
3020
3004
  role: "user",
3021
3005
  content: recentHistory ? `Recent conversation:
3022
3006
  ${recentHistory}
3023
3007
 
3024
- The agent just said:
3025
- ${lastResponse}` : "No conversation yet. Send your opening message."
3008
+ The agent's latest response:
3009
+ ${lastResponse}` : "No conversation yet. Send your opening message \u2014 in character, phrased as this person actually would."
3026
3010
  }
3027
3011
  ],
3028
3012
  temperature: 0.5,
3029
- maxTokens: 500
3013
+ maxTokens: 700
3030
3014
  });
3031
3015
  const content = resp.choices?.[0]?.message?.content ?? "";
3032
3016
  return content.trim();
@@ -3049,16 +3033,54 @@ ${lastResponse}` : "No conversation yet. Send your opening message."
3049
3033
  }
3050
3034
  }
3051
3035
  }
3052
- /** Describe which completion criteria are met */
3053
- describeCompletion(persona, state) {
3054
- const results = persona.completionCriteria.map((c) => {
3055
- const met = c.check(state);
3056
- return `${c.name}: ${met ? "MET" : "NOT MET"}`;
3057
- });
3058
- const metCount = results.filter((r) => r.includes("MET") && !r.includes("NOT")).length;
3059
- return `${metCount}/${persona.completionCriteria.length} \u2014 ${results.join(", ")}`;
3060
- }
3061
3036
  };
3037
+ function describeCompletion(persona, state) {
3038
+ const results = persona.completionCriteria.map((c) => {
3039
+ const met = c.check(state);
3040
+ return `${c.name}: ${met ? "MET" : "NOT MET"}`;
3041
+ });
3042
+ const metCount = results.filter((r) => r.includes("MET") && !r.includes("NOT")).length;
3043
+ return `${metCount}/${persona.completionCriteria.length} \u2014 ${results.join(", ")}`;
3044
+ }
3045
+ function buildDriverSystemPrompt(persona, state, productContext = "") {
3046
+ const rigor = persona.rigor ?? "demanding";
3047
+ const expertise = persona.expertise ? ` You are ${persona.expertise}.` : "";
3048
+ const pressure = persona.pressurePoints && persona.pressurePoints.length > 0 ? `
3049
+ A competent ${persona.role} here MUST get the agent to address each of:
3050
+ ${persona.pressurePoints.map((p) => ` - ${p}`).join(
3051
+ "\n"
3052
+ )}
3053
+ Do NOT hand these to the agent. Probe whether it surfaces them itself. If it misses one, press on exactly that gap until it delivers or demonstrably fails.
3054
+ ` : "";
3055
+ const curveballs = persona.curveballs && persona.curveballs.length > 0 ? `
3056
+ Once the agent is coasting on easy answers, introduce ONE of these as a genuine new development \u2014 never as a quiz:
3057
+ ${persona.curveballs.map((c) => ` - ${c}`).join("\n")}
3058
+ ` : "";
3059
+ return `You are role-playing a real ${persona.role} putting an AI agent through its paces.${expertise}
3060
+ Your objective: ${persona.goal}
3061
+ You are deciding whether this agent's work is good enough to stake your professional reputation on. Assume it is not \u2014 until it proves otherwise.
3062
+
3063
+ ${RIGOR_STANCE[rigor]}
3064
+ ${productContext ? `Product context:
3065
+ ${productContext}
3066
+ ` : ""}Current workspace state:
3067
+ - Tasks: ${state.tasks} | Events: ${state.events}
3068
+ - Proposals: pending=${state.proposals.pending}, approved=${state.proposals.approved}, rejected=${state.proposals.rejected}
3069
+ - Vault files (${state.vaultFiles.length}): ${state.vaultFiles.slice(0, 10).join(", ")}${state.vaultFiles.length > 10 ? " \u2026" : ""}
3070
+ - Nominal task criteria: ${describeCompletion(persona, state)}
3071
+ ${pressure}${curveballs}
3072
+ How to choose your next message:
3073
+ 1. Silently judge the agent's last response the way a ${persona.role} would. Is every claim defended with a specific authority, figure, or mechanism? Or is it vague, hedged, or generic?
3074
+ 2. If it is vague or hand-waved \u2014 do NOT move on. Name the gap and demand the specific authority / figure / mechanism. "It depends" is not an answer; force the decision.
3075
+ 3. If it makes a claim you can challenge \u2014 challenge it. Make the agent defend or correct it.
3076
+ 4. If it missed something a ${persona.role} would catch \u2014 press on exactly that, without naming it for the agent.
3077
+ 5. If it is genuinely solid \u2014 escalate: go a layer deeper, or introduce a curveball.
3078
+ 6. First message \u2014 state your situation as you really would: realistic, specific, with the messy detail, but do not coach the agent.
3079
+
3080
+ Sign-off: respond with exactly "DONE" only when a ${persona.role} would act on this work without redoing it. Nominal task completion is NOT sign-off \u2014 sloppy-but-complete still fails. If the agent never gets there, keep pushing; never sign off on weak work.
3081
+
3082
+ Output ONLY your next message to the agent \u2014 in character, first person, no meta-commentary, no stage directions.`;
3083
+ }
3062
3084
 
3063
3085
  // src/integration-gates.ts
3064
3086
  function integrationManifestValidatedPayload(input) {
@@ -4520,6 +4542,194 @@ function pathExists(obj, path) {
4520
4542
  return true;
4521
4543
  }
4522
4544
 
4545
+ // src/completion-verifier.ts
4546
+ var STOPWORDS = /* @__PURE__ */ new Set([
4547
+ "the",
4548
+ "a",
4549
+ "an",
4550
+ "of",
4551
+ "for",
4552
+ "and",
4553
+ "or",
4554
+ "to",
4555
+ "in",
4556
+ "on",
4557
+ "with",
4558
+ "by"
4559
+ ]);
4560
+ var MATCH_THRESHOLD = 0.5;
4561
+ var MIN_CONTENT_CHARS = 50;
4562
+ function tokens(s) {
4563
+ return new Set(
4564
+ s.toLowerCase().split(/[^a-z0-9]+/).filter((t) => t.length > 1 && !STOPWORDS.has(t))
4565
+ );
4566
+ }
4567
+ function tokenRecall(requirementText, candidateText) {
4568
+ const req = tokens(requirementText);
4569
+ if (req.size === 0) return 0;
4570
+ const cand = tokens(candidateText);
4571
+ let hit = 0;
4572
+ for (const t of req) if (cand.has(t)) hit++;
4573
+ return hit / req.size;
4574
+ }
4575
+ function artifactCandidates(req, reqIndex, artifacts) {
4576
+ const reqText = `${req.title} ${req.category ?? ""}`;
4577
+ const out = [];
4578
+ artifacts.forEach((a, i) => {
4579
+ if ((a.content ?? "").trim().length < MIN_CONTENT_CHARS) return;
4580
+ let score = tokenRecall(reqText, `${a.path ?? ""} ${a.kind}`);
4581
+ if (req.category && a.kind && req.category.toLowerCase() === a.kind.toLowerCase()) {
4582
+ score = Math.max(score, 1);
4583
+ }
4584
+ if (score < MATCH_THRESHOLD) return;
4585
+ out.push({
4586
+ reqIndex,
4587
+ itemKey: `artifact:${i}`,
4588
+ score,
4589
+ evidence: `artifact '${a.path ?? a.kind}' matched (token recall ${score.toFixed(2)})`,
4590
+ content: a.content ?? null
4591
+ });
4592
+ });
4593
+ return out;
4594
+ }
4595
+ function proposalCandidates(req, reqIndex, proposals) {
4596
+ const reqText = `${req.title} ${req.category ?? ""}`;
4597
+ const out = [];
4598
+ for (const p of proposals) {
4599
+ if (p.status !== "approved") continue;
4600
+ const score = tokenRecall(reqText, p.title);
4601
+ if (score < MATCH_THRESHOLD) continue;
4602
+ const body = p.content ?? "";
4603
+ out.push({
4604
+ reqIndex,
4605
+ itemKey: `proposal:${p.id}`,
4606
+ score,
4607
+ evidence: `approved proposal '${p.title}' matched (token recall ${score.toFixed(2)})`,
4608
+ content: body.trim().length >= MIN_CONTENT_CHARS ? body : null
4609
+ });
4610
+ }
4611
+ return out;
4612
+ }
4613
+ function toolCallCandidates(req, reqIndex, toolCalls) {
4614
+ const out = [];
4615
+ toolCalls.forEach((name, i) => {
4616
+ const score = tokenRecall(req.title, name);
4617
+ if (score < MATCH_THRESHOLD) return;
4618
+ out.push({
4619
+ reqIndex,
4620
+ itemKey: `tool:${i}`,
4621
+ score,
4622
+ evidence: `tool call '${name}' matched (token recall ${score.toFixed(2)})`,
4623
+ content: null
4624
+ });
4625
+ });
4626
+ return out;
4627
+ }
4628
+ async function verifyCompletion(gold, state, checkCorrectness) {
4629
+ if (gold.requirements.length === 0) {
4630
+ throw new Error(
4631
+ `verifyCompletion: task '${gold.taskId}' has no requirements \u2014 malformed gold spec`
4632
+ );
4633
+ }
4634
+ const candidates = [];
4635
+ gold.requirements.forEach((req, i) => {
4636
+ const by = req.satisfiedBy ?? "any";
4637
+ if (by === "artifact" || by === "any") {
4638
+ candidates.push(...artifactCandidates(req, i, state.artifacts));
4639
+ }
4640
+ if (by === "proposal" || by === "any") {
4641
+ candidates.push(...proposalCandidates(req, i, state.proposals));
4642
+ }
4643
+ if (by === "tool-call" || by === "any") {
4644
+ candidates.push(...toolCallCandidates(req, i, state.toolCalls));
4645
+ }
4646
+ });
4647
+ candidates.sort((a, b) => b.score - a.score);
4648
+ const assigned = /* @__PURE__ */ new Map();
4649
+ const itemTaken = /* @__PURE__ */ new Set();
4650
+ for (const c of candidates) {
4651
+ if (assigned.has(c.reqIndex) || itemTaken.has(c.itemKey)) continue;
4652
+ assigned.set(c.reqIndex, c);
4653
+ itemTaken.add(c.itemKey);
4654
+ }
4655
+ const requirements = [];
4656
+ for (let i = 0; i < gold.requirements.length; i++) {
4657
+ const req = gold.requirements[i];
4658
+ const match = assigned.get(i);
4659
+ const evidence = [];
4660
+ let correct = null;
4661
+ if (match) {
4662
+ evidence.push(match.evidence);
4663
+ if (match.content !== null) {
4664
+ const r = await checkCorrectness(req, match.content);
4665
+ correct = r.correct;
4666
+ evidence.push(`correctness: ${r.correct ? "pass" : "fail"} \u2014 ${r.reason}`);
4667
+ } else {
4668
+ evidence.push("correctness: not assessed \u2014 matched item carries no content");
4669
+ }
4670
+ } else {
4671
+ const by = req.satisfiedBy ?? "any";
4672
+ const kind = by === "any" ? "artifact/proposal/tool-call" : by;
4673
+ evidence.push(`no produced ${kind} matched this requirement`);
4674
+ }
4675
+ const structurallyPresent = match !== void 0;
4676
+ const satisfied = structurallyPresent && correct !== false;
4677
+ requirements.push({
4678
+ reqId: req.reqId,
4679
+ title: req.title,
4680
+ structurallyPresent,
4681
+ correct,
4682
+ satisfied,
4683
+ evidence
4684
+ });
4685
+ }
4686
+ const satisfiedCount = requirements.filter((r) => r.satisfied).length;
4687
+ return {
4688
+ taskId: gold.taskId,
4689
+ requirements,
4690
+ completionRate: satisfiedCount / requirements.length,
4691
+ fullyComplete: satisfiedCount === requirements.length
4692
+ };
4693
+ }
4694
+ function parseCorrectnessResponse(raw) {
4695
+ const match = raw.match(/\{[\s\S]*\}/);
4696
+ if (!match) {
4697
+ throw new Error(`correctness checker: no JSON object in model response: ${raw.slice(0, 200)}`);
4698
+ }
4699
+ const parsed = JSON.parse(match[0]);
4700
+ if (typeof parsed.correct !== "boolean") {
4701
+ throw new Error(`correctness checker: 'correct' is not a boolean in: ${match[0].slice(0, 200)}`);
4702
+ }
4703
+ return { correct: parsed.correct, reason: typeof parsed.reason === "string" ? parsed.reason : "" };
4704
+ }
4705
+ function createLlmCorrectnessChecker(tc, opts = {}) {
4706
+ const model = opts.model ?? "claude-sonnet-4-6";
4707
+ const maxContentChars = opts.maxContentChars ?? 8e3;
4708
+ return async (requirement, content) => {
4709
+ const resp = await tc.chat({
4710
+ model,
4711
+ messages: [
4712
+ {
4713
+ role: "system",
4714
+ content: 'You verify whether a produced work artifact actually fulfils a stated requirement. Judge fulfilment only \u2014 is the deliverable substantively present and on-point \u2014 not polish. A plan to do it later, a vague gesture, or a description of what should be done does NOT fulfil a requirement; the artifact must BE the deliverable. Respond with a single JSON object: {"correct": boolean, "reason": string (<= 30 words)}.'
4715
+ },
4716
+ {
4717
+ role: "user",
4718
+ content: `Requirement: ${requirement.title}
4719
+ ${requirement.category ? `Category: ${requirement.category}
4720
+ ` : ""}
4721
+ Produced artifact:
4722
+ ${content.slice(0, maxContentChars)}`
4723
+ }
4724
+ ],
4725
+ temperature: 0,
4726
+ maxTokens: 200
4727
+ });
4728
+ const raw = resp.choices?.[0]?.message?.content ?? "";
4729
+ return parseCorrectnessResponse(raw);
4730
+ };
4731
+ }
4732
+
4523
4733
  // src/dual-agent-bench.ts
4524
4734
  var DualAgentBench = class {
4525
4735
  async run(config) {
@@ -5174,6 +5384,40 @@ function canonicalInstruction(value) {
5174
5384
  return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
5175
5385
  }
5176
5386
 
5387
+ // src/produced-state.ts
5388
+ function artifactKind(mimeType) {
5389
+ if (!mimeType) return "file";
5390
+ if (mimeType.includes("json")) return "json";
5391
+ if (mimeType.startsWith("text/")) return "text";
5392
+ return "file";
5393
+ }
5394
+ function extractProducedState(events) {
5395
+ const artifacts = [];
5396
+ const proposals = [];
5397
+ const toolCalls = [];
5398
+ const seenTools = /* @__PURE__ */ new Set();
5399
+ for (const ev of events) {
5400
+ if (ev.type === "tool_call") {
5401
+ const name = ev.toolName;
5402
+ if (name && !seenTools.has(name)) {
5403
+ seenTools.add(name);
5404
+ toolCalls.push(name);
5405
+ }
5406
+ } else if (ev.type === "artifact") {
5407
+ const a = ev;
5408
+ artifacts.push({
5409
+ kind: artifactKind(a.mimeType),
5410
+ path: a.name ?? a.uri ?? a.artifactId,
5411
+ content: a.content ?? ""
5412
+ });
5413
+ } else if (ev.type === "proposal_created") {
5414
+ const p = ev;
5415
+ proposals.push({ id: p.proposalId, title: p.title, status: p.status ?? "pending" });
5416
+ }
5417
+ }
5418
+ return { artifacts, proposals, toolCalls };
5419
+ }
5420
+
5177
5421
  // src/prompt-registry.ts
5178
5422
  var PromptRegistry = class {
5179
5423
  entries = /* @__PURE__ */ new Map();
@@ -9092,8 +9336,8 @@ function ratio(numerator, denominator) {
9092
9336
  return denominator > 0 ? numerator / denominator : 0;
9093
9337
  }
9094
9338
  function tokenJaccard(a, b) {
9095
- const left = new Set(tokens(a));
9096
- const right = new Set(tokens(b));
9339
+ const left = new Set(tokens2(a));
9340
+ const right = new Set(tokens2(b));
9097
9341
  if (left.size === 0 || right.size === 0) return 0;
9098
9342
  let intersection = 0;
9099
9343
  for (const token of left) {
@@ -9111,7 +9355,7 @@ function tagOverlap(a, b) {
9111
9355
  }
9112
9356
  return intersection / Math.max(left.size, right.size);
9113
9357
  }
9114
- function tokens(text) {
9358
+ function tokens2(text) {
9115
9359
  return normalize(text).split(/\s+/).filter((token) => token.length >= 3 && !STOP_WORDS.has(token));
9116
9360
  }
9117
9361
  function normalize(text) {
@@ -10545,6 +10789,7 @@ export {
10545
10789
  blockingKnowledgeEval,
10546
10790
  bonferroni,
10547
10791
  bootstrapCi,
10792
+ buildDriverSystemPrompt,
10548
10793
  buildReflectionPrompt,
10549
10794
  buildReviewerPrompt,
10550
10795
  buildTraceAnalystTools,
@@ -10595,6 +10840,7 @@ export {
10595
10840
  createFeedbackTrajectory,
10596
10841
  createIntentMatchJudge,
10597
10842
  createJudgeAdapter,
10843
+ createLlmCorrectnessChecker,
10598
10844
  createLlmReviewer,
10599
10845
  createReplayFetch,
10600
10846
  createRunCriticAdapter,
@@ -10637,6 +10883,7 @@ export {
10637
10883
  exportRunAsOtlp,
10638
10884
  extractAssetUrls,
10639
10885
  extractErrorCount,
10886
+ extractProducedState,
10640
10887
  feedbackTrajectoriesToDatasetScenarios,
10641
10888
  feedbackTrajectoriesToOptimizerRows,
10642
10889
  feedbackTrajectoryToDatasetScenario,
@@ -10714,6 +10961,7 @@ export {
10714
10961
  paretoChart,
10715
10962
  paretoFrontier,
10716
10963
  paretoFrontierWithCrowding,
10964
+ parseCorrectnessResponse,
10717
10965
  parseFeedbackTrajectoriesJsonl,
10718
10966
  parseFindingSubject,
10719
10967
  parseRawFinding,
@@ -10825,6 +11073,7 @@ export {
10825
11073
  userQuestionsForKnowledgeGaps,
10826
11074
  validateRunRecord,
10827
11075
  verbosityBias,
11076
+ verifyCompletion,
10828
11077
  verifyManifest,
10829
11078
  visualDiff,
10830
11079
  viteDeployRunner,