agentv 3.4.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -238,21 +238,19 @@ import json, sys
238
238
  data = json.load(sys.stdin)
239
239
  answer = data.get("answer", "")
240
240
 
241
- hits = []
242
- misses = []
241
+ assertions = []
243
242
 
244
243
  if "42" in answer:
245
- hits.append("Answer contains correct value (42)")
244
+ assertions.append({"text": "Answer contains correct value (42)", "passed": True})
246
245
  else:
247
- misses.append("Answer does not contain expected value (42)")
246
+ assertions.append({"text": "Answer does not contain expected value (42)", "passed": False})
248
247
 
249
- score = 1.0 if hits else 0.0
248
+ passed = sum(1 for a in assertions if a["passed"])
249
+ score = 1.0 if passed == len(assertions) else 0.0
250
250
 
251
251
  print(json.dumps({
252
252
  "score": score,
253
- "hits": hits,
254
- "misses": misses,
255
- "reasoning": f"Passed {len(hits)} check(s)"
253
+ "assertions": assertions,
256
254
  }))
257
255
  ```
258
256
 
@@ -1,9 +1,9 @@
1
1
  import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
2
2
  import {
3
3
  AgentvProvider
4
- } from "./chunk-AR3QEKXH.js";
4
+ } from "./chunk-BJV6MDBE.js";
5
5
  import "./chunk-5H446C7X.js";
6
6
  export {
7
7
  AgentvProvider
8
8
  };
9
- //# sourceMappingURL=agentv-provider-HDSAUUEF-LUBMM7TH.js.map
9
+ //# sourceMappingURL=agentv-provider-NFFLXG5M-TJAWCWCX.js.map
@@ -16,7 +16,7 @@ import {
16
16
  validateEvalFile,
17
17
  validateFileReferences,
18
18
  validateTargetsFile
19
- } from "./chunk-RE5I3U2S.js";
19
+ } from "./chunk-RLL4QGNL.js";
20
20
  import {
21
21
  createBuiltinRegistry,
22
22
  createProvider,
@@ -34,7 +34,7 @@ import {
34
34
  toSnakeCaseDeep as toSnakeCaseDeep2,
35
35
  transpileEvalYamlFile,
36
36
  trimBaselineResult
37
- } from "./chunk-GOZV2HN2.js";
37
+ } from "./chunk-D6G4N2H2.js";
38
38
  import {
39
39
  __commonJS,
40
40
  __esm,
@@ -4185,7 +4185,7 @@ var evalRunCommand = command({
4185
4185
  },
4186
4186
  handler: async (args) => {
4187
4187
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4188
- const { launchInteractiveWizard } = await import("./interactive-WXXTZ7PD.js");
4188
+ const { launchInteractiveWizard } = await import("./interactive-J7SUWZH2.js");
4189
4189
  await launchInteractiveWizard();
4190
4190
  return;
4191
4191
  }
@@ -5089,9 +5089,7 @@ async function runScore(results, evaluatorConfig, testIdFilter) {
5089
5089
  originalScore: raw.score,
5090
5090
  newScore: score.score,
5091
5091
  verdict: score.verdict,
5092
- hits: score.hits,
5093
- misses: score.misses,
5094
- reasoning: score.reasoning
5092
+ assertions: score.assertions
5095
5093
  });
5096
5094
  }
5097
5095
  return scored;
@@ -5110,7 +5108,9 @@ function renderTable(scored, assertSpec) {
5110
5108
  lines.push(cols.map((col) => "\u2500".repeat(col.width)).join("\u2500\u2500"));
5111
5109
  for (const r of scored) {
5112
5110
  const verdictColor = r.verdict === "pass" ? c2.green : c2.red;
5113
- const detail = r.misses.length > 0 ? r.misses[0].slice(0, 48) : r.hits.length > 0 ? r.hits[0].slice(0, 48) : r.reasoning?.slice(0, 48) ?? "";
5111
+ const failed = r.assertions.filter((a) => !a.passed);
5112
+ const passed = r.assertions.filter((a) => a.passed);
5113
+ const detail = failed.length > 0 ? failed[0].text.slice(0, 48) : passed.length > 0 ? passed[0].text.slice(0, 48) : "";
5114
5114
  const row = [
5115
5115
  padRight2(r.testId.slice(0, 24), cols[0].width),
5116
5116
  padLeft2(formatScore(r.originalScore), cols[1].width),
@@ -5332,11 +5332,17 @@ function formatResultDetail(result, index, tree) {
5332
5332
  if (result.error) {
5333
5333
  lines.push(` ${c2.red}Error: ${result.error}${c2.reset}`);
5334
5334
  }
5335
- if (result.hits && result.hits.length > 0) {
5336
- lines.push(` ${c2.green}\u2713 Hits:${c2.reset} ${result.hits.join(", ")}`);
5337
- }
5338
- if (result.misses && result.misses.length > 0) {
5339
- lines.push(` ${c2.red}\u2717 Misses:${c2.reset} ${result.misses.join(", ")}`);
5335
+ if (result.assertions && result.assertions.length > 0) {
5336
+ const passed = result.assertions.filter((a) => a.passed);
5337
+ const failed = result.assertions.filter((a) => !a.passed);
5338
+ if (passed.length > 0)
5339
+ lines.push(
5340
+ ` ${c2.green}\u2713 Passed:${c2.reset} ${passed.map((a) => a.text).join(", ")}`
5341
+ );
5342
+ if (failed.length > 0)
5343
+ lines.push(
5344
+ ` ${c2.red}\u2717 Failed:${c2.reset} ${failed.map((a) => a.text).join(", ")}`
5345
+ );
5340
5346
  }
5341
5347
  if (result.scores && result.scores.length > 0) {
5342
5348
  lines.push(` ${c2.dim}Scores:${c2.reset} ${renderScores(result.scores)}`);
@@ -5344,10 +5350,14 @@ function formatResultDetail(result, index, tree) {
5344
5350
  if (result.trace || result.duration_ms !== void 0 || result.cost_usd !== void 0) {
5345
5351
  lines.push(` ${c2.dim}Trace:${c2.reset} ${renderFlatTrace(result)}`);
5346
5352
  }
5347
- if (result.reasoning) {
5348
- const maxLen = 200;
5349
- const truncated = result.reasoning.length > maxLen ? `${result.reasoning.slice(0, maxLen)}...` : result.reasoning;
5350
- lines.push(` ${c2.dim}Reasoning: ${truncated}${c2.reset}`);
5353
+ if (result.assertions && result.assertions.length > 0) {
5354
+ const withEvidence = result.assertions.filter((a) => a.evidence);
5355
+ if (withEvidence.length > 0) {
5356
+ const maxLen = 200;
5357
+ const evidence = withEvidence[0].evidence;
5358
+ const truncated = evidence.length > maxLen ? `${evidence.slice(0, maxLen)}...` : evidence;
5359
+ lines.push(` ${c2.dim}Evidence: ${truncated}${c2.reset}`);
5360
+ }
5351
5361
  }
5352
5362
  return lines.join("\n");
5353
5363
  }
@@ -6268,4 +6278,4 @@ export {
6268
6278
  preprocessArgv,
6269
6279
  runCli
6270
6280
  };
6271
- //# sourceMappingURL=chunk-A7ZDUB46.js.map
6281
+ //# sourceMappingURL=chunk-5GG6DDP5.js.map