@tangle-network/agent-eval 0.20.8 → 0.20.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,3 +1,8 @@
1
+ import {
2
+ BENCHMARK_SPLIT_SEED,
3
+ benchmarks_exports,
4
+ deterministicSplit
5
+ } from "./chunk-XDGJUIV2.js";
1
6
  import {
2
7
  LlmCallError,
3
8
  LlmClient,
@@ -6,9 +11,7 @@ import {
6
11
  probeLlm,
7
12
  stripFencedJson
8
13
  } from "./chunk-JAOLXRIA.js";
9
- import {
10
- __export
11
- } from "./chunk-PZ5AY32C.js";
14
+ import "./chunk-PZ5AY32C.js";
12
15
 
13
16
  // src/client.ts
14
17
  var ProductClient = class {
@@ -649,9 +652,9 @@ function feedbackTrajectoryToOptimizerRow(trajectory) {
649
652
  function feedbackTrajectoriesToOptimizerRows(trajectories) {
650
653
  return trajectories.map(feedbackTrajectoryToOptimizerRow);
651
654
  }
652
- async function replayFeedbackTrajectory(trajectory, adapter2) {
655
+ async function replayFeedbackTrajectory(trajectory, adapter) {
653
656
  try {
654
- const result = await adapter2.replay(trajectory);
657
+ const result = await adapter.replay(trajectory);
655
658
  return {
656
659
  trajectoryId: trajectory.id,
657
660
  ...result
@@ -680,10 +683,10 @@ async function replayFeedbackTrajectory(trajectory, adapter2) {
680
683
  };
681
684
  }
682
685
  }
683
- async function replayFeedbackTrajectories(trajectories, adapter2) {
686
+ async function replayFeedbackTrajectories(trajectories, adapter) {
684
687
  const results = [];
685
688
  for (const trajectory of trajectories) {
686
- results.push(await replayFeedbackTrajectory(trajectory, adapter2));
689
+ results.push(await replayFeedbackTrajectory(trajectory, adapter));
687
690
  }
688
691
  return results;
689
692
  }
@@ -2379,12 +2382,13 @@ async function runAgentControlLoop(config) {
2379
2382
  try {
2380
2383
  state = await config.observe({ history, abortSignal: controller.signal });
2381
2384
  } catch (err) {
2382
- runtimeErrors.push(runtimeError("observe", 0, err));
2385
+ const error = runtimeError("observe", 0, err);
2386
+ runtimeErrors.push(error);
2383
2387
  return finish(emitter, {
2384
2388
  intent: config.intent,
2385
2389
  pass: false,
2386
2390
  completed: false,
2387
- reason: runtimeErrors[0].message,
2391
+ reason: error.message,
2388
2392
  steps: history,
2389
2393
  finalState: void 0,
2390
2394
  finalEvals: [],
@@ -2400,12 +2404,13 @@ async function runAgentControlLoop(config) {
2400
2404
  evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
2401
2405
  await recordEvalSpans(emitter, evals, "initial", runtimeErrors, 0);
2402
2406
  } catch (err) {
2403
- runtimeErrors.push(runtimeError("validate", 0, err));
2407
+ const error = runtimeError("validate", 0, err);
2408
+ runtimeErrors.push(error);
2404
2409
  return finish(emitter, {
2405
2410
  intent: config.intent,
2406
2411
  pass: false,
2407
2412
  completed: false,
2408
- reason: runtimeErrors[0].message,
2413
+ reason: error.message,
2409
2414
  steps: history,
2410
2415
  finalState: state,
2411
2416
  finalEvals: [],
@@ -3133,11 +3138,11 @@ function isBlockingGap(requirement) {
3133
3138
  function chooseRecommendedAction(blocking, nonBlocking) {
3134
3139
  const gaps = blocking.length > 0 ? blocking : nonBlocking;
3135
3140
  if (gaps.length === 0) return "run_agent";
3136
- if (blocking.some((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask")) return "ask_user";
3137
- if (blocking.some((gap) => gap.acquisitionMode === "query_connector")) return "query_connectors";
3138
- if (blocking.some((gap) => gap.acquisitionMode === "inspect_repo" || gap.acquisitionMode === "run_command")) return "inspect_repo";
3139
- if (blocking.some((gap) => gap.acquisitionMode === "search_web")) return "collect_web_data";
3140
- if (blocking.some((gap) => gap.acquisitionMode === "not_available")) return "abort_or_rescope";
3141
+ if (gaps.some((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask")) return "ask_user";
3142
+ if (gaps.some((gap) => gap.acquisitionMode === "query_connector")) return "query_connectors";
3143
+ if (gaps.some((gap) => gap.acquisitionMode === "inspect_repo" || gap.acquisitionMode === "run_command")) return "inspect_repo";
3144
+ if (gaps.some((gap) => gap.acquisitionMode === "search_web")) return "collect_web_data";
3145
+ if (gaps.some((gap) => gap.acquisitionMode === "not_available")) return "abort_or_rescope";
3141
3146
  if (nonBlocking.some((gap) => gap.importance === "high")) return "build_domain_wiki";
3142
3147
  return "continue_with_caveat";
3143
3148
  }
@@ -4286,13 +4291,15 @@ var AxGepaSteeringOptimizer = class {
4286
4291
  const compiled = await optimizer.compile(
4287
4292
  selector,
4288
4293
  train,
4289
- (({ prediction, example }) => prediction?.variantId === example?.variantId ? 1 : 0),
4294
+ ({ prediction, example }) => prediction?.variantId === example?.variantId ? 1 : 0,
4290
4295
  {
4291
4296
  validationExamples: validation,
4292
4297
  maxMetricCalls: 64
4293
4298
  }
4294
4299
  );
4295
- selector.applyOptimization(compiled.optimizedProgram);
4300
+ if (compiled.optimizedProgram !== void 0) {
4301
+ selector.applyOptimization(compiled.optimizedProgram);
4302
+ }
4296
4303
  return {
4297
4304
  ...fallback,
4298
4305
  backend: "ax-gepa",
@@ -10410,20 +10417,20 @@ function mergeLayerResults(name, perAdapter, options = {}) {
10410
10417
  let durationMs = 0;
10411
10418
  const reasonParts = [];
10412
10419
  const diagnostics = {};
10413
- for (const { adapter: adapter2, result } of perAdapter) {
10420
+ for (const { adapter, result } of perAdapter) {
10414
10421
  status = worst(status, result.status);
10415
10422
  if (typeof result.score === "number") {
10416
10423
  weightedScoreSum += result.score;
10417
10424
  weightCount += 1;
10418
10425
  }
10419
10426
  durationMs = mergeDuration === "sum" ? durationMs + result.durationMs : Math.max(durationMs, result.durationMs);
10420
- reasonParts.push(`${adapter2}: ${result.status}`);
10427
+ reasonParts.push(`${adapter}: ${result.status}`);
10421
10428
  for (const f2 of result.findings) {
10422
10429
  findings.push({
10423
10430
  ...f2,
10424
10431
  layer: name,
10425
- message: prefix ? `${prefix(adapter2)} ${f2.message}` : f2.message,
10426
- detail: { ...f2.detail ?? {}, adapter: adapter2 }
10432
+ message: prefix ? `${prefix(adapter)} ${f2.message}` : f2.message,
10433
+ detail: { ...f2.detail ?? {}, adapter }
10427
10434
  });
10428
10435
  }
10429
10436
  for (const [k, v] of Object.entries(result.diagnostics ?? {})) {
@@ -10442,8 +10449,8 @@ function mergeLayerResults(name, perAdapter, options = {}) {
10442
10449
  reason: reasonParts.join(" \xB7 "),
10443
10450
  diagnostics: Object.keys(diagnostics).length > 0 ? diagnostics : void 0,
10444
10451
  detail: {
10445
- adapters: perAdapter.map(({ adapter: adapter2, result }) => ({
10446
- adapter: adapter2,
10452
+ adapters: perAdapter.map(({ adapter, result }) => ({
10453
+ adapter,
10447
10454
  status: result.status,
10448
10455
  score: result.score ?? null
10449
10456
  })),
@@ -10469,10 +10476,10 @@ function multiToolchainLayer(config) {
10469
10476
  reason: "no adapters detected"
10470
10477
  };
10471
10478
  }
10472
- const runOne = async (adapter2) => {
10473
- const adapterName = config.adapterName(adapter2);
10479
+ const runOne = async (adapter) => {
10480
+ const adapterName = config.adapterName(adapter);
10474
10481
  try {
10475
- const r = await config.run(adapter2, ctx);
10482
+ const r = await config.run(adapter, ctx);
10476
10483
  return { adapter: adapterName, result: r };
10477
10484
  } catch (err) {
10478
10485
  return {
@@ -11908,8 +11915,8 @@ function formatPct(value) {
11908
11915
  function bySplitOrder(a, b) {
11909
11916
  return ALL_SPLITS.indexOf(a) - ALL_SPLITS.indexOf(b);
11910
11917
  }
11911
- function runAdapter(adapter2, scenario, context) {
11912
- return typeof adapter2 === "function" ? adapter2(scenario, context) : adapter2.run(scenario, context);
11918
+ function runAdapter(adapter, scenario, context) {
11919
+ return typeof adapter === "function" ? adapter(scenario, context) : adapter.run(scenario, context);
11913
11920
  }
11914
11921
  function throwIfAborted(signal) {
11915
11922
  if (!signal?.aborted) return;
@@ -12325,6 +12332,24 @@ function fmt2(x) {
12325
12332
  }
12326
12333
 
12327
12334
  // src/researcher.ts
12335
+ var CallbackResearcher = class {
12336
+ constructor(callbacks) {
12337
+ this.callbacks = callbacks;
12338
+ }
12339
+ callbacks;
12340
+ inspectFailures(runs) {
12341
+ return this.callbacks.inspectFailures(runs);
12342
+ }
12343
+ proposeChange(failures) {
12344
+ return this.callbacks.proposeChange(failures);
12345
+ }
12346
+ applyChange(changes, baseline) {
12347
+ return this.callbacks.applyChange(changes, baseline);
12348
+ }
12349
+ evaluateChange(plan) {
12350
+ return this.callbacks.evaluateChange(plan);
12351
+ }
12352
+ };
12328
12353
  var NoopResearcher = class {
12329
12354
  hint;
12330
12355
  constructor(hint = "NoopResearcher: no implementation wired") {
@@ -12777,214 +12802,6 @@ function mean7(xs) {
12777
12802
  return xs.reduce((s, x) => s + x, 0) / xs.length;
12778
12803
  }
12779
12804
 
12780
- // src/benchmarks/types.ts
12781
- function fnv1a32(input) {
12782
- let h = 2166136261;
12783
- for (let i = 0; i < input.length; i++) {
12784
- h ^= input.charCodeAt(i) & 255;
12785
- h = h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24)) >>> 0;
12786
- }
12787
- return h >>> 0;
12788
- }
12789
- var BENCHMARK_SPLIT_SEED = "agent-eval-v1";
12790
- function deterministicSplit(itemId, seed = BENCHMARK_SPLIT_SEED) {
12791
- const h = fnv1a32(`${seed}::${itemId}`);
12792
- const pos = h / 4294967296;
12793
- if (pos < 0.6) return "search";
12794
- if (pos < 0.8) return "dev";
12795
- return "holdout";
12796
- }
12797
-
12798
- // src/benchmarks/index.ts
12799
- var benchmarks_exports = {};
12800
- __export(benchmarks_exports, {
12801
- BENCHMARK_SPLIT_SEED: () => BENCHMARK_SPLIT_SEED,
12802
- deterministicSplit: () => deterministicSplit,
12803
- routing: () => routing_exports
12804
- });
12805
-
12806
- // src/benchmarks/routing/index.ts
12807
- var routing_exports = {};
12808
- __export(routing_exports, {
12809
- ROUTING_DATASET: () => ROUTING_DATASET,
12810
- RoutingAdapter: () => RoutingAdapter,
12811
- assignSplit: () => assignSplit,
12812
- evaluate: () => evaluate,
12813
- extractRouteTokens: () => extractRouteTokens,
12814
- loadDataset: () => loadDataset
12815
- });
12816
-
12817
- // src/benchmarks/routing/dataset.ts
12818
- var ROUTING_DATASET = [
12819
- {
12820
- id: "file_001",
12821
- category: "file",
12822
- prompt: "Save the meeting notes to /tmp/notes-2025-04.md as markdown.",
12823
- route: "fs.write",
12824
- synonyms: ["filesystem.write", "write_file"],
12825
- hardNegatives: ["fs.read", "chat.reply"]
12826
- },
12827
- {
12828
- id: "file_002",
12829
- category: "file",
12830
- prompt: "Read the contents of /etc/hosts and summarize the entries.",
12831
- route: "fs.read",
12832
- synonyms: ["filesystem.read", "read_file"],
12833
- hardNegatives: ["fs.write", "search.web"]
12834
- },
12835
- {
12836
- id: "file_003",
12837
- category: "file",
12838
- prompt: "List every Python file under src/ recursively.",
12839
- route: "fs.list",
12840
- synonyms: ["filesystem.list", "list_files"],
12841
- hardNegatives: ["fs.read", "search.code"]
12842
- },
12843
- {
12844
- id: "file_004",
12845
- category: "file",
12846
- prompt: "Delete the cached build at .turbo/cache.",
12847
- route: "fs.delete",
12848
- synonyms: ["filesystem.delete", "remove_file"],
12849
- hardNegatives: ["fs.write", "fs.list"]
12850
- },
12851
- {
12852
- id: "math_001",
12853
- category: "math",
12854
- prompt: "What is the integral of 3x^2 + 2x from 0 to 5?",
12855
- route: "math.integral",
12856
- synonyms: ["calculator.integral", "math.solve"],
12857
- hardNegatives: ["math.derivative", "chat.reply"]
12858
- },
12859
- {
12860
- id: "math_002",
12861
- category: "math",
12862
- prompt: "Compute the derivative of sin(x) * cos(x).",
12863
- route: "math.derivative",
12864
- synonyms: ["calculator.derivative", "math.solve"],
12865
- hardNegatives: ["math.integral", "math.algebra"]
12866
- },
12867
- {
12868
- id: "math_003",
12869
- category: "math",
12870
- prompt: "Solve 2x + 7 = 19 for x.",
12871
- route: "math.algebra",
12872
- synonyms: ["calculator.algebra", "math.solve"],
12873
- hardNegatives: ["math.derivative", "math.integral"]
12874
- },
12875
- {
12876
- id: "math_004",
12877
- category: "math",
12878
- prompt: "What is the prime factorization of 360?",
12879
- route: "math.numbertheory",
12880
- synonyms: ["calculator.factor", "math.solve"],
12881
- hardNegatives: ["math.algebra", "search.web"]
12882
- },
12883
- {
12884
- id: "search_001",
12885
- category: "search",
12886
- prompt: "Find recent papers on agent prompt optimization with held-out promotion gates.",
12887
- route: "search.web",
12888
- synonyms: ["web.search", "search.papers"],
12889
- hardNegatives: ["search.code", "chat.reply"]
12890
- },
12891
- {
12892
- id: "search_002",
12893
- category: "search",
12894
- prompt: "Search the codebase for every call site of `runProposeReview`.",
12895
- route: "search.code",
12896
- synonyms: ["code.search", "grep"],
12897
- hardNegatives: ["search.web", "fs.read"]
12898
- },
12899
- {
12900
- id: "search_003",
12901
- category: "search",
12902
- prompt: "What is the latest release of the Tangle network on GitHub?",
12903
- route: "search.web",
12904
- synonyms: ["web.search", "github.releases"],
12905
- hardNegatives: ["search.code", "chat.reply"]
12906
- },
12907
- {
12908
- id: "search_004",
12909
- category: "search",
12910
- prompt: "Find all TODO comments in the agent-eval src tree.",
12911
- route: "search.code",
12912
- synonyms: ["code.search", "grep"],
12913
- hardNegatives: ["search.web", "fs.list"]
12914
- },
12915
- {
12916
- id: "chat_001",
12917
- category: "chat",
12918
- prompt: "Hi there, how are you doing today?",
12919
- route: "chat.reply",
12920
- synonyms: ["conversation.reply"],
12921
- hardNegatives: ["search.web", "fs.read"]
12922
- },
12923
- {
12924
- id: "chat_002",
12925
- category: "chat",
12926
- prompt: "Please explain the difference between an LLM and a foundation model.",
12927
- route: "chat.reply",
12928
- synonyms: ["conversation.reply", "qa.answer"],
12929
- hardNegatives: ["search.web", "math.algebra"]
12930
- },
12931
- {
12932
- id: "chat_003",
12933
- category: "chat",
12934
- prompt: "Tell me a short joke about distributed systems.",
12935
- route: "chat.reply",
12936
- synonyms: ["conversation.reply"],
12937
- hardNegatives: ["search.web", "fs.read"]
12938
- },
12939
- {
12940
- id: "chat_004",
12941
- category: "chat",
12942
- prompt: "Acknowledge my last message with a thumbs up.",
12943
- route: "chat.reply",
12944
- synonyms: ["conversation.reply", "react"],
12945
- hardNegatives: ["fs.write", "search.web"]
12946
- }
12947
- ];
12948
-
12949
- // src/benchmarks/routing/index.ts
12950
- var RoutingAdapter = class {
12951
- async loadDataset(split) {
12952
- return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl(it.id) === split);
12953
- }
12954
- async evaluate(item, response) {
12955
- const tokens2 = extractRouteTokens(response);
12956
- const correct = new Set([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()));
12957
- const hardNeg = new Set(item.payload.hardNegatives.map((s) => s.toLowerCase()));
12958
- const firstMatch = tokens2.find((t) => correct.has(t.toLowerCase())) ?? null;
12959
- const firstHardNeg = tokens2.find((t) => hardNeg.has(t.toLowerCase())) ?? null;
12960
- const score = firstMatch ? 1 : 0;
12961
- return {
12962
- score,
12963
- raw: {
12964
- firstToken: tokens2[0] ?? null,
12965
- matchedRoute: firstMatch,
12966
- hitHardNegative: Boolean(firstHardNeg),
12967
- hardNegativeRoute: firstHardNeg,
12968
- category: item.payload.category
12969
- }
12970
- };
12971
- }
12972
- assignSplit(itemId) {
12973
- return assignSplitImpl(itemId);
12974
- }
12975
- };
12976
- function assignSplitImpl(itemId) {
12977
- return deterministicSplit(`routing::${itemId}`);
12978
- }
12979
- function extractRouteTokens(response) {
12980
- const matches2 = response.match(/[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*/gi);
12981
- return matches2 ?? [];
12982
- }
12983
- var adapter = new RoutingAdapter();
12984
- var loadDataset = adapter.loadDataset.bind(adapter);
12985
- var evaluate = adapter.evaluate.bind(adapter);
12986
- var assignSplit = adapter.assignSplit.bind(adapter);
12987
-
12988
12805
  // src/reference-replay-steering.ts
12989
12806
  function referenceReplayRunsToSteeringRows(runs, options = {}) {
12990
12807
  const rows = [];
@@ -15436,11 +15253,22 @@ async function analyzeTraces(input, options) {
15436
15253
  findings: Array.isArray(result.findings) ? result.findings.filter((s) => typeof s === "string") : [],
15437
15254
  turns,
15438
15255
  turnCount: turns.length,
15439
- usage: analyst.getUsage(),
15440
- chatLog: analyst.getChatLog(),
15256
+ usage: normalizeRoleArrays(analyst.getUsage()),
15257
+ chatLog: normalizeRoleArrays(analyst.getChatLog()),
15441
15258
  actorPromptVersion: TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION
15442
15259
  };
15443
15260
  }
15261
+ function normalizeRoleArrays(value) {
15262
+ const record = value && typeof value === "object" ? value : {};
15263
+ return {
15264
+ actor: normalizeRecordArray(record.actor),
15265
+ responder: normalizeRecordArray(record.responder)
15266
+ };
15267
+ }
15268
+ function normalizeRecordArray(value) {
15269
+ if (!Array.isArray(value)) return [];
15270
+ return value.map((item) => item && typeof item === "object" ? { ...item } : { value: item });
15271
+ }
15444
15272
 
15445
15273
  // src/trace-analyst/insights.ts
15446
15274
  var DOMAIN_STOP_WORDS = /* @__PURE__ */ new Set([
@@ -15696,6 +15524,7 @@ export {
15696
15524
  BudgetBreachError,
15697
15525
  BudgetGuard,
15698
15526
  BuilderSession,
15527
+ CallbackResearcher,
15699
15528
  ConvergenceTracker,
15700
15529
  CostLedger,
15701
15530
  CostTracker,