npm - @tangle-network/agent-eval - Versions diffs - 0.20.8 → 0.20.9 - Mend

@tangle-network/agent-eval 0.20.8 → 0.20.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/LICENSE +21 -0
package/README.md +9 -6
package/dist/benchmarks/index.d.ts +1 -0
package/dist/benchmarks/index.js +12 -0
package/dist/benchmarks/index.js.map +1 -0
package/dist/chunk-XDGJUIV2.js +219 -0
package/dist/chunk-XDGJUIV2.js.map +1 -0
package/dist/index-CEWY1rmu.d.ts +290 -0
package/dist/index.d.ts +37 -298
package/dist/index.js +68 -239
package/dist/index.js.map +1 -1
package/dist/openapi.json +477 -0
package/docs/concepts.md +4 -4
package/docs/knowledge-readiness.md +2 -2
package/docs/wire-protocol.md +3 -3
package/package.json +14 -7
package/examples/benchmarks/README.md +0 -44
package/examples/benchmarks/gsm8k/index.ts +0 -126
package/examples/benchmarks/swebench-lite/index.ts +0 -178
package/examples/multi-shot-optimization/index.ts +0 -114
package/examples/same-sandbox-harness/index.ts +0 -63

package/dist/index.js CHANGED Viewed

@@ -1,3 +1,8 @@
+import {
+  BENCHMARK_SPLIT_SEED,
+  benchmarks_exports,
+  deterministicSplit
+} from "./chunk-XDGJUIV2.js";
 import {
   LlmCallError,
   LlmClient,
@@ -6,9 +11,7 @@ import {
   probeLlm,
   stripFencedJson
 } from "./chunk-JAOLXRIA.js";
-import {
-  __export
-} from "./chunk-PZ5AY32C.js";
+import "./chunk-PZ5AY32C.js";
 // src/client.ts
 var ProductClient = class {
@@ -649,9 +652,9 @@ function feedbackTrajectoryToOptimizerRow(trajectory) {
 function feedbackTrajectoriesToOptimizerRows(trajectories) {
   return trajectories.map(feedbackTrajectoryToOptimizerRow);
 }
-async function replayFeedbackTrajectory(trajectory, adapter2) {
+async function replayFeedbackTrajectory(trajectory, adapter) {
   try {
-    const result = await adapter2.replay(trajectory);
+    const result = await adapter.replay(trajectory);
     return {
       trajectoryId: trajectory.id,
       ...result
@@ -680,10 +683,10 @@ async function replayFeedbackTrajectory(trajectory, adapter2) {
     };
   }
 }
-async function replayFeedbackTrajectories(trajectories, adapter2) {
+async function replayFeedbackTrajectories(trajectories, adapter) {
   const results = [];
   for (const trajectory of trajectories) {
-    results.push(await replayFeedbackTrajectory(trajectory, adapter2));
+    results.push(await replayFeedbackTrajectory(trajectory, adapter));
   }
   return results;
 }
@@ -2379,12 +2382,13 @@ async function runAgentControlLoop(config) {
     try {
       state = await config.observe({ history, abortSignal: controller.signal });
     } catch (err) {
-      runtimeErrors.push(runtimeError("observe", 0, err));
+      const error = runtimeError("observe", 0, err);
+      runtimeErrors.push(error);
       return finish(emitter, {
         intent: config.intent,
         pass: false,
         completed: false,
-        reason: runtimeErrors[0].message,
+        reason: error.message,
         steps: history,
         finalState: void 0,
         finalEvals: [],
@@ -2400,12 +2404,13 @@ async function runAgentControlLoop(config) {
       evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
       await recordEvalSpans(emitter, evals, "initial", runtimeErrors, 0);
     } catch (err) {
-      runtimeErrors.push(runtimeError("validate", 0, err));
+      const error = runtimeError("validate", 0, err);
+      runtimeErrors.push(error);
       return finish(emitter, {
         intent: config.intent,
         pass: false,
         completed: false,
-        reason: runtimeErrors[0].message,
+        reason: error.message,
         steps: history,
         finalState: state,
         finalEvals: [],
@@ -3133,11 +3138,11 @@ function isBlockingGap(requirement) {
 function chooseRecommendedAction(blocking, nonBlocking) {
   const gaps = blocking.length > 0 ? blocking : nonBlocking;
   if (gaps.length === 0) return "run_agent";
-  if (blocking.some((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask")) return "ask_user";
-  if (blocking.some((gap) => gap.acquisitionMode === "query_connector")) return "query_connectors";
-  if (blocking.some((gap) => gap.acquisitionMode === "inspect_repo" || gap.acquisitionMode === "run_command")) return "inspect_repo";
-  if (blocking.some((gap) => gap.acquisitionMode === "search_web")) return "collect_web_data";
-  if (blocking.some((gap) => gap.acquisitionMode === "not_available")) return "abort_or_rescope";
+  if (gaps.some((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask")) return "ask_user";
+  if (gaps.some((gap) => gap.acquisitionMode === "query_connector")) return "query_connectors";
+  if (gaps.some((gap) => gap.acquisitionMode === "inspect_repo" || gap.acquisitionMode === "run_command")) return "inspect_repo";
+  if (gaps.some((gap) => gap.acquisitionMode === "search_web")) return "collect_web_data";
+  if (gaps.some((gap) => gap.acquisitionMode === "not_available")) return "abort_or_rescope";
   if (nonBlocking.some((gap) => gap.importance === "high")) return "build_domain_wiki";
   return "continue_with_caveat";
 }
@@ -4286,13 +4291,15 @@ var AxGepaSteeringOptimizer = class {
     const compiled = await optimizer.compile(
       selector,
       train,
-      (({ prediction, example }) => prediction?.variantId === example?.variantId ? 1 : 0),
+      ({ prediction, example }) => prediction?.variantId === example?.variantId ? 1 : 0,
       {
         validationExamples: validation,
         maxMetricCalls: 64
       }
     );
-    selector.applyOptimization(compiled.optimizedProgram);
+    if (compiled.optimizedProgram !== void 0) {
+      selector.applyOptimization(compiled.optimizedProgram);
+    }
     return {
       ...fallback,
       backend: "ax-gepa",
@@ -10410,20 +10417,20 @@ function mergeLayerResults(name, perAdapter, options = {}) {
   let durationMs = 0;
   const reasonParts = [];
   const diagnostics = {};
-  for (const { adapter: adapter2, result } of perAdapter) {
+  for (const { adapter, result } of perAdapter) {
     status = worst(status, result.status);
     if (typeof result.score === "number") {
       weightedScoreSum += result.score;
       weightCount += 1;
     }
     durationMs = mergeDuration === "sum" ? durationMs + result.durationMs : Math.max(durationMs, result.durationMs);
-    reasonParts.push(`${adapter2}: ${result.status}`);
+    reasonParts.push(`${adapter}: ${result.status}`);
     for (const f2 of result.findings) {
       findings.push({
         ...f2,
         layer: name,
-        message: prefix ? `${prefix(adapter2)} ${f2.message}` : f2.message,
-        detail: { ...f2.detail ?? {}, adapter: adapter2 }
+        message: prefix ? `${prefix(adapter)} ${f2.message}` : f2.message,
+        detail: { ...f2.detail ?? {}, adapter }
       });
     }
     for (const [k, v] of Object.entries(result.diagnostics ?? {})) {
@@ -10442,8 +10449,8 @@ function mergeLayerResults(name, perAdapter, options = {}) {
     reason: reasonParts.join(" \xB7 "),
     diagnostics: Object.keys(diagnostics).length > 0 ? diagnostics : void 0,
     detail: {
-      adapters: perAdapter.map(({ adapter: adapter2, result }) => ({
-        adapter: adapter2,
+      adapters: perAdapter.map(({ adapter, result }) => ({
+        adapter,
         status: result.status,
         score: result.score ?? null
       })),
@@ -10469,10 +10476,10 @@ function multiToolchainLayer(config) {
           reason: "no adapters detected"
         };
       }
-      const runOne = async (adapter2) => {
-        const adapterName = config.adapterName(adapter2);
+      const runOne = async (adapter) => {
+        const adapterName = config.adapterName(adapter);
         try {
-          const r = await config.run(adapter2, ctx);
+          const r = await config.run(adapter, ctx);
           return { adapter: adapterName, result: r };
         } catch (err) {
           return {
@@ -11908,8 +11915,8 @@ function formatPct(value) {
 function bySplitOrder(a, b) {
   return ALL_SPLITS.indexOf(a) - ALL_SPLITS.indexOf(b);
 }
-function runAdapter(adapter2, scenario, context) {
-  return typeof adapter2 === "function" ? adapter2(scenario, context) : adapter2.run(scenario, context);
+function runAdapter(adapter, scenario, context) {
+  return typeof adapter === "function" ? adapter(scenario, context) : adapter.run(scenario, context);
 }
 function throwIfAborted(signal) {
   if (!signal?.aborted) return;
@@ -12325,6 +12332,24 @@ function fmt2(x) {
 }
 // src/researcher.ts
+var CallbackResearcher = class {
+  constructor(callbacks) {
+    this.callbacks = callbacks;
+  }
+  callbacks;
+  inspectFailures(runs) {
+    return this.callbacks.inspectFailures(runs);
+  }
+  proposeChange(failures) {
+    return this.callbacks.proposeChange(failures);
+  }
+  applyChange(changes, baseline) {
+    return this.callbacks.applyChange(changes, baseline);
+  }
+  evaluateChange(plan) {
+    return this.callbacks.evaluateChange(plan);
+  }
+};
 var NoopResearcher = class {
   hint;
   constructor(hint = "NoopResearcher: no implementation wired") {
@@ -12777,214 +12802,6 @@ function mean7(xs) {
   return xs.reduce((s, x) => s + x, 0) / xs.length;
 }
-// src/benchmarks/types.ts
-function fnv1a32(input) {
-  let h = 2166136261;
-  for (let i = 0; i < input.length; i++) {
-    h ^= input.charCodeAt(i) & 255;
-    h = h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24)) >>> 0;
-  }
-  return h >>> 0;
-}
-var BENCHMARK_SPLIT_SEED = "agent-eval-v1";
-function deterministicSplit(itemId, seed = BENCHMARK_SPLIT_SEED) {
-  const h = fnv1a32(`${seed}::${itemId}`);
-  const pos = h / 4294967296;
-  if (pos < 0.6) return "search";
-  if (pos < 0.8) return "dev";
-  return "holdout";
-}
-// src/benchmarks/index.ts
-var benchmarks_exports = {};
-__export(benchmarks_exports, {
-  BENCHMARK_SPLIT_SEED: () => BENCHMARK_SPLIT_SEED,
-  deterministicSplit: () => deterministicSplit,
-  routing: () => routing_exports
-});
-// src/benchmarks/routing/index.ts
-var routing_exports = {};
-__export(routing_exports, {
-  ROUTING_DATASET: () => ROUTING_DATASET,
-  RoutingAdapter: () => RoutingAdapter,
-  assignSplit: () => assignSplit,
-  evaluate: () => evaluate,
-  extractRouteTokens: () => extractRouteTokens,
-  loadDataset: () => loadDataset
-});
-// src/benchmarks/routing/dataset.ts
-var ROUTING_DATASET = [
-  {
-    id: "file_001",
-    category: "file",
-    prompt: "Save the meeting notes to /tmp/notes-2025-04.md as markdown.",
-    route: "fs.write",
-    synonyms: ["filesystem.write", "write_file"],
-    hardNegatives: ["fs.read", "chat.reply"]
-  },
-  {
-    id: "file_002",
-    category: "file",
-    prompt: "Read the contents of /etc/hosts and summarize the entries.",
-    route: "fs.read",
-    synonyms: ["filesystem.read", "read_file"],
-    hardNegatives: ["fs.write", "search.web"]
-  },
-  {
-    id: "file_003",
-    category: "file",
-    prompt: "List every Python file under src/ recursively.",
-    route: "fs.list",
-    synonyms: ["filesystem.list", "list_files"],
-    hardNegatives: ["fs.read", "search.code"]
-  },
-  {
-    id: "file_004",
-    category: "file",
-    prompt: "Delete the cached build at .turbo/cache.",
-    route: "fs.delete",
-    synonyms: ["filesystem.delete", "remove_file"],
-    hardNegatives: ["fs.write", "fs.list"]
-  },
-  {
-    id: "math_001",
-    category: "math",
-    prompt: "What is the integral of 3x^2 + 2x from 0 to 5?",
-    route: "math.integral",
-    synonyms: ["calculator.integral", "math.solve"],
-    hardNegatives: ["math.derivative", "chat.reply"]
-  },
-  {
-    id: "math_002",
-    category: "math",
-    prompt: "Compute the derivative of sin(x) * cos(x).",
-    route: "math.derivative",
-    synonyms: ["calculator.derivative", "math.solve"],
-    hardNegatives: ["math.integral", "math.algebra"]
-  },
-  {
-    id: "math_003",
-    category: "math",
-    prompt: "Solve 2x + 7 = 19 for x.",
-    route: "math.algebra",
-    synonyms: ["calculator.algebra", "math.solve"],
-    hardNegatives: ["math.derivative", "math.integral"]
-  },
-  {
-    id: "math_004",
-    category: "math",
-    prompt: "What is the prime factorization of 360?",
-    route: "math.numbertheory",
-    synonyms: ["calculator.factor", "math.solve"],
-    hardNegatives: ["math.algebra", "search.web"]
-  },
-  {
-    id: "search_001",
-    category: "search",
-    prompt: "Find recent papers on agent prompt optimization with held-out promotion gates.",
-    route: "search.web",
-    synonyms: ["web.search", "search.papers"],
-    hardNegatives: ["search.code", "chat.reply"]
-  },
-  {
-    id: "search_002",
-    category: "search",
-    prompt: "Search the codebase for every call site of `runProposeReview`.",
-    route: "search.code",
-    synonyms: ["code.search", "grep"],
-    hardNegatives: ["search.web", "fs.read"]
-  },
-  {
-    id: "search_003",
-    category: "search",
-    prompt: "What is the latest release of the Tangle network on GitHub?",
-    route: "search.web",
-    synonyms: ["web.search", "github.releases"],
-    hardNegatives: ["search.code", "chat.reply"]
-  },
-  {
-    id: "search_004",
-    category: "search",
-    prompt: "Find all TODO comments in the agent-eval src tree.",
-    route: "search.code",
-    synonyms: ["code.search", "grep"],
-    hardNegatives: ["search.web", "fs.list"]
-  },
-  {
-    id: "chat_001",
-    category: "chat",
-    prompt: "Hi there, how are you doing today?",
-    route: "chat.reply",
-    synonyms: ["conversation.reply"],
-    hardNegatives: ["search.web", "fs.read"]
-  },
-  {
-    id: "chat_002",
-    category: "chat",
-    prompt: "Please explain the difference between an LLM and a foundation model.",
-    route: "chat.reply",
-    synonyms: ["conversation.reply", "qa.answer"],
-    hardNegatives: ["search.web", "math.algebra"]
-  },
-  {
-    id: "chat_003",
-    category: "chat",
-    prompt: "Tell me a short joke about distributed systems.",
-    route: "chat.reply",
-    synonyms: ["conversation.reply"],
-    hardNegatives: ["search.web", "fs.read"]
-  },
-  {
-    id: "chat_004",
-    category: "chat",
-    prompt: "Acknowledge my last message with a thumbs up.",
-    route: "chat.reply",
-    synonyms: ["conversation.reply", "react"],
-    hardNegatives: ["fs.write", "search.web"]
-  }
-];
-// src/benchmarks/routing/index.ts
-var RoutingAdapter = class {
-  async loadDataset(split) {
-    return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl(it.id) === split);
-  }
-  async evaluate(item, response) {
-    const tokens2 = extractRouteTokens(response);
-    const correct = new Set([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()));
-    const hardNeg = new Set(item.payload.hardNegatives.map((s) => s.toLowerCase()));
-    const firstMatch = tokens2.find((t) => correct.has(t.toLowerCase())) ?? null;
-    const firstHardNeg = tokens2.find((t) => hardNeg.has(t.toLowerCase())) ?? null;
-    const score = firstMatch ? 1 : 0;
-    return {
-      score,
-      raw: {
-        firstToken: tokens2[0] ?? null,
-        matchedRoute: firstMatch,
-        hitHardNegative: Boolean(firstHardNeg),
-        hardNegativeRoute: firstHardNeg,
-        category: item.payload.category
-      }
-    };
-  }
-  assignSplit(itemId) {
-    return assignSplitImpl(itemId);
-  }
-};
-function assignSplitImpl(itemId) {
-  return deterministicSplit(`routing::${itemId}`);
-}
-function extractRouteTokens(response) {
-  const matches2 = response.match(/[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*/gi);
-  return matches2 ?? [];
-}
-var adapter = new RoutingAdapter();
-var loadDataset = adapter.loadDataset.bind(adapter);
-var evaluate = adapter.evaluate.bind(adapter);
-var assignSplit = adapter.assignSplit.bind(adapter);
 // src/reference-replay-steering.ts
 function referenceReplayRunsToSteeringRows(runs, options = {}) {
   const rows = [];
@@ -15436,11 +15253,22 @@ async function analyzeTraces(input, options) {
     findings: Array.isArray(result.findings) ? result.findings.filter((s) => typeof s === "string") : [],
     turns,
     turnCount: turns.length,
-    usage: analyst.getUsage(),
-    chatLog: analyst.getChatLog(),
+    usage: normalizeRoleArrays(analyst.getUsage()),
+    chatLog: normalizeRoleArrays(analyst.getChatLog()),
     actorPromptVersion: TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION
   };
 }
+function normalizeRoleArrays(value) {
+  const record = value && typeof value === "object" ? value : {};
+  return {
+    actor: normalizeRecordArray(record.actor),
+    responder: normalizeRecordArray(record.responder)
+  };
+}
+function normalizeRecordArray(value) {
+  if (!Array.isArray(value)) return [];
+  return value.map((item) => item && typeof item === "object" ? { ...item } : { value: item });
+}
 // src/trace-analyst/insights.ts
 var DOMAIN_STOP_WORDS = /* @__PURE__ */ new Set([
@@ -15696,6 +15524,7 @@ export {
   BudgetBreachError,
   BudgetGuard,
   BuilderSession,
+  CallbackResearcher,
   ConvergenceTracker,
   CostLedger,
   CostTracker,