npm - @tangle-network/agent-eval - Versions diffs - 0.65.0 → 0.66.0 - Mend

@tangle-network/agent-eval 0.65.0 → 0.66.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/CHANGELOG.md +12 -0
package/dist/adapters/otel.d.ts +1 -1
package/dist/campaign/index.d.ts +4 -3
package/dist/campaign/index.js +18 -19
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-7TPYV2ER.js → chunk-6XQIEUQ2.js} +140 -7
package/dist/chunk-6XQIEUQ2.js.map +1 -0
package/dist/{chunk-HKINEDRZ.js → chunk-DFS3FEXO.js} +3 -2
package/dist/chunk-DFS3FEXO.js.map +1 -0
package/dist/{chunk-4ODZXQV2.js → chunk-Q56RRLEC.js} +635 -2
package/dist/chunk-Q56RRLEC.js.map +1 -0
package/dist/chunk-RDK3P4JE.js +482 -0
package/dist/chunk-RDK3P4JE.js.map +1 -0
package/dist/contract/index.d.ts +10 -8
package/dist/contract/index.js +11 -12
package/dist/contract/index.js.map +1 -1
package/dist/hosted/index.d.ts +1 -1
package/dist/hosted/index.js +1 -1
package/dist/{index-CzhtwYBT.d.ts → index-DSEHMwvS.d.ts} +4 -2
package/dist/index.d.ts +246 -3
package/dist/index.js +292 -2
package/dist/index.js.map +1 -1
package/dist/openapi.json +1 -1
package/dist/provenance-BZUFC1_D.d.ts +292 -0
package/dist/{registry-DPly4_hZ.d.ts → registry-BzAEvqAt.d.ts} +1 -1
package/dist/{run-campaign-5J3ED2UJ.js → run-campaign-BVY3RGAZ.js} +2 -3
package/dist/{provenance-lqyLpOYR.d.ts → run-improvement-loop-BKpM5T4t.d.ts} +51 -329
package/package.json +1 -1
package/dist/chunk-4ODZXQV2.js.map +0 -1
package/dist/chunk-7TPYV2ER.js.map +0 -1
package/dist/chunk-CZRKD2X2.js +0 -1104
package/dist/chunk-CZRKD2X2.js.map +0 -1
package/dist/chunk-E22YUOAL.js +0 -111
package/dist/chunk-E22YUOAL.js.map +0 -1
package/dist/chunk-HKINEDRZ.js.map +0 -1
/package/dist/{run-campaign-5J3ED2UJ.js.map → run-campaign-BVY3RGAZ.js.map} +0 -0

package/dist/index.js CHANGED Viewed

@@ -14,24 +14,28 @@ import {
   Dataset,
   HoldoutLockedError,
   buildReflectionPrompt,
+  campaignMeanComposite,
   crowdingDistance,
   dominates,
+  gepaDriver,
   hashScenarios,
+  heldOutGate,
   paretoFrontier,
   paretoFrontierWithCrowding,
   parseReflectionResponse,
   redTeamDataset,
   redTeamReport,
   runCanaries,
+  runImprovementLoop,
   scalarScore,
   scoreRedTeamOutput,
   toolNamesForRun
-} from "./chunk-4ODZXQV2.js";
+} from "./chunk-Q56RRLEC.js";
 import {
   BackendIntegrityError,
   assertRealBackend,
   summarizeBackendIntegrity
-} from "./chunk-E22YUOAL.js";
+} from "./chunk-6XQIEUQ2.js";
 import {
   BENCHMARK_SPLIT_SEED,
   benchmarks_exports,
@@ -10430,6 +10434,284 @@ function traceJudgeEnsemble(judges, judgeNames, opts) {
     }
   };
 }
+// src/campaign/distillation/agreement-judge.ts
+var AGREEMENT_DIM = "agreement";
+function buildAgreementJudge(options) {
+  const name = options.name ?? "gold-agreement";
+  const goldOnly = options.goldOnly ?? true;
+  const declaredDims = options.dimensionKeys ?? [AGREEMENT_DIM];
+  return {
+    name,
+    dimensions: declaredDims.map((key) => ({
+      key,
+      description: `Per-field agreement between the produced label and the gold label on '${key}'`
+    })),
+    appliesTo: goldOnly ? (scenario) => scenario.kind === "gold" : void 0,
+    score({ artifact, scenario }) {
+      const { score, dimensions } = options.compareLabels(artifact, scenario.label);
+      if (!Number.isFinite(score) || score < 0 || score > 1) {
+        throw new Error(
+          `buildAgreementJudge: comparator returned out-of-range score ${score} for scenario '${scenario.id}' (must be in [0,1])`
+        );
+      }
+      const outDims = { [AGREEMENT_DIM]: score, ...dimensions };
+      const weakest = Object.entries(dimensions).sort((a, b) => a[1] - b[1])[0];
+      const notes = weakest ? `agreement ${score.toFixed(3)}; weakest field '${weakest[0]}' (${weakest[1].toFixed(3)})` : `agreement ${score.toFixed(3)}`;
+      return { composite: score, dimensions: outDims, notes };
+    }
+  };
+}
+function fieldAgreement(spec) {
+  const categorical = spec.categorical ?? [];
+  const array = spec.array ?? [];
+  if (categorical.length === 0 && array.length === 0) {
+    throw new Error("fieldAgreement: at least one categorical or array field is required");
+  }
+  return (produced, gold) => {
+    const p = produced ?? {};
+    const g = gold ?? {};
+    const dimensions = {};
+    for (const field of categorical) {
+      dimensions[field] = categoricalAgreement(p[field], g[field]);
+    }
+    for (const field of array) {
+      dimensions[field] = jaccard(asArray(p[field]), asArray(g[field]));
+    }
+    const values = Object.values(dimensions);
+    const score = values.length === 0 ? 0 : values.reduce((a, b) => a + b, 0) / values.length;
+    return { score, dimensions };
+  };
+}
+function categoricalAgreement(produced, gold) {
+  if (produced === void 0 && gold === void 0) return 1;
+  return normalizeScalar(produced) === normalizeScalar(gold) ? 1 : 0;
+}
+function normalizeScalar(value) {
+  if (value === void 0) return "__undefined__";
+  if (value === null) return "__null__";
+  return JSON.stringify(value);
+}
+function asArray(value) {
+  if (Array.isArray(value)) return value;
+  if (value === void 0 || value === null) return [];
+  return [value];
+}
+function jaccard(a, b) {
+  const sa = new Set(a.map((x) => JSON.stringify(x)));
+  const sb = new Set(b.map((x) => JSON.stringify(x)));
+  if (sa.size === 0 && sb.size === 0) return 1;
+  let inter = 0;
+  for (const x of sa) if (sb.has(x)) inter++;
+  const union = sa.size + sb.size - inter;
+  return union === 0 ? 1 : inter / union;
+}
+// src/campaign/distillation/gold-scenarios.ts
+import { readFileSync as readFileSync7 } from "fs";
+function loadGoldScenarios(jsonlPath) {
+  const text = readFileSync7(jsonlPath, "utf8");
+  return parseGoldJsonl(text, jsonlPath);
+}
+function parseGoldJsonl(text, sourceLabel = "<inline>") {
+  const out = [];
+  const lines = text.split("\n");
+  for (let i = 0; i < lines.length; i++) {
+    const raw = lines[i].trim();
+    if (raw.length === 0) continue;
+    let parsed;
+    try {
+      parsed = JSON.parse(raw);
+    } catch (err) {
+      throw new Error(
+        `loadGoldScenarios: ${sourceLabel}:${i + 1} is not valid JSON \u2014 ${err instanceof Error ? err.message : String(err)}`
+      );
+    }
+    const rawId = parsed.scenarioId ?? parsed.id;
+    if (typeof rawId !== "string" || rawId.length === 0) {
+      throw new Error(
+        `loadGoldScenarios: ${sourceLabel}:${i + 1} missing string \`scenarioId\`/\`id\``
+      );
+    }
+    const id = rawId.replace(/:/g, "__");
+    if (parsed.input === void 0) {
+      throw new Error(`loadGoldScenarios: ${sourceLabel}:${i + 1} (${rawId}) missing \`input\``);
+    }
+    if (parsed.label === void 0) {
+      throw new Error(`loadGoldScenarios: ${sourceLabel}:${i + 1} (${rawId}) missing \`label\``);
+    }
+    const scenario = {
+      id,
+      kind: "gold",
+      input: parsed.input,
+      label: parsed.label
+    };
+    const tags = [];
+    if (id !== rawId) tags.push(`gold-id:${rawId}`);
+    if (parsed.split !== void 0) tags.push(`split:${parsed.split}`);
+    if (tags.length > 0) scenario.tags = tags;
+    out.push(scenario);
+  }
+  if (out.length === 0) {
+    throw new Error(`loadGoldScenarios: ${sourceLabel} contained no gold records`);
+  }
+  return out;
+}
+function splitGold(scenarios, options = {}) {
+  const testEveryNth = options.testEveryNth ?? 4;
+  if (!Number.isInteger(testEveryNth) || testEveryNth < 2) {
+    throw new Error("splitGold: testEveryNth must be an integer \u2265 2 (else train or test is empty)");
+  }
+  const train = [];
+  const test = [];
+  let implicitIndex = 0;
+  for (const scenario of scenarios) {
+    const explicit = explicitSplit(scenario);
+    if (explicit === "train") {
+      train.push(scenario);
+    } else if (explicit === "test") {
+      test.push(scenario);
+    } else {
+      if (implicitIndex % testEveryNth === 0) test.push(scenario);
+      else train.push(scenario);
+      implicitIndex += 1;
+    }
+  }
+  return { train, test };
+}
+function explicitSplit(scenario) {
+  for (const tag of scenario.tags ?? []) {
+    if (tag === "split:train") return "train";
+    if (tag === "split:test") return "test";
+  }
+  return void 0;
+}
+// src/campaign/distillation/run-distillation.ts
+async function runDistillation(opts) {
+  if (opts.train.length === 0) throw new Error("runDistillation: train split is empty");
+  if (opts.holdout.length === 0) throw new Error("runDistillation: holdout split is empty");
+  const chat = createChatClient(opts.llm);
+  const render = opts.renderStudentPrompt ?? defaultRenderStudentPrompt;
+  const parse = opts.parseStudentLabel ?? defaultParseStudentLabel;
+  const runDir = opts.runDir ?? `.evolve/distillation/${Date.now()}`;
+  const studentTemperature = opts.studentTemperature ?? 0;
+  const studentMaxTokens = opts.studentMaxTokens ?? 1024;
+  const driver = gepaDriver({
+    llm: opts.reflectionLlm,
+    model: opts.optimizerModel,
+    target: "a cheap single-shot analyst system prompt that reproduces an expensive workflow gold verdict",
+    mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES2,
+    constraints: opts.constraints
+  });
+  const gate = opts.gate ?? heldOutGate({
+    scenarios: opts.holdout,
+    deltaThreshold: opts.deltaThreshold ?? 0
+  });
+  const loop = await runImprovementLoop({
+    baselineSurface: opts.baselinePrompt,
+    scenarios: opts.train,
+    holdoutScenarios: opts.holdout,
+    judges: [opts.judge],
+    driver,
+    gate,
+    autoOnPromote: "none",
+    // the loop NEVER opens a PR — the caller decides
+    populationSize: opts.populationSize ?? 4,
+    maxGenerations: opts.maxGenerations ?? 3,
+    reps: opts.reps ?? 1,
+    runDir,
+    // The student spends tokens; tracing must stay on (the driver is wired and
+    // runImprovementLoop refuses tracing='off' with a driver).
+    tracing: "on",
+    dispatchWithSurface: async (surface, scenario, ctx) => {
+      const prompt = render({
+        surface: typeof surface === "string" ? surface : JSON.stringify(surface),
+        input: scenario.input,
+        scenarioId: scenario.id
+      });
+      const response = await chat.chat(
+        {
+          model: opts.studentModel,
+          messages: prompt,
+          jsonMode: true,
+          temperature: studentTemperature,
+          maxTokens: studentMaxTokens
+        },
+        { signal: ctx.signal }
+      );
+      reportUsage(ctx.cost, response);
+      return parse(response.content, scenario.id);
+    }
+  });
+  const winnerPrompt = typeof loop.winnerSurface === "string" ? loop.winnerSurface : opts.baselinePrompt;
+  const baseline = campaignMeanComposite(loop.baselineOnHoldout);
+  const winner = campaignMeanComposite(loop.winnerOnHoldout);
+  return {
+    ...loop,
+    winnerPrompt,
+    holdoutAgreement: { baseline, winner, delta: winner - baseline }
+  };
+}
+function reportUsage(cost, response) {
+  if (typeof response.costUsd === "number") cost.observe(response.costUsd, "distillation-student");
+  cost.observeTokens({
+    input: response.usage.promptTokens,
+    output: response.usage.completionTokens,
+    cached: response.usage.cachedPromptTokens
+  });
+}
+var DEFAULT_MUTATION_PRIMITIVES2 = [
+  "Add an explicit output-schema instruction so the model emits exactly the gold label fields as JSON.",
+  "Add a one-line decision rule for each verdict field the student keeps getting wrong.",
+  "Add a worked example mapping a representative input to its correct gold label.",
+  "Tighten ambiguous phrasing that lets the student hedge instead of committing to a verdict.",
+  "Add a guardrail that forces the student to set boolean risk flags (e.g. leak risk) when the triggering condition is present."
+];
+function defaultRenderStudentPrompt(args) {
+  return [
+    { role: "system", content: args.surface },
+    {
+      role: "user",
+      content: `Input:
+${stableStringify(args.input)}
+Respond with ONLY a single JSON object \u2014 the verdict. No prose, no code fences.`
+    }
+  ];
+}
+function defaultParseStudentLabel(rawContent, scenarioId) {
+  const stripped = stripFence(rawContent).trim();
+  if (stripped.length === 0) {
+    throw new Error(`distillation student returned empty output for scenario '${scenarioId}'`);
+  }
+  try {
+    return JSON.parse(stripped);
+  } catch (err) {
+    throw new Error(
+      `distillation student returned non-JSON for scenario '${scenarioId}': ${err instanceof Error ? err.message : String(err)} \u2014 raw: ${stripped.slice(0, 200)}`
+    );
+  }
+}
+function stripFence(text) {
+  const fenced = /```(?:json)?\s*([\s\S]*?)\s*```/.exec(text);
+  return fenced ? fenced[1] ?? text : text;
+}
+function stableStringify(value) {
+  return JSON.stringify(value, replacerSortKeys(), 2);
+}
+function replacerSortKeys() {
+  return (_key, value) => {
+    if (value && typeof value === "object" && !Array.isArray(value)) {
+      const sorted = {};
+      for (const k of Object.keys(value).sort()) {
+        sorted[k] = value[k];
+      }
+      return sorted;
+    }
+    return value;
+  };
+}
 export {
   AGENT_PROFILE_KINDS,
   ANALYST_SEVERITIES,
@@ -10572,6 +10854,7 @@ export {
   bonferroni,
   bootstrapCi,
   buildAgentProfileCell,
+  buildAgreementJudge,
   buildDriverSystemPrompt,
   buildReflectionPrompt,
   buildReviewerPrompt,
@@ -10646,8 +10929,10 @@ export {
   decideReferenceReplayRunPromotion,
   defaultIsMaterial,
   defaultJudges,
+  defaultParseStudentLabel,
   defaultProviderRedactor,
   defaultReferenceReplayMatcher,
+  defaultRenderStudentPrompt,
   defaultTraceInsightPanel,
   deployGateLayer,
   describeTraceInsightScope,
@@ -10678,6 +10963,7 @@ export {
   feedbackTrajectoriesToOptimizerRows,
   feedbackTrajectoryToDatasetScenario,
   feedbackTrajectoryToOptimizerRow,
+  fieldAgreement,
   fileContains,
   fileExists,
   findAutoMatchNoExpectation,
@@ -10732,6 +11018,7 @@ export {
   linterJudge,
   llmSpanFromProvider,
   llmSpans,
+  loadGoldScenarios,
   loadScorecard,
   loadScorerFromGrader,
   localCommandRunner,
@@ -10759,6 +11046,7 @@ export {
   parseCorrectnessResponse,
   parseFeedbackTrajectoriesJsonl,
   parseFindingSubject,
+  parseGoldJsonl,
   parseRawFinding,
   parseReflectionResponse,
   parseRunRecordSafe,
@@ -10810,6 +11098,7 @@ export {
   runBehavioralCanaries,
   runCanaries,
   runCounterfactual,
+  runDistillation,
   runE2EWorkflow,
   runEvalCampaign,
   runExpectations,
@@ -10844,6 +11133,7 @@ export {
   serializeFeedbackTrajectoriesJsonl,
   signManifest,
   soc2Report,
+  splitGold,
   statusAdvanced,
   stopOnNoProgress,
   stopOnRepeatedAction,