@tangle-network/agent-eval 0.65.0 → 0.67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/CHANGELOG.md +25 -0
  2. package/dist/adapters/otel.d.ts +1 -1
  3. package/dist/campaign/index.d.ts +110 -6
  4. package/dist/campaign/index.js +26 -19
  5. package/dist/campaign/index.js.map +1 -1
  6. package/dist/{chunk-7TPYV2ER.js → chunk-6XQIEUQ2.js} +140 -7
  7. package/dist/chunk-6XQIEUQ2.js.map +1 -0
  8. package/dist/{chunk-HKINEDRZ.js → chunk-DFS3FEXO.js} +3 -2
  9. package/dist/chunk-DFS3FEXO.js.map +1 -0
  10. package/dist/chunk-MZ2IYGGN.js +592 -0
  11. package/dist/chunk-MZ2IYGGN.js.map +1 -0
  12. package/dist/{chunk-4ODZXQV2.js → chunk-NV2PF37Q.js} +645 -2
  13. package/dist/chunk-NV2PF37Q.js.map +1 -0
  14. package/dist/contract/index.d.ts +11 -9
  15. package/dist/contract/index.js +11 -12
  16. package/dist/contract/index.js.map +1 -1
  17. package/dist/hosted/index.d.ts +1 -1
  18. package/dist/hosted/index.js +1 -1
  19. package/dist/{index-CzhtwYBT.d.ts → index-DSEHMwvS.d.ts} +4 -2
  20. package/dist/index.d.ts +251 -7
  21. package/dist/index.js +292 -2
  22. package/dist/index.js.map +1 -1
  23. package/dist/openapi.json +1 -1
  24. package/dist/provenance-CChUqexv.d.ts +314 -0
  25. package/dist/{registry-DPly4_hZ.d.ts → registry-BGKyX6bw.d.ts} +2 -2
  26. package/dist/release-report-CN8hJlhk.d.ts +233 -0
  27. package/dist/reporting.d.ts +4 -3
  28. package/dist/{run-campaign-5J3ED2UJ.js → run-campaign-BVY3RGAZ.js} +2 -3
  29. package/dist/{provenance-lqyLpOYR.d.ts → run-improvement-loop-BKpM5T4t.d.ts} +51 -329
  30. package/dist/statistics-B7yCbi9i.d.ts +253 -0
  31. package/dist/{types-DhqpAi_z.d.ts → types-Croy5h7V.d.ts} +1 -1
  32. package/package.json +1 -1
  33. package/dist/chunk-4ODZXQV2.js.map +0 -1
  34. package/dist/chunk-7TPYV2ER.js.map +0 -1
  35. package/dist/chunk-CZRKD2X2.js +0 -1104
  36. package/dist/chunk-CZRKD2X2.js.map +0 -1
  37. package/dist/chunk-E22YUOAL.js +0 -111
  38. package/dist/chunk-E22YUOAL.js.map +0 -1
  39. package/dist/chunk-HKINEDRZ.js.map +0 -1
  40. package/dist/release-report-DGoeObZT.d.ts +0 -484
  41. /package/dist/{run-campaign-5J3ED2UJ.js.map → run-campaign-BVY3RGAZ.js.map} +0 -0
package/dist/index.js CHANGED
@@ -14,24 +14,28 @@ import {
14
14
  Dataset,
15
15
  HoldoutLockedError,
16
16
  buildReflectionPrompt,
17
+ campaignMeanComposite,
17
18
  crowdingDistance,
18
19
  dominates,
20
+ gepaDriver,
19
21
  hashScenarios,
22
+ heldOutGate,
20
23
  paretoFrontier,
21
24
  paretoFrontierWithCrowding,
22
25
  parseReflectionResponse,
23
26
  redTeamDataset,
24
27
  redTeamReport,
25
28
  runCanaries,
29
+ runImprovementLoop,
26
30
  scalarScore,
27
31
  scoreRedTeamOutput,
28
32
  toolNamesForRun
29
- } from "./chunk-4ODZXQV2.js";
33
+ } from "./chunk-NV2PF37Q.js";
30
34
  import {
31
35
  BackendIntegrityError,
32
36
  assertRealBackend,
33
37
  summarizeBackendIntegrity
34
- } from "./chunk-E22YUOAL.js";
38
+ } from "./chunk-6XQIEUQ2.js";
35
39
  import {
36
40
  BENCHMARK_SPLIT_SEED,
37
41
  benchmarks_exports,
@@ -10430,6 +10434,284 @@ function traceJudgeEnsemble(judges, judgeNames, opts) {
10430
10434
  }
10431
10435
  };
10432
10436
  }
10437
+
10438
+ // src/campaign/distillation/agreement-judge.ts
10439
+ var AGREEMENT_DIM = "agreement";
10440
+ function buildAgreementJudge(options) {
10441
+ const name = options.name ?? "gold-agreement";
10442
+ const goldOnly = options.goldOnly ?? true;
10443
+ const declaredDims = options.dimensionKeys ?? [AGREEMENT_DIM];
10444
+ return {
10445
+ name,
10446
+ dimensions: declaredDims.map((key) => ({
10447
+ key,
10448
+ description: `Per-field agreement between the produced label and the gold label on '${key}'`
10449
+ })),
10450
+ appliesTo: goldOnly ? (scenario) => scenario.kind === "gold" : void 0,
10451
+ score({ artifact, scenario }) {
10452
+ const { score, dimensions } = options.compareLabels(artifact, scenario.label);
10453
+ if (!Number.isFinite(score) || score < 0 || score > 1) {
10454
+ throw new Error(
10455
+ `buildAgreementJudge: comparator returned out-of-range score ${score} for scenario '${scenario.id}' (must be in [0,1])`
10456
+ );
10457
+ }
10458
+ const outDims = { [AGREEMENT_DIM]: score, ...dimensions };
10459
+ const weakest = Object.entries(dimensions).sort((a, b) => a[1] - b[1])[0];
10460
+ const notes = weakest ? `agreement ${score.toFixed(3)}; weakest field '${weakest[0]}' (${weakest[1].toFixed(3)})` : `agreement ${score.toFixed(3)}`;
10461
+ return { composite: score, dimensions: outDims, notes };
10462
+ }
10463
+ };
10464
+ }
10465
+ function fieldAgreement(spec) {
10466
+ const categorical = spec.categorical ?? [];
10467
+ const array = spec.array ?? [];
10468
+ if (categorical.length === 0 && array.length === 0) {
10469
+ throw new Error("fieldAgreement: at least one categorical or array field is required");
10470
+ }
10471
+ return (produced, gold) => {
10472
+ const p = produced ?? {};
10473
+ const g = gold ?? {};
10474
+ const dimensions = {};
10475
+ for (const field of categorical) {
10476
+ dimensions[field] = categoricalAgreement(p[field], g[field]);
10477
+ }
10478
+ for (const field of array) {
10479
+ dimensions[field] = jaccard(asArray(p[field]), asArray(g[field]));
10480
+ }
10481
+ const values = Object.values(dimensions);
10482
+ const score = values.length === 0 ? 0 : values.reduce((a, b) => a + b, 0) / values.length;
10483
+ return { score, dimensions };
10484
+ };
10485
+ }
10486
+ function categoricalAgreement(produced, gold) {
10487
+ if (produced === void 0 && gold === void 0) return 1;
10488
+ return normalizeScalar(produced) === normalizeScalar(gold) ? 1 : 0;
10489
+ }
10490
+ function normalizeScalar(value) {
10491
+ if (value === void 0) return "__undefined__";
10492
+ if (value === null) return "__null__";
10493
+ return JSON.stringify(value);
10494
+ }
10495
+ function asArray(value) {
10496
+ if (Array.isArray(value)) return value;
10497
+ if (value === void 0 || value === null) return [];
10498
+ return [value];
10499
+ }
10500
+ function jaccard(a, b) {
10501
+ const sa = new Set(a.map((x) => JSON.stringify(x)));
10502
+ const sb = new Set(b.map((x) => JSON.stringify(x)));
10503
+ if (sa.size === 0 && sb.size === 0) return 1;
10504
+ let inter = 0;
10505
+ for (const x of sa) if (sb.has(x)) inter++;
10506
+ const union = sa.size + sb.size - inter;
10507
+ return union === 0 ? 1 : inter / union;
10508
+ }
10509
+
10510
+ // src/campaign/distillation/gold-scenarios.ts
10511
+ import { readFileSync as readFileSync7 } from "fs";
10512
+ function loadGoldScenarios(jsonlPath) {
10513
+ const text = readFileSync7(jsonlPath, "utf8");
10514
+ return parseGoldJsonl(text, jsonlPath);
10515
+ }
10516
+ function parseGoldJsonl(text, sourceLabel = "<inline>") {
10517
+ const out = [];
10518
+ const lines = text.split("\n");
10519
+ for (let i = 0; i < lines.length; i++) {
10520
+ const raw = lines[i].trim();
10521
+ if (raw.length === 0) continue;
10522
+ let parsed;
10523
+ try {
10524
+ parsed = JSON.parse(raw);
10525
+ } catch (err) {
10526
+ throw new Error(
10527
+ `loadGoldScenarios: ${sourceLabel}:${i + 1} is not valid JSON \u2014 ${err instanceof Error ? err.message : String(err)}`
10528
+ );
10529
+ }
10530
+ const rawId = parsed.scenarioId ?? parsed.id;
10531
+ if (typeof rawId !== "string" || rawId.length === 0) {
10532
+ throw new Error(
10533
+ `loadGoldScenarios: ${sourceLabel}:${i + 1} missing string \`scenarioId\`/\`id\``
10534
+ );
10535
+ }
10536
+ const id = rawId.replace(/:/g, "__");
10537
+ if (parsed.input === void 0) {
10538
+ throw new Error(`loadGoldScenarios: ${sourceLabel}:${i + 1} (${rawId}) missing \`input\``);
10539
+ }
10540
+ if (parsed.label === void 0) {
10541
+ throw new Error(`loadGoldScenarios: ${sourceLabel}:${i + 1} (${rawId}) missing \`label\``);
10542
+ }
10543
+ const scenario = {
10544
+ id,
10545
+ kind: "gold",
10546
+ input: parsed.input,
10547
+ label: parsed.label
10548
+ };
10549
+ const tags = [];
10550
+ if (id !== rawId) tags.push(`gold-id:${rawId}`);
10551
+ if (parsed.split !== void 0) tags.push(`split:${parsed.split}`);
10552
+ if (tags.length > 0) scenario.tags = tags;
10553
+ out.push(scenario);
10554
+ }
10555
+ if (out.length === 0) {
10556
+ throw new Error(`loadGoldScenarios: ${sourceLabel} contained no gold records`);
10557
+ }
10558
+ return out;
10559
+ }
10560
+ function splitGold(scenarios, options = {}) {
10561
+ const testEveryNth = options.testEveryNth ?? 4;
10562
+ if (!Number.isInteger(testEveryNth) || testEveryNth < 2) {
10563
+ throw new Error("splitGold: testEveryNth must be an integer \u2265 2 (else train or test is empty)");
10564
+ }
10565
+ const train = [];
10566
+ const test = [];
10567
+ let implicitIndex = 0;
10568
+ for (const scenario of scenarios) {
10569
+ const explicit = explicitSplit(scenario);
10570
+ if (explicit === "train") {
10571
+ train.push(scenario);
10572
+ } else if (explicit === "test") {
10573
+ test.push(scenario);
10574
+ } else {
10575
+ if (implicitIndex % testEveryNth === 0) test.push(scenario);
10576
+ else train.push(scenario);
10577
+ implicitIndex += 1;
10578
+ }
10579
+ }
10580
+ return { train, test };
10581
+ }
10582
+ function explicitSplit(scenario) {
10583
+ for (const tag of scenario.tags ?? []) {
10584
+ if (tag === "split:train") return "train";
10585
+ if (tag === "split:test") return "test";
10586
+ }
10587
+ return void 0;
10588
+ }
10589
+
10590
+ // src/campaign/distillation/run-distillation.ts
10591
+ async function runDistillation(opts) {
10592
+ if (opts.train.length === 0) throw new Error("runDistillation: train split is empty");
10593
+ if (opts.holdout.length === 0) throw new Error("runDistillation: holdout split is empty");
10594
+ const chat = createChatClient(opts.llm);
10595
+ const render = opts.renderStudentPrompt ?? defaultRenderStudentPrompt;
10596
+ const parse = opts.parseStudentLabel ?? defaultParseStudentLabel;
10597
+ const runDir = opts.runDir ?? `.evolve/distillation/${Date.now()}`;
10598
+ const studentTemperature = opts.studentTemperature ?? 0;
10599
+ const studentMaxTokens = opts.studentMaxTokens ?? 1024;
10600
+ const driver = gepaDriver({
10601
+ llm: opts.reflectionLlm,
10602
+ model: opts.optimizerModel,
10603
+ target: "a cheap single-shot analyst system prompt that reproduces an expensive workflow gold verdict",
10604
+ mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES2,
10605
+ constraints: opts.constraints
10606
+ });
10607
+ const gate = opts.gate ?? heldOutGate({
10608
+ scenarios: opts.holdout,
10609
+ deltaThreshold: opts.deltaThreshold ?? 0
10610
+ });
10611
+ const loop = await runImprovementLoop({
10612
+ baselineSurface: opts.baselinePrompt,
10613
+ scenarios: opts.train,
10614
+ holdoutScenarios: opts.holdout,
10615
+ judges: [opts.judge],
10616
+ driver,
10617
+ gate,
10618
+ autoOnPromote: "none",
10619
+ // the loop NEVER opens a PR — the caller decides
10620
+ populationSize: opts.populationSize ?? 4,
10621
+ maxGenerations: opts.maxGenerations ?? 3,
10622
+ reps: opts.reps ?? 1,
10623
+ runDir,
10624
+ // The student spends tokens; tracing must stay on (the driver is wired and
10625
+ // runImprovementLoop refuses tracing='off' with a driver).
10626
+ tracing: "on",
10627
+ dispatchWithSurface: async (surface, scenario, ctx) => {
10628
+ const prompt = render({
10629
+ surface: typeof surface === "string" ? surface : JSON.stringify(surface),
10630
+ input: scenario.input,
10631
+ scenarioId: scenario.id
10632
+ });
10633
+ const response = await chat.chat(
10634
+ {
10635
+ model: opts.studentModel,
10636
+ messages: prompt,
10637
+ jsonMode: true,
10638
+ temperature: studentTemperature,
10639
+ maxTokens: studentMaxTokens
10640
+ },
10641
+ { signal: ctx.signal }
10642
+ );
10643
+ reportUsage(ctx.cost, response);
10644
+ return parse(response.content, scenario.id);
10645
+ }
10646
+ });
10647
+ const winnerPrompt = typeof loop.winnerSurface === "string" ? loop.winnerSurface : opts.baselinePrompt;
10648
+ const baseline = campaignMeanComposite(loop.baselineOnHoldout);
10649
+ const winner = campaignMeanComposite(loop.winnerOnHoldout);
10650
+ return {
10651
+ ...loop,
10652
+ winnerPrompt,
10653
+ holdoutAgreement: { baseline, winner, delta: winner - baseline }
10654
+ };
10655
+ }
10656
+ function reportUsage(cost, response) {
10657
+ if (typeof response.costUsd === "number") cost.observe(response.costUsd, "distillation-student");
10658
+ cost.observeTokens({
10659
+ input: response.usage.promptTokens,
10660
+ output: response.usage.completionTokens,
10661
+ cached: response.usage.cachedPromptTokens
10662
+ });
10663
+ }
10664
+ var DEFAULT_MUTATION_PRIMITIVES2 = [
10665
+ "Add an explicit output-schema instruction so the model emits exactly the gold label fields as JSON.",
10666
+ "Add a one-line decision rule for each verdict field the student keeps getting wrong.",
10667
+ "Add a worked example mapping a representative input to its correct gold label.",
10668
+ "Tighten ambiguous phrasing that lets the student hedge instead of committing to a verdict.",
10669
+ "Add a guardrail that forces the student to set boolean risk flags (e.g. leak risk) when the triggering condition is present."
10670
+ ];
10671
+ function defaultRenderStudentPrompt(args) {
10672
+ return [
10673
+ { role: "system", content: args.surface },
10674
+ {
10675
+ role: "user",
10676
+ content: `Input:
10677
+ ${stableStringify(args.input)}
10678
+
10679
+ Respond with ONLY a single JSON object \u2014 the verdict. No prose, no code fences.`
10680
+ }
10681
+ ];
10682
+ }
10683
+ function defaultParseStudentLabel(rawContent, scenarioId) {
10684
+ const stripped = stripFence(rawContent).trim();
10685
+ if (stripped.length === 0) {
10686
+ throw new Error(`distillation student returned empty output for scenario '${scenarioId}'`);
10687
+ }
10688
+ try {
10689
+ return JSON.parse(stripped);
10690
+ } catch (err) {
10691
+ throw new Error(
10692
+ `distillation student returned non-JSON for scenario '${scenarioId}': ${err instanceof Error ? err.message : String(err)} \u2014 raw: ${stripped.slice(0, 200)}`
10693
+ );
10694
+ }
10695
+ }
10696
+ function stripFence(text) {
10697
+ const fenced = /```(?:json)?\s*([\s\S]*?)\s*```/.exec(text);
10698
+ return fenced ? fenced[1] ?? text : text;
10699
+ }
10700
+ function stableStringify(value) {
10701
+ return JSON.stringify(value, replacerSortKeys(), 2);
10702
+ }
10703
+ function replacerSortKeys() {
10704
+ return (_key, value) => {
10705
+ if (value && typeof value === "object" && !Array.isArray(value)) {
10706
+ const sorted = {};
10707
+ for (const k of Object.keys(value).sort()) {
10708
+ sorted[k] = value[k];
10709
+ }
10710
+ return sorted;
10711
+ }
10712
+ return value;
10713
+ };
10714
+ }
10433
10715
  export {
10434
10716
  AGENT_PROFILE_KINDS,
10435
10717
  ANALYST_SEVERITIES,
@@ -10572,6 +10854,7 @@ export {
10572
10854
  bonferroni,
10573
10855
  bootstrapCi,
10574
10856
  buildAgentProfileCell,
10857
+ buildAgreementJudge,
10575
10858
  buildDriverSystemPrompt,
10576
10859
  buildReflectionPrompt,
10577
10860
  buildReviewerPrompt,
@@ -10646,8 +10929,10 @@ export {
10646
10929
  decideReferenceReplayRunPromotion,
10647
10930
  defaultIsMaterial,
10648
10931
  defaultJudges,
10932
+ defaultParseStudentLabel,
10649
10933
  defaultProviderRedactor,
10650
10934
  defaultReferenceReplayMatcher,
10935
+ defaultRenderStudentPrompt,
10651
10936
  defaultTraceInsightPanel,
10652
10937
  deployGateLayer,
10653
10938
  describeTraceInsightScope,
@@ -10678,6 +10963,7 @@ export {
10678
10963
  feedbackTrajectoriesToOptimizerRows,
10679
10964
  feedbackTrajectoryToDatasetScenario,
10680
10965
  feedbackTrajectoryToOptimizerRow,
10966
+ fieldAgreement,
10681
10967
  fileContains,
10682
10968
  fileExists,
10683
10969
  findAutoMatchNoExpectation,
@@ -10732,6 +11018,7 @@ export {
10732
11018
  linterJudge,
10733
11019
  llmSpanFromProvider,
10734
11020
  llmSpans,
11021
+ loadGoldScenarios,
10735
11022
  loadScorecard,
10736
11023
  loadScorerFromGrader,
10737
11024
  localCommandRunner,
@@ -10759,6 +11046,7 @@ export {
10759
11046
  parseCorrectnessResponse,
10760
11047
  parseFeedbackTrajectoriesJsonl,
10761
11048
  parseFindingSubject,
11049
+ parseGoldJsonl,
10762
11050
  parseRawFinding,
10763
11051
  parseReflectionResponse,
10764
11052
  parseRunRecordSafe,
@@ -10810,6 +11098,7 @@ export {
10810
11098
  runBehavioralCanaries,
10811
11099
  runCanaries,
10812
11100
  runCounterfactual,
11101
+ runDistillation,
10813
11102
  runE2EWorkflow,
10814
11103
  runEvalCampaign,
10815
11104
  runExpectations,
@@ -10844,6 +11133,7 @@ export {
10844
11133
  serializeFeedbackTrajectoriesJsonl,
10845
11134
  signManifest,
10846
11135
  soc2Report,
11136
+ splitGold,
10847
11137
  statusAdvanced,
10848
11138
  stopOnNoProgress,
10849
11139
  stopOnRepeatedAction,