@tangle-network/agent-eval 0.17.3 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -12692,6 +12692,274 @@ function samePopulation(a, b) {
12692
12692
  return b.every((id) => setA.has(id));
12693
12693
  }
12694
12694
 
12695
+ // src/multi-shot-optimization.ts
12696
+ async function runMultiShotOptimization(config) {
12697
+ validateConfig(config);
12698
+ const scoreAdapter = {
12699
+ score: (args) => scoreOne(config, args.variant, args.scenarioId, args.rep, "search")
12700
+ };
12701
+ const evolution = await runPromptEvolution({
12702
+ runId: config.runId,
12703
+ target: config.target,
12704
+ seedVariants: config.seedVariants,
12705
+ scenarioIds: config.searchScenarioIds,
12706
+ reps: config.reps,
12707
+ generations: config.generations,
12708
+ populationSize: config.populationSize,
12709
+ scoreConcurrency: config.scoreConcurrency ?? 1,
12710
+ scoreAdapter,
12711
+ mutateAdapter: {
12712
+ mutate: (args) => config.mutateAdapter.mutate({
12713
+ ...args,
12714
+ topTrials: args.topTrials,
12715
+ bottomTrials: args.bottomTrials
12716
+ })
12717
+ },
12718
+ objectives: config.objectives ?? defaultMultiShotObjectives(),
12719
+ scalarWeights: config.scalarWeights,
12720
+ earlyStopOnNoImprovement: config.earlyStopOnNoImprovement,
12721
+ cache: config.cache,
12722
+ onProgress: config.onProgress
12723
+ });
12724
+ let gate = null;
12725
+ const baseline = config.seedVariants[0];
12726
+ let promotedVariant = evolution.bestVariant;
12727
+ let promotedAggregate = evolution.bestAggregate;
12728
+ if (config.gate && evolution.bestVariant.id !== baseline.id) {
12729
+ gate = await evaluateMultiShotGate(config, baseline, evolution.bestVariant);
12730
+ if (!gate.decision.promote) {
12731
+ promotedVariant = baseline;
12732
+ promotedAggregate = aggregateFor(evolution, baseline.id);
12733
+ }
12734
+ }
12735
+ return {
12736
+ evolution,
12737
+ searchBestVariant: evolution.bestVariant,
12738
+ searchBestAggregate: evolution.bestAggregate,
12739
+ promotedVariant,
12740
+ promotedAggregate,
12741
+ gate
12742
+ };
12743
+ }
12744
+ function defaultMultiShotObjectives() {
12745
+ return [
12746
+ { name: "score", direction: "maximize", value: (a) => a.meanScore },
12747
+ { name: "cost", direction: "minimize", value: (a) => a.meanCost }
12748
+ ];
12749
+ }
12750
+ function trialTraceFromMultiShotTrial(trial) {
12751
+ return {
12752
+ id: `${trial.variantId}/${trial.scenarioId}/r${trial.rep}`,
12753
+ score: trial.score,
12754
+ inputName: trial.scenarioId,
12755
+ expectations: (trial.asi ?? []).map((item, i) => ({
12756
+ id: item.expectationId ?? `asi-${i}`,
12757
+ phrase: item.message,
12758
+ matched: item.matched ?? false
12759
+ })),
12760
+ emitted: trial.emitted ?? traceExcerpt(trial.trace),
12761
+ metrics: trial.metrics
12762
+ };
12763
+ }
12764
+ async function evaluateMultiShotGate(config, baseline, candidate) {
12765
+ const gateConfig = config.gate;
12766
+ const reps = gateConfig.reps ?? config.reps;
12767
+ const candidateRuns = [];
12768
+ const baselineRuns = [];
12769
+ const searchIds = gateConfig.searchScenarioIds ?? config.searchScenarioIds;
12770
+ for (const scenarioId of searchIds) {
12771
+ for (let rep = 0; rep < reps; rep++) {
12772
+ const seed = seedFor(config, scenarioId, rep);
12773
+ const baseTrial = await scoreOne(config, baseline, scenarioId, rep, "search");
12774
+ const candTrial = await scoreOne(config, candidate, scenarioId, rep, "search");
12775
+ baselineRuns.push(toValidatedRecord(config, baseline, scenarioId, rep, "search", seed, baseTrial));
12776
+ candidateRuns.push(toValidatedRecord(config, candidate, scenarioId, rep, "search", seed, candTrial));
12777
+ }
12778
+ }
12779
+ for (const scenarioId of gateConfig.holdoutScenarioIds) {
12780
+ for (let rep = 0; rep < reps; rep++) {
12781
+ const seed = seedFor(config, scenarioId, rep);
12782
+ const baseTrial = await scoreOne(config, baseline, scenarioId, rep, "holdout");
12783
+ const candTrial = await scoreOne(config, candidate, scenarioId, rep, "holdout");
12784
+ baselineRuns.push(toValidatedRecord(config, baseline, scenarioId, rep, "holdout", seed, baseTrial));
12785
+ candidateRuns.push(toValidatedRecord(config, candidate, scenarioId, rep, "holdout", seed, candTrial));
12786
+ }
12787
+ }
12788
+ const decision = new HeldOutGate(gateConfig.gate).evaluate(candidateRuns, baselineRuns);
12789
+ return { decision, candidateRuns, baselineRuns };
12790
+ }
12791
+ async function scoreOne(config, variant, scenarioId, rep, split) {
12792
+ const seed = seedFor(config, scenarioId, rep);
12793
+ const input = { variant, scenarioId, rep, split, seed };
12794
+ try {
12795
+ const run = await config.runner.run(input);
12796
+ const scored = await config.scorer.score({ ...input, run });
12797
+ const asi = scored.asi ?? [];
12798
+ return {
12799
+ variantId: variant.id,
12800
+ scenarioId,
12801
+ rep,
12802
+ ok: scored.ok ?? true,
12803
+ score: clamp013(scored.score),
12804
+ cost: scored.costUsd ?? run.costUsd ?? 0,
12805
+ durationMs: scored.durationMs ?? run.durationMs ?? 0,
12806
+ metrics: {
12807
+ ...numericMetrics(scored.metrics),
12808
+ ...asiMetrics(asi)
12809
+ },
12810
+ split,
12811
+ seed,
12812
+ trace: run.trace,
12813
+ asi,
12814
+ emitted: scored.emitted ?? traceExcerpt(run.trace),
12815
+ metadata: scored.metadata
12816
+ };
12817
+ } catch (err) {
12818
+ return {
12819
+ variantId: variant.id,
12820
+ scenarioId,
12821
+ rep,
12822
+ ok: false,
12823
+ score: 0,
12824
+ cost: 0,
12825
+ durationMs: 0,
12826
+ metrics: { error: 1 },
12827
+ error: err instanceof Error ? err.message : String(err),
12828
+ split,
12829
+ seed,
12830
+ asi: [{
12831
+ severity: "critical",
12832
+ message: err instanceof Error ? err.message : String(err),
12833
+ responsibleSurface: config.target
12834
+ }],
12835
+ emitted: ""
12836
+ };
12837
+ }
12838
+ }
12839
+ function toValidatedRecord(config, variant, scenarioId, rep, split, seed, trial) {
12840
+ const record = config.gate.toRunRecord({ variant, scenarioId, rep, split, seed, trial });
12841
+ return validateRunRecord(record);
12842
+ }
12843
+ function validateConfig(config) {
12844
+ if (!config.runId.trim()) throw new Error("runMultiShotOptimization: runId must not be empty");
12845
+ if (!config.target.trim()) throw new Error("runMultiShotOptimization: target must not be empty");
12846
+ if (config.seedVariants.length === 0) {
12847
+ throw new Error("runMultiShotOptimization: seedVariants must not be empty");
12848
+ }
12849
+ if (config.searchScenarioIds.length === 0) {
12850
+ throw new Error("runMultiShotOptimization: searchScenarioIds must not be empty");
12851
+ }
12852
+ requirePositiveInteger(config.reps, "reps");
12853
+ requirePositiveInteger(config.generations, "generations");
12854
+ requirePositiveInteger(config.populationSize, "populationSize");
12855
+ if (config.scoreConcurrency !== void 0) requirePositiveInteger(config.scoreConcurrency, "scoreConcurrency");
12856
+ if (config.populationSize < config.seedVariants.length) {
12857
+ throw new Error("runMultiShotOptimization: populationSize must be >= seedVariants.length");
12858
+ }
12859
+ assertUnique(config.seedVariants.map((v) => v.id), "seedVariants.id");
12860
+ assertUnique(config.searchScenarioIds, "searchScenarioIds");
12861
+ if (config.gate) {
12862
+ if (config.gate.holdoutScenarioIds.length === 0) {
12863
+ throw new Error("runMultiShotOptimization: gate.holdoutScenarioIds must not be empty");
12864
+ }
12865
+ if (config.gate.reps !== void 0) requirePositiveInteger(config.gate.reps, "gate.reps");
12866
+ assertUnique(config.gate.holdoutScenarioIds, "gate.holdoutScenarioIds");
12867
+ if (config.gate.searchScenarioIds) assertUnique(config.gate.searchScenarioIds, "gate.searchScenarioIds");
12868
+ const searchIds = new Set(config.searchScenarioIds);
12869
+ for (const id of config.gate.holdoutScenarioIds) {
12870
+ if (searchIds.has(id)) {
12871
+ throw new Error(`runMultiShotOptimization: holdout scenario "${id}" also appears in searchScenarioIds`);
12872
+ }
12873
+ }
12874
+ const baselineId = config.seedVariants[0].id;
12875
+ if (config.gate.gate.baselineKey !== baselineId) {
12876
+ throw new Error(
12877
+ `runMultiShotOptimization: gate.gate.baselineKey must match first seed variant id "${baselineId}"`
12878
+ );
12879
+ }
12880
+ }
12881
+ }
12882
+ function requirePositiveInteger(value, name) {
12883
+ if (!Number.isInteger(value) || value <= 0) {
12884
+ throw new Error(`runMultiShotOptimization: ${name} must be a positive integer`);
12885
+ }
12886
+ }
12887
+ function assertUnique(values, name) {
12888
+ const seen = /* @__PURE__ */ new Set();
12889
+ for (const value of values) {
12890
+ if (!value.trim()) throw new Error(`runMultiShotOptimization: ${name} must not contain empty values`);
12891
+ if (seen.has(value)) throw new Error(`runMultiShotOptimization: duplicate ${name} "${value}"`);
12892
+ seen.add(value);
12893
+ }
12894
+ }
12895
+ function aggregateFor(evolution, variantId) {
12896
+ const final = evolution.generations[evolution.generations.length - 1];
12897
+ const aggregate2 = final?.aggregates.find((a) => a.variantId === variantId);
12898
+ if (!aggregate2) {
12899
+ throw new Error(`runMultiShotOptimization: missing aggregate for variant "${variantId}"`);
12900
+ }
12901
+ return aggregate2;
12902
+ }
12903
+ function seedFor(config, scenarioId, rep) {
12904
+ const base = config.seedBase ?? 0;
12905
+ return (base + stableHash2(`${scenarioId}${rep}`)) % Number.MAX_SAFE_INTEGER;
12906
+ }
12907
+ function stableHash2(input) {
12908
+ let h = 2166136261;
12909
+ for (let i = 0; i < input.length; i++) {
12910
+ h ^= input.charCodeAt(i);
12911
+ h = Math.imul(h, 16777619);
12912
+ }
12913
+ return h >>> 0;
12914
+ }
12915
+ function clamp013(n) {
12916
+ if (!Number.isFinite(n)) return 0;
12917
+ return Math.max(0, Math.min(1, n));
12918
+ }
12919
+ function numericMetrics(metrics) {
12920
+ const out = {};
12921
+ for (const [k, v] of Object.entries(metrics ?? {})) {
12922
+ if (Number.isFinite(v)) out[k] = v;
12923
+ }
12924
+ return out;
12925
+ }
12926
+ function asiMetrics(asi) {
12927
+ const out = { asi: asi.length };
12928
+ for (const item of asi.slice(0, 1e3)) {
12929
+ const sev = normalizeSeverity(item.severity);
12930
+ out[`asi.${sev}`] = (out[`asi.${sev}`] ?? 0) + 1;
12931
+ if (item.responsibleSurface) {
12932
+ const key = `surface.${metricKeySegment(item.responsibleSurface)}`;
12933
+ out[key] = (out[key] ?? 0) + 1;
12934
+ }
12935
+ }
12936
+ return out;
12937
+ }
12938
+ function normalizeSeverity(severity) {
12939
+ if (severity === "info" || severity === "warning" || severity === "error" || severity === "critical") {
12940
+ return severity;
12941
+ }
12942
+ return "error";
12943
+ }
12944
+ function metricKeySegment(raw) {
12945
+ return raw.trim().replace(/[^a-zA-Z0-9._-]+/g, "_").slice(0, 80) || "unknown";
12946
+ }
12947
+ function traceExcerpt(trace) {
12948
+ if (!trace) return void 0;
12949
+ if (typeof trace.output === "string") return trace.output;
12950
+ if (trace.transcript) return trace.transcript;
12951
+ if (trace.turns) {
12952
+ try {
12953
+ const clipped = trace.turns.slice(0, 20);
12954
+ const suffix = trace.turns.length > clipped.length ? ` ... ${trace.turns.length - clipped.length} more turn(s)` : "";
12955
+ return `${JSON.stringify(clipped).slice(0, 2e3)}${suffix}`;
12956
+ } catch {
12957
+ return "[unserializable trace turns]";
12958
+ }
12959
+ }
12960
+ return void 0;
12961
+ }
12962
+
12695
12963
  // src/jsonl-trial-cache.ts
12696
12964
  import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
12697
12965
  import { dirname as dirname4 } from "path";
@@ -13811,6 +14079,7 @@ export {
13811
14079
  decideReferenceReplayPromotion,
13812
14080
  decideReferenceReplayRunPromotion,
13813
14081
  defaultJudges,
14082
+ defaultMultiShotObjectives,
13814
14083
  defaultReferenceReplayMatcher,
13815
14084
  deployGateLayer,
13816
14085
  distillPlaybook,
@@ -13948,6 +14217,7 @@ export {
13948
14217
  runJudgeFleet,
13949
14218
  runKeywordCoverageJudge,
13950
14219
  runKeywordCoverageJudgeUrl,
14220
+ runMultiShotOptimization,
13951
14221
  runPromptEvolution,
13952
14222
  runProposeReview,
13953
14223
  runProposeReviewAsControlLoop,
@@ -13991,6 +14261,7 @@ export {
13991
14261
  toolSpans,
13992
14262
  toolSuccessRubric,
13993
14263
  toolWasteView,
14264
+ trialTraceFromMultiShotTrial,
13994
14265
  typoMutator,
13995
14266
  urlContains,
13996
14267
  validateRunRecord,