@tangle-network/agent-eval 0.11.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,3 +1,12 @@
1
+ import {
2
+ LlmCallError,
3
+ LlmClient,
4
+ callLlm,
5
+ callLlmJson,
6
+ probeLlm,
7
+ stripFencedJson
8
+ } from "./chunk-ITN4YOZY.js";
9
+
1
10
  // src/client.ts
2
11
  var ProductClient = class {
3
12
  baseUrl;
@@ -410,7 +419,7 @@ function confidenceInterval(scores, confidence = 0.95) {
410
419
  if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
411
420
  if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
412
421
  const n = scores.length;
413
- const mean5 = scores.reduce((a, b) => a + b, 0) / n;
422
+ const mean7 = scores.reduce((a, b) => a + b, 0) / n;
414
423
  const B = 1e3;
415
424
  const bootstrapMeans = [];
416
425
  for (let i = 0; i < B; i++) {
@@ -425,7 +434,7 @@ function confidenceInterval(scores, confidence = 0.95) {
425
434
  const lowerIdx = Math.floor(alpha / 2 * B);
426
435
  const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
427
436
  return {
428
- mean: mean5,
437
+ mean: mean7,
429
438
  lower: bootstrapMeans[lowerIdx],
430
439
  upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
431
440
  };
@@ -513,11 +522,11 @@ function pairedTTest(before, after) {
513
522
  const n = before.length;
514
523
  if (n < 2) return { t: 0, df: 0, p: 1 };
515
524
  const diffs = before.map((b, i) => after[i] - b);
516
- const mean5 = diffs.reduce((a, b) => a + b, 0) / n;
517
- const variance2 = diffs.reduce((acc, d) => acc + (d - mean5) ** 2, 0) / (n - 1);
525
+ const mean7 = diffs.reduce((a, b) => a + b, 0) / n;
526
+ const variance2 = diffs.reduce((acc, d) => acc + (d - mean7) ** 2, 0) / (n - 1);
518
527
  const se = Math.sqrt(variance2 / n);
519
- if (se === 0) return { t: mean5 === 0 ? 0 : Infinity, df: n - 1, p: mean5 === 0 ? 1 : 0 };
520
- const t = mean5 / se;
528
+ if (se === 0) return { t: mean7 === 0 ? 0 : Infinity, df: n - 1, p: mean7 === 0 ? 1 : 0 };
529
+ const t = mean7 / se;
521
530
  const df = n - 1;
522
531
  const p = 2 * (1 - studentTCdf(Math.abs(t), df));
523
532
  return { t, df, p };
@@ -541,9 +550,9 @@ function wilcoxonSignedRank(before, after) {
541
550
  }
542
551
  let wPlus = 0;
543
552
  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
544
- const mean5 = n * (n + 1) / 4;
553
+ const mean7 = n * (n + 1) / 4;
545
554
  const variance2 = n * (n + 1) * (2 * n + 1) / 24;
546
- const z = (wPlus - mean5) / Math.sqrt(variance2);
555
+ const z = (wPlus - mean7) / Math.sqrt(variance2);
547
556
  const p = 2 * (1 - normalCdf(Math.abs(z)));
548
557
  return { w: wPlus, p };
549
558
  }
@@ -1926,6 +1935,244 @@ function rand(bytes) {
1926
1935
  return Array.from(arr).map((b) => b.toString(16).padStart(2, "0")).join("");
1927
1936
  }
1928
1937
 
1938
+ // src/experiment-tracker-fs.ts
1939
+ var FileSystemExperimentStore = class {
1940
+ dir;
1941
+ maxBytes;
1942
+ index;
1943
+ loaded = false;
1944
+ constructor(options) {
1945
+ this.dir = options.dir;
1946
+ this.maxBytes = options.maxBytes ?? 32 * 1024 * 1024;
1947
+ }
1948
+ async saveExperiment(exp) {
1949
+ const idx = await this.load();
1950
+ await idx.saveExperiment(exp);
1951
+ await this.append("experiments", exp);
1952
+ }
1953
+ async getExperiment(id) {
1954
+ const idx = await this.load();
1955
+ return idx.getExperiment(id);
1956
+ }
1957
+ async listExperiments() {
1958
+ const idx = await this.load();
1959
+ return idx.listExperiments();
1960
+ }
1961
+ async saveRun(run) {
1962
+ const idx = await this.load();
1963
+ await idx.saveRun(run);
1964
+ await this.append("runs", run);
1965
+ }
1966
+ async getRun(id) {
1967
+ const idx = await this.load();
1968
+ return idx.getRun(id);
1969
+ }
1970
+ async listRuns(experimentId) {
1971
+ const idx = await this.load();
1972
+ return idx.listRuns(experimentId);
1973
+ }
1974
+ async ensureDir() {
1975
+ const fs = await import("fs/promises");
1976
+ await fs.mkdir(this.dir, { recursive: true });
1977
+ }
1978
+ async append(name, record) {
1979
+ await this.ensureDir();
1980
+ const fs = await import("fs/promises");
1981
+ const path = await import("path");
1982
+ const active = path.join(this.dir, `${name}.ndjson`);
1983
+ try {
1984
+ const stat = await fs.stat(active);
1985
+ if (stat.size >= this.maxBytes) {
1986
+ const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
1987
+ await fs.rename(active, rolled);
1988
+ }
1989
+ } catch {
1990
+ }
1991
+ await fs.appendFile(active, JSON.stringify(record) + "\n", "utf8");
1992
+ }
1993
+ async load() {
1994
+ if (this.loaded && this.index) return this.index;
1995
+ const fs = await import("fs/promises");
1996
+ const path = await import("path");
1997
+ const store = new InMemoryExperimentStore();
1998
+ try {
1999
+ const entries = await fs.readdir(this.dir);
2000
+ const sorted = entries.filter((f) => f.endsWith(".ndjson")).sort((a, b) => a.localeCompare(b));
2001
+ for (const file of sorted) {
2002
+ const full = path.join(this.dir, file);
2003
+ const content = await fs.readFile(full, "utf8");
2004
+ const base = file.split(".")[0];
2005
+ for (const line of content.split("\n")) {
2006
+ if (!line.trim()) continue;
2007
+ let record;
2008
+ try {
2009
+ record = JSON.parse(line);
2010
+ } catch {
2011
+ continue;
2012
+ }
2013
+ if (base === "experiments") {
2014
+ await store.saveExperiment(record);
2015
+ } else if (base === "runs") {
2016
+ await store.saveRun(record);
2017
+ }
2018
+ }
2019
+ }
2020
+ } catch {
2021
+ }
2022
+ this.index = store;
2023
+ this.loaded = true;
2024
+ return store;
2025
+ }
2026
+ };
2027
+
2028
+ // src/experiment-tracker-d1.ts
2029
+ var SCHEMA_VERSION = 1;
2030
+ var D1ExperimentStore = class {
2031
+ db;
2032
+ experimentsTable;
2033
+ runsTable;
2034
+ metaTable;
2035
+ schemaReady = false;
2036
+ constructor(options) {
2037
+ this.db = options.db;
2038
+ const prefix = options.tablePrefix ?? "agent_eval_";
2039
+ this.experimentsTable = `${prefix}experiments`;
2040
+ this.runsTable = `${prefix}runs`;
2041
+ this.metaTable = `${prefix}meta`;
2042
+ }
2043
+ /**
2044
+ * Idempotent schema setup. Safe to call before every operation; the second
2045
+ * call short-circuits via `schemaReady`. Most consumers will call it once
2046
+ * during Worker bootstrap.
2047
+ */
2048
+ async ensureSchema() {
2049
+ if (this.schemaReady) return;
2050
+ const ddl = `
2051
+ CREATE TABLE IF NOT EXISTS ${this.experimentsTable} (
2052
+ id TEXT PRIMARY KEY,
2053
+ name TEXT NOT NULL,
2054
+ created_at TEXT NOT NULL,
2055
+ metadata_json TEXT
2056
+ );
2057
+ CREATE TABLE IF NOT EXISTS ${this.runsTable} (
2058
+ id TEXT PRIMARY KEY,
2059
+ experiment_id TEXT NOT NULL,
2060
+ name TEXT,
2061
+ status TEXT NOT NULL,
2062
+ started_at TEXT NOT NULL,
2063
+ completed_at TEXT,
2064
+ config_json TEXT NOT NULL,
2065
+ report_json TEXT,
2066
+ error TEXT
2067
+ );
2068
+ CREATE INDEX IF NOT EXISTS idx_${this.runsTable}_experiment ON ${this.runsTable}(experiment_id);
2069
+ CREATE INDEX IF NOT EXISTS idx_${this.runsTable}_started ON ${this.runsTable}(started_at);
2070
+ CREATE TABLE IF NOT EXISTS ${this.metaTable} (
2071
+ key TEXT PRIMARY KEY,
2072
+ value TEXT NOT NULL
2073
+ );
2074
+ INSERT OR REPLACE INTO ${this.metaTable}(key, value) VALUES ('schema_version', '${SCHEMA_VERSION}');
2075
+ `;
2076
+ await this.db.exec(ddl.trim().replace(/\s+/g, " "));
2077
+ this.schemaReady = true;
2078
+ }
2079
+ async saveExperiment(exp) {
2080
+ await this.ensureSchema();
2081
+ await this.db.prepare(
2082
+ `INSERT INTO ${this.experimentsTable}(id, name, created_at, metadata_json)
2083
+ VALUES (?1, ?2, ?3, ?4)
2084
+ ON CONFLICT(id) DO UPDATE SET
2085
+ name = excluded.name,
2086
+ created_at = excluded.created_at,
2087
+ metadata_json = excluded.metadata_json`
2088
+ ).bind(exp.id, exp.name, exp.createdAt, exp.metadata ? JSON.stringify(exp.metadata) : null).run();
2089
+ }
2090
+ async getExperiment(id) {
2091
+ await this.ensureSchema();
2092
+ const row = await this.db.prepare(
2093
+ `SELECT id, name, created_at, metadata_json
2094
+ FROM ${this.experimentsTable}
2095
+ WHERE id = ?1`
2096
+ ).bind(id).first();
2097
+ return row ? rowToExperiment(row) : null;
2098
+ }
2099
+ async listExperiments() {
2100
+ await this.ensureSchema();
2101
+ const { results } = await this.db.prepare(
2102
+ `SELECT id, name, created_at, metadata_json
2103
+ FROM ${this.experimentsTable}
2104
+ ORDER BY created_at DESC`
2105
+ ).all();
2106
+ return results.map(rowToExperiment);
2107
+ }
2108
+ async saveRun(run) {
2109
+ await this.ensureSchema();
2110
+ await this.db.prepare(
2111
+ `INSERT INTO ${this.runsTable}(id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error)
2112
+ VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)
2113
+ ON CONFLICT(id) DO UPDATE SET
2114
+ experiment_id = excluded.experiment_id,
2115
+ name = excluded.name,
2116
+ status = excluded.status,
2117
+ started_at = excluded.started_at,
2118
+ completed_at = excluded.completed_at,
2119
+ config_json = excluded.config_json,
2120
+ report_json = excluded.report_json,
2121
+ error = excluded.error`
2122
+ ).bind(
2123
+ run.id,
2124
+ run.experimentId,
2125
+ run.name ?? null,
2126
+ run.status,
2127
+ run.startedAt,
2128
+ run.completedAt ?? null,
2129
+ JSON.stringify(run.config),
2130
+ run.report ? JSON.stringify(run.report) : null,
2131
+ run.error ?? null
2132
+ ).run();
2133
+ }
2134
+ async getRun(id) {
2135
+ await this.ensureSchema();
2136
+ const row = await this.db.prepare(
2137
+ `SELECT id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error
2138
+ FROM ${this.runsTable}
2139
+ WHERE id = ?1`
2140
+ ).bind(id).first();
2141
+ return row ? rowToRun(row) : null;
2142
+ }
2143
+ async listRuns(experimentId) {
2144
+ await this.ensureSchema();
2145
+ const { results } = await this.db.prepare(
2146
+ `SELECT id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error
2147
+ FROM ${this.runsTable}
2148
+ WHERE experiment_id = ?1
2149
+ ORDER BY started_at DESC`
2150
+ ).bind(experimentId).all();
2151
+ return results.map(rowToRun);
2152
+ }
2153
+ };
2154
+ function rowToExperiment(row) {
2155
+ return {
2156
+ id: row.id,
2157
+ name: row.name,
2158
+ createdAt: row.created_at,
2159
+ ...row.metadata_json ? { metadata: JSON.parse(row.metadata_json) } : {}
2160
+ };
2161
+ }
2162
+ function rowToRun(row) {
2163
+ return {
2164
+ id: row.id,
2165
+ experimentId: row.experiment_id,
2166
+ ...row.name ? { name: row.name } : {},
2167
+ status: row.status,
2168
+ startedAt: row.started_at,
2169
+ ...row.completed_at ? { completedAt: row.completed_at } : {},
2170
+ config: JSON.parse(row.config_json),
2171
+ ...row.report_json ? { report: JSON.parse(row.report_json) } : {},
2172
+ ...row.error ? { error: row.error } : {}
2173
+ };
2174
+ }
2175
+
1929
2176
  // src/power-analysis.ts
1930
2177
  function requiredSampleSize(opts) {
1931
2178
  const effect = opts.effect;
@@ -2486,6 +2733,56 @@ function paretoFrontier(candidates, objectives) {
2486
2733
  }));
2487
2734
  return { frontier, dominated, dominanceMap };
2488
2735
  }
2736
+ function scalarScore(candidates, objectives, options = {}) {
2737
+ if (candidates.length === 0) return [];
2738
+ const weights = options.weights ?? {};
2739
+ const totalWeight = objectives.reduce((s, o) => s + (weights[o.name] ?? 1), 0);
2740
+ const ranges = objectives.map((obj) => {
2741
+ const values = candidates.map((c) => obj.value(c)).filter((v) => Number.isFinite(v));
2742
+ if (values.length === 0) return { min: 0, max: 1 };
2743
+ const min = Math.min(...values);
2744
+ const max = Math.max(...values);
2745
+ return { min, max: max === min ? min + 1 : max };
2746
+ });
2747
+ return candidates.map((c) => {
2748
+ let score = 0;
2749
+ objectives.forEach((obj, i) => {
2750
+ const v = obj.value(c);
2751
+ if (!Number.isFinite(v)) return;
2752
+ const { min, max } = ranges[i];
2753
+ const normalised = (v - min) / (max - min);
2754
+ const directional = obj.direction === "maximize" ? normalised : 1 - normalised;
2755
+ const weight = (weights[obj.name] ?? 1) / totalWeight;
2756
+ score += directional * weight;
2757
+ });
2758
+ return { candidate: c, score };
2759
+ });
2760
+ }
2761
+ function crowdingDistance(candidates, objectives) {
2762
+ const distances = new Map(candidates.map((c) => [c, 0]));
2763
+ for (const obj of objectives) {
2764
+ const sorted = [...candidates].sort((a, b) => obj.value(a) - obj.value(b));
2765
+ const min = obj.value(sorted[0]);
2766
+ const max = obj.value(sorted[sorted.length - 1]);
2767
+ const range = max - min || 1;
2768
+ distances.set(sorted[0], Infinity);
2769
+ distances.set(sorted[sorted.length - 1], Infinity);
2770
+ for (let i = 1; i < sorted.length - 1; i++) {
2771
+ const prev = obj.value(sorted[i - 1]);
2772
+ const next = obj.value(sorted[i + 1]);
2773
+ const current = distances.get(sorted[i]);
2774
+ if (current === Infinity) continue;
2775
+ distances.set(sorted[i], current + (next - prev) / range);
2776
+ }
2777
+ }
2778
+ return candidates.map((c) => ({ candidate: c, distance: distances.get(c) ?? 0 }));
2779
+ }
2780
+ function paretoFrontierWithCrowding(candidates, objectives) {
2781
+ const { frontier } = paretoFrontier(candidates, objectives);
2782
+ if (frontier.length === 0) return [];
2783
+ const distances = crowdingDistance(frontier, objectives);
2784
+ return distances.sort((a, b) => b.distance - a.distance);
2785
+ }
2489
2786
 
2490
2787
  // src/harness-optimizer.ts
2491
2788
  var DEFAULT_HARNESS_OBJECTIVES = [
@@ -5095,10 +5392,10 @@ function analyzeSeries(values, options = {}) {
5095
5392
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
5096
5393
  }
5097
5394
  const tail = values.slice(-window);
5098
- const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
5099
- const variance2 = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
5395
+ const mean7 = tail.reduce((a, b) => a + b, 0) / tail.length;
5396
+ const variance2 = tail.reduce((acc, v) => acc + (v - mean7) ** 2, 0) / tail.length;
5100
5397
  const stdDev = Math.sqrt(variance2);
5101
- const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
5398
+ const refMean = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
5102
5399
  const cv = stdDev / refMean;
5103
5400
  const stable = tail.length >= window && cv <= stableCv;
5104
5401
  let tailRun = 0;
@@ -5119,7 +5416,7 @@ function analyzeSeries(values, options = {}) {
5119
5416
  } else {
5120
5417
  state = "noisy";
5121
5418
  }
5122
- return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
5419
+ return { state, windowMean: mean7, windowCv: cv, tailRun, stable };
5123
5420
  }
5124
5421
 
5125
5422
  // src/state-continuity.ts
@@ -6047,12 +6344,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
6047
6344
  variantScores.push({ mutator: id, score, mutated });
6048
6345
  all.push(score);
6049
6346
  }
6050
- const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
6051
- const variance2 = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
6347
+ const mean7 = all.reduce((a, b) => a + b, 0) / all.length;
6348
+ const variance2 = all.reduce((a, v) => a + (v - mean7) ** 2, 0) / all.length;
6052
6349
  const stdDev = Math.sqrt(variance2);
6053
- const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
6350
+ const ref = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
6054
6351
  const robustness = Math.max(0, 1 - stdDev / ref);
6055
- return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
6352
+ return { originalScore, variantScores, meanScore: mean7, stdDev, robustness };
6056
6353
  }
6057
6354
  var lowercaseMutator = (p) => p.toLowerCase();
6058
6355
  var sentenceReorderMutator = (p, seed) => {
@@ -6973,8 +7270,8 @@ async function prmBestOfN(store, grader, runIds) {
6973
7270
  if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
6974
7271
  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
6975
7272
  const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
6976
- const mean5 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
6977
- const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean5) ** 2, 0) / graded.length;
7273
+ const mean7 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
7274
+ const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / graded.length;
6978
7275
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
6979
7276
  }
6980
7277
  async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -6996,8 +7293,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
6996
7293
  const ranked = [...byRun.values()].sort(
6997
7294
  (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
6998
7295
  );
6999
- const mean5 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
7000
- const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean5) ** 2, 0) / ranked.length;
7296
+ const mean7 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
7297
+ const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / ranked.length;
7001
7298
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
7002
7299
  }
7003
7300
 
@@ -7527,8 +7824,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
7527
7824
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
7528
7825
  const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7529
7826
  if (scores.length < 3) continue;
7530
- const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
7531
- const variance2 = scores.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores.length;
7827
+ const mean7 = scores.reduce((a, b) => a + b, 0) / scores.length;
7828
+ const variance2 = scores.reduce((a, b) => a + (b - mean7) ** 2, 0) / scores.length;
7532
7829
  if (variance2 > varianceThreshold) {
7533
7830
  targets.push({
7534
7831
  reason: "high-variance",
@@ -8008,212 +8305,6 @@ async function euAiActReport(ctx, signals) {
8008
8305
  };
8009
8306
  }
8010
8307
 
8011
- // src/llm-client.ts
8012
- var LlmCallError = class extends Error {
8013
- constructor(message, status, body, model) {
8014
- super(message);
8015
- this.status = status;
8016
- this.body = body;
8017
- this.model = model;
8018
- this.name = "LlmCallError";
8019
- }
8020
- status;
8021
- body;
8022
- model;
8023
- };
8024
- var DEFAULT_BASE_URL = "https://router.tangle.tools/v1";
8025
- var DEFAULT_TIMEOUT_MS = 6e4;
8026
- var DEFAULT_MAX_RETRIES = 3;
8027
- var RETRYABLE_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
8028
- function isRetryableError(err) {
8029
- if (err instanceof LlmCallError) return RETRYABLE_STATUS.has(err.status);
8030
- if (err instanceof Error) {
8031
- return err.name === "AbortError" || err.name === "TimeoutError" || /fetch failed|ECONNRESET|ETIMEDOUT|EAI_AGAIN/i.test(err.message);
8032
- }
8033
- return false;
8034
- }
8035
- function parseRetryAfter(headers) {
8036
- const h = headers.get("retry-after");
8037
- if (!h) return null;
8038
- const asNumber = Number(h);
8039
- if (Number.isFinite(asNumber) && asNumber > 0) return asNumber * 1e3;
8040
- const asDate = Date.parse(h);
8041
- if (Number.isFinite(asDate)) return Math.max(0, asDate - Date.now());
8042
- return null;
8043
- }
8044
- function backoffMs(attempt) {
8045
- return Math.min(500 * Math.pow(2, attempt), 16e3);
8046
- }
8047
- function buildHeaders(opts) {
8048
- const headers = {
8049
- "Content-Type": "application/json",
8050
- Accept: "application/json"
8051
- };
8052
- if (opts.authHeader) {
8053
- headers[opts.authHeader.name] = opts.authHeader.value;
8054
- } else if (opts.bearer || opts.apiKey) {
8055
- headers.Authorization = `Bearer ${opts.bearer ?? opts.apiKey}`;
8056
- }
8057
- return headers;
8058
- }
8059
- function isSchemaRejection(status, body) {
8060
- if (status !== 400) return false;
8061
- const lower = body.toLowerCase();
8062
- return lower.includes("response_format") || lower.includes("json_schema") || lower.includes("is unavailable") || lower.includes("not supported");
8063
- }
8064
- function buildBody(req, forceJsonObject) {
8065
- const body = {
8066
- model: req.model,
8067
- messages: req.messages,
8068
- temperature: req.temperature ?? 0
8069
- };
8070
- if (req.maxTokens != null) body.max_tokens = req.maxTokens;
8071
- if (req.jsonSchema && !forceJsonObject) {
8072
- body.response_format = {
8073
- type: "json_schema",
8074
- json_schema: { name: req.jsonSchema.name, schema: req.jsonSchema.schema, strict: true }
8075
- };
8076
- } else if (req.jsonMode || req.jsonSchema) {
8077
- body.response_format = { type: "json_object" };
8078
- }
8079
- return body;
8080
- }
8081
- async function sleep(ms) {
8082
- return new Promise((resolve) => setTimeout(resolve, ms));
8083
- }
8084
- function stripFencedJson(raw) {
8085
- const trimmed = raw.trim();
8086
- const m = trimmed.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?```\s*$/);
8087
- return m ? m[1].trim() : trimmed;
8088
- }
8089
- async function callLlm(req, opts = {}) {
8090
- const baseUrl = (opts.baseUrl ?? DEFAULT_BASE_URL).replace(/\/+$/, "");
8091
- const url = `${baseUrl}/chat/completions`;
8092
- const timeoutMs = req.timeoutMs ?? opts.defaultTimeoutMs ?? DEFAULT_TIMEOUT_MS;
8093
- const maxRetries = opts.maxRetries ?? DEFAULT_MAX_RETRIES;
8094
- const fetchFn = opts.fetch ?? globalThis.fetch;
8095
- const headers = buildHeaders(opts);
8096
- let lastErr;
8097
- for (let attempt = 0; attempt < maxRetries; attempt++) {
8098
- const controller = new AbortController();
8099
- const timeoutHandle = setTimeout(() => controller.abort(), timeoutMs);
8100
- const started = Date.now();
8101
- try {
8102
- const res = await fetchFn(url, {
8103
- method: "POST",
8104
- headers,
8105
- body: JSON.stringify(buildBody(req, false)),
8106
- signal: controller.signal
8107
- });
8108
- clearTimeout(timeoutHandle);
8109
- if (!res.ok) {
8110
- const body = await res.text();
8111
- const err = new LlmCallError(
8112
- `LLM call ${res.status}: ${body.slice(0, 300)}`,
8113
- res.status,
8114
- body,
8115
- req.model
8116
- );
8117
- if (RETRYABLE_STATUS.has(res.status) && attempt < maxRetries - 1) {
8118
- lastErr = err;
8119
- const retryAfter = parseRetryAfter(res.headers);
8120
- await sleep(retryAfter ?? backoffMs(attempt));
8121
- continue;
8122
- }
8123
- throw err;
8124
- }
8125
- const json = await res.json();
8126
- const choice = json.choices?.[0];
8127
- const usageRaw = json.usage ?? {};
8128
- const costFromProxy = json._response_cost ?? json.cost_usd;
8129
- return {
8130
- content: choice?.message?.content ?? "",
8131
- usage: {
8132
- promptTokens: Number(usageRaw.prompt_tokens ?? 0),
8133
- completionTokens: Number(usageRaw.completion_tokens ?? 0),
8134
- totalTokens: Number(usageRaw.total_tokens ?? 0),
8135
- cachedPromptTokens: usageRaw.prompt_tokens_details && typeof usageRaw.prompt_tokens_details === "object" ? Number(
8136
- usageRaw.prompt_tokens_details.cached_tokens ?? 0
8137
- ) : void 0
8138
- },
8139
- costUsd: typeof costFromProxy === "number" ? costFromProxy : null,
8140
- model: json.model ?? req.model,
8141
- durationMs: Date.now() - started,
8142
- raw: json
8143
- };
8144
- } catch (err) {
8145
- clearTimeout(timeoutHandle);
8146
- lastErr = err;
8147
- if (attempt < maxRetries - 1 && isRetryableError(err)) {
8148
- await sleep(backoffMs(attempt));
8149
- continue;
8150
- }
8151
- throw err;
8152
- }
8153
- }
8154
- throw lastErr instanceof Error ? lastErr : new Error(String(lastErr));
8155
- }
8156
- async function callLlmJson(req, opts = {}) {
8157
- try {
8158
- const result = await callLlm({ ...req, jsonMode: req.jsonMode ?? !req.jsonSchema }, opts);
8159
- const value = parseJsonSafely(result.content, result.model);
8160
- return { value, result };
8161
- } catch (err) {
8162
- if (err instanceof LlmCallError && isSchemaRejection(err.status, err.body) && req.jsonSchema) {
8163
- const degradedReq = { ...req, jsonMode: true, jsonSchema: void 0 };
8164
- const result = await callLlm(degradedReq, opts);
8165
- const value = parseJsonSafely(result.content, result.model);
8166
- return { value, result };
8167
- }
8168
- throw err;
8169
- }
8170
- }
8171
- function parseJsonSafely(content, model) {
8172
- const stripped = stripFencedJson(content);
8173
- try {
8174
- return JSON.parse(stripped);
8175
- } catch (err) {
8176
- throw new Error(
8177
- `LLM returned non-JSON content (model=${model}): ${err instanceof Error ? err.message : String(err)}
8178
- --- raw content ---
8179
- ${content.slice(0, 800)}`
8180
- );
8181
- }
8182
- }
8183
- async function probeLlm(model, opts = {}) {
8184
- const start = Date.now();
8185
- try {
8186
- await callLlm(
8187
- {
8188
- model,
8189
- messages: [{ role: "user", content: "ping" }],
8190
- maxTokens: 64,
8191
- timeoutMs: opts.timeoutMs ?? 3e4
8192
- },
8193
- opts
8194
- );
8195
- return { ok: true, latencyMs: Date.now() - start, error: null };
8196
- } catch (err) {
8197
- return {
8198
- ok: false,
8199
- latencyMs: Date.now() - start,
8200
- error: err instanceof Error ? err.message : String(err)
8201
- };
8202
- }
8203
- }
8204
- var LlmClient = class {
8205
- constructor(opts = {}) {
8206
- this.opts = opts;
8207
- }
8208
- opts;
8209
- call(req, per) {
8210
- return callLlm(req, { ...this.opts, ...per });
8211
- }
8212
- callJson(req, per) {
8213
- return callLlmJson(req, { ...this.opts, ...per });
8214
- }
8215
- };
8216
-
8217
8308
  // src/multi-layer-verifier.ts
8218
8309
  function gradeSemanticStatus(input) {
8219
8310
  if (!input.available) return "error";
@@ -9771,7 +9862,7 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
9771
9862
  const total = scenario.references.length;
9772
9863
  const matchedWeight = matches2.filter((match) => match.matched).reduce((sum2, match) => sum2 + match.weight, 0);
9773
9864
  const totalWeight = matches2.reduce((sum2, match) => sum2 + match.weight, 0);
9774
- const precision = ratio(matched, matched + falsePositives);
9865
+ const precision2 = ratio(matched, matched + falsePositives);
9775
9866
  const recall = ratio(matched, total);
9776
9867
  return {
9777
9868
  scenarioId: scenario.id,
@@ -9781,9 +9872,9 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
9781
9872
  falsePositives,
9782
9873
  matchedWeight,
9783
9874
  totalWeight,
9784
- precision,
9875
+ precision: precision2,
9785
9876
  recall,
9786
- f1: f1(precision, recall),
9877
+ f1: f1(precision2, recall),
9787
9878
  matches: matches2
9788
9879
  };
9789
9880
  }
@@ -9801,7 +9892,7 @@ function aggregateScenarioScores(scores) {
9801
9892
  const falsePositives = sum(scores.map((score) => score.falsePositives));
9802
9893
  const matchedWeight = sum(scores.map((score) => score.matchedWeight));
9803
9894
  const totalWeight = sum(scores.map((score) => score.totalWeight));
9804
- const precision = ratio(matched, matched + falsePositives);
9895
+ const precision2 = ratio(matched, matched + falsePositives);
9805
9896
  const recall = ratio(matched, total);
9806
9897
  return {
9807
9898
  matched,
@@ -9809,9 +9900,9 @@ function aggregateScenarioScores(scores) {
9809
9900
  falsePositives,
9810
9901
  matchedWeight,
9811
9902
  totalWeight,
9812
- precision,
9903
+ precision: precision2,
9813
9904
  recall,
9814
- f1: f1(precision, recall),
9905
+ f1: f1(precision2, recall),
9815
9906
  weightedRecall: ratio(matchedWeight, totalWeight)
9816
9907
  };
9817
9908
  }
@@ -9831,8 +9922,8 @@ function emptyAggregate() {
9831
9922
  function hasSplit(score, split) {
9832
9923
  return score.bySplit[split] !== void 0;
9833
9924
  }
9834
- function f1(precision, recall) {
9835
- return precision + recall === 0 ? 0 : 2 * precision * recall / (precision + recall);
9925
+ function f1(precision2, recall) {
9926
+ return precision2 + recall === 0 ? 0 : 2 * precision2 * recall / (precision2 + recall);
9836
9927
  }
9837
9928
  function ratio(numerator, denominator) {
9838
9929
  return denominator > 0 ? numerator / denominator : 0;
@@ -9956,14 +10047,14 @@ function referenceReplayRunsToSteeringRows(runs, options = {}) {
9956
10047
  function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
9957
10048
  const success = scenarioScore.f1;
9958
10049
  const recall = scenarioScore.recall;
9959
- const precision = scenarioScore.precision;
10050
+ const precision2 = scenarioScore.precision;
9960
10051
  const failed = scenarioScore.total > 0 && scenarioScore.matched === 0;
9961
10052
  return {
9962
10053
  success,
9963
10054
  goalProgress: recall,
9964
- repoGroundedness: precision,
9965
- driftPenalty: 1 - precision,
9966
- toolUseQuality: precision,
10055
+ repoGroundedness: precision2,
10056
+ driftPenalty: 1 - precision2,
10057
+ toolUseQuality: precision2,
9967
10058
  patchQuality: 0,
9968
10059
  testReality: scenarioScore.total > 0 ? 1 : 0,
9969
10060
  finalGate: success,
@@ -9972,10 +10063,569 @@ function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
9972
10063
  wallSeconds: Math.max(0, durationMs / 1e3),
9973
10064
  notes: [
9974
10065
  `reference-replay matched ${scenarioScore.matched}/${scenarioScore.total}`,
9975
- `precision=${precision.toFixed(3)} recall=${recall.toFixed(3)} f1=${success.toFixed(3)}`
10066
+ `precision=${precision2.toFixed(3)} recall=${recall.toFixed(3)} f1=${success.toFixed(3)}`
9976
10067
  ]
9977
10068
  };
9978
10069
  }
10070
+
10071
+ // src/prompt-evolution.ts
10072
+ var InMemoryTrialCache = class {
10073
+ store = /* @__PURE__ */ new Map();
10074
+ get(key) {
10075
+ return this.store.get(key);
10076
+ }
10077
+ set(key, value) {
10078
+ this.store.set(key, value);
10079
+ }
10080
+ size() {
10081
+ return this.store.size;
10082
+ }
10083
+ clear() {
10084
+ this.store.clear();
10085
+ }
10086
+ };
10087
+ async function runPromptEvolution(config) {
10088
+ const generations = [];
10089
+ let population = [...config.seedVariants];
10090
+ let bestVariant = population[0];
10091
+ let bestAggregate = null;
10092
+ for (let generation = 0; generation < config.generations; generation++) {
10093
+ config.onProgress?.({ type: "generation-start", generation, populationSize: population.length });
10094
+ const trials = await scorePopulation(population, config, generation);
10095
+ const aggregates = aggregateTrials(population, config.scenarioIds, trials);
10096
+ const front = paretoFrontierWithCrowding(aggregates, config.objectives);
10097
+ const frontIds = new Set(front.map((c) => c.candidate.variantId));
10098
+ const scored = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights });
10099
+ scored.sort((a, b) => b.score - a.score);
10100
+ const winnerId = scored[0]?.candidate.variantId ?? aggregates[0]?.variantId ?? population[0].id;
10101
+ const report = {
10102
+ runId: config.runId,
10103
+ target: config.target,
10104
+ generation,
10105
+ variants: population,
10106
+ aggregates,
10107
+ paretoFrontIds: front.map((c) => c.candidate.variantId),
10108
+ winnerId,
10109
+ trials
10110
+ };
10111
+ generations.push(report);
10112
+ config.onProgress?.({ type: "generation-complete", report });
10113
+ const winnerAgg = aggregates.find((a) => a.variantId === winnerId);
10114
+ if (winnerAgg) {
10115
+ const winner = population.find((v) => v.id === winnerId);
10116
+ if (winner) bestVariant = winner;
10117
+ bestAggregate = winnerAgg;
10118
+ }
10119
+ if (config.earlyStopOnNoImprovement !== false && generations.length >= 2) {
10120
+ const prev = generations[generations.length - 2];
10121
+ const noChange = prev.winnerId === winnerId && samePopulation(prev.paretoFrontIds, [...frontIds]);
10122
+ if (noChange) {
10123
+ config.onProgress?.({ type: "converged", generation, reason: "no improvement vs previous generation" });
10124
+ break;
10125
+ }
10126
+ }
10127
+ if (generation === config.generations - 1) break;
10128
+ population = await nextPopulation(population, aggregates, trials, front, config, generation + 1);
10129
+ }
10130
+ return {
10131
+ runId: config.runId,
10132
+ target: config.target,
10133
+ generations,
10134
+ bestVariant,
10135
+ bestAggregate: bestAggregate ?? aggregateTrials(population, config.scenarioIds, []).find((a) => a.variantId === bestVariant.id)
10136
+ };
10137
+ }
10138
+ async function scorePopulation(population, config, generation) {
10139
+ const jobs = [];
10140
+ for (const variant of population) {
10141
+ for (const scenarioId of config.scenarioIds) {
10142
+ for (let rep = 0; rep < config.reps; rep++) {
10143
+ jobs.push(async () => {
10144
+ const cacheKey = `${variant.id}|${scenarioId}|${rep}`;
10145
+ const cached = config.cache?.get(cacheKey);
10146
+ if (cached) {
10147
+ config.onProgress?.({
10148
+ type: "trial-complete",
10149
+ generation,
10150
+ variantId: variant.id,
10151
+ scenarioId,
10152
+ rep,
10153
+ ok: cached.ok,
10154
+ score: cached.score,
10155
+ cached: true
10156
+ });
10157
+ return cached;
10158
+ }
10159
+ const result = await config.scoreAdapter.score({ variant, scenarioId, rep });
10160
+ config.cache?.set(cacheKey, result);
10161
+ config.onProgress?.({
10162
+ type: "trial-complete",
10163
+ generation,
10164
+ variantId: variant.id,
10165
+ scenarioId,
10166
+ rep,
10167
+ ok: result.ok,
10168
+ score: result.score,
10169
+ cached: false
10170
+ });
10171
+ return result;
10172
+ });
10173
+ }
10174
+ }
10175
+ }
10176
+ return runWithConcurrency(jobs, config.scoreConcurrency);
10177
+ }
10178
+ async function runWithConcurrency(jobs, concurrency) {
10179
+ const results = new Array(jobs.length);
10180
+ const limit = Math.max(1, concurrency);
10181
+ let next = 0;
10182
+ async function worker() {
10183
+ while (true) {
10184
+ const i = next++;
10185
+ if (i >= jobs.length) return;
10186
+ results[i] = await jobs[i]();
10187
+ }
10188
+ }
10189
+ await Promise.all(Array.from({ length: limit }, () => worker()));
10190
+ return results;
10191
+ }
10192
+ function aggregateTrials(population, scenarioIds, trials) {
10193
+ return population.map((variant) => {
10194
+ const variantTrials = trials.filter((t) => t.variantId === variant.id);
10195
+ const scenarios = scenarioIds.map((sid) => {
10196
+ const scenarioTrials = variantTrials.filter((t) => t.scenarioId === sid);
10197
+ const okTrials = scenarioTrials.filter((t) => t.ok);
10198
+ const metrics = aggregateMetrics(okTrials.map((t) => t.metrics ?? {}));
10199
+ return {
10200
+ variantId: variant.id,
10201
+ scenarioId: sid,
10202
+ meanScore: mean5(okTrials.map((t) => t.score)),
10203
+ meanCost: mean5(okTrials.map((t) => t.cost ?? 0)),
10204
+ meanDurationMs: mean5(okTrials.map((t) => t.durationMs ?? 0)),
10205
+ okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
10206
+ trials: scenarioTrials.length,
10207
+ metrics
10208
+ };
10209
+ });
10210
+ return {
10211
+ variantId: variant.id,
10212
+ meanScore: mean5(scenarios.map((s) => s.meanScore)),
10213
+ meanCost: mean5(scenarios.map((s) => s.meanCost)),
10214
+ meanDurationMs: mean5(scenarios.map((s) => s.meanDurationMs)),
10215
+ okRate: mean5(scenarios.map((s) => s.okRate)),
10216
+ scenarios,
10217
+ metrics: aggregateMetrics(scenarios.map((s) => s.metrics))
10218
+ };
10219
+ });
10220
+ }
10221
+ function aggregateMetrics(rows) {
10222
+ const buckets = /* @__PURE__ */ new Map();
10223
+ for (const row of rows) {
10224
+ for (const [k, v] of Object.entries(row)) {
10225
+ if (!Number.isFinite(v)) continue;
10226
+ const list = buckets.get(k) ?? [];
10227
+ list.push(v);
10228
+ buckets.set(k, list);
10229
+ }
10230
+ }
10231
+ const out = {};
10232
+ for (const [k, list] of buckets) out[k] = mean5(list);
10233
+ return out;
10234
+ }
10235
+ function mean5(xs) {
10236
+ if (xs.length === 0) return 0;
10237
+ return xs.reduce((a, b) => a + b, 0) / xs.length;
10238
+ }
10239
+ async function nextPopulation(current, aggregates, trials, front, config, nextGeneration) {
10240
+ const survivorIds = new Set(front.map((c) => c.candidate.variantId));
10241
+ const survivors = current.filter((v) => survivorIds.has(v.id));
10242
+ const ranked = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights }).sort((a, b) => b.score - a.score);
10243
+ const parentId = ranked[0]?.candidate.variantId ?? current[0].id;
10244
+ const parent = current.find((v) => v.id === parentId) ?? current[0];
10245
+ const parentAggregate = aggregates.find((a) => a.variantId === parent.id) ?? aggregates[0];
10246
+ const topTrials = topKTrialsByScore(trials, parent.id, 3);
10247
+ const bottomTrials = bottomKTrialsByScore(trials, parent.id, 3);
10248
+ const childCount = Math.max(0, config.populationSize - survivors.length);
10249
+ let children = [];
10250
+ if (childCount > 0) {
10251
+ children = await config.mutateAdapter.mutate({
10252
+ parent,
10253
+ parentAggregate,
10254
+ topTrials,
10255
+ bottomTrials,
10256
+ childCount,
10257
+ generation: nextGeneration
10258
+ });
10259
+ children = children.slice(0, childCount).map((c) => ({ ...c, generation: nextGeneration, parentId: parent.id }));
10260
+ }
10261
+ return [...survivors, ...children];
10262
+ }
10263
+ function topKTrialsByScore(trials, variantId, k) {
10264
+ return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => b.score - a.score).slice(0, k);
10265
+ }
10266
+ function bottomKTrialsByScore(trials, variantId, k) {
10267
+ return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => a.score - b.score).slice(0, k);
10268
+ }
10269
+ function samePopulation(a, b) {
10270
+ if (a.length !== b.length) return false;
10271
+ const setA = new Set(a);
10272
+ return b.every((id) => setA.has(id));
10273
+ }
10274
+
10275
+ // src/golden-matcher.ts
10276
+ function matchGoldens(goldens, candidates, options = {}) {
10277
+ const extract = options.text ?? defaultExtract5;
10278
+ const haystacks = candidates.map((c) => extract(c).toLowerCase());
10279
+ const matches2 = goldens.map((golden) => goldenMatched(golden, haystacks));
10280
+ return {
10281
+ matches: matches2,
10282
+ hits: matches2.filter(Boolean).length,
10283
+ total: goldens.length
10284
+ };
10285
+ }
10286
+ function defaultExtract5(candidate) {
10287
+ if (typeof candidate === "string") return candidate;
10288
+ if (candidate && typeof candidate === "object") {
10289
+ const parts = [];
10290
+ for (const v of Object.values(candidate)) {
10291
+ if (typeof v === "string") parts.push(v);
10292
+ }
10293
+ return parts.join(" ");
10294
+ }
10295
+ return String(candidate ?? "");
10296
+ }
10297
+ function goldenMatched(golden, haystacks) {
10298
+ for (const phrase of golden.any) {
10299
+ const needle = phrase.toLowerCase().trim();
10300
+ if (!needle) continue;
10301
+ if (haystacks.some((h) => h.includes(needle))) return true;
10302
+ }
10303
+ for (const pattern of golden.anyRegex ?? []) {
10304
+ let re;
10305
+ try {
10306
+ re = new RegExp(pattern, "i");
10307
+ } catch {
10308
+ continue;
10309
+ }
10310
+ if (haystacks.some((h) => re.test(h))) return true;
10311
+ }
10312
+ return false;
10313
+ }
10314
+ var DEFAULT_SEVERITY_WEIGHTS = {
10315
+ critical: 3,
10316
+ major: 2,
10317
+ minor: 1
10318
+ };
10319
+ function weightedRecall(goldens, result, weights = DEFAULT_SEVERITY_WEIGHTS) {
10320
+ if (goldens.length === 0) return 1;
10321
+ const total = goldens.reduce((s, g) => s + (weights[g.severity] ?? 1), 0);
10322
+ if (total === 0) return 1;
10323
+ const hit = goldens.reduce(
10324
+ (s, g, i) => s + (result.matches[i] ? weights[g.severity] ?? 1 : 0),
10325
+ 0
10326
+ );
10327
+ return hit / total;
10328
+ }
10329
+ function precision(goldens, candidates, options = {}) {
10330
+ if (candidates.length === 0) return 1;
10331
+ const extract = options.text ?? defaultExtract5;
10332
+ let matched = 0;
10333
+ for (const cand of candidates) {
10334
+ const haystack = extract(cand).toLowerCase();
10335
+ const matchedAny = goldens.some(
10336
+ (g) => g.any.some((phrase) => phrase.length > 0 && haystack.includes(phrase.toLowerCase())) || (g.anyRegex ?? []).some((pat) => {
10337
+ try {
10338
+ return new RegExp(pat, "i").test(haystack);
10339
+ } catch {
10340
+ return false;
10341
+ }
10342
+ })
10343
+ );
10344
+ if (matchedAny) matched++;
10345
+ }
10346
+ return matched / candidates.length;
10347
+ }
10348
+
10349
+ // src/orthogonality.ts
10350
+ function passOrthogonality(input) {
10351
+ const passes = input.passes;
10352
+ if (passes.length < 2) {
10353
+ return { orthogonality: 1, passCount: passes.length, similarities: [] };
10354
+ }
10355
+ const render = input.text ?? defaultRender;
10356
+ const minLen = input.minTokenLength ?? 4;
10357
+ const vectors = passes.map((p) => bagOfWords(p.findings, render, minLen));
10358
+ const sims = [];
10359
+ for (let i = 0; i < vectors.length; i++) {
10360
+ for (let j = i + 1; j < vectors.length; j++) {
10361
+ sims.push(cosineSimilarity(vectors[i], vectors[j]));
10362
+ }
10363
+ }
10364
+ const mean7 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
10365
+ return {
10366
+ orthogonality: Math.max(0, Math.min(1, 1 - mean7)),
10367
+ passCount: passes.length,
10368
+ similarities: sims
10369
+ };
10370
+ }
10371
+ function defaultRender(item) {
10372
+ if (typeof item === "string") return item;
10373
+ if (item && typeof item === "object") {
10374
+ const parts = [];
10375
+ for (const v of Object.values(item)) {
10376
+ if (typeof v === "string") parts.push(v);
10377
+ }
10378
+ return parts.join(" ");
10379
+ }
10380
+ return String(item ?? "");
10381
+ }
10382
+ function bagOfWords(items, render, minLen) {
10383
+ const bag = /* @__PURE__ */ new Map();
10384
+ for (const item of items) {
10385
+ const text = render(item).toLowerCase();
10386
+ for (const tok of text.split(/[^a-z0-9]+/).filter((w) => w.length >= minLen)) {
10387
+ bag.set(tok, (bag.get(tok) ?? 0) + 1);
10388
+ }
10389
+ }
10390
+ return bag;
10391
+ }
10392
+ function cosineSimilarity(a, b) {
10393
+ let dot = 0;
10394
+ let aMag = 0;
10395
+ let bMag = 0;
10396
+ for (const [, v] of a) aMag += v * v;
10397
+ for (const [, v] of b) bMag += v * v;
10398
+ for (const [k, v] of a) {
10399
+ const bv = b.get(k);
10400
+ if (bv) dot += v * bv;
10401
+ }
10402
+ if (aMag === 0 || bMag === 0) return 0;
10403
+ return dot / (Math.sqrt(aMag) * Math.sqrt(bMag));
10404
+ }
10405
+
10406
+ // src/promotion-gate.ts
10407
+ function bootstrapCi(baseline, candidate, options = {}) {
10408
+ const alpha = options.alpha ?? 0.05;
10409
+ const iterations = options.iterations ?? 1e3;
10410
+ const minTotal = options.minTotalSamples ?? 6;
10411
+ const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
10412
+ const baselineMean = mean6(baseline);
10413
+ const candidateMean = mean6(candidate);
10414
+ const delta = candidateMean - baselineMean;
10415
+ if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
10416
+ return {
10417
+ baselineMean,
10418
+ candidateMean,
10419
+ delta,
10420
+ ciLower: -Infinity,
10421
+ ciUpper: Infinity,
10422
+ iterations: 0,
10423
+ alpha,
10424
+ verdict: "INCONCLUSIVE"
10425
+ };
10426
+ }
10427
+ const deltas = new Array(iterations);
10428
+ for (let i = 0; i < iterations; i++) {
10429
+ const bResample = resample(baseline, rng);
10430
+ const cResample = resample(candidate, rng);
10431
+ deltas[i] = mean6(cResample) - mean6(bResample);
10432
+ }
10433
+ deltas.sort((a, b) => a - b);
10434
+ const lowerIdx = Math.floor(alpha / 2 * iterations);
10435
+ const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1;
10436
+ const ciLower = deltas[Math.max(0, lowerIdx)];
10437
+ const ciUpper = deltas[Math.min(iterations - 1, upperIdx)];
10438
+ let verdict;
10439
+ if (ciLower > 0) verdict = "ADVANCE";
10440
+ else if (ciUpper < 0) verdict = "REVERT";
10441
+ else if (delta >= 0) verdict = "KEEP";
10442
+ else verdict = "INCONCLUSIVE";
10443
+ return {
10444
+ baselineMean,
10445
+ candidateMean,
10446
+ delta,
10447
+ ciLower,
10448
+ ciUpper,
10449
+ iterations,
10450
+ alpha,
10451
+ verdict
10452
+ };
10453
+ }
10454
+ function mean6(xs) {
10455
+ if (xs.length === 0) return 0;
10456
+ let s = 0;
10457
+ for (const x of xs) s += x;
10458
+ return s / xs.length;
10459
+ }
10460
+ function resample(xs, rng) {
10461
+ const out = new Array(xs.length);
10462
+ for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)];
10463
+ return out;
10464
+ }
10465
+ function mulberry32(seed) {
10466
+ let t = seed >>> 0;
10467
+ return () => {
10468
+ t += 1831565813;
10469
+ let r = t;
10470
+ r = Math.imul(r ^ r >>> 15, r | 1);
10471
+ r ^= r + Math.imul(r ^ r >>> 7, r | 61);
10472
+ return ((r ^ r >>> 14) >>> 0) / 4294967296;
10473
+ };
10474
+ }
10475
+ function hashSeed(a, b) {
10476
+ let h = 2166136261;
10477
+ for (const x of [...a, ...b]) {
10478
+ const view = new Float64Array([x]);
10479
+ const bytes = new Uint8Array(view.buffer);
10480
+ for (const byte of bytes) {
10481
+ h ^= byte;
10482
+ h = Math.imul(h, 16777619);
10483
+ }
10484
+ }
10485
+ return h >>> 0;
10486
+ }
10487
+ async function judgeReplayGate(args) {
10488
+ const concurrency = args.judgeConcurrency ?? 4;
10489
+ const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
10490
+ const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
10491
+ const ci = bootstrapCi(baselineScores, candidateScores, {
10492
+ ...args.alpha !== void 0 ? { alpha: args.alpha } : {},
10493
+ ...args.iterations !== void 0 ? { iterations: args.iterations } : {},
10494
+ ...args.seed !== void 0 ? { seed: args.seed } : {}
10495
+ });
10496
+ return {
10497
+ ...ci,
10498
+ baselineSamples: baselineScores.length,
10499
+ candidateSamples: candidateScores.length
10500
+ };
10501
+ }
10502
+ async function scoreAll(outputs, judge, concurrency) {
10503
+ const results = new Array(outputs.length);
10504
+ let next = 0;
10505
+ async function worker() {
10506
+ while (true) {
10507
+ const i = next++;
10508
+ if (i >= outputs.length) return;
10509
+ const v = await judge(outputs[i]);
10510
+ results[i] = Number.isFinite(v) ? v : 0;
10511
+ }
10512
+ }
10513
+ await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
10514
+ return results;
10515
+ }
10516
+
10517
+ // src/reflective-mutation.ts
10518
+ var DEFAULT_MUTATION_PRIMITIVES = [
10519
+ 'Strengthen an imperative ("should" \u2192 "must")',
10520
+ "Add a concrete example pulled from a missed-golden phrase",
10521
+ "Remove a redundant rule that did not improve recall",
10522
+ 'Add a counterfactual ("if X is missing, the score is capped at Y")',
10523
+ "Reorder sections so the highest-impact rule is first",
10524
+ "Replace abstract language with a domain-specific noun the trial misses"
10525
+ ];
10526
+ function buildReflectionPrompt(ctx) {
10527
+ const primitives = ctx.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES;
10528
+ const sections = [];
10529
+ sections.push(`# Mutation target: ${ctx.target}`);
10530
+ sections.push("");
10531
+ sections.push(`You are tuning the prompt component named \`${ctx.target}\`. The current variant is shown below; you have ${ctx.topTrials.length} top trials and ${ctx.bottomTrials.length} bottom trials as evidence. Propose ${ctx.childCount} mutation${ctx.childCount === 1 ? "" : "s"} that fix specific weaknesses visible in the bottom trials. Avoid blank rephrasings.`);
10532
+ sections.push("");
10533
+ sections.push("## Current variant");
10534
+ sections.push("```json");
10535
+ sections.push(JSON.stringify(ctx.parentPayload, null, 2));
10536
+ sections.push("```");
10537
+ sections.push("");
10538
+ if (ctx.bottomTrials.length > 0) {
10539
+ sections.push("## Failures (bottom trials) \u2014 what went wrong");
10540
+ sections.push("");
10541
+ for (const trial of ctx.bottomTrials) {
10542
+ sections.push(`### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
10543
+ const missed = (trial.expectations ?? []).filter((e) => !e.matched);
10544
+ if (missed.length > 0) {
10545
+ sections.push("");
10546
+ sections.push("**Missed expectations:**");
10547
+ for (const m of missed) {
10548
+ sections.push(`- \`${m.id}\`: should match phrase \`${quote(m.phrase)}\``);
10549
+ }
10550
+ }
10551
+ if (trial.emitted) {
10552
+ sections.push("");
10553
+ sections.push("**What the agent emitted:**");
10554
+ sections.push("```");
10555
+ sections.push(truncate3(trial.emitted, 600));
10556
+ sections.push("```");
10557
+ }
10558
+ sections.push("");
10559
+ }
10560
+ }
10561
+ if (ctx.topTrials.length > 0) {
10562
+ sections.push("## Successes (top trials) \u2014 what to preserve");
10563
+ sections.push("");
10564
+ for (const trial of ctx.topTrials) {
10565
+ sections.push(`- \`${trial.id}\`: score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
10566
+ }
10567
+ sections.push("");
10568
+ }
10569
+ sections.push("## Allowed mutation primitives");
10570
+ sections.push("");
10571
+ for (const p of primitives) sections.push(`- ${p}`);
10572
+ sections.push("");
10573
+ sections.push("## Output schema");
10574
+ sections.push("");
10575
+ sections.push("Respond with a JSON object \u2014 no prose, no markdown fences:");
10576
+ sections.push("```json");
10577
+ sections.push(JSON.stringify(
10578
+ {
10579
+ proposals: [
10580
+ {
10581
+ label: "<short label, \u2264 40 chars>",
10582
+ rationale: "<which failure this targets and which primitive you used>",
10583
+ payload: "<full payload of the new variant \u2014 same shape as the current variant>"
10584
+ }
10585
+ ]
10586
+ },
10587
+ null,
10588
+ 2
10589
+ ));
10590
+ sections.push("```");
10591
+ return sections.join("\n");
10592
+ }
10593
+ function truncate3(s, max) {
10594
+ if (s.length <= max) return s;
10595
+ return s.slice(0, max) + "\u2026 [truncated]";
10596
+ }
10597
+ function quote(s) {
10598
+ return s.replace(/`/g, "\\`");
10599
+ }
10600
+ function parseReflectionResponse(raw, maxProposals) {
10601
+ let text = raw.trim();
10602
+ if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
10603
+ const start = text.indexOf("{");
10604
+ const end = text.lastIndexOf("}");
10605
+ if (start < 0 || end <= start) return [];
10606
+ let parsed;
10607
+ try {
10608
+ parsed = JSON.parse(text.slice(start, end + 1));
10609
+ } catch {
10610
+ return [];
10611
+ }
10612
+ if (!parsed || typeof parsed !== "object") return [];
10613
+ const proposalsRaw = parsed.proposals;
10614
+ if (!Array.isArray(proposalsRaw)) return [];
10615
+ const out = [];
10616
+ for (const p of proposalsRaw) {
10617
+ if (!p || typeof p !== "object") continue;
10618
+ const obj = p;
10619
+ if (!("payload" in obj)) continue;
10620
+ out.push({
10621
+ label: typeof obj.label === "string" ? obj.label : "mutation",
10622
+ rationale: typeof obj.rationale === "string" ? obj.rationale : "",
10623
+ payload: obj.payload
10624
+ });
10625
+ if (maxProposals !== void 0 && out.length >= maxProposals) break;
10626
+ }
10627
+ return out;
10628
+ }
9979
10629
  export {
9980
10630
  AgentDriver,
9981
10631
  AxGepaSteeringOptimizer,
@@ -9985,21 +10635,25 @@ export {
9985
10635
  BuilderSession,
9986
10636
  ConvergenceTracker,
9987
10637
  CostTracker,
10638
+ D1ExperimentStore,
9988
10639
  DEFAULT_AGENT_SLOS,
9989
10640
  DEFAULT_COMPLEXITY_WEIGHTS,
9990
10641
  DEFAULT_RULES as DEFAULT_FAILURE_RULES,
9991
10642
  DEFAULT_FINDERS,
9992
10643
  DEFAULT_HARNESS_OBJECTIVES,
10644
+ DEFAULT_MUTATION_PRIMITIVES,
9993
10645
  DEFAULT_MUTATORS,
9994
10646
  DEFAULT_REDACTION_RULES,
9995
10647
  DEFAULT_RED_TEAM_CORPUS,
9996
10648
  DEFAULT_RUN_SCORE_WEIGHTS,
10649
+ DEFAULT_SEVERITY_WEIGHTS,
9997
10650
  Dataset,
9998
10651
  DockerSandboxDriver,
9999
10652
  DualAgentBench,
10000
10653
  ERROR_COUNT_PATTERNS,
10001
10654
  ExperimentTracker,
10002
10655
  FAILURE_CLASSES,
10656
+ FileSystemExperimentStore,
10003
10657
  FileSystemOutcomeStore,
10004
10658
  FileSystemTraceStore,
10005
10659
  HoldoutAuditor,
@@ -10008,6 +10662,7 @@ export {
10008
10662
  InMemoryExperimentStore,
10009
10663
  InMemoryOutcomeStore,
10010
10664
  InMemoryTraceStore,
10665
+ InMemoryTrialCache,
10011
10666
  InMemoryWorkspaceInspector,
10012
10667
  JudgeRunner,
10013
10668
  LlmCallError,
@@ -10043,7 +10698,9 @@ export {
10043
10698
  benjaminiHochberg,
10044
10699
  bisect,
10045
10700
  bonferroni,
10701
+ bootstrapCi,
10046
10702
  budgetBreachView,
10703
+ buildReflectionPrompt,
10047
10704
  buildReviewerPrompt,
10048
10705
  buildTrajectory,
10049
10706
  byteLengthRange,
@@ -10081,6 +10738,7 @@ export {
10081
10738
  createLlmReviewer,
10082
10739
  createSemanticConceptJudge,
10083
10740
  crossTraceDiff,
10741
+ crowdingDistance,
10084
10742
  decideReferenceReplayPromotion,
10085
10743
  decideReferenceReplayRunPromotion,
10086
10744
  defaultJudges,
@@ -10114,6 +10772,7 @@ export {
10114
10772
  formatBenchmarkReport,
10115
10773
  formatDriverReport,
10116
10774
  formatFindings,
10775
+ precision as goldenPrecision,
10117
10776
  gradeSemanticStatus,
10118
10777
  groupBy,
10119
10778
  hashContent,
@@ -10135,6 +10794,7 @@ export {
10135
10794
  jsonlReferenceReplayStore,
10136
10795
  jsonlReviewStore,
10137
10796
  judgeAgreementView,
10797
+ judgeReplayGate,
10138
10798
  judgeSpans,
10139
10799
  keyPreserved,
10140
10800
  linterJudge,
@@ -10144,6 +10804,7 @@ export {
10144
10804
  localCommandRunner,
10145
10805
  lowercaseMutator,
10146
10806
  mannWhitneyU,
10807
+ matchGoldens,
10147
10808
  mergeLayerResults,
10148
10809
  mergeSteeringBundle,
10149
10810
  multiToolchainLayer,
@@ -10155,7 +10816,10 @@ export {
10155
10816
  pairedTTest,
10156
10817
  paraphraseRobustness,
10157
10818
  paretoFrontier,
10819
+ paretoFrontierWithCrowding,
10820
+ parseReflectionResponse,
10158
10821
  partialCredit,
10822
+ passOrthogonality,
10159
10823
  pixelDeltaRatio,
10160
10824
  politenessPrefixMutator,
10161
10825
  positionalBias,
@@ -10195,12 +10859,14 @@ export {
10195
10859
  runJudgeFleet,
10196
10860
  runKeywordCoverageJudge,
10197
10861
  runKeywordCoverageJudgeUrl,
10862
+ runPromptEvolution,
10198
10863
  runProposeReview,
10199
10864
  runReferenceReplay,
10200
10865
  runSelfPlay,
10201
10866
  runSemanticConceptJudge,
10202
10867
  runTestGradedScenario,
10203
10868
  runsForScenario,
10869
+ scalarScore,
10204
10870
  scanForMuffledGates,
10205
10871
  scoreAllProjects,
10206
10872
  scoreContinuity,
@@ -10237,6 +10903,7 @@ export {
10237
10903
  viteDeployRunner,
10238
10904
  vitestTestParser,
10239
10905
  weightedMean,
10906
+ weightedRecall,
10240
10907
  welchsTTest,
10241
10908
  whitespaceCollapseMutator,
10242
10909
  wilcoxonSignedRank