@tangle-network/agent-eval 0.11.1 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +96 -11
- package/dist/chunk-ITN4YOZY.js +215 -0
- package/dist/chunk-ITN4YOZY.js.map +1 -0
- package/dist/chunk-OZPRSK4A.js +594 -0
- package/dist/chunk-OZPRSK4A.js.map +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +104 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +597 -4
- package/dist/index.js +908 -241
- package/dist/index.js.map +1 -1
- package/dist/sink-fetch-C0B8ximv.d.ts +101 -0
- package/dist/telemetry/file.d.ts +19 -0
- package/dist/telemetry/file.js +40 -0
- package/dist/telemetry/file.js.map +1 -0
- package/dist/telemetry/index.d.ts +38 -0
- package/dist/telemetry/index.js +128 -0
- package/dist/telemetry/index.js.map +1 -0
- package/dist/wire/index.d.ts +211 -0
- package/dist/wire/index.js +56 -0
- package/dist/wire/index.js.map +1 -0
- package/package.json +27 -3
package/dist/index.js
CHANGED
|
@@ -1,3 +1,12 @@
|
|
|
1
|
+
import {
|
|
2
|
+
LlmCallError,
|
|
3
|
+
LlmClient,
|
|
4
|
+
callLlm,
|
|
5
|
+
callLlmJson,
|
|
6
|
+
probeLlm,
|
|
7
|
+
stripFencedJson
|
|
8
|
+
} from "./chunk-ITN4YOZY.js";
|
|
9
|
+
|
|
1
10
|
// src/client.ts
|
|
2
11
|
var ProductClient = class {
|
|
3
12
|
baseUrl;
|
|
@@ -410,7 +419,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
410
419
|
if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
411
420
|
if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
|
|
412
421
|
const n = scores.length;
|
|
413
|
-
const
|
|
422
|
+
const mean7 = scores.reduce((a, b) => a + b, 0) / n;
|
|
414
423
|
const B = 1e3;
|
|
415
424
|
const bootstrapMeans = [];
|
|
416
425
|
for (let i = 0; i < B; i++) {
|
|
@@ -425,7 +434,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
425
434
|
const lowerIdx = Math.floor(alpha / 2 * B);
|
|
426
435
|
const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
|
|
427
436
|
return {
|
|
428
|
-
mean:
|
|
437
|
+
mean: mean7,
|
|
429
438
|
lower: bootstrapMeans[lowerIdx],
|
|
430
439
|
upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
|
|
431
440
|
};
|
|
@@ -513,11 +522,11 @@ function pairedTTest(before, after) {
|
|
|
513
522
|
const n = before.length;
|
|
514
523
|
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
515
524
|
const diffs = before.map((b, i) => after[i] - b);
|
|
516
|
-
const
|
|
517
|
-
const variance2 = diffs.reduce((acc, d) => acc + (d -
|
|
525
|
+
const mean7 = diffs.reduce((a, b) => a + b, 0) / n;
|
|
526
|
+
const variance2 = diffs.reduce((acc, d) => acc + (d - mean7) ** 2, 0) / (n - 1);
|
|
518
527
|
const se = Math.sqrt(variance2 / n);
|
|
519
|
-
if (se === 0) return { t:
|
|
520
|
-
const t =
|
|
528
|
+
if (se === 0) return { t: mean7 === 0 ? 0 : Infinity, df: n - 1, p: mean7 === 0 ? 1 : 0 };
|
|
529
|
+
const t = mean7 / se;
|
|
521
530
|
const df = n - 1;
|
|
522
531
|
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
523
532
|
return { t, df, p };
|
|
@@ -541,9 +550,9 @@ function wilcoxonSignedRank(before, after) {
|
|
|
541
550
|
}
|
|
542
551
|
let wPlus = 0;
|
|
543
552
|
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
|
|
544
|
-
const
|
|
553
|
+
const mean7 = n * (n + 1) / 4;
|
|
545
554
|
const variance2 = n * (n + 1) * (2 * n + 1) / 24;
|
|
546
|
-
const z = (wPlus -
|
|
555
|
+
const z = (wPlus - mean7) / Math.sqrt(variance2);
|
|
547
556
|
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
548
557
|
return { w: wPlus, p };
|
|
549
558
|
}
|
|
@@ -1926,6 +1935,244 @@ function rand(bytes) {
|
|
|
1926
1935
|
return Array.from(arr).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
1927
1936
|
}
|
|
1928
1937
|
|
|
1938
|
+
// src/experiment-tracker-fs.ts
|
|
1939
|
+
var FileSystemExperimentStore = class {
|
|
1940
|
+
dir;
|
|
1941
|
+
maxBytes;
|
|
1942
|
+
index;
|
|
1943
|
+
loaded = false;
|
|
1944
|
+
constructor(options) {
|
|
1945
|
+
this.dir = options.dir;
|
|
1946
|
+
this.maxBytes = options.maxBytes ?? 32 * 1024 * 1024;
|
|
1947
|
+
}
|
|
1948
|
+
async saveExperiment(exp) {
|
|
1949
|
+
const idx = await this.load();
|
|
1950
|
+
await idx.saveExperiment(exp);
|
|
1951
|
+
await this.append("experiments", exp);
|
|
1952
|
+
}
|
|
1953
|
+
async getExperiment(id) {
|
|
1954
|
+
const idx = await this.load();
|
|
1955
|
+
return idx.getExperiment(id);
|
|
1956
|
+
}
|
|
1957
|
+
async listExperiments() {
|
|
1958
|
+
const idx = await this.load();
|
|
1959
|
+
return idx.listExperiments();
|
|
1960
|
+
}
|
|
1961
|
+
async saveRun(run) {
|
|
1962
|
+
const idx = await this.load();
|
|
1963
|
+
await idx.saveRun(run);
|
|
1964
|
+
await this.append("runs", run);
|
|
1965
|
+
}
|
|
1966
|
+
async getRun(id) {
|
|
1967
|
+
const idx = await this.load();
|
|
1968
|
+
return idx.getRun(id);
|
|
1969
|
+
}
|
|
1970
|
+
async listRuns(experimentId) {
|
|
1971
|
+
const idx = await this.load();
|
|
1972
|
+
return idx.listRuns(experimentId);
|
|
1973
|
+
}
|
|
1974
|
+
async ensureDir() {
|
|
1975
|
+
const fs = await import("fs/promises");
|
|
1976
|
+
await fs.mkdir(this.dir, { recursive: true });
|
|
1977
|
+
}
|
|
1978
|
+
async append(name, record) {
|
|
1979
|
+
await this.ensureDir();
|
|
1980
|
+
const fs = await import("fs/promises");
|
|
1981
|
+
const path = await import("path");
|
|
1982
|
+
const active = path.join(this.dir, `${name}.ndjson`);
|
|
1983
|
+
try {
|
|
1984
|
+
const stat = await fs.stat(active);
|
|
1985
|
+
if (stat.size >= this.maxBytes) {
|
|
1986
|
+
const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
|
|
1987
|
+
await fs.rename(active, rolled);
|
|
1988
|
+
}
|
|
1989
|
+
} catch {
|
|
1990
|
+
}
|
|
1991
|
+
await fs.appendFile(active, JSON.stringify(record) + "\n", "utf8");
|
|
1992
|
+
}
|
|
1993
|
+
async load() {
|
|
1994
|
+
if (this.loaded && this.index) return this.index;
|
|
1995
|
+
const fs = await import("fs/promises");
|
|
1996
|
+
const path = await import("path");
|
|
1997
|
+
const store = new InMemoryExperimentStore();
|
|
1998
|
+
try {
|
|
1999
|
+
const entries = await fs.readdir(this.dir);
|
|
2000
|
+
const sorted = entries.filter((f) => f.endsWith(".ndjson")).sort((a, b) => a.localeCompare(b));
|
|
2001
|
+
for (const file of sorted) {
|
|
2002
|
+
const full = path.join(this.dir, file);
|
|
2003
|
+
const content = await fs.readFile(full, "utf8");
|
|
2004
|
+
const base = file.split(".")[0];
|
|
2005
|
+
for (const line of content.split("\n")) {
|
|
2006
|
+
if (!line.trim()) continue;
|
|
2007
|
+
let record;
|
|
2008
|
+
try {
|
|
2009
|
+
record = JSON.parse(line);
|
|
2010
|
+
} catch {
|
|
2011
|
+
continue;
|
|
2012
|
+
}
|
|
2013
|
+
if (base === "experiments") {
|
|
2014
|
+
await store.saveExperiment(record);
|
|
2015
|
+
} else if (base === "runs") {
|
|
2016
|
+
await store.saveRun(record);
|
|
2017
|
+
}
|
|
2018
|
+
}
|
|
2019
|
+
}
|
|
2020
|
+
} catch {
|
|
2021
|
+
}
|
|
2022
|
+
this.index = store;
|
|
2023
|
+
this.loaded = true;
|
|
2024
|
+
return store;
|
|
2025
|
+
}
|
|
2026
|
+
};
|
|
2027
|
+
|
|
2028
|
+
// src/experiment-tracker-d1.ts
|
|
2029
|
+
var SCHEMA_VERSION = 1;
|
|
2030
|
+
var D1ExperimentStore = class {
|
|
2031
|
+
db;
|
|
2032
|
+
experimentsTable;
|
|
2033
|
+
runsTable;
|
|
2034
|
+
metaTable;
|
|
2035
|
+
schemaReady = false;
|
|
2036
|
+
constructor(options) {
|
|
2037
|
+
this.db = options.db;
|
|
2038
|
+
const prefix = options.tablePrefix ?? "agent_eval_";
|
|
2039
|
+
this.experimentsTable = `${prefix}experiments`;
|
|
2040
|
+
this.runsTable = `${prefix}runs`;
|
|
2041
|
+
this.metaTable = `${prefix}meta`;
|
|
2042
|
+
}
|
|
2043
|
+
/**
|
|
2044
|
+
* Idempotent schema setup. Safe to call before every operation; the second
|
|
2045
|
+
* call short-circuits via `schemaReady`. Most consumers will call it once
|
|
2046
|
+
* during Worker bootstrap.
|
|
2047
|
+
*/
|
|
2048
|
+
async ensureSchema() {
|
|
2049
|
+
if (this.schemaReady) return;
|
|
2050
|
+
const ddl = `
|
|
2051
|
+
CREATE TABLE IF NOT EXISTS ${this.experimentsTable} (
|
|
2052
|
+
id TEXT PRIMARY KEY,
|
|
2053
|
+
name TEXT NOT NULL,
|
|
2054
|
+
created_at TEXT NOT NULL,
|
|
2055
|
+
metadata_json TEXT
|
|
2056
|
+
);
|
|
2057
|
+
CREATE TABLE IF NOT EXISTS ${this.runsTable} (
|
|
2058
|
+
id TEXT PRIMARY KEY,
|
|
2059
|
+
experiment_id TEXT NOT NULL,
|
|
2060
|
+
name TEXT,
|
|
2061
|
+
status TEXT NOT NULL,
|
|
2062
|
+
started_at TEXT NOT NULL,
|
|
2063
|
+
completed_at TEXT,
|
|
2064
|
+
config_json TEXT NOT NULL,
|
|
2065
|
+
report_json TEXT,
|
|
2066
|
+
error TEXT
|
|
2067
|
+
);
|
|
2068
|
+
CREATE INDEX IF NOT EXISTS idx_${this.runsTable}_experiment ON ${this.runsTable}(experiment_id);
|
|
2069
|
+
CREATE INDEX IF NOT EXISTS idx_${this.runsTable}_started ON ${this.runsTable}(started_at);
|
|
2070
|
+
CREATE TABLE IF NOT EXISTS ${this.metaTable} (
|
|
2071
|
+
key TEXT PRIMARY KEY,
|
|
2072
|
+
value TEXT NOT NULL
|
|
2073
|
+
);
|
|
2074
|
+
INSERT OR REPLACE INTO ${this.metaTable}(key, value) VALUES ('schema_version', '${SCHEMA_VERSION}');
|
|
2075
|
+
`;
|
|
2076
|
+
await this.db.exec(ddl.trim().replace(/\s+/g, " "));
|
|
2077
|
+
this.schemaReady = true;
|
|
2078
|
+
}
|
|
2079
|
+
async saveExperiment(exp) {
|
|
2080
|
+
await this.ensureSchema();
|
|
2081
|
+
await this.db.prepare(
|
|
2082
|
+
`INSERT INTO ${this.experimentsTable}(id, name, created_at, metadata_json)
|
|
2083
|
+
VALUES (?1, ?2, ?3, ?4)
|
|
2084
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
2085
|
+
name = excluded.name,
|
|
2086
|
+
created_at = excluded.created_at,
|
|
2087
|
+
metadata_json = excluded.metadata_json`
|
|
2088
|
+
).bind(exp.id, exp.name, exp.createdAt, exp.metadata ? JSON.stringify(exp.metadata) : null).run();
|
|
2089
|
+
}
|
|
2090
|
+
async getExperiment(id) {
|
|
2091
|
+
await this.ensureSchema();
|
|
2092
|
+
const row = await this.db.prepare(
|
|
2093
|
+
`SELECT id, name, created_at, metadata_json
|
|
2094
|
+
FROM ${this.experimentsTable}
|
|
2095
|
+
WHERE id = ?1`
|
|
2096
|
+
).bind(id).first();
|
|
2097
|
+
return row ? rowToExperiment(row) : null;
|
|
2098
|
+
}
|
|
2099
|
+
async listExperiments() {
|
|
2100
|
+
await this.ensureSchema();
|
|
2101
|
+
const { results } = await this.db.prepare(
|
|
2102
|
+
`SELECT id, name, created_at, metadata_json
|
|
2103
|
+
FROM ${this.experimentsTable}
|
|
2104
|
+
ORDER BY created_at DESC`
|
|
2105
|
+
).all();
|
|
2106
|
+
return results.map(rowToExperiment);
|
|
2107
|
+
}
|
|
2108
|
+
async saveRun(run) {
|
|
2109
|
+
await this.ensureSchema();
|
|
2110
|
+
await this.db.prepare(
|
|
2111
|
+
`INSERT INTO ${this.runsTable}(id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error)
|
|
2112
|
+
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)
|
|
2113
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
2114
|
+
experiment_id = excluded.experiment_id,
|
|
2115
|
+
name = excluded.name,
|
|
2116
|
+
status = excluded.status,
|
|
2117
|
+
started_at = excluded.started_at,
|
|
2118
|
+
completed_at = excluded.completed_at,
|
|
2119
|
+
config_json = excluded.config_json,
|
|
2120
|
+
report_json = excluded.report_json,
|
|
2121
|
+
error = excluded.error`
|
|
2122
|
+
).bind(
|
|
2123
|
+
run.id,
|
|
2124
|
+
run.experimentId,
|
|
2125
|
+
run.name ?? null,
|
|
2126
|
+
run.status,
|
|
2127
|
+
run.startedAt,
|
|
2128
|
+
run.completedAt ?? null,
|
|
2129
|
+
JSON.stringify(run.config),
|
|
2130
|
+
run.report ? JSON.stringify(run.report) : null,
|
|
2131
|
+
run.error ?? null
|
|
2132
|
+
).run();
|
|
2133
|
+
}
|
|
2134
|
+
async getRun(id) {
|
|
2135
|
+
await this.ensureSchema();
|
|
2136
|
+
const row = await this.db.prepare(
|
|
2137
|
+
`SELECT id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error
|
|
2138
|
+
FROM ${this.runsTable}
|
|
2139
|
+
WHERE id = ?1`
|
|
2140
|
+
).bind(id).first();
|
|
2141
|
+
return row ? rowToRun(row) : null;
|
|
2142
|
+
}
|
|
2143
|
+
async listRuns(experimentId) {
|
|
2144
|
+
await this.ensureSchema();
|
|
2145
|
+
const { results } = await this.db.prepare(
|
|
2146
|
+
`SELECT id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error
|
|
2147
|
+
FROM ${this.runsTable}
|
|
2148
|
+
WHERE experiment_id = ?1
|
|
2149
|
+
ORDER BY started_at DESC`
|
|
2150
|
+
).bind(experimentId).all();
|
|
2151
|
+
return results.map(rowToRun);
|
|
2152
|
+
}
|
|
2153
|
+
};
|
|
2154
|
+
function rowToExperiment(row) {
|
|
2155
|
+
return {
|
|
2156
|
+
id: row.id,
|
|
2157
|
+
name: row.name,
|
|
2158
|
+
createdAt: row.created_at,
|
|
2159
|
+
...row.metadata_json ? { metadata: JSON.parse(row.metadata_json) } : {}
|
|
2160
|
+
};
|
|
2161
|
+
}
|
|
2162
|
+
function rowToRun(row) {
|
|
2163
|
+
return {
|
|
2164
|
+
id: row.id,
|
|
2165
|
+
experimentId: row.experiment_id,
|
|
2166
|
+
...row.name ? { name: row.name } : {},
|
|
2167
|
+
status: row.status,
|
|
2168
|
+
startedAt: row.started_at,
|
|
2169
|
+
...row.completed_at ? { completedAt: row.completed_at } : {},
|
|
2170
|
+
config: JSON.parse(row.config_json),
|
|
2171
|
+
...row.report_json ? { report: JSON.parse(row.report_json) } : {},
|
|
2172
|
+
...row.error ? { error: row.error } : {}
|
|
2173
|
+
};
|
|
2174
|
+
}
|
|
2175
|
+
|
|
1929
2176
|
// src/power-analysis.ts
|
|
1930
2177
|
function requiredSampleSize(opts) {
|
|
1931
2178
|
const effect = opts.effect;
|
|
@@ -2486,6 +2733,56 @@ function paretoFrontier(candidates, objectives) {
|
|
|
2486
2733
|
}));
|
|
2487
2734
|
return { frontier, dominated, dominanceMap };
|
|
2488
2735
|
}
|
|
2736
|
+
function scalarScore(candidates, objectives, options = {}) {
|
|
2737
|
+
if (candidates.length === 0) return [];
|
|
2738
|
+
const weights = options.weights ?? {};
|
|
2739
|
+
const totalWeight = objectives.reduce((s, o) => s + (weights[o.name] ?? 1), 0);
|
|
2740
|
+
const ranges = objectives.map((obj) => {
|
|
2741
|
+
const values = candidates.map((c) => obj.value(c)).filter((v) => Number.isFinite(v));
|
|
2742
|
+
if (values.length === 0) return { min: 0, max: 1 };
|
|
2743
|
+
const min = Math.min(...values);
|
|
2744
|
+
const max = Math.max(...values);
|
|
2745
|
+
return { min, max: max === min ? min + 1 : max };
|
|
2746
|
+
});
|
|
2747
|
+
return candidates.map((c) => {
|
|
2748
|
+
let score = 0;
|
|
2749
|
+
objectives.forEach((obj, i) => {
|
|
2750
|
+
const v = obj.value(c);
|
|
2751
|
+
if (!Number.isFinite(v)) return;
|
|
2752
|
+
const { min, max } = ranges[i];
|
|
2753
|
+
const normalised = (v - min) / (max - min);
|
|
2754
|
+
const directional = obj.direction === "maximize" ? normalised : 1 - normalised;
|
|
2755
|
+
const weight = (weights[obj.name] ?? 1) / totalWeight;
|
|
2756
|
+
score += directional * weight;
|
|
2757
|
+
});
|
|
2758
|
+
return { candidate: c, score };
|
|
2759
|
+
});
|
|
2760
|
+
}
|
|
2761
|
+
function crowdingDistance(candidates, objectives) {
|
|
2762
|
+
const distances = new Map(candidates.map((c) => [c, 0]));
|
|
2763
|
+
for (const obj of objectives) {
|
|
2764
|
+
const sorted = [...candidates].sort((a, b) => obj.value(a) - obj.value(b));
|
|
2765
|
+
const min = obj.value(sorted[0]);
|
|
2766
|
+
const max = obj.value(sorted[sorted.length - 1]);
|
|
2767
|
+
const range = max - min || 1;
|
|
2768
|
+
distances.set(sorted[0], Infinity);
|
|
2769
|
+
distances.set(sorted[sorted.length - 1], Infinity);
|
|
2770
|
+
for (let i = 1; i < sorted.length - 1; i++) {
|
|
2771
|
+
const prev = obj.value(sorted[i - 1]);
|
|
2772
|
+
const next = obj.value(sorted[i + 1]);
|
|
2773
|
+
const current = distances.get(sorted[i]);
|
|
2774
|
+
if (current === Infinity) continue;
|
|
2775
|
+
distances.set(sorted[i], current + (next - prev) / range);
|
|
2776
|
+
}
|
|
2777
|
+
}
|
|
2778
|
+
return candidates.map((c) => ({ candidate: c, distance: distances.get(c) ?? 0 }));
|
|
2779
|
+
}
|
|
2780
|
+
function paretoFrontierWithCrowding(candidates, objectives) {
|
|
2781
|
+
const { frontier } = paretoFrontier(candidates, objectives);
|
|
2782
|
+
if (frontier.length === 0) return [];
|
|
2783
|
+
const distances = crowdingDistance(frontier, objectives);
|
|
2784
|
+
return distances.sort((a, b) => b.distance - a.distance);
|
|
2785
|
+
}
|
|
2489
2786
|
|
|
2490
2787
|
// src/harness-optimizer.ts
|
|
2491
2788
|
var DEFAULT_HARNESS_OBJECTIVES = [
|
|
@@ -5095,10 +5392,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
5095
5392
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
5096
5393
|
}
|
|
5097
5394
|
const tail = values.slice(-window);
|
|
5098
|
-
const
|
|
5099
|
-
const variance2 = tail.reduce((acc, v) => acc + (v -
|
|
5395
|
+
const mean7 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
5396
|
+
const variance2 = tail.reduce((acc, v) => acc + (v - mean7) ** 2, 0) / tail.length;
|
|
5100
5397
|
const stdDev = Math.sqrt(variance2);
|
|
5101
|
-
const refMean = Math.abs(
|
|
5398
|
+
const refMean = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
|
|
5102
5399
|
const cv = stdDev / refMean;
|
|
5103
5400
|
const stable = tail.length >= window && cv <= stableCv;
|
|
5104
5401
|
let tailRun = 0;
|
|
@@ -5119,7 +5416,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
5119
5416
|
} else {
|
|
5120
5417
|
state = "noisy";
|
|
5121
5418
|
}
|
|
5122
|
-
return { state, windowMean:
|
|
5419
|
+
return { state, windowMean: mean7, windowCv: cv, tailRun, stable };
|
|
5123
5420
|
}
|
|
5124
5421
|
|
|
5125
5422
|
// src/state-continuity.ts
|
|
@@ -6047,12 +6344,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
6047
6344
|
variantScores.push({ mutator: id, score, mutated });
|
|
6048
6345
|
all.push(score);
|
|
6049
6346
|
}
|
|
6050
|
-
const
|
|
6051
|
-
const variance2 = all.reduce((a, v) => a + (v -
|
|
6347
|
+
const mean7 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
6348
|
+
const variance2 = all.reduce((a, v) => a + (v - mean7) ** 2, 0) / all.length;
|
|
6052
6349
|
const stdDev = Math.sqrt(variance2);
|
|
6053
|
-
const ref = Math.abs(
|
|
6350
|
+
const ref = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
|
|
6054
6351
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
6055
|
-
return { originalScore, variantScores, meanScore:
|
|
6352
|
+
return { originalScore, variantScores, meanScore: mean7, stdDev, robustness };
|
|
6056
6353
|
}
|
|
6057
6354
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
6058
6355
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -6973,8 +7270,8 @@ async function prmBestOfN(store, grader, runIds) {
|
|
|
6973
7270
|
if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
|
|
6974
7271
|
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
6975
7272
|
const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
6976
|
-
const
|
|
6977
|
-
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore -
|
|
7273
|
+
const mean7 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
|
|
7274
|
+
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / graded.length;
|
|
6978
7275
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
6979
7276
|
}
|
|
6980
7277
|
async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
@@ -6996,8 +7293,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
|
6996
7293
|
const ranked = [...byRun.values()].sort(
|
|
6997
7294
|
(a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
|
|
6998
7295
|
);
|
|
6999
|
-
const
|
|
7000
|
-
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore -
|
|
7296
|
+
const mean7 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
|
|
7297
|
+
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / ranked.length;
|
|
7001
7298
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
7002
7299
|
}
|
|
7003
7300
|
|
|
@@ -7527,8 +7824,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
7527
7824
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
7528
7825
|
const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
7529
7826
|
if (scores.length < 3) continue;
|
|
7530
|
-
const
|
|
7531
|
-
const variance2 = scores.reduce((a, b) => a + (b -
|
|
7827
|
+
const mean7 = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
7828
|
+
const variance2 = scores.reduce((a, b) => a + (b - mean7) ** 2, 0) / scores.length;
|
|
7532
7829
|
if (variance2 > varianceThreshold) {
|
|
7533
7830
|
targets.push({
|
|
7534
7831
|
reason: "high-variance",
|
|
@@ -8008,212 +8305,6 @@ async function euAiActReport(ctx, signals) {
|
|
|
8008
8305
|
};
|
|
8009
8306
|
}
|
|
8010
8307
|
|
|
8011
|
-
// src/llm-client.ts
|
|
8012
|
-
var LlmCallError = class extends Error {
|
|
8013
|
-
constructor(message, status, body, model) {
|
|
8014
|
-
super(message);
|
|
8015
|
-
this.status = status;
|
|
8016
|
-
this.body = body;
|
|
8017
|
-
this.model = model;
|
|
8018
|
-
this.name = "LlmCallError";
|
|
8019
|
-
}
|
|
8020
|
-
status;
|
|
8021
|
-
body;
|
|
8022
|
-
model;
|
|
8023
|
-
};
|
|
8024
|
-
var DEFAULT_BASE_URL = "https://router.tangle.tools/v1";
|
|
8025
|
-
var DEFAULT_TIMEOUT_MS = 6e4;
|
|
8026
|
-
var DEFAULT_MAX_RETRIES = 3;
|
|
8027
|
-
var RETRYABLE_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
|
|
8028
|
-
function isRetryableError(err) {
|
|
8029
|
-
if (err instanceof LlmCallError) return RETRYABLE_STATUS.has(err.status);
|
|
8030
|
-
if (err instanceof Error) {
|
|
8031
|
-
return err.name === "AbortError" || err.name === "TimeoutError" || /fetch failed|ECONNRESET|ETIMEDOUT|EAI_AGAIN/i.test(err.message);
|
|
8032
|
-
}
|
|
8033
|
-
return false;
|
|
8034
|
-
}
|
|
8035
|
-
function parseRetryAfter(headers) {
|
|
8036
|
-
const h = headers.get("retry-after");
|
|
8037
|
-
if (!h) return null;
|
|
8038
|
-
const asNumber = Number(h);
|
|
8039
|
-
if (Number.isFinite(asNumber) && asNumber > 0) return asNumber * 1e3;
|
|
8040
|
-
const asDate = Date.parse(h);
|
|
8041
|
-
if (Number.isFinite(asDate)) return Math.max(0, asDate - Date.now());
|
|
8042
|
-
return null;
|
|
8043
|
-
}
|
|
8044
|
-
function backoffMs(attempt) {
|
|
8045
|
-
return Math.min(500 * Math.pow(2, attempt), 16e3);
|
|
8046
|
-
}
|
|
8047
|
-
function buildHeaders(opts) {
|
|
8048
|
-
const headers = {
|
|
8049
|
-
"Content-Type": "application/json",
|
|
8050
|
-
Accept: "application/json"
|
|
8051
|
-
};
|
|
8052
|
-
if (opts.authHeader) {
|
|
8053
|
-
headers[opts.authHeader.name] = opts.authHeader.value;
|
|
8054
|
-
} else if (opts.bearer || opts.apiKey) {
|
|
8055
|
-
headers.Authorization = `Bearer ${opts.bearer ?? opts.apiKey}`;
|
|
8056
|
-
}
|
|
8057
|
-
return headers;
|
|
8058
|
-
}
|
|
8059
|
-
function isSchemaRejection(status, body) {
|
|
8060
|
-
if (status !== 400) return false;
|
|
8061
|
-
const lower = body.toLowerCase();
|
|
8062
|
-
return lower.includes("response_format") || lower.includes("json_schema") || lower.includes("is unavailable") || lower.includes("not supported");
|
|
8063
|
-
}
|
|
8064
|
-
function buildBody(req, forceJsonObject) {
|
|
8065
|
-
const body = {
|
|
8066
|
-
model: req.model,
|
|
8067
|
-
messages: req.messages,
|
|
8068
|
-
temperature: req.temperature ?? 0
|
|
8069
|
-
};
|
|
8070
|
-
if (req.maxTokens != null) body.max_tokens = req.maxTokens;
|
|
8071
|
-
if (req.jsonSchema && !forceJsonObject) {
|
|
8072
|
-
body.response_format = {
|
|
8073
|
-
type: "json_schema",
|
|
8074
|
-
json_schema: { name: req.jsonSchema.name, schema: req.jsonSchema.schema, strict: true }
|
|
8075
|
-
};
|
|
8076
|
-
} else if (req.jsonMode || req.jsonSchema) {
|
|
8077
|
-
body.response_format = { type: "json_object" };
|
|
8078
|
-
}
|
|
8079
|
-
return body;
|
|
8080
|
-
}
|
|
8081
|
-
async function sleep(ms) {
|
|
8082
|
-
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
8083
|
-
}
|
|
8084
|
-
function stripFencedJson(raw) {
|
|
8085
|
-
const trimmed = raw.trim();
|
|
8086
|
-
const m = trimmed.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?```\s*$/);
|
|
8087
|
-
return m ? m[1].trim() : trimmed;
|
|
8088
|
-
}
|
|
8089
|
-
async function callLlm(req, opts = {}) {
|
|
8090
|
-
const baseUrl = (opts.baseUrl ?? DEFAULT_BASE_URL).replace(/\/+$/, "");
|
|
8091
|
-
const url = `${baseUrl}/chat/completions`;
|
|
8092
|
-
const timeoutMs = req.timeoutMs ?? opts.defaultTimeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
8093
|
-
const maxRetries = opts.maxRetries ?? DEFAULT_MAX_RETRIES;
|
|
8094
|
-
const fetchFn = opts.fetch ?? globalThis.fetch;
|
|
8095
|
-
const headers = buildHeaders(opts);
|
|
8096
|
-
let lastErr;
|
|
8097
|
-
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
|
8098
|
-
const controller = new AbortController();
|
|
8099
|
-
const timeoutHandle = setTimeout(() => controller.abort(), timeoutMs);
|
|
8100
|
-
const started = Date.now();
|
|
8101
|
-
try {
|
|
8102
|
-
const res = await fetchFn(url, {
|
|
8103
|
-
method: "POST",
|
|
8104
|
-
headers,
|
|
8105
|
-
body: JSON.stringify(buildBody(req, false)),
|
|
8106
|
-
signal: controller.signal
|
|
8107
|
-
});
|
|
8108
|
-
clearTimeout(timeoutHandle);
|
|
8109
|
-
if (!res.ok) {
|
|
8110
|
-
const body = await res.text();
|
|
8111
|
-
const err = new LlmCallError(
|
|
8112
|
-
`LLM call ${res.status}: ${body.slice(0, 300)}`,
|
|
8113
|
-
res.status,
|
|
8114
|
-
body,
|
|
8115
|
-
req.model
|
|
8116
|
-
);
|
|
8117
|
-
if (RETRYABLE_STATUS.has(res.status) && attempt < maxRetries - 1) {
|
|
8118
|
-
lastErr = err;
|
|
8119
|
-
const retryAfter = parseRetryAfter(res.headers);
|
|
8120
|
-
await sleep(retryAfter ?? backoffMs(attempt));
|
|
8121
|
-
continue;
|
|
8122
|
-
}
|
|
8123
|
-
throw err;
|
|
8124
|
-
}
|
|
8125
|
-
const json = await res.json();
|
|
8126
|
-
const choice = json.choices?.[0];
|
|
8127
|
-
const usageRaw = json.usage ?? {};
|
|
8128
|
-
const costFromProxy = json._response_cost ?? json.cost_usd;
|
|
8129
|
-
return {
|
|
8130
|
-
content: choice?.message?.content ?? "",
|
|
8131
|
-
usage: {
|
|
8132
|
-
promptTokens: Number(usageRaw.prompt_tokens ?? 0),
|
|
8133
|
-
completionTokens: Number(usageRaw.completion_tokens ?? 0),
|
|
8134
|
-
totalTokens: Number(usageRaw.total_tokens ?? 0),
|
|
8135
|
-
cachedPromptTokens: usageRaw.prompt_tokens_details && typeof usageRaw.prompt_tokens_details === "object" ? Number(
|
|
8136
|
-
usageRaw.prompt_tokens_details.cached_tokens ?? 0
|
|
8137
|
-
) : void 0
|
|
8138
|
-
},
|
|
8139
|
-
costUsd: typeof costFromProxy === "number" ? costFromProxy : null,
|
|
8140
|
-
model: json.model ?? req.model,
|
|
8141
|
-
durationMs: Date.now() - started,
|
|
8142
|
-
raw: json
|
|
8143
|
-
};
|
|
8144
|
-
} catch (err) {
|
|
8145
|
-
clearTimeout(timeoutHandle);
|
|
8146
|
-
lastErr = err;
|
|
8147
|
-
if (attempt < maxRetries - 1 && isRetryableError(err)) {
|
|
8148
|
-
await sleep(backoffMs(attempt));
|
|
8149
|
-
continue;
|
|
8150
|
-
}
|
|
8151
|
-
throw err;
|
|
8152
|
-
}
|
|
8153
|
-
}
|
|
8154
|
-
throw lastErr instanceof Error ? lastErr : new Error(String(lastErr));
|
|
8155
|
-
}
|
|
8156
|
-
async function callLlmJson(req, opts = {}) {
|
|
8157
|
-
try {
|
|
8158
|
-
const result = await callLlm({ ...req, jsonMode: req.jsonMode ?? !req.jsonSchema }, opts);
|
|
8159
|
-
const value = parseJsonSafely(result.content, result.model);
|
|
8160
|
-
return { value, result };
|
|
8161
|
-
} catch (err) {
|
|
8162
|
-
if (err instanceof LlmCallError && isSchemaRejection(err.status, err.body) && req.jsonSchema) {
|
|
8163
|
-
const degradedReq = { ...req, jsonMode: true, jsonSchema: void 0 };
|
|
8164
|
-
const result = await callLlm(degradedReq, opts);
|
|
8165
|
-
const value = parseJsonSafely(result.content, result.model);
|
|
8166
|
-
return { value, result };
|
|
8167
|
-
}
|
|
8168
|
-
throw err;
|
|
8169
|
-
}
|
|
8170
|
-
}
|
|
8171
|
-
function parseJsonSafely(content, model) {
|
|
8172
|
-
const stripped = stripFencedJson(content);
|
|
8173
|
-
try {
|
|
8174
|
-
return JSON.parse(stripped);
|
|
8175
|
-
} catch (err) {
|
|
8176
|
-
throw new Error(
|
|
8177
|
-
`LLM returned non-JSON content (model=${model}): ${err instanceof Error ? err.message : String(err)}
|
|
8178
|
-
--- raw content ---
|
|
8179
|
-
${content.slice(0, 800)}`
|
|
8180
|
-
);
|
|
8181
|
-
}
|
|
8182
|
-
}
|
|
8183
|
-
async function probeLlm(model, opts = {}) {
|
|
8184
|
-
const start = Date.now();
|
|
8185
|
-
try {
|
|
8186
|
-
await callLlm(
|
|
8187
|
-
{
|
|
8188
|
-
model,
|
|
8189
|
-
messages: [{ role: "user", content: "ping" }],
|
|
8190
|
-
maxTokens: 64,
|
|
8191
|
-
timeoutMs: opts.timeoutMs ?? 3e4
|
|
8192
|
-
},
|
|
8193
|
-
opts
|
|
8194
|
-
);
|
|
8195
|
-
return { ok: true, latencyMs: Date.now() - start, error: null };
|
|
8196
|
-
} catch (err) {
|
|
8197
|
-
return {
|
|
8198
|
-
ok: false,
|
|
8199
|
-
latencyMs: Date.now() - start,
|
|
8200
|
-
error: err instanceof Error ? err.message : String(err)
|
|
8201
|
-
};
|
|
8202
|
-
}
|
|
8203
|
-
}
|
|
8204
|
-
var LlmClient = class {
|
|
8205
|
-
constructor(opts = {}) {
|
|
8206
|
-
this.opts = opts;
|
|
8207
|
-
}
|
|
8208
|
-
opts;
|
|
8209
|
-
call(req, per) {
|
|
8210
|
-
return callLlm(req, { ...this.opts, ...per });
|
|
8211
|
-
}
|
|
8212
|
-
callJson(req, per) {
|
|
8213
|
-
return callLlmJson(req, { ...this.opts, ...per });
|
|
8214
|
-
}
|
|
8215
|
-
};
|
|
8216
|
-
|
|
8217
8308
|
// src/multi-layer-verifier.ts
|
|
8218
8309
|
function gradeSemanticStatus(input) {
|
|
8219
8310
|
if (!input.available) return "error";
|
|
@@ -9771,7 +9862,7 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
|
|
|
9771
9862
|
const total = scenario.references.length;
|
|
9772
9863
|
const matchedWeight = matches2.filter((match) => match.matched).reduce((sum2, match) => sum2 + match.weight, 0);
|
|
9773
9864
|
const totalWeight = matches2.reduce((sum2, match) => sum2 + match.weight, 0);
|
|
9774
|
-
const
|
|
9865
|
+
const precision2 = ratio(matched, matched + falsePositives);
|
|
9775
9866
|
const recall = ratio(matched, total);
|
|
9776
9867
|
return {
|
|
9777
9868
|
scenarioId: scenario.id,
|
|
@@ -9781,9 +9872,9 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
|
|
|
9781
9872
|
falsePositives,
|
|
9782
9873
|
matchedWeight,
|
|
9783
9874
|
totalWeight,
|
|
9784
|
-
precision,
|
|
9875
|
+
precision: precision2,
|
|
9785
9876
|
recall,
|
|
9786
|
-
f1: f1(
|
|
9877
|
+
f1: f1(precision2, recall),
|
|
9787
9878
|
matches: matches2
|
|
9788
9879
|
};
|
|
9789
9880
|
}
|
|
@@ -9801,7 +9892,7 @@ function aggregateScenarioScores(scores) {
|
|
|
9801
9892
|
const falsePositives = sum(scores.map((score) => score.falsePositives));
|
|
9802
9893
|
const matchedWeight = sum(scores.map((score) => score.matchedWeight));
|
|
9803
9894
|
const totalWeight = sum(scores.map((score) => score.totalWeight));
|
|
9804
|
-
const
|
|
9895
|
+
const precision2 = ratio(matched, matched + falsePositives);
|
|
9805
9896
|
const recall = ratio(matched, total);
|
|
9806
9897
|
return {
|
|
9807
9898
|
matched,
|
|
@@ -9809,9 +9900,9 @@ function aggregateScenarioScores(scores) {
|
|
|
9809
9900
|
falsePositives,
|
|
9810
9901
|
matchedWeight,
|
|
9811
9902
|
totalWeight,
|
|
9812
|
-
precision,
|
|
9903
|
+
precision: precision2,
|
|
9813
9904
|
recall,
|
|
9814
|
-
f1: f1(
|
|
9905
|
+
f1: f1(precision2, recall),
|
|
9815
9906
|
weightedRecall: ratio(matchedWeight, totalWeight)
|
|
9816
9907
|
};
|
|
9817
9908
|
}
|
|
@@ -9831,8 +9922,8 @@ function emptyAggregate() {
|
|
|
9831
9922
|
function hasSplit(score, split) {
|
|
9832
9923
|
return score.bySplit[split] !== void 0;
|
|
9833
9924
|
}
|
|
9834
|
-
function f1(
|
|
9835
|
-
return
|
|
9925
|
+
function f1(precision2, recall) {
|
|
9926
|
+
return precision2 + recall === 0 ? 0 : 2 * precision2 * recall / (precision2 + recall);
|
|
9836
9927
|
}
|
|
9837
9928
|
function ratio(numerator, denominator) {
|
|
9838
9929
|
return denominator > 0 ? numerator / denominator : 0;
|
|
@@ -9956,14 +10047,14 @@ function referenceReplayRunsToSteeringRows(runs, options = {}) {
|
|
|
9956
10047
|
function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
|
|
9957
10048
|
const success = scenarioScore.f1;
|
|
9958
10049
|
const recall = scenarioScore.recall;
|
|
9959
|
-
const
|
|
10050
|
+
const precision2 = scenarioScore.precision;
|
|
9960
10051
|
const failed = scenarioScore.total > 0 && scenarioScore.matched === 0;
|
|
9961
10052
|
return {
|
|
9962
10053
|
success,
|
|
9963
10054
|
goalProgress: recall,
|
|
9964
|
-
repoGroundedness:
|
|
9965
|
-
driftPenalty: 1 -
|
|
9966
|
-
toolUseQuality:
|
|
10055
|
+
repoGroundedness: precision2,
|
|
10056
|
+
driftPenalty: 1 - precision2,
|
|
10057
|
+
toolUseQuality: precision2,
|
|
9967
10058
|
patchQuality: 0,
|
|
9968
10059
|
testReality: scenarioScore.total > 0 ? 1 : 0,
|
|
9969
10060
|
finalGate: success,
|
|
@@ -9972,10 +10063,569 @@ function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
|
|
|
9972
10063
|
wallSeconds: Math.max(0, durationMs / 1e3),
|
|
9973
10064
|
notes: [
|
|
9974
10065
|
`reference-replay matched ${scenarioScore.matched}/${scenarioScore.total}`,
|
|
9975
|
-
`precision=${
|
|
10066
|
+
`precision=${precision2.toFixed(3)} recall=${recall.toFixed(3)} f1=${success.toFixed(3)}`
|
|
9976
10067
|
]
|
|
9977
10068
|
};
|
|
9978
10069
|
}
|
|
10070
|
+
|
|
10071
|
+
// src/prompt-evolution.ts
|
|
10072
|
+
var InMemoryTrialCache = class {
|
|
10073
|
+
store = /* @__PURE__ */ new Map();
|
|
10074
|
+
get(key) {
|
|
10075
|
+
return this.store.get(key);
|
|
10076
|
+
}
|
|
10077
|
+
set(key, value) {
|
|
10078
|
+
this.store.set(key, value);
|
|
10079
|
+
}
|
|
10080
|
+
size() {
|
|
10081
|
+
return this.store.size;
|
|
10082
|
+
}
|
|
10083
|
+
clear() {
|
|
10084
|
+
this.store.clear();
|
|
10085
|
+
}
|
|
10086
|
+
};
|
|
10087
|
+
async function runPromptEvolution(config) {
|
|
10088
|
+
const generations = [];
|
|
10089
|
+
let population = [...config.seedVariants];
|
|
10090
|
+
let bestVariant = population[0];
|
|
10091
|
+
let bestAggregate = null;
|
|
10092
|
+
for (let generation = 0; generation < config.generations; generation++) {
|
|
10093
|
+
config.onProgress?.({ type: "generation-start", generation, populationSize: population.length });
|
|
10094
|
+
const trials = await scorePopulation(population, config, generation);
|
|
10095
|
+
const aggregates = aggregateTrials(population, config.scenarioIds, trials);
|
|
10096
|
+
const front = paretoFrontierWithCrowding(aggregates, config.objectives);
|
|
10097
|
+
const frontIds = new Set(front.map((c) => c.candidate.variantId));
|
|
10098
|
+
const scored = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights });
|
|
10099
|
+
scored.sort((a, b) => b.score - a.score);
|
|
10100
|
+
const winnerId = scored[0]?.candidate.variantId ?? aggregates[0]?.variantId ?? population[0].id;
|
|
10101
|
+
const report = {
|
|
10102
|
+
runId: config.runId,
|
|
10103
|
+
target: config.target,
|
|
10104
|
+
generation,
|
|
10105
|
+
variants: population,
|
|
10106
|
+
aggregates,
|
|
10107
|
+
paretoFrontIds: front.map((c) => c.candidate.variantId),
|
|
10108
|
+
winnerId,
|
|
10109
|
+
trials
|
|
10110
|
+
};
|
|
10111
|
+
generations.push(report);
|
|
10112
|
+
config.onProgress?.({ type: "generation-complete", report });
|
|
10113
|
+
const winnerAgg = aggregates.find((a) => a.variantId === winnerId);
|
|
10114
|
+
if (winnerAgg) {
|
|
10115
|
+
const winner = population.find((v) => v.id === winnerId);
|
|
10116
|
+
if (winner) bestVariant = winner;
|
|
10117
|
+
bestAggregate = winnerAgg;
|
|
10118
|
+
}
|
|
10119
|
+
if (config.earlyStopOnNoImprovement !== false && generations.length >= 2) {
|
|
10120
|
+
const prev = generations[generations.length - 2];
|
|
10121
|
+
const noChange = prev.winnerId === winnerId && samePopulation(prev.paretoFrontIds, [...frontIds]);
|
|
10122
|
+
if (noChange) {
|
|
10123
|
+
config.onProgress?.({ type: "converged", generation, reason: "no improvement vs previous generation" });
|
|
10124
|
+
break;
|
|
10125
|
+
}
|
|
10126
|
+
}
|
|
10127
|
+
if (generation === config.generations - 1) break;
|
|
10128
|
+
population = await nextPopulation(population, aggregates, trials, front, config, generation + 1);
|
|
10129
|
+
}
|
|
10130
|
+
return {
|
|
10131
|
+
runId: config.runId,
|
|
10132
|
+
target: config.target,
|
|
10133
|
+
generations,
|
|
10134
|
+
bestVariant,
|
|
10135
|
+
bestAggregate: bestAggregate ?? aggregateTrials(population, config.scenarioIds, []).find((a) => a.variantId === bestVariant.id)
|
|
10136
|
+
};
|
|
10137
|
+
}
|
|
10138
|
+
async function scorePopulation(population, config, generation) {
|
|
10139
|
+
const jobs = [];
|
|
10140
|
+
for (const variant of population) {
|
|
10141
|
+
for (const scenarioId of config.scenarioIds) {
|
|
10142
|
+
for (let rep = 0; rep < config.reps; rep++) {
|
|
10143
|
+
jobs.push(async () => {
|
|
10144
|
+
const cacheKey = `${variant.id}|${scenarioId}|${rep}`;
|
|
10145
|
+
const cached = config.cache?.get(cacheKey);
|
|
10146
|
+
if (cached) {
|
|
10147
|
+
config.onProgress?.({
|
|
10148
|
+
type: "trial-complete",
|
|
10149
|
+
generation,
|
|
10150
|
+
variantId: variant.id,
|
|
10151
|
+
scenarioId,
|
|
10152
|
+
rep,
|
|
10153
|
+
ok: cached.ok,
|
|
10154
|
+
score: cached.score,
|
|
10155
|
+
cached: true
|
|
10156
|
+
});
|
|
10157
|
+
return cached;
|
|
10158
|
+
}
|
|
10159
|
+
const result = await config.scoreAdapter.score({ variant, scenarioId, rep });
|
|
10160
|
+
config.cache?.set(cacheKey, result);
|
|
10161
|
+
config.onProgress?.({
|
|
10162
|
+
type: "trial-complete",
|
|
10163
|
+
generation,
|
|
10164
|
+
variantId: variant.id,
|
|
10165
|
+
scenarioId,
|
|
10166
|
+
rep,
|
|
10167
|
+
ok: result.ok,
|
|
10168
|
+
score: result.score,
|
|
10169
|
+
cached: false
|
|
10170
|
+
});
|
|
10171
|
+
return result;
|
|
10172
|
+
});
|
|
10173
|
+
}
|
|
10174
|
+
}
|
|
10175
|
+
}
|
|
10176
|
+
return runWithConcurrency(jobs, config.scoreConcurrency);
|
|
10177
|
+
}
|
|
10178
|
+
async function runWithConcurrency(jobs, concurrency) {
|
|
10179
|
+
const results = new Array(jobs.length);
|
|
10180
|
+
const limit = Math.max(1, concurrency);
|
|
10181
|
+
let next = 0;
|
|
10182
|
+
async function worker() {
|
|
10183
|
+
while (true) {
|
|
10184
|
+
const i = next++;
|
|
10185
|
+
if (i >= jobs.length) return;
|
|
10186
|
+
results[i] = await jobs[i]();
|
|
10187
|
+
}
|
|
10188
|
+
}
|
|
10189
|
+
await Promise.all(Array.from({ length: limit }, () => worker()));
|
|
10190
|
+
return results;
|
|
10191
|
+
}
|
|
10192
|
+
function aggregateTrials(population, scenarioIds, trials) {
|
|
10193
|
+
return population.map((variant) => {
|
|
10194
|
+
const variantTrials = trials.filter((t) => t.variantId === variant.id);
|
|
10195
|
+
const scenarios = scenarioIds.map((sid) => {
|
|
10196
|
+
const scenarioTrials = variantTrials.filter((t) => t.scenarioId === sid);
|
|
10197
|
+
const okTrials = scenarioTrials.filter((t) => t.ok);
|
|
10198
|
+
const metrics = aggregateMetrics(okTrials.map((t) => t.metrics ?? {}));
|
|
10199
|
+
return {
|
|
10200
|
+
variantId: variant.id,
|
|
10201
|
+
scenarioId: sid,
|
|
10202
|
+
meanScore: mean5(okTrials.map((t) => t.score)),
|
|
10203
|
+
meanCost: mean5(okTrials.map((t) => t.cost ?? 0)),
|
|
10204
|
+
meanDurationMs: mean5(okTrials.map((t) => t.durationMs ?? 0)),
|
|
10205
|
+
okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
|
|
10206
|
+
trials: scenarioTrials.length,
|
|
10207
|
+
metrics
|
|
10208
|
+
};
|
|
10209
|
+
});
|
|
10210
|
+
return {
|
|
10211
|
+
variantId: variant.id,
|
|
10212
|
+
meanScore: mean5(scenarios.map((s) => s.meanScore)),
|
|
10213
|
+
meanCost: mean5(scenarios.map((s) => s.meanCost)),
|
|
10214
|
+
meanDurationMs: mean5(scenarios.map((s) => s.meanDurationMs)),
|
|
10215
|
+
okRate: mean5(scenarios.map((s) => s.okRate)),
|
|
10216
|
+
scenarios,
|
|
10217
|
+
metrics: aggregateMetrics(scenarios.map((s) => s.metrics))
|
|
10218
|
+
};
|
|
10219
|
+
});
|
|
10220
|
+
}
|
|
10221
|
+
function aggregateMetrics(rows) {
|
|
10222
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
10223
|
+
for (const row of rows) {
|
|
10224
|
+
for (const [k, v] of Object.entries(row)) {
|
|
10225
|
+
if (!Number.isFinite(v)) continue;
|
|
10226
|
+
const list = buckets.get(k) ?? [];
|
|
10227
|
+
list.push(v);
|
|
10228
|
+
buckets.set(k, list);
|
|
10229
|
+
}
|
|
10230
|
+
}
|
|
10231
|
+
const out = {};
|
|
10232
|
+
for (const [k, list] of buckets) out[k] = mean5(list);
|
|
10233
|
+
return out;
|
|
10234
|
+
}
|
|
10235
|
+
function mean5(xs) {
|
|
10236
|
+
if (xs.length === 0) return 0;
|
|
10237
|
+
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
10238
|
+
}
|
|
10239
|
+
async function nextPopulation(current, aggregates, trials, front, config, nextGeneration) {
|
|
10240
|
+
const survivorIds = new Set(front.map((c) => c.candidate.variantId));
|
|
10241
|
+
const survivors = current.filter((v) => survivorIds.has(v.id));
|
|
10242
|
+
const ranked = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights }).sort((a, b) => b.score - a.score);
|
|
10243
|
+
const parentId = ranked[0]?.candidate.variantId ?? current[0].id;
|
|
10244
|
+
const parent = current.find((v) => v.id === parentId) ?? current[0];
|
|
10245
|
+
const parentAggregate = aggregates.find((a) => a.variantId === parent.id) ?? aggregates[0];
|
|
10246
|
+
const topTrials = topKTrialsByScore(trials, parent.id, 3);
|
|
10247
|
+
const bottomTrials = bottomKTrialsByScore(trials, parent.id, 3);
|
|
10248
|
+
const childCount = Math.max(0, config.populationSize - survivors.length);
|
|
10249
|
+
let children = [];
|
|
10250
|
+
if (childCount > 0) {
|
|
10251
|
+
children = await config.mutateAdapter.mutate({
|
|
10252
|
+
parent,
|
|
10253
|
+
parentAggregate,
|
|
10254
|
+
topTrials,
|
|
10255
|
+
bottomTrials,
|
|
10256
|
+
childCount,
|
|
10257
|
+
generation: nextGeneration
|
|
10258
|
+
});
|
|
10259
|
+
children = children.slice(0, childCount).map((c) => ({ ...c, generation: nextGeneration, parentId: parent.id }));
|
|
10260
|
+
}
|
|
10261
|
+
return [...survivors, ...children];
|
|
10262
|
+
}
|
|
10263
|
+
function topKTrialsByScore(trials, variantId, k) {
|
|
10264
|
+
return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => b.score - a.score).slice(0, k);
|
|
10265
|
+
}
|
|
10266
|
+
function bottomKTrialsByScore(trials, variantId, k) {
|
|
10267
|
+
return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => a.score - b.score).slice(0, k);
|
|
10268
|
+
}
|
|
10269
|
+
function samePopulation(a, b) {
|
|
10270
|
+
if (a.length !== b.length) return false;
|
|
10271
|
+
const setA = new Set(a);
|
|
10272
|
+
return b.every((id) => setA.has(id));
|
|
10273
|
+
}
|
|
10274
|
+
|
|
10275
|
+
// src/golden-matcher.ts
|
|
10276
|
+
function matchGoldens(goldens, candidates, options = {}) {
|
|
10277
|
+
const extract = options.text ?? defaultExtract5;
|
|
10278
|
+
const haystacks = candidates.map((c) => extract(c).toLowerCase());
|
|
10279
|
+
const matches2 = goldens.map((golden) => goldenMatched(golden, haystacks));
|
|
10280
|
+
return {
|
|
10281
|
+
matches: matches2,
|
|
10282
|
+
hits: matches2.filter(Boolean).length,
|
|
10283
|
+
total: goldens.length
|
|
10284
|
+
};
|
|
10285
|
+
}
|
|
10286
|
+
function defaultExtract5(candidate) {
|
|
10287
|
+
if (typeof candidate === "string") return candidate;
|
|
10288
|
+
if (candidate && typeof candidate === "object") {
|
|
10289
|
+
const parts = [];
|
|
10290
|
+
for (const v of Object.values(candidate)) {
|
|
10291
|
+
if (typeof v === "string") parts.push(v);
|
|
10292
|
+
}
|
|
10293
|
+
return parts.join(" ");
|
|
10294
|
+
}
|
|
10295
|
+
return String(candidate ?? "");
|
|
10296
|
+
}
|
|
10297
|
+
function goldenMatched(golden, haystacks) {
|
|
10298
|
+
for (const phrase of golden.any) {
|
|
10299
|
+
const needle = phrase.toLowerCase().trim();
|
|
10300
|
+
if (!needle) continue;
|
|
10301
|
+
if (haystacks.some((h) => h.includes(needle))) return true;
|
|
10302
|
+
}
|
|
10303
|
+
for (const pattern of golden.anyRegex ?? []) {
|
|
10304
|
+
let re;
|
|
10305
|
+
try {
|
|
10306
|
+
re = new RegExp(pattern, "i");
|
|
10307
|
+
} catch {
|
|
10308
|
+
continue;
|
|
10309
|
+
}
|
|
10310
|
+
if (haystacks.some((h) => re.test(h))) return true;
|
|
10311
|
+
}
|
|
10312
|
+
return false;
|
|
10313
|
+
}
|
|
10314
|
+
var DEFAULT_SEVERITY_WEIGHTS = {
|
|
10315
|
+
critical: 3,
|
|
10316
|
+
major: 2,
|
|
10317
|
+
minor: 1
|
|
10318
|
+
};
|
|
10319
|
+
function weightedRecall(goldens, result, weights = DEFAULT_SEVERITY_WEIGHTS) {
|
|
10320
|
+
if (goldens.length === 0) return 1;
|
|
10321
|
+
const total = goldens.reduce((s, g) => s + (weights[g.severity] ?? 1), 0);
|
|
10322
|
+
if (total === 0) return 1;
|
|
10323
|
+
const hit = goldens.reduce(
|
|
10324
|
+
(s, g, i) => s + (result.matches[i] ? weights[g.severity] ?? 1 : 0),
|
|
10325
|
+
0
|
|
10326
|
+
);
|
|
10327
|
+
return hit / total;
|
|
10328
|
+
}
|
|
10329
|
+
function precision(goldens, candidates, options = {}) {
|
|
10330
|
+
if (candidates.length === 0) return 1;
|
|
10331
|
+
const extract = options.text ?? defaultExtract5;
|
|
10332
|
+
let matched = 0;
|
|
10333
|
+
for (const cand of candidates) {
|
|
10334
|
+
const haystack = extract(cand).toLowerCase();
|
|
10335
|
+
const matchedAny = goldens.some(
|
|
10336
|
+
(g) => g.any.some((phrase) => phrase.length > 0 && haystack.includes(phrase.toLowerCase())) || (g.anyRegex ?? []).some((pat) => {
|
|
10337
|
+
try {
|
|
10338
|
+
return new RegExp(pat, "i").test(haystack);
|
|
10339
|
+
} catch {
|
|
10340
|
+
return false;
|
|
10341
|
+
}
|
|
10342
|
+
})
|
|
10343
|
+
);
|
|
10344
|
+
if (matchedAny) matched++;
|
|
10345
|
+
}
|
|
10346
|
+
return matched / candidates.length;
|
|
10347
|
+
}
|
|
10348
|
+
|
|
10349
|
+
// src/orthogonality.ts
|
|
10350
|
+
function passOrthogonality(input) {
|
|
10351
|
+
const passes = input.passes;
|
|
10352
|
+
if (passes.length < 2) {
|
|
10353
|
+
return { orthogonality: 1, passCount: passes.length, similarities: [] };
|
|
10354
|
+
}
|
|
10355
|
+
const render = input.text ?? defaultRender;
|
|
10356
|
+
const minLen = input.minTokenLength ?? 4;
|
|
10357
|
+
const vectors = passes.map((p) => bagOfWords(p.findings, render, minLen));
|
|
10358
|
+
const sims = [];
|
|
10359
|
+
for (let i = 0; i < vectors.length; i++) {
|
|
10360
|
+
for (let j = i + 1; j < vectors.length; j++) {
|
|
10361
|
+
sims.push(cosineSimilarity(vectors[i], vectors[j]));
|
|
10362
|
+
}
|
|
10363
|
+
}
|
|
10364
|
+
const mean7 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
|
|
10365
|
+
return {
|
|
10366
|
+
orthogonality: Math.max(0, Math.min(1, 1 - mean7)),
|
|
10367
|
+
passCount: passes.length,
|
|
10368
|
+
similarities: sims
|
|
10369
|
+
};
|
|
10370
|
+
}
|
|
10371
|
+
function defaultRender(item) {
|
|
10372
|
+
if (typeof item === "string") return item;
|
|
10373
|
+
if (item && typeof item === "object") {
|
|
10374
|
+
const parts = [];
|
|
10375
|
+
for (const v of Object.values(item)) {
|
|
10376
|
+
if (typeof v === "string") parts.push(v);
|
|
10377
|
+
}
|
|
10378
|
+
return parts.join(" ");
|
|
10379
|
+
}
|
|
10380
|
+
return String(item ?? "");
|
|
10381
|
+
}
|
|
10382
|
+
function bagOfWords(items, render, minLen) {
|
|
10383
|
+
const bag = /* @__PURE__ */ new Map();
|
|
10384
|
+
for (const item of items) {
|
|
10385
|
+
const text = render(item).toLowerCase();
|
|
10386
|
+
for (const tok of text.split(/[^a-z0-9]+/).filter((w) => w.length >= minLen)) {
|
|
10387
|
+
bag.set(tok, (bag.get(tok) ?? 0) + 1);
|
|
10388
|
+
}
|
|
10389
|
+
}
|
|
10390
|
+
return bag;
|
|
10391
|
+
}
|
|
10392
|
+
function cosineSimilarity(a, b) {
|
|
10393
|
+
let dot = 0;
|
|
10394
|
+
let aMag = 0;
|
|
10395
|
+
let bMag = 0;
|
|
10396
|
+
for (const [, v] of a) aMag += v * v;
|
|
10397
|
+
for (const [, v] of b) bMag += v * v;
|
|
10398
|
+
for (const [k, v] of a) {
|
|
10399
|
+
const bv = b.get(k);
|
|
10400
|
+
if (bv) dot += v * bv;
|
|
10401
|
+
}
|
|
10402
|
+
if (aMag === 0 || bMag === 0) return 0;
|
|
10403
|
+
return dot / (Math.sqrt(aMag) * Math.sqrt(bMag));
|
|
10404
|
+
}
|
|
10405
|
+
|
|
10406
|
+
// src/promotion-gate.ts
|
|
10407
|
+
function bootstrapCi(baseline, candidate, options = {}) {
|
|
10408
|
+
const alpha = options.alpha ?? 0.05;
|
|
10409
|
+
const iterations = options.iterations ?? 1e3;
|
|
10410
|
+
const minTotal = options.minTotalSamples ?? 6;
|
|
10411
|
+
const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
|
|
10412
|
+
const baselineMean = mean6(baseline);
|
|
10413
|
+
const candidateMean = mean6(candidate);
|
|
10414
|
+
const delta = candidateMean - baselineMean;
|
|
10415
|
+
if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
|
|
10416
|
+
return {
|
|
10417
|
+
baselineMean,
|
|
10418
|
+
candidateMean,
|
|
10419
|
+
delta,
|
|
10420
|
+
ciLower: -Infinity,
|
|
10421
|
+
ciUpper: Infinity,
|
|
10422
|
+
iterations: 0,
|
|
10423
|
+
alpha,
|
|
10424
|
+
verdict: "INCONCLUSIVE"
|
|
10425
|
+
};
|
|
10426
|
+
}
|
|
10427
|
+
const deltas = new Array(iterations);
|
|
10428
|
+
for (let i = 0; i < iterations; i++) {
|
|
10429
|
+
const bResample = resample(baseline, rng);
|
|
10430
|
+
const cResample = resample(candidate, rng);
|
|
10431
|
+
deltas[i] = mean6(cResample) - mean6(bResample);
|
|
10432
|
+
}
|
|
10433
|
+
deltas.sort((a, b) => a - b);
|
|
10434
|
+
const lowerIdx = Math.floor(alpha / 2 * iterations);
|
|
10435
|
+
const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1;
|
|
10436
|
+
const ciLower = deltas[Math.max(0, lowerIdx)];
|
|
10437
|
+
const ciUpper = deltas[Math.min(iterations - 1, upperIdx)];
|
|
10438
|
+
let verdict;
|
|
10439
|
+
if (ciLower > 0) verdict = "ADVANCE";
|
|
10440
|
+
else if (ciUpper < 0) verdict = "REVERT";
|
|
10441
|
+
else if (delta >= 0) verdict = "KEEP";
|
|
10442
|
+
else verdict = "INCONCLUSIVE";
|
|
10443
|
+
return {
|
|
10444
|
+
baselineMean,
|
|
10445
|
+
candidateMean,
|
|
10446
|
+
delta,
|
|
10447
|
+
ciLower,
|
|
10448
|
+
ciUpper,
|
|
10449
|
+
iterations,
|
|
10450
|
+
alpha,
|
|
10451
|
+
verdict
|
|
10452
|
+
};
|
|
10453
|
+
}
|
|
10454
|
+
function mean6(xs) {
|
|
10455
|
+
if (xs.length === 0) return 0;
|
|
10456
|
+
let s = 0;
|
|
10457
|
+
for (const x of xs) s += x;
|
|
10458
|
+
return s / xs.length;
|
|
10459
|
+
}
|
|
10460
|
+
function resample(xs, rng) {
|
|
10461
|
+
const out = new Array(xs.length);
|
|
10462
|
+
for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)];
|
|
10463
|
+
return out;
|
|
10464
|
+
}
|
|
10465
|
+
function mulberry32(seed) {
|
|
10466
|
+
let t = seed >>> 0;
|
|
10467
|
+
return () => {
|
|
10468
|
+
t += 1831565813;
|
|
10469
|
+
let r = t;
|
|
10470
|
+
r = Math.imul(r ^ r >>> 15, r | 1);
|
|
10471
|
+
r ^= r + Math.imul(r ^ r >>> 7, r | 61);
|
|
10472
|
+
return ((r ^ r >>> 14) >>> 0) / 4294967296;
|
|
10473
|
+
};
|
|
10474
|
+
}
|
|
10475
|
+
function hashSeed(a, b) {
|
|
10476
|
+
let h = 2166136261;
|
|
10477
|
+
for (const x of [...a, ...b]) {
|
|
10478
|
+
const view = new Float64Array([x]);
|
|
10479
|
+
const bytes = new Uint8Array(view.buffer);
|
|
10480
|
+
for (const byte of bytes) {
|
|
10481
|
+
h ^= byte;
|
|
10482
|
+
h = Math.imul(h, 16777619);
|
|
10483
|
+
}
|
|
10484
|
+
}
|
|
10485
|
+
return h >>> 0;
|
|
10486
|
+
}
|
|
10487
|
+
async function judgeReplayGate(args) {
|
|
10488
|
+
const concurrency = args.judgeConcurrency ?? 4;
|
|
10489
|
+
const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
|
|
10490
|
+
const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
|
|
10491
|
+
const ci = bootstrapCi(baselineScores, candidateScores, {
|
|
10492
|
+
...args.alpha !== void 0 ? { alpha: args.alpha } : {},
|
|
10493
|
+
...args.iterations !== void 0 ? { iterations: args.iterations } : {},
|
|
10494
|
+
...args.seed !== void 0 ? { seed: args.seed } : {}
|
|
10495
|
+
});
|
|
10496
|
+
return {
|
|
10497
|
+
...ci,
|
|
10498
|
+
baselineSamples: baselineScores.length,
|
|
10499
|
+
candidateSamples: candidateScores.length
|
|
10500
|
+
};
|
|
10501
|
+
}
|
|
10502
|
+
async function scoreAll(outputs, judge, concurrency) {
|
|
10503
|
+
const results = new Array(outputs.length);
|
|
10504
|
+
let next = 0;
|
|
10505
|
+
async function worker() {
|
|
10506
|
+
while (true) {
|
|
10507
|
+
const i = next++;
|
|
10508
|
+
if (i >= outputs.length) return;
|
|
10509
|
+
const v = await judge(outputs[i]);
|
|
10510
|
+
results[i] = Number.isFinite(v) ? v : 0;
|
|
10511
|
+
}
|
|
10512
|
+
}
|
|
10513
|
+
await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
|
|
10514
|
+
return results;
|
|
10515
|
+
}
|
|
10516
|
+
|
|
10517
|
+
// src/reflective-mutation.ts
|
|
10518
|
+
var DEFAULT_MUTATION_PRIMITIVES = [
|
|
10519
|
+
'Strengthen an imperative ("should" \u2192 "must")',
|
|
10520
|
+
"Add a concrete example pulled from a missed-golden phrase",
|
|
10521
|
+
"Remove a redundant rule that did not improve recall",
|
|
10522
|
+
'Add a counterfactual ("if X is missing, the score is capped at Y")',
|
|
10523
|
+
"Reorder sections so the highest-impact rule is first",
|
|
10524
|
+
"Replace abstract language with a domain-specific noun the trial misses"
|
|
10525
|
+
];
|
|
10526
|
+
function buildReflectionPrompt(ctx) {
|
|
10527
|
+
const primitives = ctx.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES;
|
|
10528
|
+
const sections = [];
|
|
10529
|
+
sections.push(`# Mutation target: ${ctx.target}`);
|
|
10530
|
+
sections.push("");
|
|
10531
|
+
sections.push(`You are tuning the prompt component named \`${ctx.target}\`. The current variant is shown below; you have ${ctx.topTrials.length} top trials and ${ctx.bottomTrials.length} bottom trials as evidence. Propose ${ctx.childCount} mutation${ctx.childCount === 1 ? "" : "s"} that fix specific weaknesses visible in the bottom trials. Avoid blank rephrasings.`);
|
|
10532
|
+
sections.push("");
|
|
10533
|
+
sections.push("## Current variant");
|
|
10534
|
+
sections.push("```json");
|
|
10535
|
+
sections.push(JSON.stringify(ctx.parentPayload, null, 2));
|
|
10536
|
+
sections.push("```");
|
|
10537
|
+
sections.push("");
|
|
10538
|
+
if (ctx.bottomTrials.length > 0) {
|
|
10539
|
+
sections.push("## Failures (bottom trials) \u2014 what went wrong");
|
|
10540
|
+
sections.push("");
|
|
10541
|
+
for (const trial of ctx.bottomTrials) {
|
|
10542
|
+
sections.push(`### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
|
|
10543
|
+
const missed = (trial.expectations ?? []).filter((e) => !e.matched);
|
|
10544
|
+
if (missed.length > 0) {
|
|
10545
|
+
sections.push("");
|
|
10546
|
+
sections.push("**Missed expectations:**");
|
|
10547
|
+
for (const m of missed) {
|
|
10548
|
+
sections.push(`- \`${m.id}\`: should match phrase \`${quote(m.phrase)}\``);
|
|
10549
|
+
}
|
|
10550
|
+
}
|
|
10551
|
+
if (trial.emitted) {
|
|
10552
|
+
sections.push("");
|
|
10553
|
+
sections.push("**What the agent emitted:**");
|
|
10554
|
+
sections.push("```");
|
|
10555
|
+
sections.push(truncate3(trial.emitted, 600));
|
|
10556
|
+
sections.push("```");
|
|
10557
|
+
}
|
|
10558
|
+
sections.push("");
|
|
10559
|
+
}
|
|
10560
|
+
}
|
|
10561
|
+
if (ctx.topTrials.length > 0) {
|
|
10562
|
+
sections.push("## Successes (top trials) \u2014 what to preserve");
|
|
10563
|
+
sections.push("");
|
|
10564
|
+
for (const trial of ctx.topTrials) {
|
|
10565
|
+
sections.push(`- \`${trial.id}\`: score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
|
|
10566
|
+
}
|
|
10567
|
+
sections.push("");
|
|
10568
|
+
}
|
|
10569
|
+
sections.push("## Allowed mutation primitives");
|
|
10570
|
+
sections.push("");
|
|
10571
|
+
for (const p of primitives) sections.push(`- ${p}`);
|
|
10572
|
+
sections.push("");
|
|
10573
|
+
sections.push("## Output schema");
|
|
10574
|
+
sections.push("");
|
|
10575
|
+
sections.push("Respond with a JSON object \u2014 no prose, no markdown fences:");
|
|
10576
|
+
sections.push("```json");
|
|
10577
|
+
sections.push(JSON.stringify(
|
|
10578
|
+
{
|
|
10579
|
+
proposals: [
|
|
10580
|
+
{
|
|
10581
|
+
label: "<short label, \u2264 40 chars>",
|
|
10582
|
+
rationale: "<which failure this targets and which primitive you used>",
|
|
10583
|
+
payload: "<full payload of the new variant \u2014 same shape as the current variant>"
|
|
10584
|
+
}
|
|
10585
|
+
]
|
|
10586
|
+
},
|
|
10587
|
+
null,
|
|
10588
|
+
2
|
|
10589
|
+
));
|
|
10590
|
+
sections.push("```");
|
|
10591
|
+
return sections.join("\n");
|
|
10592
|
+
}
|
|
10593
|
+
function truncate3(s, max) {
|
|
10594
|
+
if (s.length <= max) return s;
|
|
10595
|
+
return s.slice(0, max) + "\u2026 [truncated]";
|
|
10596
|
+
}
|
|
10597
|
+
function quote(s) {
|
|
10598
|
+
return s.replace(/`/g, "\\`");
|
|
10599
|
+
}
|
|
10600
|
+
function parseReflectionResponse(raw, maxProposals) {
|
|
10601
|
+
let text = raw.trim();
|
|
10602
|
+
if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
|
|
10603
|
+
const start = text.indexOf("{");
|
|
10604
|
+
const end = text.lastIndexOf("}");
|
|
10605
|
+
if (start < 0 || end <= start) return [];
|
|
10606
|
+
let parsed;
|
|
10607
|
+
try {
|
|
10608
|
+
parsed = JSON.parse(text.slice(start, end + 1));
|
|
10609
|
+
} catch {
|
|
10610
|
+
return [];
|
|
10611
|
+
}
|
|
10612
|
+
if (!parsed || typeof parsed !== "object") return [];
|
|
10613
|
+
const proposalsRaw = parsed.proposals;
|
|
10614
|
+
if (!Array.isArray(proposalsRaw)) return [];
|
|
10615
|
+
const out = [];
|
|
10616
|
+
for (const p of proposalsRaw) {
|
|
10617
|
+
if (!p || typeof p !== "object") continue;
|
|
10618
|
+
const obj = p;
|
|
10619
|
+
if (!("payload" in obj)) continue;
|
|
10620
|
+
out.push({
|
|
10621
|
+
label: typeof obj.label === "string" ? obj.label : "mutation",
|
|
10622
|
+
rationale: typeof obj.rationale === "string" ? obj.rationale : "",
|
|
10623
|
+
payload: obj.payload
|
|
10624
|
+
});
|
|
10625
|
+
if (maxProposals !== void 0 && out.length >= maxProposals) break;
|
|
10626
|
+
}
|
|
10627
|
+
return out;
|
|
10628
|
+
}
|
|
9979
10629
|
export {
|
|
9980
10630
|
AgentDriver,
|
|
9981
10631
|
AxGepaSteeringOptimizer,
|
|
@@ -9985,21 +10635,25 @@ export {
|
|
|
9985
10635
|
BuilderSession,
|
|
9986
10636
|
ConvergenceTracker,
|
|
9987
10637
|
CostTracker,
|
|
10638
|
+
D1ExperimentStore,
|
|
9988
10639
|
DEFAULT_AGENT_SLOS,
|
|
9989
10640
|
DEFAULT_COMPLEXITY_WEIGHTS,
|
|
9990
10641
|
DEFAULT_RULES as DEFAULT_FAILURE_RULES,
|
|
9991
10642
|
DEFAULT_FINDERS,
|
|
9992
10643
|
DEFAULT_HARNESS_OBJECTIVES,
|
|
10644
|
+
DEFAULT_MUTATION_PRIMITIVES,
|
|
9993
10645
|
DEFAULT_MUTATORS,
|
|
9994
10646
|
DEFAULT_REDACTION_RULES,
|
|
9995
10647
|
DEFAULT_RED_TEAM_CORPUS,
|
|
9996
10648
|
DEFAULT_RUN_SCORE_WEIGHTS,
|
|
10649
|
+
DEFAULT_SEVERITY_WEIGHTS,
|
|
9997
10650
|
Dataset,
|
|
9998
10651
|
DockerSandboxDriver,
|
|
9999
10652
|
DualAgentBench,
|
|
10000
10653
|
ERROR_COUNT_PATTERNS,
|
|
10001
10654
|
ExperimentTracker,
|
|
10002
10655
|
FAILURE_CLASSES,
|
|
10656
|
+
FileSystemExperimentStore,
|
|
10003
10657
|
FileSystemOutcomeStore,
|
|
10004
10658
|
FileSystemTraceStore,
|
|
10005
10659
|
HoldoutAuditor,
|
|
@@ -10008,6 +10662,7 @@ export {
|
|
|
10008
10662
|
InMemoryExperimentStore,
|
|
10009
10663
|
InMemoryOutcomeStore,
|
|
10010
10664
|
InMemoryTraceStore,
|
|
10665
|
+
InMemoryTrialCache,
|
|
10011
10666
|
InMemoryWorkspaceInspector,
|
|
10012
10667
|
JudgeRunner,
|
|
10013
10668
|
LlmCallError,
|
|
@@ -10043,7 +10698,9 @@ export {
|
|
|
10043
10698
|
benjaminiHochberg,
|
|
10044
10699
|
bisect,
|
|
10045
10700
|
bonferroni,
|
|
10701
|
+
bootstrapCi,
|
|
10046
10702
|
budgetBreachView,
|
|
10703
|
+
buildReflectionPrompt,
|
|
10047
10704
|
buildReviewerPrompt,
|
|
10048
10705
|
buildTrajectory,
|
|
10049
10706
|
byteLengthRange,
|
|
@@ -10081,6 +10738,7 @@ export {
|
|
|
10081
10738
|
createLlmReviewer,
|
|
10082
10739
|
createSemanticConceptJudge,
|
|
10083
10740
|
crossTraceDiff,
|
|
10741
|
+
crowdingDistance,
|
|
10084
10742
|
decideReferenceReplayPromotion,
|
|
10085
10743
|
decideReferenceReplayRunPromotion,
|
|
10086
10744
|
defaultJudges,
|
|
@@ -10114,6 +10772,7 @@ export {
|
|
|
10114
10772
|
formatBenchmarkReport,
|
|
10115
10773
|
formatDriverReport,
|
|
10116
10774
|
formatFindings,
|
|
10775
|
+
precision as goldenPrecision,
|
|
10117
10776
|
gradeSemanticStatus,
|
|
10118
10777
|
groupBy,
|
|
10119
10778
|
hashContent,
|
|
@@ -10135,6 +10794,7 @@ export {
|
|
|
10135
10794
|
jsonlReferenceReplayStore,
|
|
10136
10795
|
jsonlReviewStore,
|
|
10137
10796
|
judgeAgreementView,
|
|
10797
|
+
judgeReplayGate,
|
|
10138
10798
|
judgeSpans,
|
|
10139
10799
|
keyPreserved,
|
|
10140
10800
|
linterJudge,
|
|
@@ -10144,6 +10804,7 @@ export {
|
|
|
10144
10804
|
localCommandRunner,
|
|
10145
10805
|
lowercaseMutator,
|
|
10146
10806
|
mannWhitneyU,
|
|
10807
|
+
matchGoldens,
|
|
10147
10808
|
mergeLayerResults,
|
|
10148
10809
|
mergeSteeringBundle,
|
|
10149
10810
|
multiToolchainLayer,
|
|
@@ -10155,7 +10816,10 @@ export {
|
|
|
10155
10816
|
pairedTTest,
|
|
10156
10817
|
paraphraseRobustness,
|
|
10157
10818
|
paretoFrontier,
|
|
10819
|
+
paretoFrontierWithCrowding,
|
|
10820
|
+
parseReflectionResponse,
|
|
10158
10821
|
partialCredit,
|
|
10822
|
+
passOrthogonality,
|
|
10159
10823
|
pixelDeltaRatio,
|
|
10160
10824
|
politenessPrefixMutator,
|
|
10161
10825
|
positionalBias,
|
|
@@ -10195,12 +10859,14 @@ export {
|
|
|
10195
10859
|
runJudgeFleet,
|
|
10196
10860
|
runKeywordCoverageJudge,
|
|
10197
10861
|
runKeywordCoverageJudgeUrl,
|
|
10862
|
+
runPromptEvolution,
|
|
10198
10863
|
runProposeReview,
|
|
10199
10864
|
runReferenceReplay,
|
|
10200
10865
|
runSelfPlay,
|
|
10201
10866
|
runSemanticConceptJudge,
|
|
10202
10867
|
runTestGradedScenario,
|
|
10203
10868
|
runsForScenario,
|
|
10869
|
+
scalarScore,
|
|
10204
10870
|
scanForMuffledGates,
|
|
10205
10871
|
scoreAllProjects,
|
|
10206
10872
|
scoreContinuity,
|
|
@@ -10237,6 +10903,7 @@ export {
|
|
|
10237
10903
|
viteDeployRunner,
|
|
10238
10904
|
vitestTestParser,
|
|
10239
10905
|
weightedMean,
|
|
10906
|
+
weightedRecall,
|
|
10240
10907
|
welchsTTest,
|
|
10241
10908
|
whitespaceCollapseMutator,
|
|
10242
10909
|
wilcoxonSignedRank
|