@tangle-network/agent-eval 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +184 -11
- package/dist/chunk-ITN4YOZY.js +215 -0
- package/dist/chunk-ITN4YOZY.js.map +1 -0
- package/dist/chunk-OZPRSK4A.js +594 -0
- package/dist/chunk-OZPRSK4A.js.map +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +104 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +548 -1
- package/dist/index.js +876 -210
- package/dist/index.js.map +1 -1
- package/dist/wire/index.d.ts +211 -0
- package/dist/wire/index.js +56 -0
- package/dist/wire/index.js.map +1 -0
- package/package.json +17 -3
package/dist/index.js
CHANGED
|
@@ -1,3 +1,12 @@
|
|
|
1
|
+
import {
|
|
2
|
+
LlmCallError,
|
|
3
|
+
LlmClient,
|
|
4
|
+
callLlm,
|
|
5
|
+
callLlmJson,
|
|
6
|
+
probeLlm,
|
|
7
|
+
stripFencedJson
|
|
8
|
+
} from "./chunk-ITN4YOZY.js";
|
|
9
|
+
|
|
1
10
|
// src/client.ts
|
|
2
11
|
var ProductClient = class {
|
|
3
12
|
baseUrl;
|
|
@@ -1926,6 +1935,244 @@ function rand(bytes) {
|
|
|
1926
1935
|
return Array.from(arr).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
1927
1936
|
}
|
|
1928
1937
|
|
|
1938
|
+
// src/experiment-tracker-fs.ts
|
|
1939
|
+
var FileSystemExperimentStore = class {
|
|
1940
|
+
dir;
|
|
1941
|
+
maxBytes;
|
|
1942
|
+
index;
|
|
1943
|
+
loaded = false;
|
|
1944
|
+
constructor(options) {
|
|
1945
|
+
this.dir = options.dir;
|
|
1946
|
+
this.maxBytes = options.maxBytes ?? 32 * 1024 * 1024;
|
|
1947
|
+
}
|
|
1948
|
+
async saveExperiment(exp) {
|
|
1949
|
+
const idx = await this.load();
|
|
1950
|
+
await idx.saveExperiment(exp);
|
|
1951
|
+
await this.append("experiments", exp);
|
|
1952
|
+
}
|
|
1953
|
+
async getExperiment(id) {
|
|
1954
|
+
const idx = await this.load();
|
|
1955
|
+
return idx.getExperiment(id);
|
|
1956
|
+
}
|
|
1957
|
+
async listExperiments() {
|
|
1958
|
+
const idx = await this.load();
|
|
1959
|
+
return idx.listExperiments();
|
|
1960
|
+
}
|
|
1961
|
+
async saveRun(run) {
|
|
1962
|
+
const idx = await this.load();
|
|
1963
|
+
await idx.saveRun(run);
|
|
1964
|
+
await this.append("runs", run);
|
|
1965
|
+
}
|
|
1966
|
+
async getRun(id) {
|
|
1967
|
+
const idx = await this.load();
|
|
1968
|
+
return idx.getRun(id);
|
|
1969
|
+
}
|
|
1970
|
+
async listRuns(experimentId) {
|
|
1971
|
+
const idx = await this.load();
|
|
1972
|
+
return idx.listRuns(experimentId);
|
|
1973
|
+
}
|
|
1974
|
+
async ensureDir() {
|
|
1975
|
+
const fs = await import("fs/promises");
|
|
1976
|
+
await fs.mkdir(this.dir, { recursive: true });
|
|
1977
|
+
}
|
|
1978
|
+
async append(name, record) {
|
|
1979
|
+
await this.ensureDir();
|
|
1980
|
+
const fs = await import("fs/promises");
|
|
1981
|
+
const path = await import("path");
|
|
1982
|
+
const active = path.join(this.dir, `${name}.ndjson`);
|
|
1983
|
+
try {
|
|
1984
|
+
const stat = await fs.stat(active);
|
|
1985
|
+
if (stat.size >= this.maxBytes) {
|
|
1986
|
+
const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
|
|
1987
|
+
await fs.rename(active, rolled);
|
|
1988
|
+
}
|
|
1989
|
+
} catch {
|
|
1990
|
+
}
|
|
1991
|
+
await fs.appendFile(active, JSON.stringify(record) + "\n", "utf8");
|
|
1992
|
+
}
|
|
1993
|
+
async load() {
|
|
1994
|
+
if (this.loaded && this.index) return this.index;
|
|
1995
|
+
const fs = await import("fs/promises");
|
|
1996
|
+
const path = await import("path");
|
|
1997
|
+
const store = new InMemoryExperimentStore();
|
|
1998
|
+
try {
|
|
1999
|
+
const entries = await fs.readdir(this.dir);
|
|
2000
|
+
const sorted = entries.filter((f) => f.endsWith(".ndjson")).sort((a, b) => a.localeCompare(b));
|
|
2001
|
+
for (const file of sorted) {
|
|
2002
|
+
const full = path.join(this.dir, file);
|
|
2003
|
+
const content = await fs.readFile(full, "utf8");
|
|
2004
|
+
const base = file.split(".")[0];
|
|
2005
|
+
for (const line of content.split("\n")) {
|
|
2006
|
+
if (!line.trim()) continue;
|
|
2007
|
+
let record;
|
|
2008
|
+
try {
|
|
2009
|
+
record = JSON.parse(line);
|
|
2010
|
+
} catch {
|
|
2011
|
+
continue;
|
|
2012
|
+
}
|
|
2013
|
+
if (base === "experiments") {
|
|
2014
|
+
await store.saveExperiment(record);
|
|
2015
|
+
} else if (base === "runs") {
|
|
2016
|
+
await store.saveRun(record);
|
|
2017
|
+
}
|
|
2018
|
+
}
|
|
2019
|
+
}
|
|
2020
|
+
} catch {
|
|
2021
|
+
}
|
|
2022
|
+
this.index = store;
|
|
2023
|
+
this.loaded = true;
|
|
2024
|
+
return store;
|
|
2025
|
+
}
|
|
2026
|
+
};
|
|
2027
|
+
|
|
2028
|
+
// src/experiment-tracker-d1.ts
|
|
2029
|
+
var SCHEMA_VERSION = 1;
|
|
2030
|
+
var D1ExperimentStore = class {
|
|
2031
|
+
db;
|
|
2032
|
+
experimentsTable;
|
|
2033
|
+
runsTable;
|
|
2034
|
+
metaTable;
|
|
2035
|
+
schemaReady = false;
|
|
2036
|
+
constructor(options) {
|
|
2037
|
+
this.db = options.db;
|
|
2038
|
+
const prefix = options.tablePrefix ?? "agent_eval_";
|
|
2039
|
+
this.experimentsTable = `${prefix}experiments`;
|
|
2040
|
+
this.runsTable = `${prefix}runs`;
|
|
2041
|
+
this.metaTable = `${prefix}meta`;
|
|
2042
|
+
}
|
|
2043
|
+
/**
|
|
2044
|
+
* Idempotent schema setup. Safe to call before every operation; the second
|
|
2045
|
+
* call short-circuits via `schemaReady`. Most consumers will call it once
|
|
2046
|
+
* during Worker bootstrap.
|
|
2047
|
+
*/
|
|
2048
|
+
async ensureSchema() {
|
|
2049
|
+
if (this.schemaReady) return;
|
|
2050
|
+
const ddl = `
|
|
2051
|
+
CREATE TABLE IF NOT EXISTS ${this.experimentsTable} (
|
|
2052
|
+
id TEXT PRIMARY KEY,
|
|
2053
|
+
name TEXT NOT NULL,
|
|
2054
|
+
created_at TEXT NOT NULL,
|
|
2055
|
+
metadata_json TEXT
|
|
2056
|
+
);
|
|
2057
|
+
CREATE TABLE IF NOT EXISTS ${this.runsTable} (
|
|
2058
|
+
id TEXT PRIMARY KEY,
|
|
2059
|
+
experiment_id TEXT NOT NULL,
|
|
2060
|
+
name TEXT,
|
|
2061
|
+
status TEXT NOT NULL,
|
|
2062
|
+
started_at TEXT NOT NULL,
|
|
2063
|
+
completed_at TEXT,
|
|
2064
|
+
config_json TEXT NOT NULL,
|
|
2065
|
+
report_json TEXT,
|
|
2066
|
+
error TEXT
|
|
2067
|
+
);
|
|
2068
|
+
CREATE INDEX IF NOT EXISTS idx_${this.runsTable}_experiment ON ${this.runsTable}(experiment_id);
|
|
2069
|
+
CREATE INDEX IF NOT EXISTS idx_${this.runsTable}_started ON ${this.runsTable}(started_at);
|
|
2070
|
+
CREATE TABLE IF NOT EXISTS ${this.metaTable} (
|
|
2071
|
+
key TEXT PRIMARY KEY,
|
|
2072
|
+
value TEXT NOT NULL
|
|
2073
|
+
);
|
|
2074
|
+
INSERT OR REPLACE INTO ${this.metaTable}(key, value) VALUES ('schema_version', '${SCHEMA_VERSION}');
|
|
2075
|
+
`;
|
|
2076
|
+
await this.db.exec(ddl.trim().replace(/\s+/g, " "));
|
|
2077
|
+
this.schemaReady = true;
|
|
2078
|
+
}
|
|
2079
|
+
async saveExperiment(exp) {
|
|
2080
|
+
await this.ensureSchema();
|
|
2081
|
+
await this.db.prepare(
|
|
2082
|
+
`INSERT INTO ${this.experimentsTable}(id, name, created_at, metadata_json)
|
|
2083
|
+
VALUES (?1, ?2, ?3, ?4)
|
|
2084
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
2085
|
+
name = excluded.name,
|
|
2086
|
+
created_at = excluded.created_at,
|
|
2087
|
+
metadata_json = excluded.metadata_json`
|
|
2088
|
+
).bind(exp.id, exp.name, exp.createdAt, exp.metadata ? JSON.stringify(exp.metadata) : null).run();
|
|
2089
|
+
}
|
|
2090
|
+
async getExperiment(id) {
|
|
2091
|
+
await this.ensureSchema();
|
|
2092
|
+
const row = await this.db.prepare(
|
|
2093
|
+
`SELECT id, name, created_at, metadata_json
|
|
2094
|
+
FROM ${this.experimentsTable}
|
|
2095
|
+
WHERE id = ?1`
|
|
2096
|
+
).bind(id).first();
|
|
2097
|
+
return row ? rowToExperiment(row) : null;
|
|
2098
|
+
}
|
|
2099
|
+
async listExperiments() {
|
|
2100
|
+
await this.ensureSchema();
|
|
2101
|
+
const { results } = await this.db.prepare(
|
|
2102
|
+
`SELECT id, name, created_at, metadata_json
|
|
2103
|
+
FROM ${this.experimentsTable}
|
|
2104
|
+
ORDER BY created_at DESC`
|
|
2105
|
+
).all();
|
|
2106
|
+
return results.map(rowToExperiment);
|
|
2107
|
+
}
|
|
2108
|
+
async saveRun(run) {
|
|
2109
|
+
await this.ensureSchema();
|
|
2110
|
+
await this.db.prepare(
|
|
2111
|
+
`INSERT INTO ${this.runsTable}(id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error)
|
|
2112
|
+
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)
|
|
2113
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
2114
|
+
experiment_id = excluded.experiment_id,
|
|
2115
|
+
name = excluded.name,
|
|
2116
|
+
status = excluded.status,
|
|
2117
|
+
started_at = excluded.started_at,
|
|
2118
|
+
completed_at = excluded.completed_at,
|
|
2119
|
+
config_json = excluded.config_json,
|
|
2120
|
+
report_json = excluded.report_json,
|
|
2121
|
+
error = excluded.error`
|
|
2122
|
+
).bind(
|
|
2123
|
+
run.id,
|
|
2124
|
+
run.experimentId,
|
|
2125
|
+
run.name ?? null,
|
|
2126
|
+
run.status,
|
|
2127
|
+
run.startedAt,
|
|
2128
|
+
run.completedAt ?? null,
|
|
2129
|
+
JSON.stringify(run.config),
|
|
2130
|
+
run.report ? JSON.stringify(run.report) : null,
|
|
2131
|
+
run.error ?? null
|
|
2132
|
+
).run();
|
|
2133
|
+
}
|
|
2134
|
+
async getRun(id) {
|
|
2135
|
+
await this.ensureSchema();
|
|
2136
|
+
const row = await this.db.prepare(
|
|
2137
|
+
`SELECT id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error
|
|
2138
|
+
FROM ${this.runsTable}
|
|
2139
|
+
WHERE id = ?1`
|
|
2140
|
+
).bind(id).first();
|
|
2141
|
+
return row ? rowToRun(row) : null;
|
|
2142
|
+
}
|
|
2143
|
+
async listRuns(experimentId) {
|
|
2144
|
+
await this.ensureSchema();
|
|
2145
|
+
const { results } = await this.db.prepare(
|
|
2146
|
+
`SELECT id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error
|
|
2147
|
+
FROM ${this.runsTable}
|
|
2148
|
+
WHERE experiment_id = ?1
|
|
2149
|
+
ORDER BY started_at DESC`
|
|
2150
|
+
).bind(experimentId).all();
|
|
2151
|
+
return results.map(rowToRun);
|
|
2152
|
+
}
|
|
2153
|
+
};
|
|
2154
|
+
function rowToExperiment(row) {
|
|
2155
|
+
return {
|
|
2156
|
+
id: row.id,
|
|
2157
|
+
name: row.name,
|
|
2158
|
+
createdAt: row.created_at,
|
|
2159
|
+
...row.metadata_json ? { metadata: JSON.parse(row.metadata_json) } : {}
|
|
2160
|
+
};
|
|
2161
|
+
}
|
|
2162
|
+
function rowToRun(row) {
|
|
2163
|
+
return {
|
|
2164
|
+
id: row.id,
|
|
2165
|
+
experimentId: row.experiment_id,
|
|
2166
|
+
...row.name ? { name: row.name } : {},
|
|
2167
|
+
status: row.status,
|
|
2168
|
+
startedAt: row.started_at,
|
|
2169
|
+
...row.completed_at ? { completedAt: row.completed_at } : {},
|
|
2170
|
+
config: JSON.parse(row.config_json),
|
|
2171
|
+
...row.report_json ? { report: JSON.parse(row.report_json) } : {},
|
|
2172
|
+
...row.error ? { error: row.error } : {}
|
|
2173
|
+
};
|
|
2174
|
+
}
|
|
2175
|
+
|
|
1929
2176
|
// src/power-analysis.ts
|
|
1930
2177
|
function requiredSampleSize(opts) {
|
|
1931
2178
|
const effect = opts.effect;
|
|
@@ -8058,212 +8305,6 @@ async function euAiActReport(ctx, signals) {
|
|
|
8058
8305
|
};
|
|
8059
8306
|
}
|
|
8060
8307
|
|
|
8061
|
-
// src/llm-client.ts
|
|
8062
|
-
var LlmCallError = class extends Error {
|
|
8063
|
-
constructor(message, status, body, model) {
|
|
8064
|
-
super(message);
|
|
8065
|
-
this.status = status;
|
|
8066
|
-
this.body = body;
|
|
8067
|
-
this.model = model;
|
|
8068
|
-
this.name = "LlmCallError";
|
|
8069
|
-
}
|
|
8070
|
-
status;
|
|
8071
|
-
body;
|
|
8072
|
-
model;
|
|
8073
|
-
};
|
|
8074
|
-
var DEFAULT_BASE_URL = "https://router.tangle.tools/v1";
|
|
8075
|
-
var DEFAULT_TIMEOUT_MS = 6e4;
|
|
8076
|
-
var DEFAULT_MAX_RETRIES = 3;
|
|
8077
|
-
var RETRYABLE_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
|
|
8078
|
-
function isRetryableError(err) {
|
|
8079
|
-
if (err instanceof LlmCallError) return RETRYABLE_STATUS.has(err.status);
|
|
8080
|
-
if (err instanceof Error) {
|
|
8081
|
-
return err.name === "AbortError" || err.name === "TimeoutError" || /fetch failed|ECONNRESET|ETIMEDOUT|EAI_AGAIN/i.test(err.message);
|
|
8082
|
-
}
|
|
8083
|
-
return false;
|
|
8084
|
-
}
|
|
8085
|
-
function parseRetryAfter(headers) {
|
|
8086
|
-
const h = headers.get("retry-after");
|
|
8087
|
-
if (!h) return null;
|
|
8088
|
-
const asNumber = Number(h);
|
|
8089
|
-
if (Number.isFinite(asNumber) && asNumber > 0) return asNumber * 1e3;
|
|
8090
|
-
const asDate = Date.parse(h);
|
|
8091
|
-
if (Number.isFinite(asDate)) return Math.max(0, asDate - Date.now());
|
|
8092
|
-
return null;
|
|
8093
|
-
}
|
|
8094
|
-
function backoffMs(attempt) {
|
|
8095
|
-
return Math.min(500 * Math.pow(2, attempt), 16e3);
|
|
8096
|
-
}
|
|
8097
|
-
function buildHeaders(opts) {
|
|
8098
|
-
const headers = {
|
|
8099
|
-
"Content-Type": "application/json",
|
|
8100
|
-
Accept: "application/json"
|
|
8101
|
-
};
|
|
8102
|
-
if (opts.authHeader) {
|
|
8103
|
-
headers[opts.authHeader.name] = opts.authHeader.value;
|
|
8104
|
-
} else if (opts.bearer || opts.apiKey) {
|
|
8105
|
-
headers.Authorization = `Bearer ${opts.bearer ?? opts.apiKey}`;
|
|
8106
|
-
}
|
|
8107
|
-
return headers;
|
|
8108
|
-
}
|
|
8109
|
-
function isSchemaRejection(status, body) {
|
|
8110
|
-
if (status !== 400) return false;
|
|
8111
|
-
const lower = body.toLowerCase();
|
|
8112
|
-
return lower.includes("response_format") || lower.includes("json_schema") || lower.includes("is unavailable") || lower.includes("not supported");
|
|
8113
|
-
}
|
|
8114
|
-
function buildBody(req, forceJsonObject) {
|
|
8115
|
-
const body = {
|
|
8116
|
-
model: req.model,
|
|
8117
|
-
messages: req.messages,
|
|
8118
|
-
temperature: req.temperature ?? 0
|
|
8119
|
-
};
|
|
8120
|
-
if (req.maxTokens != null) body.max_tokens = req.maxTokens;
|
|
8121
|
-
if (req.jsonSchema && !forceJsonObject) {
|
|
8122
|
-
body.response_format = {
|
|
8123
|
-
type: "json_schema",
|
|
8124
|
-
json_schema: { name: req.jsonSchema.name, schema: req.jsonSchema.schema, strict: true }
|
|
8125
|
-
};
|
|
8126
|
-
} else if (req.jsonMode || req.jsonSchema) {
|
|
8127
|
-
body.response_format = { type: "json_object" };
|
|
8128
|
-
}
|
|
8129
|
-
return body;
|
|
8130
|
-
}
|
|
8131
|
-
async function sleep(ms) {
|
|
8132
|
-
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
8133
|
-
}
|
|
8134
|
-
function stripFencedJson(raw) {
|
|
8135
|
-
const trimmed = raw.trim();
|
|
8136
|
-
const m = trimmed.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?```\s*$/);
|
|
8137
|
-
return m ? m[1].trim() : trimmed;
|
|
8138
|
-
}
|
|
8139
|
-
async function callLlm(req, opts = {}) {
|
|
8140
|
-
const baseUrl = (opts.baseUrl ?? DEFAULT_BASE_URL).replace(/\/+$/, "");
|
|
8141
|
-
const url = `${baseUrl}/chat/completions`;
|
|
8142
|
-
const timeoutMs = req.timeoutMs ?? opts.defaultTimeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
8143
|
-
const maxRetries = opts.maxRetries ?? DEFAULT_MAX_RETRIES;
|
|
8144
|
-
const fetchFn = opts.fetch ?? globalThis.fetch;
|
|
8145
|
-
const headers = buildHeaders(opts);
|
|
8146
|
-
let lastErr;
|
|
8147
|
-
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
|
8148
|
-
const controller = new AbortController();
|
|
8149
|
-
const timeoutHandle = setTimeout(() => controller.abort(), timeoutMs);
|
|
8150
|
-
const started = Date.now();
|
|
8151
|
-
try {
|
|
8152
|
-
const res = await fetchFn(url, {
|
|
8153
|
-
method: "POST",
|
|
8154
|
-
headers,
|
|
8155
|
-
body: JSON.stringify(buildBody(req, false)),
|
|
8156
|
-
signal: controller.signal
|
|
8157
|
-
});
|
|
8158
|
-
clearTimeout(timeoutHandle);
|
|
8159
|
-
if (!res.ok) {
|
|
8160
|
-
const body = await res.text();
|
|
8161
|
-
const err = new LlmCallError(
|
|
8162
|
-
`LLM call ${res.status}: ${body.slice(0, 300)}`,
|
|
8163
|
-
res.status,
|
|
8164
|
-
body,
|
|
8165
|
-
req.model
|
|
8166
|
-
);
|
|
8167
|
-
if (RETRYABLE_STATUS.has(res.status) && attempt < maxRetries - 1) {
|
|
8168
|
-
lastErr = err;
|
|
8169
|
-
const retryAfter = parseRetryAfter(res.headers);
|
|
8170
|
-
await sleep(retryAfter ?? backoffMs(attempt));
|
|
8171
|
-
continue;
|
|
8172
|
-
}
|
|
8173
|
-
throw err;
|
|
8174
|
-
}
|
|
8175
|
-
const json = await res.json();
|
|
8176
|
-
const choice = json.choices?.[0];
|
|
8177
|
-
const usageRaw = json.usage ?? {};
|
|
8178
|
-
const costFromProxy = json._response_cost ?? json.cost_usd;
|
|
8179
|
-
return {
|
|
8180
|
-
content: choice?.message?.content ?? "",
|
|
8181
|
-
usage: {
|
|
8182
|
-
promptTokens: Number(usageRaw.prompt_tokens ?? 0),
|
|
8183
|
-
completionTokens: Number(usageRaw.completion_tokens ?? 0),
|
|
8184
|
-
totalTokens: Number(usageRaw.total_tokens ?? 0),
|
|
8185
|
-
cachedPromptTokens: usageRaw.prompt_tokens_details && typeof usageRaw.prompt_tokens_details === "object" ? Number(
|
|
8186
|
-
usageRaw.prompt_tokens_details.cached_tokens ?? 0
|
|
8187
|
-
) : void 0
|
|
8188
|
-
},
|
|
8189
|
-
costUsd: typeof costFromProxy === "number" ? costFromProxy : null,
|
|
8190
|
-
model: json.model ?? req.model,
|
|
8191
|
-
durationMs: Date.now() - started,
|
|
8192
|
-
raw: json
|
|
8193
|
-
};
|
|
8194
|
-
} catch (err) {
|
|
8195
|
-
clearTimeout(timeoutHandle);
|
|
8196
|
-
lastErr = err;
|
|
8197
|
-
if (attempt < maxRetries - 1 && isRetryableError(err)) {
|
|
8198
|
-
await sleep(backoffMs(attempt));
|
|
8199
|
-
continue;
|
|
8200
|
-
}
|
|
8201
|
-
throw err;
|
|
8202
|
-
}
|
|
8203
|
-
}
|
|
8204
|
-
throw lastErr instanceof Error ? lastErr : new Error(String(lastErr));
|
|
8205
|
-
}
|
|
8206
|
-
async function callLlmJson(req, opts = {}) {
|
|
8207
|
-
try {
|
|
8208
|
-
const result = await callLlm({ ...req, jsonMode: req.jsonMode ?? !req.jsonSchema }, opts);
|
|
8209
|
-
const value = parseJsonSafely(result.content, result.model);
|
|
8210
|
-
return { value, result };
|
|
8211
|
-
} catch (err) {
|
|
8212
|
-
if (err instanceof LlmCallError && isSchemaRejection(err.status, err.body) && req.jsonSchema) {
|
|
8213
|
-
const degradedReq = { ...req, jsonMode: true, jsonSchema: void 0 };
|
|
8214
|
-
const result = await callLlm(degradedReq, opts);
|
|
8215
|
-
const value = parseJsonSafely(result.content, result.model);
|
|
8216
|
-
return { value, result };
|
|
8217
|
-
}
|
|
8218
|
-
throw err;
|
|
8219
|
-
}
|
|
8220
|
-
}
|
|
8221
|
-
function parseJsonSafely(content, model) {
|
|
8222
|
-
const stripped = stripFencedJson(content);
|
|
8223
|
-
try {
|
|
8224
|
-
return JSON.parse(stripped);
|
|
8225
|
-
} catch (err) {
|
|
8226
|
-
throw new Error(
|
|
8227
|
-
`LLM returned non-JSON content (model=${model}): ${err instanceof Error ? err.message : String(err)}
|
|
8228
|
-
--- raw content ---
|
|
8229
|
-
${content.slice(0, 800)}`
|
|
8230
|
-
);
|
|
8231
|
-
}
|
|
8232
|
-
}
|
|
8233
|
-
async function probeLlm(model, opts = {}) {
|
|
8234
|
-
const start = Date.now();
|
|
8235
|
-
try {
|
|
8236
|
-
await callLlm(
|
|
8237
|
-
{
|
|
8238
|
-
model,
|
|
8239
|
-
messages: [{ role: "user", content: "ping" }],
|
|
8240
|
-
maxTokens: 64,
|
|
8241
|
-
timeoutMs: opts.timeoutMs ?? 3e4
|
|
8242
|
-
},
|
|
8243
|
-
opts
|
|
8244
|
-
);
|
|
8245
|
-
return { ok: true, latencyMs: Date.now() - start, error: null };
|
|
8246
|
-
} catch (err) {
|
|
8247
|
-
return {
|
|
8248
|
-
ok: false,
|
|
8249
|
-
latencyMs: Date.now() - start,
|
|
8250
|
-
error: err instanceof Error ? err.message : String(err)
|
|
8251
|
-
};
|
|
8252
|
-
}
|
|
8253
|
-
}
|
|
8254
|
-
var LlmClient = class {
|
|
8255
|
-
constructor(opts = {}) {
|
|
8256
|
-
this.opts = opts;
|
|
8257
|
-
}
|
|
8258
|
-
opts;
|
|
8259
|
-
call(req, per) {
|
|
8260
|
-
return callLlm(req, { ...this.opts, ...per });
|
|
8261
|
-
}
|
|
8262
|
-
callJson(req, per) {
|
|
8263
|
-
return callLlmJson(req, { ...this.opts, ...per });
|
|
8264
|
-
}
|
|
8265
|
-
};
|
|
8266
|
-
|
|
8267
8308
|
// src/multi-layer-verifier.ts
|
|
8268
8309
|
function gradeSemanticStatus(input) {
|
|
8269
8310
|
if (!input.available) return "error";
|
|
@@ -9497,6 +9538,49 @@ function extractErrorCount(text, opts = {}) {
|
|
|
9497
9538
|
// src/reference-replay.ts
|
|
9498
9539
|
import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
|
|
9499
9540
|
import { dirname as dirname2 } from "path";
|
|
9541
|
+
|
|
9542
|
+
// src/concurrency.ts
|
|
9543
|
+
var Mutex = class {
|
|
9544
|
+
locked = false;
|
|
9545
|
+
waiters = [];
|
|
9546
|
+
async acquire() {
|
|
9547
|
+
if (!this.locked) {
|
|
9548
|
+
this.locked = true;
|
|
9549
|
+
return () => this.release();
|
|
9550
|
+
}
|
|
9551
|
+
return new Promise((resolve) => {
|
|
9552
|
+
this.waiters.push(() => {
|
|
9553
|
+
resolve(() => this.release());
|
|
9554
|
+
});
|
|
9555
|
+
});
|
|
9556
|
+
}
|
|
9557
|
+
release() {
|
|
9558
|
+
const next = this.waiters.shift();
|
|
9559
|
+
if (next) {
|
|
9560
|
+
next();
|
|
9561
|
+
} else {
|
|
9562
|
+
this.locked = false;
|
|
9563
|
+
}
|
|
9564
|
+
}
|
|
9565
|
+
async runExclusive(fn) {
|
|
9566
|
+
const release = await this.acquire();
|
|
9567
|
+
try {
|
|
9568
|
+
return await fn();
|
|
9569
|
+
} finally {
|
|
9570
|
+
release();
|
|
9571
|
+
}
|
|
9572
|
+
}
|
|
9573
|
+
/** True iff someone holds the lock right now. Diagnostics only. */
|
|
9574
|
+
get isLocked() {
|
|
9575
|
+
return this.locked;
|
|
9576
|
+
}
|
|
9577
|
+
/** Pending waiter count. Diagnostics only. */
|
|
9578
|
+
get pending() {
|
|
9579
|
+
return this.waiters.length;
|
|
9580
|
+
}
|
|
9581
|
+
};
|
|
9582
|
+
|
|
9583
|
+
// src/reference-replay.ts
|
|
9500
9584
|
var DEFAULT_MATCH_THRESHOLD = 0.55;
|
|
9501
9585
|
var ALL_SPLITS = ["train", "dev", "test", "holdout"];
|
|
9502
9586
|
async function runReferenceReplay(cases, options) {
|
|
@@ -9597,15 +9681,29 @@ function inMemoryReferenceReplayStore(initial = []) {
|
|
|
9597
9681
|
}
|
|
9598
9682
|
};
|
|
9599
9683
|
}
|
|
9684
|
+
var jsonlStoreLocks = /* @__PURE__ */ new Map();
|
|
9685
|
+
function getJsonlStoreLock(path) {
|
|
9686
|
+
let m = jsonlStoreLocks.get(path);
|
|
9687
|
+
if (!m) {
|
|
9688
|
+
m = new Mutex();
|
|
9689
|
+
jsonlStoreLocks.set(path, m);
|
|
9690
|
+
}
|
|
9691
|
+
return m;
|
|
9692
|
+
}
|
|
9600
9693
|
function jsonlReferenceReplayStore(path) {
|
|
9694
|
+
const lock = getJsonlStoreLock(path);
|
|
9601
9695
|
return {
|
|
9602
9696
|
async save(run) {
|
|
9603
|
-
|
|
9604
|
-
|
|
9697
|
+
await lock.runExclusive(() => {
|
|
9698
|
+
mkdirSync2(dirname2(path), { recursive: true });
|
|
9699
|
+
appendFileSync2(path, JSON.stringify(run) + "\n");
|
|
9700
|
+
});
|
|
9605
9701
|
},
|
|
9606
9702
|
async list() {
|
|
9607
|
-
|
|
9608
|
-
|
|
9703
|
+
return lock.runExclusive(() => {
|
|
9704
|
+
if (!existsSync4(path)) return [];
|
|
9705
|
+
return readJsonl(path);
|
|
9706
|
+
});
|
|
9609
9707
|
}
|
|
9610
9708
|
};
|
|
9611
9709
|
}
|
|
@@ -10231,6 +10329,561 @@ function samePopulation(a, b) {
|
|
|
10231
10329
|
return b.every((id) => setA.has(id));
|
|
10232
10330
|
}
|
|
10233
10331
|
|
|
10332
|
+
// src/jsonl-trial-cache.ts
|
|
10333
|
+
import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
|
|
10334
|
+
import { dirname as dirname4 } from "path";
|
|
10335
|
+
|
|
10336
|
+
// src/locked-jsonl-appender.ts
|
|
10337
|
+
import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3 } from "fs";
|
|
10338
|
+
import { dirname as dirname3 } from "path";
|
|
10339
|
+
var mutexes = /* @__PURE__ */ new Map();
|
|
10340
|
+
function getMutex(path) {
|
|
10341
|
+
let m = mutexes.get(path);
|
|
10342
|
+
if (!m) {
|
|
10343
|
+
m = new Mutex();
|
|
10344
|
+
mutexes.set(path, m);
|
|
10345
|
+
}
|
|
10346
|
+
return m;
|
|
10347
|
+
}
|
|
10348
|
+
var LockedJsonlAppender = class {
|
|
10349
|
+
constructor(path) {
|
|
10350
|
+
this.path = path;
|
|
10351
|
+
this.mutex = getMutex(path);
|
|
10352
|
+
if (!existsSync5(dirname3(path))) {
|
|
10353
|
+
mkdirSync3(dirname3(path), { recursive: true });
|
|
10354
|
+
}
|
|
10355
|
+
}
|
|
10356
|
+
path;
|
|
10357
|
+
mutex;
|
|
10358
|
+
async append(entry) {
|
|
10359
|
+
const line = `${JSON.stringify(entry)}
|
|
10360
|
+
`;
|
|
10361
|
+
await this.mutex.runExclusive(() => {
|
|
10362
|
+
appendFileSync3(this.path, line);
|
|
10363
|
+
});
|
|
10364
|
+
}
|
|
10365
|
+
};
|
|
10366
|
+
function resetLockedAppendersForTesting() {
|
|
10367
|
+
mutexes.clear();
|
|
10368
|
+
}
|
|
10369
|
+
|
|
10370
|
+
// src/jsonl-trial-cache.ts
|
|
10371
|
+
var JsonlTrialCache = class {
|
|
10372
|
+
map = /* @__PURE__ */ new Map();
|
|
10373
|
+
path;
|
|
10374
|
+
appender;
|
|
10375
|
+
constructor(path) {
|
|
10376
|
+
this.path = path;
|
|
10377
|
+
if (existsSync6(path)) {
|
|
10378
|
+
for (const line of readFileSync5(path, "utf-8").split("\n")) {
|
|
10379
|
+
if (!line.trim()) continue;
|
|
10380
|
+
try {
|
|
10381
|
+
const entry = JSON.parse(line);
|
|
10382
|
+
this.map.set(entry.key, entry.result);
|
|
10383
|
+
} catch {
|
|
10384
|
+
}
|
|
10385
|
+
}
|
|
10386
|
+
} else {
|
|
10387
|
+
mkdirSync4(dirname4(path), { recursive: true });
|
|
10388
|
+
}
|
|
10389
|
+
this.appender = new LockedJsonlAppender(path);
|
|
10390
|
+
}
|
|
10391
|
+
get(key) {
|
|
10392
|
+
return this.map.get(key);
|
|
10393
|
+
}
|
|
10394
|
+
set(key, value) {
|
|
10395
|
+
this.map.set(key, value);
|
|
10396
|
+
const line = { key, result: value, writtenAt: Date.now() };
|
|
10397
|
+
void this.appender.append(line);
|
|
10398
|
+
}
|
|
10399
|
+
size() {
|
|
10400
|
+
return this.map.size;
|
|
10401
|
+
}
|
|
10402
|
+
/**
|
|
10403
|
+
* Synchronous fallback path for tests / CLI tools that want to be sure
|
|
10404
|
+
* the line is on disk before returning. Bypasses the mutex (single-
|
|
10405
|
+
* threaded callers only).
|
|
10406
|
+
*/
|
|
10407
|
+
setSync(key, value) {
|
|
10408
|
+
this.map.set(key, value);
|
|
10409
|
+
const line = { key, result: value, writtenAt: Date.now() };
|
|
10410
|
+
appendFileSync4(this.path, `${JSON.stringify(line)}
|
|
10411
|
+
`);
|
|
10412
|
+
}
|
|
10413
|
+
};
|
|
10414
|
+
|
|
10415
|
+
// src/evolution-telemetry.ts
|
|
10416
|
+
import { appendFileSync as appendFileSync5, existsSync as existsSync7, mkdirSync as mkdirSync5, readFileSync as readFileSync6, writeFileSync } from "fs";
|
|
10417
|
+
import { dirname as dirname5 } from "path";
|
|
10418
|
+
var MutationTelemetry = class {
|
|
10419
|
+
appender;
|
|
10420
|
+
constructor(path) {
|
|
10421
|
+
this.appender = new LockedJsonlAppender(path);
|
|
10422
|
+
}
|
|
10423
|
+
async record(attempt) {
|
|
10424
|
+
await this.appender.append(attempt);
|
|
10425
|
+
}
|
|
10426
|
+
};
|
|
10427
|
+
var TrialTelemetry = class {
|
|
10428
|
+
appender;
|
|
10429
|
+
constructor(path) {
|
|
10430
|
+
this.appender = new LockedJsonlAppender(path);
|
|
10431
|
+
}
|
|
10432
|
+
async record(attempt) {
|
|
10433
|
+
await this.appender.append(attempt);
|
|
10434
|
+
}
|
|
10435
|
+
};
|
|
10436
|
+
var LineageRecorder = class {
|
|
10437
|
+
path;
|
|
10438
|
+
snapshotPath;
|
|
10439
|
+
mutex = new Mutex();
|
|
10440
|
+
nodes = /* @__PURE__ */ new Map();
|
|
10441
|
+
kindOf;
|
|
10442
|
+
constructor(path, kindOf) {
|
|
10443
|
+
this.path = path;
|
|
10444
|
+
this.snapshotPath = `${path}.snapshot`;
|
|
10445
|
+
this.kindOf = kindOf ?? defaultKindOf;
|
|
10446
|
+
mkdirSync5(dirname5(path), { recursive: true });
|
|
10447
|
+
if (existsSync7(this.snapshotPath)) {
|
|
10448
|
+
try {
|
|
10449
|
+
const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
|
|
10450
|
+
for (const n of parsed) this.nodes.set(n.id, n);
|
|
10451
|
+
} catch {
|
|
10452
|
+
}
|
|
10453
|
+
}
|
|
10454
|
+
if (existsSync7(path)) {
|
|
10455
|
+
try {
|
|
10456
|
+
for (const line of readFileSync6(path, "utf-8").split("\n")) {
|
|
10457
|
+
if (!line.trim()) continue;
|
|
10458
|
+
try {
|
|
10459
|
+
const entry = JSON.parse(line);
|
|
10460
|
+
const prev = this.nodes.get(entry.id);
|
|
10461
|
+
this.nodes.set(entry.id, { ...prev, ...entry });
|
|
10462
|
+
} catch {
|
|
10463
|
+
}
|
|
10464
|
+
}
|
|
10465
|
+
} catch {
|
|
10466
|
+
}
|
|
10467
|
+
}
|
|
10468
|
+
if (existsSync7(path) && this.nodes.size === 0) {
|
|
10469
|
+
try {
|
|
10470
|
+
const raw = readFileSync6(path, "utf-8").trim();
|
|
10471
|
+
if (raw.startsWith("[")) {
|
|
10472
|
+
const parsed = JSON.parse(raw);
|
|
10473
|
+
for (const n of parsed) this.nodes.set(n.id, n);
|
|
10474
|
+
}
|
|
10475
|
+
} catch {
|
|
10476
|
+
}
|
|
10477
|
+
}
|
|
10478
|
+
}
|
|
10479
|
+
async upsert(node) {
|
|
10480
|
+
await this.mutex.runExclusive(() => {
|
|
10481
|
+
const prev = this.nodes.get(node.id);
|
|
10482
|
+
this.nodes.set(node.id, { ...prev, ...node });
|
|
10483
|
+
try {
|
|
10484
|
+
if (existsSync7(this.path)) {
|
|
10485
|
+
const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
|
|
10486
|
+
if (head === "[") {
|
|
10487
|
+
writeFileSync(this.path, "");
|
|
10488
|
+
}
|
|
10489
|
+
}
|
|
10490
|
+
} catch {
|
|
10491
|
+
}
|
|
10492
|
+
appendFileSync5(this.path, `${JSON.stringify(this.nodes.get(node.id))}
|
|
10493
|
+
`);
|
|
10494
|
+
});
|
|
10495
|
+
}
|
|
10496
|
+
async upsertVariant(variant) {
|
|
10497
|
+
await this.upsert({
|
|
10498
|
+
id: variant.id,
|
|
10499
|
+
parentId: variant.parentId ?? null,
|
|
10500
|
+
generation: variant.generation,
|
|
10501
|
+
kind: this.kindOf(variant),
|
|
10502
|
+
...variant.rationale ? { rationale: variant.rationale } : {}
|
|
10503
|
+
});
|
|
10504
|
+
}
|
|
10505
|
+
snapshot() {
|
|
10506
|
+
return [...this.nodes.values()];
|
|
10507
|
+
}
|
|
10508
|
+
/**
|
|
10509
|
+
* Write the current consolidated state to `<path>.snapshot` so external
|
|
10510
|
+
* tools can read it without replaying the event log. Idempotent.
|
|
10511
|
+
*/
|
|
10512
|
+
async compact() {
|
|
10513
|
+
await this.mutex.runExclusive(() => {
|
|
10514
|
+
writeFileSync(this.snapshotPath, JSON.stringify([...this.nodes.values()], null, 2));
|
|
10515
|
+
});
|
|
10516
|
+
}
|
|
10517
|
+
};
|
|
10518
|
+
function defaultKindOf(variant) {
|
|
10519
|
+
if (variant.parentId === void 0) return "seed";
|
|
10520
|
+
const payload = variant.payload;
|
|
10521
|
+
if (payload && typeof payload === "object" && payload.codeMutation) return "code";
|
|
10522
|
+
return "prompt";
|
|
10523
|
+
}
|
|
10524
|
+
function emptyGenBucket() {
|
|
10525
|
+
return {
|
|
10526
|
+
mutatorPromptUsd: 0,
|
|
10527
|
+
mutatorCodeUsd: 0,
|
|
10528
|
+
scorerPromptUsd: 0,
|
|
10529
|
+
scorerCodeUsd: 0,
|
|
10530
|
+
trialsCounted: 0,
|
|
10531
|
+
cachedTrials: 0
|
|
10532
|
+
};
|
|
10533
|
+
}
|
|
10534
|
+
var CostLedger = class {
|
|
10535
|
+
totals = {
|
|
10536
|
+
mutatorPromptUsd: 0,
|
|
10537
|
+
mutatorCodeUsd: 0,
|
|
10538
|
+
scorerPromptUsd: 0,
|
|
10539
|
+
scorerCodeUsd: 0,
|
|
10540
|
+
trialsCounted: 0,
|
|
10541
|
+
cachedTrials: 0,
|
|
10542
|
+
poolBusyMs: 0,
|
|
10543
|
+
poolUtilizationPct: 0,
|
|
10544
|
+
byGeneration: {}
|
|
10545
|
+
};
|
|
10546
|
+
path;
|
|
10547
|
+
mutex = new Mutex();
|
|
10548
|
+
constructor(path) {
|
|
10549
|
+
this.path = path;
|
|
10550
|
+
if (existsSync7(path)) {
|
|
10551
|
+
try {
|
|
10552
|
+
const loaded = JSON.parse(readFileSync6(path, "utf-8"));
|
|
10553
|
+
for (const k of Object.keys(this.totals)) {
|
|
10554
|
+
if (k === "byGeneration") {
|
|
10555
|
+
if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
|
|
10556
|
+
this.totals.byGeneration = loaded.byGeneration;
|
|
10557
|
+
}
|
|
10558
|
+
continue;
|
|
10559
|
+
}
|
|
10560
|
+
const v = loaded[k];
|
|
10561
|
+
if (typeof v === "number" && Number.isFinite(v)) {
|
|
10562
|
+
this.totals[k] = v;
|
|
10563
|
+
}
|
|
10564
|
+
}
|
|
10565
|
+
} catch {
|
|
10566
|
+
}
|
|
10567
|
+
} else {
|
|
10568
|
+
mkdirSync5(dirname5(path), { recursive: true });
|
|
10569
|
+
}
|
|
10570
|
+
}
|
|
10571
|
+
genBucket(generation) {
|
|
10572
|
+
if (generation === void 0) return null;
|
|
10573
|
+
const key = String(generation);
|
|
10574
|
+
if (!this.totals.byGeneration[key]) {
|
|
10575
|
+
this.totals.byGeneration[key] = emptyGenBucket();
|
|
10576
|
+
}
|
|
10577
|
+
return this.totals.byGeneration[key];
|
|
10578
|
+
}
|
|
10579
|
+
async addMutation(channel, usd, opts = {}) {
|
|
10580
|
+
await this.mutex.runExclusive(() => {
|
|
10581
|
+
const bucket = this.genBucket(opts.generation);
|
|
10582
|
+
if (channel === "prompt") {
|
|
10583
|
+
this.totals.mutatorPromptUsd += usd;
|
|
10584
|
+
if (bucket) bucket.mutatorPromptUsd += usd;
|
|
10585
|
+
} else {
|
|
10586
|
+
this.totals.mutatorCodeUsd += usd;
|
|
10587
|
+
if (bucket) bucket.mutatorCodeUsd += usd;
|
|
10588
|
+
}
|
|
10589
|
+
this.persist();
|
|
10590
|
+
});
|
|
10591
|
+
}
|
|
10592
|
+
async addTrial(channel, usd, cached, opts = {}) {
|
|
10593
|
+
await this.mutex.runExclusive(() => {
|
|
10594
|
+
const bucket = this.genBucket(opts.generation);
|
|
10595
|
+
if (cached) {
|
|
10596
|
+
this.totals.cachedTrials++;
|
|
10597
|
+
this.totals.trialsCounted++;
|
|
10598
|
+
if (bucket) {
|
|
10599
|
+
bucket.cachedTrials++;
|
|
10600
|
+
bucket.trialsCounted++;
|
|
10601
|
+
}
|
|
10602
|
+
this.persist();
|
|
10603
|
+
return;
|
|
10604
|
+
}
|
|
10605
|
+
if (channel === "prompt") {
|
|
10606
|
+
this.totals.scorerPromptUsd += usd;
|
|
10607
|
+
if (bucket) bucket.scorerPromptUsd += usd;
|
|
10608
|
+
} else {
|
|
10609
|
+
this.totals.scorerCodeUsd += usd;
|
|
10610
|
+
if (bucket) bucket.scorerCodeUsd += usd;
|
|
10611
|
+
}
|
|
10612
|
+
this.totals.trialsCounted++;
|
|
10613
|
+
if (bucket) bucket.trialsCounted++;
|
|
10614
|
+
this.persist();
|
|
10615
|
+
});
|
|
10616
|
+
}
|
|
10617
|
+
async setPoolUtilization(busyMs, totalMs) {
|
|
10618
|
+
await this.mutex.runExclusive(() => {
|
|
10619
|
+
this.totals.poolBusyMs = busyMs;
|
|
10620
|
+
this.totals.poolUtilizationPct = totalMs > 0 ? 100 * busyMs / totalMs : 0;
|
|
10621
|
+
this.persist();
|
|
10622
|
+
});
|
|
10623
|
+
}
|
|
10624
|
+
snapshot() {
|
|
10625
|
+
const totalUsd = this.totals.mutatorPromptUsd + this.totals.mutatorCodeUsd + this.totals.scorerPromptUsd + this.totals.scorerCodeUsd;
|
|
10626
|
+
const byGeneration = Object.entries(this.totals.byGeneration).map(([g, b]) => ({ generation: Number(g), ...b })).sort((a, b) => a.generation - b.generation);
|
|
10627
|
+
return {
|
|
10628
|
+
totalUsd,
|
|
10629
|
+
mutatorPromptUsd: this.totals.mutatorPromptUsd,
|
|
10630
|
+
mutatorCodeUsd: this.totals.mutatorCodeUsd,
|
|
10631
|
+
scorerPromptUsd: this.totals.scorerPromptUsd,
|
|
10632
|
+
scorerCodeUsd: this.totals.scorerCodeUsd,
|
|
10633
|
+
trialsCounted: this.totals.trialsCounted,
|
|
10634
|
+
cachedTrials: this.totals.cachedTrials,
|
|
10635
|
+
poolBusyMs: this.totals.poolBusyMs,
|
|
10636
|
+
poolUtilizationPct: this.totals.poolUtilizationPct,
|
|
10637
|
+
byGeneration
|
|
10638
|
+
};
|
|
10639
|
+
}
|
|
10640
|
+
persist() {
|
|
10641
|
+
writeFileSync(this.path, JSON.stringify(this.totals, null, 2));
|
|
10642
|
+
}
|
|
10643
|
+
};
|
|
10644
|
+
|
|
10645
|
+
// src/composite-mutator.ts
|
|
10646
|
+
function createCompositeMutator(opts) {
|
|
10647
|
+
const recentScores = [];
|
|
10648
|
+
const plateauThreshold = opts.plateauThreshold ?? 0.02;
|
|
10649
|
+
const plateauPatience = opts.plateauPatience ?? 2;
|
|
10650
|
+
function pickMode(args) {
|
|
10651
|
+
recentScores.push(args.parentAggregate.meanScore);
|
|
10652
|
+
switch (opts.policy) {
|
|
10653
|
+
case "primary-only":
|
|
10654
|
+
return { mode: "primary", reason: "policy=primary-only" };
|
|
10655
|
+
case "secondary-only":
|
|
10656
|
+
if (!opts.secondary) return { mode: "primary", reason: "secondary-only requested but no secondary mutator wired" };
|
|
10657
|
+
return { mode: "secondary", reason: "policy=secondary-only" };
|
|
10658
|
+
case "alternate":
|
|
10659
|
+
if (!opts.secondary) return { mode: "primary", reason: "alternate requested but no secondary mutator wired" };
|
|
10660
|
+
return args.generation % 2 === 1 ? { mode: "secondary", reason: `alternate: gen${args.generation} odd \u2192 secondary` } : { mode: "primary", reason: `alternate: gen${args.generation} even \u2192 primary` };
|
|
10661
|
+
case "plateau": {
|
|
10662
|
+
if (!opts.secondary) return { mode: "primary", reason: "plateau requested but no secondary mutator wired" };
|
|
10663
|
+
if (recentScores.length <= plateauPatience) {
|
|
10664
|
+
return { mode: "primary", reason: "plateau: warming up with primary mutations" };
|
|
10665
|
+
}
|
|
10666
|
+
const window = recentScores.slice(-plateauPatience - 1);
|
|
10667
|
+
const deltas = window.slice(1).map((v, i) => v - window[i]);
|
|
10668
|
+
const stagnant = deltas.every((d) => d < plateauThreshold);
|
|
10669
|
+
if (stagnant) {
|
|
10670
|
+
return {
|
|
10671
|
+
mode: "split",
|
|
10672
|
+
reason: `plateau detected (${deltas.map((d) => d.toFixed(3)).join(", ")}) \u2192 split`
|
|
10673
|
+
};
|
|
10674
|
+
}
|
|
10675
|
+
return {
|
|
10676
|
+
mode: "primary",
|
|
10677
|
+
reason: `plateau: still improving (${deltas[deltas.length - 1].toFixed(3)})`
|
|
10678
|
+
};
|
|
10679
|
+
}
|
|
10680
|
+
}
|
|
10681
|
+
}
|
|
10682
|
+
return {
|
|
10683
|
+
async mutate(args) {
|
|
10684
|
+
const { mode, reason } = pickMode(args);
|
|
10685
|
+
opts.onPolicyDecision?.({ generation: args.generation, chose: mode, reason });
|
|
10686
|
+
if (mode === "primary") return opts.primary.mutate(args);
|
|
10687
|
+
if (mode === "secondary" && opts.secondary) return opts.secondary.mutate(args);
|
|
10688
|
+
if (mode === "split" && opts.secondary) {
|
|
10689
|
+
const secondaryShare = Math.ceil(args.childCount / 2);
|
|
10690
|
+
const primaryShare = args.childCount - secondaryShare;
|
|
10691
|
+
const [primaryChildren, secondaryChildren] = await Promise.all([
|
|
10692
|
+
opts.primary.mutate({ ...args, childCount: primaryShare }),
|
|
10693
|
+
opts.secondary.mutate({ ...args, childCount: secondaryShare })
|
|
10694
|
+
]);
|
|
10695
|
+
return [...primaryChildren, ...secondaryChildren];
|
|
10696
|
+
}
|
|
10697
|
+
return opts.primary.mutate(args);
|
|
10698
|
+
}
|
|
10699
|
+
};
|
|
10700
|
+
}
|
|
10701
|
+
|
|
10702
|
+
// src/sandbox-pool.ts
|
|
10703
|
+
function createSandboxPool(opts) {
|
|
10704
|
+
if (opts.size < 1) throw new Error(`sandbox pool size must be >= 1 (got ${opts.size})`);
|
|
10705
|
+
const slots = [];
|
|
10706
|
+
const waiters = [];
|
|
10707
|
+
const mutex = new Mutex();
|
|
10708
|
+
let nextSlotId = 0;
|
|
10709
|
+
let totalCheckouts = 0;
|
|
10710
|
+
let busyMs = 0;
|
|
10711
|
+
const startedAt = Date.now();
|
|
10712
|
+
async function acquireSlot() {
|
|
10713
|
+
let mintId;
|
|
10714
|
+
const ready = await mutex.runExclusive(async () => {
|
|
10715
|
+
const idle = slots.find((s) => !s.busy);
|
|
10716
|
+
if (idle) {
|
|
10717
|
+
idle.busy = true;
|
|
10718
|
+
return idle;
|
|
10719
|
+
}
|
|
10720
|
+
if (slots.length < opts.size) {
|
|
10721
|
+
mintId = `slot_${nextSlotId++}`;
|
|
10722
|
+
return null;
|
|
10723
|
+
}
|
|
10724
|
+
return null;
|
|
10725
|
+
});
|
|
10726
|
+
if (ready) return ready;
|
|
10727
|
+
if (mintId !== void 0) {
|
|
10728
|
+
const resource = await opts.factory.create(mintId);
|
|
10729
|
+
const state = {
|
|
10730
|
+
slot: { id: mintId, resource },
|
|
10731
|
+
busy: true
|
|
10732
|
+
};
|
|
10733
|
+
await mutex.runExclusive(() => {
|
|
10734
|
+
slots.push(state);
|
|
10735
|
+
});
|
|
10736
|
+
return state;
|
|
10737
|
+
}
|
|
10738
|
+
return new Promise((resolve) => {
|
|
10739
|
+
waiters.push((s) => {
|
|
10740
|
+
s.busy = true;
|
|
10741
|
+
resolve(s);
|
|
10742
|
+
});
|
|
10743
|
+
});
|
|
10744
|
+
}
|
|
10745
|
+
function releaseSlot(state) {
|
|
10746
|
+
void (async () => {
|
|
10747
|
+
try {
|
|
10748
|
+
if (opts.factory.reset) await opts.factory.reset(state.slot);
|
|
10749
|
+
} catch (err) {
|
|
10750
|
+
console.warn(`[sandbox-pool] reset failed for slot ${state.slot.id}:`, err);
|
|
10751
|
+
}
|
|
10752
|
+
state.busy = false;
|
|
10753
|
+
const next = waiters.shift();
|
|
10754
|
+
if (next) next(state);
|
|
10755
|
+
})();
|
|
10756
|
+
}
|
|
10757
|
+
async function checkout() {
|
|
10758
|
+
const state = await acquireSlot();
|
|
10759
|
+
const checkoutStart = Date.now();
|
|
10760
|
+
totalCheckouts++;
|
|
10761
|
+
return {
|
|
10762
|
+
slot: state.slot,
|
|
10763
|
+
release: () => {
|
|
10764
|
+
busyMs += Date.now() - checkoutStart;
|
|
10765
|
+
releaseSlot(state);
|
|
10766
|
+
}
|
|
10767
|
+
};
|
|
10768
|
+
}
|
|
10769
|
+
async function withSlot(fn) {
|
|
10770
|
+
const { slot, release } = await checkout();
|
|
10771
|
+
try {
|
|
10772
|
+
return await fn(slot);
|
|
10773
|
+
} finally {
|
|
10774
|
+
release();
|
|
10775
|
+
}
|
|
10776
|
+
}
|
|
10777
|
+
async function drain() {
|
|
10778
|
+
const snapshot = await mutex.runExclusive(() => {
|
|
10779
|
+
const taken = slots.splice(0, slots.length);
|
|
10780
|
+
for (const w of waiters.splice(0, waiters.length)) {
|
|
10781
|
+
void w;
|
|
10782
|
+
}
|
|
10783
|
+
return taken;
|
|
10784
|
+
});
|
|
10785
|
+
await Promise.allSettled(snapshot.map((s) => opts.factory.destroy(s.slot)));
|
|
10786
|
+
}
|
|
10787
|
+
function utilization() {
|
|
10788
|
+
return {
|
|
10789
|
+
busyMs,
|
|
10790
|
+
totalMs: Date.now() - startedAt,
|
|
10791
|
+
checkouts: totalCheckouts
|
|
10792
|
+
};
|
|
10793
|
+
}
|
|
10794
|
+
return {
|
|
10795
|
+
checkout,
|
|
10796
|
+
withSlot,
|
|
10797
|
+
drain,
|
|
10798
|
+
poolSize: () => slots.length,
|
|
10799
|
+
activeCheckouts: () => slots.filter((s) => s.busy).length,
|
|
10800
|
+
utilization
|
|
10801
|
+
};
|
|
10802
|
+
}
|
|
10803
|
+
|
|
10804
|
+
// src/code-mutator.ts
|
|
10805
|
+
function createSandboxCodeMutator(opts) {
|
|
10806
|
+
const childIdFor = opts.childIdFor ?? ((parent, generation, index) => `${parent.id}.g${generation}.code.${index}`);
|
|
10807
|
+
const labelFor = opts.labelFor ?? ((outcome, parent, _generation, index) => outcome.description?.slice(0, 80) ?? `${parent.label} \u2192 code.${index}`);
|
|
10808
|
+
return {
|
|
10809
|
+
async mutate(args) {
|
|
10810
|
+
const { parent, parentAggregate, topTrials, bottomTrials, childCount, generation } = args;
|
|
10811
|
+
const startedAt = Date.now();
|
|
10812
|
+
const outcomes = await opts.pool.withSlot(async (slot) => {
|
|
10813
|
+
try {
|
|
10814
|
+
return await opts.runner({
|
|
10815
|
+
slot,
|
|
10816
|
+
parent,
|
|
10817
|
+
parentAggregate,
|
|
10818
|
+
topTrials,
|
|
10819
|
+
bottomTrials,
|
|
10820
|
+
childCount,
|
|
10821
|
+
generation
|
|
10822
|
+
});
|
|
10823
|
+
} catch (err) {
|
|
10824
|
+
return [{
|
|
10825
|
+
ok: false,
|
|
10826
|
+
failureReason: "runner_error",
|
|
10827
|
+
description: err instanceof Error ? err.message : String(err),
|
|
10828
|
+
latencyMs: Date.now() - startedAt
|
|
10829
|
+
}];
|
|
10830
|
+
}
|
|
10831
|
+
});
|
|
10832
|
+
const variants = [];
|
|
10833
|
+
let index = 0;
|
|
10834
|
+
for (const outcome of outcomes) {
|
|
10835
|
+
const childId = outcome.childId ?? childIdFor(parent, generation, index);
|
|
10836
|
+
if (opts.mutationTelemetry) {
|
|
10837
|
+
await opts.mutationTelemetry.record({
|
|
10838
|
+
ts: Date.now(),
|
|
10839
|
+
channel: "code",
|
|
10840
|
+
generation,
|
|
10841
|
+
parentId: parent.id,
|
|
10842
|
+
childId: outcome.ok ? childId : null,
|
|
10843
|
+
ok: outcome.ok,
|
|
10844
|
+
failureReason: outcome.failureReason,
|
|
10845
|
+
description: outcome.description,
|
|
10846
|
+
latencyMs: outcome.latencyMs,
|
|
10847
|
+
diffBytes: outcome.diffBytes,
|
|
10848
|
+
filesTouched: outcome.filesTouched,
|
|
10849
|
+
agentSteps: outcome.agentSteps,
|
|
10850
|
+
costUsd: outcome.costUsd
|
|
10851
|
+
});
|
|
10852
|
+
}
|
|
10853
|
+
if (opts.costLedger && outcome.costUsd !== void 0) {
|
|
10854
|
+
await opts.costLedger.addMutation("code", outcome.costUsd, { generation });
|
|
10855
|
+
}
|
|
10856
|
+
if (outcome.ok) {
|
|
10857
|
+
const variant = {
|
|
10858
|
+
id: childId,
|
|
10859
|
+
payload: opts.toVariantPayload(outcome, parent),
|
|
10860
|
+
generation,
|
|
10861
|
+
parentId: parent.id,
|
|
10862
|
+
label: labelFor(outcome, parent, generation, index),
|
|
10863
|
+
...outcome.rationale ? { rationale: outcome.rationale } : {}
|
|
10864
|
+
};
|
|
10865
|
+
variants.push(variant);
|
|
10866
|
+
if (opts.lineage) {
|
|
10867
|
+
await opts.lineage.upsert({
|
|
10868
|
+
id: variant.id,
|
|
10869
|
+
parentId: variant.parentId ?? null,
|
|
10870
|
+
generation: variant.generation,
|
|
10871
|
+
kind: "code",
|
|
10872
|
+
...variant.rationale ? { rationale: variant.rationale } : {}
|
|
10873
|
+
});
|
|
10874
|
+
}
|
|
10875
|
+
}
|
|
10876
|
+
index++;
|
|
10877
|
+
}
|
|
10878
|
+
if (opts.costLedger) {
|
|
10879
|
+
const u = opts.pool.utilization();
|
|
10880
|
+
await opts.costLedger.setPoolUtilization(u.busyMs, u.totalMs);
|
|
10881
|
+
}
|
|
10882
|
+
return variants;
|
|
10883
|
+
}
|
|
10884
|
+
};
|
|
10885
|
+
}
|
|
10886
|
+
|
|
10234
10887
|
// src/golden-matcher.ts
|
|
10235
10888
|
function matchGoldens(goldens, candidates, options = {}) {
|
|
10236
10889
|
const extract = options.text ?? defaultExtract5;
|
|
@@ -10593,7 +11246,9 @@ export {
|
|
|
10593
11246
|
BudgetGuard,
|
|
10594
11247
|
BuilderSession,
|
|
10595
11248
|
ConvergenceTracker,
|
|
11249
|
+
CostLedger,
|
|
10596
11250
|
CostTracker,
|
|
11251
|
+
D1ExperimentStore,
|
|
10597
11252
|
DEFAULT_AGENT_SLOS,
|
|
10598
11253
|
DEFAULT_COMPLEXITY_WEIGHTS,
|
|
10599
11254
|
DEFAULT_RULES as DEFAULT_FAILURE_RULES,
|
|
@@ -10611,6 +11266,7 @@ export {
|
|
|
10611
11266
|
ERROR_COUNT_PATTERNS,
|
|
10612
11267
|
ExperimentTracker,
|
|
10613
11268
|
FAILURE_CLASSES,
|
|
11269
|
+
FileSystemExperimentStore,
|
|
10614
11270
|
FileSystemOutcomeStore,
|
|
10615
11271
|
FileSystemTraceStore,
|
|
10616
11272
|
HoldoutAuditor,
|
|
@@ -10621,12 +11277,17 @@ export {
|
|
|
10621
11277
|
InMemoryTraceStore,
|
|
10622
11278
|
InMemoryTrialCache,
|
|
10623
11279
|
InMemoryWorkspaceInspector,
|
|
11280
|
+
JsonlTrialCache,
|
|
10624
11281
|
JudgeRunner,
|
|
11282
|
+
LineageRecorder,
|
|
10625
11283
|
LlmCallError,
|
|
10626
11284
|
LlmClient,
|
|
11285
|
+
LockedJsonlAppender,
|
|
10627
11286
|
MODEL_PRICING,
|
|
10628
11287
|
MetricsCollector,
|
|
10629
11288
|
MultiLayerVerifier,
|
|
11289
|
+
MutationTelemetry,
|
|
11290
|
+
Mutex,
|
|
10630
11291
|
OTEL_AGENT_EVAL_SCOPE,
|
|
10631
11292
|
OptimizationLoop,
|
|
10632
11293
|
PairwiseSteeringOptimizer,
|
|
@@ -10644,6 +11305,7 @@ export {
|
|
|
10644
11305
|
TRACE_SCHEMA_VERSION,
|
|
10645
11306
|
TokenCounter,
|
|
10646
11307
|
TraceEmitter,
|
|
11308
|
+
TrialTelemetry,
|
|
10647
11309
|
UNIVERSAL_FINDERS,
|
|
10648
11310
|
adversarialJudge,
|
|
10649
11311
|
aggregateLlm,
|
|
@@ -10688,11 +11350,14 @@ export {
|
|
|
10688
11350
|
correlateLayers,
|
|
10689
11351
|
correlationStudy,
|
|
10690
11352
|
createAntiSlopJudge,
|
|
11353
|
+
createCompositeMutator,
|
|
10691
11354
|
createCustomJudge,
|
|
10692
11355
|
createDefaultReviewer,
|
|
10693
11356
|
createDomainExpertJudge,
|
|
10694
11357
|
createIntentMatchJudge,
|
|
10695
11358
|
createLlmReviewer,
|
|
11359
|
+
createSandboxCodeMutator,
|
|
11360
|
+
createSandboxPool,
|
|
10696
11361
|
createSemanticConceptJudge,
|
|
10697
11362
|
crossTraceDiff,
|
|
10698
11363
|
crowdingDistance,
|
|
@@ -10803,6 +11468,7 @@ export {
|
|
|
10803
11468
|
replayScorerOverCorpus,
|
|
10804
11469
|
replayTraceThroughJudge,
|
|
10805
11470
|
requiredSampleSize,
|
|
11471
|
+
resetLockedAppendersForTesting,
|
|
10806
11472
|
resumeBuilderSession,
|
|
10807
11473
|
rowCount,
|
|
10808
11474
|
rowWhere,
|