@tangle-network/agent-eval 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,3 +1,12 @@
1
+ import {
2
+ LlmCallError,
3
+ LlmClient,
4
+ callLlm,
5
+ callLlmJson,
6
+ probeLlm,
7
+ stripFencedJson
8
+ } from "./chunk-ITN4YOZY.js";
9
+
1
10
  // src/client.ts
2
11
  var ProductClient = class {
3
12
  baseUrl;
@@ -1926,6 +1935,244 @@ function rand(bytes) {
1926
1935
  return Array.from(arr).map((b) => b.toString(16).padStart(2, "0")).join("");
1927
1936
  }
1928
1937
 
1938
+ // src/experiment-tracker-fs.ts
1939
+ var FileSystemExperimentStore = class {
1940
+ dir;
1941
+ maxBytes;
1942
+ index;
1943
+ loaded = false;
1944
+ constructor(options) {
1945
+ this.dir = options.dir;
1946
+ this.maxBytes = options.maxBytes ?? 32 * 1024 * 1024;
1947
+ }
1948
+ async saveExperiment(exp) {
1949
+ const idx = await this.load();
1950
+ await idx.saveExperiment(exp);
1951
+ await this.append("experiments", exp);
1952
+ }
1953
+ async getExperiment(id) {
1954
+ const idx = await this.load();
1955
+ return idx.getExperiment(id);
1956
+ }
1957
+ async listExperiments() {
1958
+ const idx = await this.load();
1959
+ return idx.listExperiments();
1960
+ }
1961
+ async saveRun(run) {
1962
+ const idx = await this.load();
1963
+ await idx.saveRun(run);
1964
+ await this.append("runs", run);
1965
+ }
1966
+ async getRun(id) {
1967
+ const idx = await this.load();
1968
+ return idx.getRun(id);
1969
+ }
1970
+ async listRuns(experimentId) {
1971
+ const idx = await this.load();
1972
+ return idx.listRuns(experimentId);
1973
+ }
1974
+ async ensureDir() {
1975
+ const fs = await import("fs/promises");
1976
+ await fs.mkdir(this.dir, { recursive: true });
1977
+ }
1978
+ async append(name, record) {
1979
+ await this.ensureDir();
1980
+ const fs = await import("fs/promises");
1981
+ const path = await import("path");
1982
+ const active = path.join(this.dir, `${name}.ndjson`);
1983
+ try {
1984
+ const stat = await fs.stat(active);
1985
+ if (stat.size >= this.maxBytes) {
1986
+ const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
1987
+ await fs.rename(active, rolled);
1988
+ }
1989
+ } catch {
1990
+ }
1991
+ await fs.appendFile(active, JSON.stringify(record) + "\n", "utf8");
1992
+ }
1993
+ async load() {
1994
+ if (this.loaded && this.index) return this.index;
1995
+ const fs = await import("fs/promises");
1996
+ const path = await import("path");
1997
+ const store = new InMemoryExperimentStore();
1998
+ try {
1999
+ const entries = await fs.readdir(this.dir);
2000
+ const sorted = entries.filter((f) => f.endsWith(".ndjson")).sort((a, b) => a.localeCompare(b));
2001
+ for (const file of sorted) {
2002
+ const full = path.join(this.dir, file);
2003
+ const content = await fs.readFile(full, "utf8");
2004
+ const base = file.split(".")[0];
2005
+ for (const line of content.split("\n")) {
2006
+ if (!line.trim()) continue;
2007
+ let record;
2008
+ try {
2009
+ record = JSON.parse(line);
2010
+ } catch {
2011
+ continue;
2012
+ }
2013
+ if (base === "experiments") {
2014
+ await store.saveExperiment(record);
2015
+ } else if (base === "runs") {
2016
+ await store.saveRun(record);
2017
+ }
2018
+ }
2019
+ }
2020
+ } catch {
2021
+ }
2022
+ this.index = store;
2023
+ this.loaded = true;
2024
+ return store;
2025
+ }
2026
+ };
2027
+
2028
+ // src/experiment-tracker-d1.ts
2029
+ var SCHEMA_VERSION = 1;
2030
+ var D1ExperimentStore = class {
2031
+ db;
2032
+ experimentsTable;
2033
+ runsTable;
2034
+ metaTable;
2035
+ schemaReady = false;
2036
+ constructor(options) {
2037
+ this.db = options.db;
2038
+ const prefix = options.tablePrefix ?? "agent_eval_";
2039
+ this.experimentsTable = `${prefix}experiments`;
2040
+ this.runsTable = `${prefix}runs`;
2041
+ this.metaTable = `${prefix}meta`;
2042
+ }
2043
+ /**
2044
+ * Idempotent schema setup. Safe to call before every operation; the second
2045
+ * call short-circuits via `schemaReady`. Most consumers will call it once
2046
+ * during Worker bootstrap.
2047
+ */
2048
+ async ensureSchema() {
2049
+ if (this.schemaReady) return;
2050
+ const ddl = `
2051
+ CREATE TABLE IF NOT EXISTS ${this.experimentsTable} (
2052
+ id TEXT PRIMARY KEY,
2053
+ name TEXT NOT NULL,
2054
+ created_at TEXT NOT NULL,
2055
+ metadata_json TEXT
2056
+ );
2057
+ CREATE TABLE IF NOT EXISTS ${this.runsTable} (
2058
+ id TEXT PRIMARY KEY,
2059
+ experiment_id TEXT NOT NULL,
2060
+ name TEXT,
2061
+ status TEXT NOT NULL,
2062
+ started_at TEXT NOT NULL,
2063
+ completed_at TEXT,
2064
+ config_json TEXT NOT NULL,
2065
+ report_json TEXT,
2066
+ error TEXT
2067
+ );
2068
+ CREATE INDEX IF NOT EXISTS idx_${this.runsTable}_experiment ON ${this.runsTable}(experiment_id);
2069
+ CREATE INDEX IF NOT EXISTS idx_${this.runsTable}_started ON ${this.runsTable}(started_at);
2070
+ CREATE TABLE IF NOT EXISTS ${this.metaTable} (
2071
+ key TEXT PRIMARY KEY,
2072
+ value TEXT NOT NULL
2073
+ );
2074
+ INSERT OR REPLACE INTO ${this.metaTable}(key, value) VALUES ('schema_version', '${SCHEMA_VERSION}');
2075
+ `;
2076
+ await this.db.exec(ddl.trim().replace(/\s+/g, " "));
2077
+ this.schemaReady = true;
2078
+ }
2079
+ async saveExperiment(exp) {
2080
+ await this.ensureSchema();
2081
+ await this.db.prepare(
2082
+ `INSERT INTO ${this.experimentsTable}(id, name, created_at, metadata_json)
2083
+ VALUES (?1, ?2, ?3, ?4)
2084
+ ON CONFLICT(id) DO UPDATE SET
2085
+ name = excluded.name,
2086
+ created_at = excluded.created_at,
2087
+ metadata_json = excluded.metadata_json`
2088
+ ).bind(exp.id, exp.name, exp.createdAt, exp.metadata ? JSON.stringify(exp.metadata) : null).run();
2089
+ }
2090
+ async getExperiment(id) {
2091
+ await this.ensureSchema();
2092
+ const row = await this.db.prepare(
2093
+ `SELECT id, name, created_at, metadata_json
2094
+ FROM ${this.experimentsTable}
2095
+ WHERE id = ?1`
2096
+ ).bind(id).first();
2097
+ return row ? rowToExperiment(row) : null;
2098
+ }
2099
+ async listExperiments() {
2100
+ await this.ensureSchema();
2101
+ const { results } = await this.db.prepare(
2102
+ `SELECT id, name, created_at, metadata_json
2103
+ FROM ${this.experimentsTable}
2104
+ ORDER BY created_at DESC`
2105
+ ).all();
2106
+ return results.map(rowToExperiment);
2107
+ }
2108
+ async saveRun(run) {
2109
+ await this.ensureSchema();
2110
+ await this.db.prepare(
2111
+ `INSERT INTO ${this.runsTable}(id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error)
2112
+ VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)
2113
+ ON CONFLICT(id) DO UPDATE SET
2114
+ experiment_id = excluded.experiment_id,
2115
+ name = excluded.name,
2116
+ status = excluded.status,
2117
+ started_at = excluded.started_at,
2118
+ completed_at = excluded.completed_at,
2119
+ config_json = excluded.config_json,
2120
+ report_json = excluded.report_json,
2121
+ error = excluded.error`
2122
+ ).bind(
2123
+ run.id,
2124
+ run.experimentId,
2125
+ run.name ?? null,
2126
+ run.status,
2127
+ run.startedAt,
2128
+ run.completedAt ?? null,
2129
+ JSON.stringify(run.config),
2130
+ run.report ? JSON.stringify(run.report) : null,
2131
+ run.error ?? null
2132
+ ).run();
2133
+ }
2134
+ async getRun(id) {
2135
+ await this.ensureSchema();
2136
+ const row = await this.db.prepare(
2137
+ `SELECT id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error
2138
+ FROM ${this.runsTable}
2139
+ WHERE id = ?1`
2140
+ ).bind(id).first();
2141
+ return row ? rowToRun(row) : null;
2142
+ }
2143
+ async listRuns(experimentId) {
2144
+ await this.ensureSchema();
2145
+ const { results } = await this.db.prepare(
2146
+ `SELECT id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error
2147
+ FROM ${this.runsTable}
2148
+ WHERE experiment_id = ?1
2149
+ ORDER BY started_at DESC`
2150
+ ).bind(experimentId).all();
2151
+ return results.map(rowToRun);
2152
+ }
2153
+ };
2154
+ function rowToExperiment(row) {
2155
+ return {
2156
+ id: row.id,
2157
+ name: row.name,
2158
+ createdAt: row.created_at,
2159
+ ...row.metadata_json ? { metadata: JSON.parse(row.metadata_json) } : {}
2160
+ };
2161
+ }
2162
+ function rowToRun(row) {
2163
+ return {
2164
+ id: row.id,
2165
+ experimentId: row.experiment_id,
2166
+ ...row.name ? { name: row.name } : {},
2167
+ status: row.status,
2168
+ startedAt: row.started_at,
2169
+ ...row.completed_at ? { completedAt: row.completed_at } : {},
2170
+ config: JSON.parse(row.config_json),
2171
+ ...row.report_json ? { report: JSON.parse(row.report_json) } : {},
2172
+ ...row.error ? { error: row.error } : {}
2173
+ };
2174
+ }
2175
+
1929
2176
  // src/power-analysis.ts
1930
2177
  function requiredSampleSize(opts) {
1931
2178
  const effect = opts.effect;
@@ -8058,212 +8305,6 @@ async function euAiActReport(ctx, signals) {
8058
8305
  };
8059
8306
  }
8060
8307
 
8061
- // src/llm-client.ts
8062
- var LlmCallError = class extends Error {
8063
- constructor(message, status, body, model) {
8064
- super(message);
8065
- this.status = status;
8066
- this.body = body;
8067
- this.model = model;
8068
- this.name = "LlmCallError";
8069
- }
8070
- status;
8071
- body;
8072
- model;
8073
- };
8074
- var DEFAULT_BASE_URL = "https://router.tangle.tools/v1";
8075
- var DEFAULT_TIMEOUT_MS = 6e4;
8076
- var DEFAULT_MAX_RETRIES = 3;
8077
- var RETRYABLE_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
8078
- function isRetryableError(err) {
8079
- if (err instanceof LlmCallError) return RETRYABLE_STATUS.has(err.status);
8080
- if (err instanceof Error) {
8081
- return err.name === "AbortError" || err.name === "TimeoutError" || /fetch failed|ECONNRESET|ETIMEDOUT|EAI_AGAIN/i.test(err.message);
8082
- }
8083
- return false;
8084
- }
8085
- function parseRetryAfter(headers) {
8086
- const h = headers.get("retry-after");
8087
- if (!h) return null;
8088
- const asNumber = Number(h);
8089
- if (Number.isFinite(asNumber) && asNumber > 0) return asNumber * 1e3;
8090
- const asDate = Date.parse(h);
8091
- if (Number.isFinite(asDate)) return Math.max(0, asDate - Date.now());
8092
- return null;
8093
- }
8094
- function backoffMs(attempt) {
8095
- return Math.min(500 * Math.pow(2, attempt), 16e3);
8096
- }
8097
- function buildHeaders(opts) {
8098
- const headers = {
8099
- "Content-Type": "application/json",
8100
- Accept: "application/json"
8101
- };
8102
- if (opts.authHeader) {
8103
- headers[opts.authHeader.name] = opts.authHeader.value;
8104
- } else if (opts.bearer || opts.apiKey) {
8105
- headers.Authorization = `Bearer ${opts.bearer ?? opts.apiKey}`;
8106
- }
8107
- return headers;
8108
- }
8109
- function isSchemaRejection(status, body) {
8110
- if (status !== 400) return false;
8111
- const lower = body.toLowerCase();
8112
- return lower.includes("response_format") || lower.includes("json_schema") || lower.includes("is unavailable") || lower.includes("not supported");
8113
- }
8114
- function buildBody(req, forceJsonObject) {
8115
- const body = {
8116
- model: req.model,
8117
- messages: req.messages,
8118
- temperature: req.temperature ?? 0
8119
- };
8120
- if (req.maxTokens != null) body.max_tokens = req.maxTokens;
8121
- if (req.jsonSchema && !forceJsonObject) {
8122
- body.response_format = {
8123
- type: "json_schema",
8124
- json_schema: { name: req.jsonSchema.name, schema: req.jsonSchema.schema, strict: true }
8125
- };
8126
- } else if (req.jsonMode || req.jsonSchema) {
8127
- body.response_format = { type: "json_object" };
8128
- }
8129
- return body;
8130
- }
8131
- async function sleep(ms) {
8132
- return new Promise((resolve) => setTimeout(resolve, ms));
8133
- }
8134
- function stripFencedJson(raw) {
8135
- const trimmed = raw.trim();
8136
- const m = trimmed.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?```\s*$/);
8137
- return m ? m[1].trim() : trimmed;
8138
- }
8139
- async function callLlm(req, opts = {}) {
8140
- const baseUrl = (opts.baseUrl ?? DEFAULT_BASE_URL).replace(/\/+$/, "");
8141
- const url = `${baseUrl}/chat/completions`;
8142
- const timeoutMs = req.timeoutMs ?? opts.defaultTimeoutMs ?? DEFAULT_TIMEOUT_MS;
8143
- const maxRetries = opts.maxRetries ?? DEFAULT_MAX_RETRIES;
8144
- const fetchFn = opts.fetch ?? globalThis.fetch;
8145
- const headers = buildHeaders(opts);
8146
- let lastErr;
8147
- for (let attempt = 0; attempt < maxRetries; attempt++) {
8148
- const controller = new AbortController();
8149
- const timeoutHandle = setTimeout(() => controller.abort(), timeoutMs);
8150
- const started = Date.now();
8151
- try {
8152
- const res = await fetchFn(url, {
8153
- method: "POST",
8154
- headers,
8155
- body: JSON.stringify(buildBody(req, false)),
8156
- signal: controller.signal
8157
- });
8158
- clearTimeout(timeoutHandle);
8159
- if (!res.ok) {
8160
- const body = await res.text();
8161
- const err = new LlmCallError(
8162
- `LLM call ${res.status}: ${body.slice(0, 300)}`,
8163
- res.status,
8164
- body,
8165
- req.model
8166
- );
8167
- if (RETRYABLE_STATUS.has(res.status) && attempt < maxRetries - 1) {
8168
- lastErr = err;
8169
- const retryAfter = parseRetryAfter(res.headers);
8170
- await sleep(retryAfter ?? backoffMs(attempt));
8171
- continue;
8172
- }
8173
- throw err;
8174
- }
8175
- const json = await res.json();
8176
- const choice = json.choices?.[0];
8177
- const usageRaw = json.usage ?? {};
8178
- const costFromProxy = json._response_cost ?? json.cost_usd;
8179
- return {
8180
- content: choice?.message?.content ?? "",
8181
- usage: {
8182
- promptTokens: Number(usageRaw.prompt_tokens ?? 0),
8183
- completionTokens: Number(usageRaw.completion_tokens ?? 0),
8184
- totalTokens: Number(usageRaw.total_tokens ?? 0),
8185
- cachedPromptTokens: usageRaw.prompt_tokens_details && typeof usageRaw.prompt_tokens_details === "object" ? Number(
8186
- usageRaw.prompt_tokens_details.cached_tokens ?? 0
8187
- ) : void 0
8188
- },
8189
- costUsd: typeof costFromProxy === "number" ? costFromProxy : null,
8190
- model: json.model ?? req.model,
8191
- durationMs: Date.now() - started,
8192
- raw: json
8193
- };
8194
- } catch (err) {
8195
- clearTimeout(timeoutHandle);
8196
- lastErr = err;
8197
- if (attempt < maxRetries - 1 && isRetryableError(err)) {
8198
- await sleep(backoffMs(attempt));
8199
- continue;
8200
- }
8201
- throw err;
8202
- }
8203
- }
8204
- throw lastErr instanceof Error ? lastErr : new Error(String(lastErr));
8205
- }
8206
- async function callLlmJson(req, opts = {}) {
8207
- try {
8208
- const result = await callLlm({ ...req, jsonMode: req.jsonMode ?? !req.jsonSchema }, opts);
8209
- const value = parseJsonSafely(result.content, result.model);
8210
- return { value, result };
8211
- } catch (err) {
8212
- if (err instanceof LlmCallError && isSchemaRejection(err.status, err.body) && req.jsonSchema) {
8213
- const degradedReq = { ...req, jsonMode: true, jsonSchema: void 0 };
8214
- const result = await callLlm(degradedReq, opts);
8215
- const value = parseJsonSafely(result.content, result.model);
8216
- return { value, result };
8217
- }
8218
- throw err;
8219
- }
8220
- }
8221
- function parseJsonSafely(content, model) {
8222
- const stripped = stripFencedJson(content);
8223
- try {
8224
- return JSON.parse(stripped);
8225
- } catch (err) {
8226
- throw new Error(
8227
- `LLM returned non-JSON content (model=${model}): ${err instanceof Error ? err.message : String(err)}
8228
- --- raw content ---
8229
- ${content.slice(0, 800)}`
8230
- );
8231
- }
8232
- }
8233
- async function probeLlm(model, opts = {}) {
8234
- const start = Date.now();
8235
- try {
8236
- await callLlm(
8237
- {
8238
- model,
8239
- messages: [{ role: "user", content: "ping" }],
8240
- maxTokens: 64,
8241
- timeoutMs: opts.timeoutMs ?? 3e4
8242
- },
8243
- opts
8244
- );
8245
- return { ok: true, latencyMs: Date.now() - start, error: null };
8246
- } catch (err) {
8247
- return {
8248
- ok: false,
8249
- latencyMs: Date.now() - start,
8250
- error: err instanceof Error ? err.message : String(err)
8251
- };
8252
- }
8253
- }
8254
- var LlmClient = class {
8255
- constructor(opts = {}) {
8256
- this.opts = opts;
8257
- }
8258
- opts;
8259
- call(req, per) {
8260
- return callLlm(req, { ...this.opts, ...per });
8261
- }
8262
- callJson(req, per) {
8263
- return callLlmJson(req, { ...this.opts, ...per });
8264
- }
8265
- };
8266
-
8267
8308
  // src/multi-layer-verifier.ts
8268
8309
  function gradeSemanticStatus(input) {
8269
8310
  if (!input.available) return "error";
@@ -10594,6 +10635,7 @@ export {
10594
10635
  BuilderSession,
10595
10636
  ConvergenceTracker,
10596
10637
  CostTracker,
10638
+ D1ExperimentStore,
10597
10639
  DEFAULT_AGENT_SLOS,
10598
10640
  DEFAULT_COMPLEXITY_WEIGHTS,
10599
10641
  DEFAULT_RULES as DEFAULT_FAILURE_RULES,
@@ -10611,6 +10653,7 @@ export {
10611
10653
  ERROR_COUNT_PATTERNS,
10612
10654
  ExperimentTracker,
10613
10655
  FAILURE_CLASSES,
10656
+ FileSystemExperimentStore,
10614
10657
  FileSystemOutcomeStore,
10615
10658
  FileSystemTraceStore,
10616
10659
  HoldoutAuditor,