@tangle-network/agent-eval 0.20.11 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +76 -0
- package/README.md +137 -170
- package/dist/benchmarks/index.d.ts +2 -1
- package/dist/{chunk-JAOLXRIA.js → chunk-3GN6U53I.js} +205 -4
- package/dist/chunk-3GN6U53I.js.map +1 -0
- package/dist/chunk-3IX6QTB7.js +1349 -0
- package/dist/chunk-3IX6QTB7.js.map +1 -0
- package/dist/chunk-5IIQKMD5.js +236 -0
- package/dist/chunk-5IIQKMD5.js.map +1 -0
- package/dist/chunk-ARZ6BEV6.js +1310 -0
- package/dist/chunk-ARZ6BEV6.js.map +1 -0
- package/dist/chunk-HRZELXCR.js +1354 -0
- package/dist/chunk-HRZELXCR.js.map +1 -0
- package/dist/chunk-KRR4VMH7.js +423 -0
- package/dist/chunk-KRR4VMH7.js.map +1 -0
- package/dist/chunk-SNUHRBDL.js +154 -0
- package/dist/chunk-SNUHRBDL.js.map +1 -0
- package/dist/chunk-WOK2RTWG.js +1920 -0
- package/dist/chunk-WOK2RTWG.js.map +1 -0
- package/dist/{chunk-LSR4IAYN.js → chunk-WOPGKVN4.js} +2 -2
- package/dist/chunk-YUFXO3TU.js +148 -0
- package/dist/chunk-YUFXO3TU.js.map +1 -0
- package/dist/cli.js +3 -2
- package/dist/cli.js.map +1 -1
- package/dist/control-cxwMOAsy.d.ts +259 -0
- package/dist/control.d.ts +6 -0
- package/dist/control.js +30 -0
- package/dist/control.js.map +1 -0
- package/dist/dataset-B9qvlm_o.d.ts +112 -0
- package/dist/emitter-B2XqDKFU.d.ts +121 -0
- package/dist/feedback-trajectory-CB0A32o3.d.ts +346 -0
- package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
- package/dist/index.d.ts +178 -2945
- package/dist/index.js +1066 -6185
- package/dist/index.js.map +1 -1
- package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +146 -0
- package/dist/optimization.js +60 -0
- package/dist/optimization.js.map +1 -0
- package/dist/reporting-Da2ihlcM.d.ts +672 -0
- package/dist/reporting.d.ts +5 -0
- package/dist/reporting.js +36 -0
- package/dist/reporting.js.map +1 -0
- package/dist/run-record-CX_jcAyr.d.ts +134 -0
- package/dist/store-u47QaJ9G.d.ts +297 -0
- package/dist/traces.d.ts +914 -0
- package/dist/traces.js +120 -0
- package/dist/traces.js.map +1 -0
- package/dist/wire/index.js +3 -2
- package/docs/concepts.md +16 -11
- package/docs/feature-guide.md +10 -17
- package/docs/integration-launch-gates.md +77 -0
- package/docs/product-eval-adoption.md +27 -0
- package/docs/research-report-methodology.md +155 -0
- package/docs/trace-analysis.md +75 -0
- package/package.json +30 -12
- package/dist/chunk-JAOLXRIA.js.map +0 -1
- /package/dist/{chunk-LSR4IAYN.js.map → chunk-WOPGKVN4.js.map} +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/run-record.ts"],"sourcesContent":["/**\n * Paper-grade RunRecord schema + runtime validator.\n *\n * Every run that participates in a promotion gate, paper table, or\n * researcher loop SHOULD be recorded as a `RunRecord`. The mandatory\n * fields are exactly those the paper \"Two Loops, Three Roles\" requires\n * for reproducibility: who/what/when/cost/seed/hash, plus the search vs\n * holdout split tag and either a `searchScore` or a `holdoutScore`.\n *\n * This is intentionally NOT a replacement for the rich `Run` /\n * `ProposeReviewReport` / `ScenarioResult` types already in the\n * package. Those are runtime structures with full provenance. A\n * `RunRecord` is the analysis-time projection — the JSON-friendly\n * row you'd put in a parquet file or paste into a notebook.\n *\n * Validate at the boundary:\n *\n * const rec = validateRunRecord(rawJson) // throws on missing\n * const ok = isRunRecord(rawJson) // boolean check\n * const rec = parseRunRecordSafe(rawJson) // { ok, value | error }\n *\n * The validator runs in pure TS — zod is intentionally NOT a\n * dependency. Round-trip tested in `tests/run-record.test.ts`.\n */\n\n/** Search/dev/holdout split tag. 'search' is the paper-grade alias for the\n * combined train+test pool that the optimizer is allowed to read. */\nexport type RunSplitTag = 'search' | 'dev' | 'holdout'\n\nexport interface RunTokenUsage {\n input: number\n output: number\n cached?: number\n}\n\nexport interface RunJudgeMetadata {\n model: string\n promptVersion: string\n /** [0,1] confidence the judge declared. Constant judge confidence\n * across many runs is a fallback signal (see `canary.ts`). */\n confidence: number\n /** True if the judge degraded to a fallback path (rules-only,\n * prior-call cache, etc.). The canary uses this to alert. */\n fallback: boolean\n}\n\nexport interface RunOutcome {\n /** Score on the search/optimization split. Optional because a\n * holdout-only evaluation only fills `holdoutScore`. */\n searchScore?: number\n /** Score on the held-out split. Optional because a search-only run\n * only fills `searchScore`. At least one must be present. */\n holdoutScore?: number\n /** Bag of any other metric the run produced — judge dimensions,\n * pass/fail counters, latency stats, etc. Numeric only — keeps\n * reporters honest. */\n raw: Record<string, number>\n}\n\n/**\n * Mandatory paper-grade fields for a single evaluation run. Optional\n * fields are extension points; mandatory fields throw if missing.\n *\n * Hash discipline:\n * - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the\n * model (after any steering bundle merge).\n * - `configHash` is the sha256 of the effective run config (model,\n * temperature, tools, judges, splits). The pair (promptHash,\n * configHash) uniquely identifies an experimental cell.\n *\n * Model snapshot discipline:\n * - `model` MUST encode a snapshot version. Bare aliases like\n * `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.\n * Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.\n */\nexport interface RunRecord {\n /** UUID for the run. */\n runId: string\n /** Logical experiment grouping (a treatment vs a baseline within\n * the same sweep should share `experimentId`). */\n experimentId: string\n /** Stable identifier for the candidate (variant) being run. The\n * promotion gate compares two `candidateId`s on matched items. */\n candidateId: string\n /** RNG seed for the run. Always recorded — silent re-seeding is\n * the most common cause of non-reproducible numbers. */\n seed: number\n /** Model identifier WITH snapshot version. */\n model: string\n /** sha256 of the effective prompt (post-steering). */\n promptHash: string\n /** sha256 of the effective config. */\n configHash: string\n /** Git SHA the harness was run from. */\n commitSha: string\n /** End-to-end wall-clock duration in milliseconds. */\n wallMs: number\n /** Time spent queued before execution started, if known. */\n queueMs?: number\n /** Total USD cost. Mandatory — runs without a cost number are\n * unbounded by definition and must not be admitted into the gate. */\n costUsd: number\n /** Token usage breakdown. */\n tokenUsage: RunTokenUsage\n /** Judge-side metadata, if a judge was used. */\n judgeMetadata?: RunJudgeMetadata\n /** Per-split scores + raw bag. */\n outcome: RunOutcome\n /** Categorical failure tag, when the run failed and the harness\n * classified it. Free-form string; standard tags live in\n * `failure-taxonomy.ts`. */\n failureMode?: string\n /** Which split this run was drawn from. */\n splitTag: RunSplitTag\n}\n\n// ── Validation ───────────────────────────────────────────────────────\n\nconst MANDATORY_TOP_LEVEL = [\n 'runId',\n 'experimentId',\n 'candidateId',\n 'seed',\n 'model',\n 'promptHash',\n 'configHash',\n 'commitSha',\n 'wallMs',\n 'costUsd',\n 'tokenUsage',\n 'outcome',\n 'splitTag',\n] as const\n\nconst SPLIT_TAGS: ReadonlyArray<RunSplitTag> = ['search', 'dev', 'holdout']\n\nexport class RunRecordValidationError extends Error {\n readonly path: string\n constructor(message: string, path = '') {\n super(path ? `${message} (at ${path})` : message)\n this.name = 'RunRecordValidationError'\n this.path = path\n }\n}\n\n/**\n * Strict validator. Throws `RunRecordValidationError` on the first\n * missing or wrongly-typed field. Returns the input cast to\n * `RunRecord` on success — the validator does not coerce.\n */\nexport function validateRunRecord(input: unknown): RunRecord {\n if (input === null || typeof input !== 'object') {\n throw new RunRecordValidationError('expected object')\n }\n const obj = input as Record<string, unknown>\n\n for (const key of MANDATORY_TOP_LEVEL) {\n if (!(key in obj)) {\n throw new RunRecordValidationError(`missing mandatory field \"${key}\"`)\n }\n }\n\n expectString(obj.runId, 'runId')\n expectString(obj.experimentId, 'experimentId')\n expectString(obj.candidateId, 'candidateId')\n expectFiniteNumber(obj.seed, 'seed')\n expectString(obj.model, 'model')\n expectString(obj.promptHash, 'promptHash')\n expectString(obj.configHash, 'configHash')\n expectString(obj.commitSha, 'commitSha')\n expectFiniteNumber(obj.wallMs, 'wallMs')\n if (obj.queueMs !== undefined) expectFiniteNumber(obj.queueMs, 'queueMs')\n expectFiniteNumber(obj.costUsd, 'costUsd')\n\n // Snapshot discipline: bare model aliases are not paper-grade.\n if (!modelHasSnapshot(obj.model as string)) {\n throw new RunRecordValidationError(\n `model \"${obj.model}\" lacks a snapshot version (use 'name@YYYY-MM-DD' or 'name-YYYYMMDD')`,\n 'model',\n )\n }\n\n // Token usage.\n const tu = obj.tokenUsage\n if (tu === null || typeof tu !== 'object') {\n throw new RunRecordValidationError('tokenUsage must be an object', 'tokenUsage')\n }\n const tuRec = tu as Record<string, unknown>\n expectFiniteNumber(tuRec.input, 'tokenUsage.input')\n expectFiniteNumber(tuRec.output, 'tokenUsage.output')\n if (tuRec.cached !== undefined) expectFiniteNumber(tuRec.cached, 'tokenUsage.cached')\n\n // Judge metadata, optional.\n if (obj.judgeMetadata !== undefined) {\n const jm = obj.judgeMetadata\n if (jm === null || typeof jm !== 'object') {\n throw new RunRecordValidationError('judgeMetadata must be an object', 'judgeMetadata')\n }\n const jmRec = jm as Record<string, unknown>\n expectString(jmRec.model, 'judgeMetadata.model')\n expectString(jmRec.promptVersion, 'judgeMetadata.promptVersion')\n expectFiniteNumber(jmRec.confidence, 'judgeMetadata.confidence')\n if (typeof jmRec.fallback !== 'boolean') {\n throw new RunRecordValidationError('judgeMetadata.fallback must be boolean', 'judgeMetadata.fallback')\n }\n }\n\n // Outcome.\n const out = obj.outcome\n if (out === null || typeof out !== 'object') {\n throw new RunRecordValidationError('outcome must be an object', 'outcome')\n }\n const outRec = out as Record<string, unknown>\n if (outRec.searchScore !== undefined) expectFiniteNumber(outRec.searchScore, 'outcome.searchScore')\n if (outRec.holdoutScore !== undefined) expectFiniteNumber(outRec.holdoutScore, 'outcome.holdoutScore')\n if (outRec.searchScore === undefined && outRec.holdoutScore === undefined) {\n throw new RunRecordValidationError(\n 'outcome must define searchScore or holdoutScore (or both)',\n 'outcome',\n )\n }\n const raw = outRec.raw\n if (raw === null || typeof raw !== 'object') {\n throw new RunRecordValidationError('outcome.raw must be an object', 'outcome.raw')\n }\n for (const [k, v] of Object.entries(raw as Record<string, unknown>)) {\n expectFiniteNumber(v, `outcome.raw.${k}`)\n }\n\n // Failure mode optional.\n if (obj.failureMode !== undefined) expectString(obj.failureMode, 'failureMode')\n\n // Split tag.\n if (typeof obj.splitTag !== 'string' || !SPLIT_TAGS.includes(obj.splitTag as RunSplitTag)) {\n throw new RunRecordValidationError(\n `splitTag must be one of ${SPLIT_TAGS.join(', ')}, got ${String(obj.splitTag)}`,\n 'splitTag',\n )\n }\n\n return input as RunRecord\n}\n\n/** Boolean validator — convenience for filtering arrays. */\nexport function isRunRecord(input: unknown): input is RunRecord {\n try {\n validateRunRecord(input)\n return true\n } catch {\n return false\n }\n}\n\n/** Non-throwing validator — returns a discriminated union. */\nexport function parseRunRecordSafe(\n input: unknown,\n):\n | { ok: true; value: RunRecord }\n | { ok: false; error: RunRecordValidationError } {\n try {\n return { ok: true, value: validateRunRecord(input) }\n } catch (e) {\n if (e instanceof RunRecordValidationError) return { ok: false, error: e }\n throw e\n }\n}\n\n/** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */\nexport function roundTripRunRecord(record: RunRecord): RunRecord {\n const json = JSON.stringify(record)\n return validateRunRecord(JSON.parse(json))\n}\n\n// ── Internals ────────────────────────────────────────────────────────\n\nfunction expectString(value: unknown, path: string): void {\n if (typeof value !== 'string' || value.length === 0) {\n throw new RunRecordValidationError(`expected non-empty string`, path)\n }\n}\n\nfunction expectFiniteNumber(value: unknown, path: string): void {\n if (typeof value !== 'number' || !Number.isFinite(value)) {\n throw new RunRecordValidationError(`expected finite number`, path)\n }\n}\n\n/**\n * Heuristic snapshot check. Accepts:\n * - `name@YYYY-MM-DD` (Anthropic style: `claude-sonnet-4-6@2025-04-15`)\n * - `name-YYYYMMDD` (OpenAI style: `gpt-4o-2024-11-20`)\n * - `name@<arbitrary-token>` (allow opaque snapshots like `@v3`)\n * - explicit `:date-...` Vertex-style tags\n *\n * Rejects bare aliases like `claude-sonnet-4` or `gpt-4o` that remap\n * silently as providers ship new snapshots.\n */\nfunction modelHasSnapshot(model: string): boolean {\n if (model.includes('@')) return true\n if (/-\\d{8}$/.test(model)) return true\n if (/-\\d{4}-\\d{2}-\\d{2}$/.test(model)) return true\n if (/:date-/.test(model)) return true\n return false\n}\n"],"mappings":";AAsHA,IAAM,sBAAsB;AAAA,EAC1B;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,IAAM,aAAyC,CAAC,UAAU,OAAO,SAAS;AAEnE,IAAM,2BAAN,cAAuC,MAAM;AAAA,EACzC;AAAA,EACT,YAAY,SAAiB,OAAO,IAAI;AACtC,UAAM,OAAO,GAAG,OAAO,QAAQ,IAAI,MAAM,OAAO;AAChD,SAAK,OAAO;AACZ,SAAK,OAAO;AAAA,EACd;AACF;AAOO,SAAS,kBAAkB,OAA2B;AAC3D,MAAI,UAAU,QAAQ,OAAO,UAAU,UAAU;AAC/C,UAAM,IAAI,yBAAyB,iBAAiB;AAAA,EACtD;AACA,QAAM,MAAM;AAEZ,aAAW,OAAO,qBAAqB;AACrC,QAAI,EAAE,OAAO,MAAM;AACjB,YAAM,IAAI,yBAAyB,4BAA4B,GAAG,GAAG;AAAA,IACvE;AAAA,EACF;AAEA,eAAa,IAAI,OAAO,OAAO;AAC/B,eAAa,IAAI,cAAc,cAAc;AAC7C,eAAa,IAAI,aAAa,aAAa;AAC3C,qBAAmB,IAAI,MAAM,MAAM;AACnC,eAAa,IAAI,OAAO,OAAO;AAC/B,eAAa,IAAI,YAAY,YAAY;AACzC,eAAa,IAAI,YAAY,YAAY;AACzC,eAAa,IAAI,WAAW,WAAW;AACvC,qBAAmB,IAAI,QAAQ,QAAQ;AACvC,MAAI,IAAI,YAAY,OAAW,oBAAmB,IAAI,SAAS,SAAS;AACxE,qBAAmB,IAAI,SAAS,SAAS;AAGzC,MAAI,CAAC,iBAAiB,IAAI,KAAe,GAAG;AAC1C,UAAM,IAAI;AAAA,MACR,UAAU,IAAI,KAAK;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AAGA,QAAM,KAAK,IAAI;AACf,MAAI,OAAO,QAAQ,OAAO,OAAO,UAAU;AACzC,UAAM,IAAI,yBAAyB,gCAAgC,YAAY;AAAA,EACjF;AACA,QAAM,QAAQ;AACd,qBAAmB,MAAM,OAAO,kBAAkB;AAClD,qBAAmB,MAAM,QAAQ,mBAAmB;AACpD,MAAI,MAAM,WAAW,OAAW,oBAAmB,MAAM,QAAQ,mBAAmB;AAGpF,MAAI,IAAI,kBAAkB,QAAW;AACnC,UAAM,KAAK,IAAI;AACf,QAAI,OAAO,QAAQ,OAAO,OAAO,UAAU;AACzC,YAAM,IAAI,yBAAyB,mCAAmC,eAAe;AAAA,IACvF;AACA,UAAM,QAAQ;AACd,iBAAa,MAAM,OAAO,qBAAqB;AAC/C,iBAAa,MAAM,eAAe,6BAA6B;AAC/D,uBAAmB,MAAM,YAAY,0BAA0B;AAC/D,QAAI,OAAO,MAAM,aAAa,WAAW;AACvC,YAAM,IAAI,yBAAyB,0CAA0C,wBAAwB;AAAA,IACvG;AAAA,EACF;AAGA,QAAM,MAAM,IAAI;AAChB,MAAI,QAAQ,QAAQ,OAAO,QAAQ,UAAU;AAC3C,UAAM,IAAI,yBAAyB,6BAA6B,SAAS;AAAA,EAC3E;AACA,QAAM,SAAS;AACf,MAAI,OAAO,gBAAgB,OAAW,oBAAmB,OAAO,aAAa,qBAAqB;AAClG,MAAI,OAAO,iBAAiB,OAAW,oBAAmB,OAAO,cAAc,sBAAsB;AACrG,MAAI,OAAO,gBAAgB,UAAa,OAAO,iBAAiB,QAAW;AACzE,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,QAAM,MAAM,OAAO;AACnB,MAAI,QAAQ,QAAQ,OAAO,QAAQ,UAAU;AAC3C,UAAM,IAAI,yBAAyB,iCAAiC,aAAa;AAAA,EACnF;AACA,aAAW,CAAC,GAAG,CAAC,KAAK,OAAO,QAAQ,GAA8B,GAAG;AACnE,uBAAmB,GAAG,eAAe,CAAC,EAAE;AAAA,EAC1C;AAGA,MAAI,IAAI,gBAAgB,OAAW,cAAa,IAAI,aAAa,aAAa;AAG9E,MAAI,OAAO,IAAI,aAAa,YAAY,CAAC,WAAW,SAAS,IAAI,QAAuB,GAAG;AACzF,UAAM,IAAI;AAAA,MACR,2BAA2B,WAAW,KAAK,IAAI,CAAC,SAAS,OAAO,IAAI,QAAQ,CAAC;AAAA,MAC7E;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AACT;AAGO,SAAS,YAAY,OAAoC;AAC9D,MAAI;AACF,sBAAkB,KAAK;AACvB,WAAO;AAAA,EACT,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAGO,SAAS,mBACd,OAGiD;AACjD,MAAI;AACF,WAAO,EAAE,IAAI,MAAM,OAAO,kBAAkB,KAAK,EAAE;AAAA,EACrD,SAAS,GAAG;AACV,QAAI,aAAa,yBAA0B,QAAO,EAAE,IAAI,OAAO,OAAO,EAAE;AACxE,UAAM;AAAA,EACR;AACF;AAGO,SAAS,mBAAmB,QAA8B;AAC/D,QAAM,OAAO,KAAK,UAAU,MAAM;AAClC,SAAO,kBAAkB,KAAK,MAAM,IAAI,CAAC;AAC3C;AAIA,SAAS,aAAa,OAAgB,MAAoB;AACxD,MAAI,OAAO,UAAU,YAAY,MAAM,WAAW,GAAG;AACnD,UAAM,IAAI,yBAAyB,6BAA6B,IAAI;AAAA,EACtE;AACF;AAEA,SAAS,mBAAmB,OAAgB,MAAoB;AAC9D,MAAI,OAAO,UAAU,YAAY,CAAC,OAAO,SAAS,KAAK,GAAG;AACxD,UAAM,IAAI,yBAAyB,0BAA0B,IAAI;AAAA,EACnE;AACF;AAYA,SAAS,iBAAiB,OAAwB;AAChD,MAAI,MAAM,SAAS,GAAG,EAAG,QAAO;AAChC,MAAI,UAAU,KAAK,KAAK,EAAG,QAAO;AAClC,MAAI,sBAAsB,KAAK,KAAK,EAAG,QAAO;AAC9C,MAAI,SAAS,KAAK,KAAK,EAAG,QAAO;AACjC,SAAO;AACT;","names":[]}
|
package/dist/cli.js
CHANGED
|
@@ -5,8 +5,9 @@ import {
|
|
|
5
5
|
runRpcBatch,
|
|
6
6
|
runRpcOnce,
|
|
7
7
|
startServer
|
|
8
|
-
} from "./chunk-
|
|
9
|
-
import "./chunk-
|
|
8
|
+
} from "./chunk-WOPGKVN4.js";
|
|
9
|
+
import "./chunk-3GN6U53I.js";
|
|
10
|
+
import "./chunk-SNUHRBDL.js";
|
|
10
11
|
import "./chunk-PZ5AY32C.js";
|
|
11
12
|
|
|
12
13
|
// src/cli.ts
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n/**\n * agent-eval CLI.\n *\n * agent-eval serve [--port 5005] [--host 127.0.0.1]\n * agent-eval rpc <method> # one request from stdin → one response on stdout\n * agent-eval rpc-batch <method> # JSONL stdin → JSONL stdout\n * agent-eval openapi [--out path] # write OpenAPI spec\n * agent-eval version\n *\n * <method> is one of: judge, listRubrics, version. When omitted, the\n * stdin payload must be a full {method, params} envelope.\n */\nimport { writeFileSync } from 'node:fs'\n\nimport { buildOpenApi } from './wire/openapi'\nimport { handleVersion } from './wire/handlers'\nimport { runRpcBatch, runRpcOnce } from './wire/rpc'\nimport { startServer } from './wire/server'\n\ninterface Args {\n command: string\n positional: string[]\n flags: Record<string, string>\n}\n\nfunction parseArgs(argv: string[]): Args {\n const [command, ...rest] = argv\n const positional: string[] = []\n const flags: Record<string, string> = {}\n for (let i = 0; i < rest.length; i++) {\n const tok = rest[i]\n if (tok.startsWith('--')) {\n const key = tok.slice(2)\n const next = rest[i + 1]\n if (next != null && !next.startsWith('--')) {\n flags[key] = next\n i++\n } else {\n flags[key] = 'true'\n }\n } else {\n positional.push(tok)\n }\n }\n return { command: command ?? 'help', positional, flags }\n}\n\nconst HELP = `agent-eval — wire-protocol entry point.\n\nCommands:\n serve [--port 5005] [--host 127.0.0.1]\n Start the HTTP server. POST /v1/judge, GET /v1/rubrics, GET /v1/version, GET /openapi.json.\n rpc <method>\n Read one JSON object from stdin (the params for <method>), write one\n JSON object to stdout. Method ∈ {judge, listRubrics, version}.\n rpc-batch <method>\n Like 'rpc' but JSONL in / JSONL out.\n openapi [--out openapi.json]\n Write the OpenAPI 3.1 spec.\n version\n Print server + wire-protocol version JSON.\n\nWithout arguments, prints this help.`\n\nasync function main(): Promise<number> {\n const { command, positional, flags } = parseArgs(process.argv.slice(2))\n\n switch (command) {\n case 'serve': {\n const port = Number(flags.port ?? 5005)\n const host = flags.host ?? '127.0.0.1'\n const server = startServer({ port, host })\n // Keep process alive on SIGINT/SIGTERM\n const shutdown = (sig: string) => {\n // eslint-disable-next-line no-console\n console.log(`[agent-eval] received ${sig}, shutting down`)\n server.close(() => process.exit(0))\n // Force exit after 5s if close hangs\n setTimeout(() => process.exit(1), 5000).unref()\n }\n process.on('SIGINT', () => shutdown('SIGINT'))\n process.on('SIGTERM', () => shutdown('SIGTERM'))\n // Block forever\n await new Promise(() => {})\n return 0\n }\n case 'rpc': {\n const [method] = positional\n return await runRpcOnce(method)\n }\n case 'rpc-batch': {\n const [method] = positional\n return await runRpcBatch(method)\n }\n case 'openapi': {\n const out = flags.out ?? 'openapi.json'\n const spec = buildOpenApi(handleVersion().version)\n writeFileSync(out, JSON.stringify(spec, null, 2) + '\\n', 'utf-8')\n // eslint-disable-next-line no-console\n console.log(`[agent-eval] wrote OpenAPI 3.1 spec to ${out}`)\n return 0\n }\n case 'version': {\n process.stdout.write(JSON.stringify(handleVersion(), null, 2) + '\\n')\n return 0\n }\n case 'help':\n case '--help':\n case '-h':\n case '':\n process.stdout.write(HELP + '\\n')\n return 0\n default:\n process.stderr.write(`unknown command: ${command}\\n${HELP}\\n`)\n return 1\n }\n}\n\nmain()\n .then((code) => process.exit(code))\n .catch((err) => {\n // eslint-disable-next-line no-console\n console.error('[agent-eval] cli error:', err)\n process.exit(1)\n })\n"],"mappings":"
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n/**\n * agent-eval CLI.\n *\n * agent-eval serve [--port 5005] [--host 127.0.0.1]\n * agent-eval rpc <method> # one request from stdin → one response on stdout\n * agent-eval rpc-batch <method> # JSONL stdin → JSONL stdout\n * agent-eval openapi [--out path] # write OpenAPI spec\n * agent-eval version\n *\n * <method> is one of: judge, listRubrics, version. When omitted, the\n * stdin payload must be a full {method, params} envelope.\n */\nimport { writeFileSync } from 'node:fs'\n\nimport { buildOpenApi } from './wire/openapi'\nimport { handleVersion } from './wire/handlers'\nimport { runRpcBatch, runRpcOnce } from './wire/rpc'\nimport { startServer } from './wire/server'\n\ninterface Args {\n command: string\n positional: string[]\n flags: Record<string, string>\n}\n\nfunction parseArgs(argv: string[]): Args {\n const [command, ...rest] = argv\n const positional: string[] = []\n const flags: Record<string, string> = {}\n for (let i = 0; i < rest.length; i++) {\n const tok = rest[i]\n if (tok.startsWith('--')) {\n const key = tok.slice(2)\n const next = rest[i + 1]\n if (next != null && !next.startsWith('--')) {\n flags[key] = next\n i++\n } else {\n flags[key] = 'true'\n }\n } else {\n positional.push(tok)\n }\n }\n return { command: command ?? 'help', positional, flags }\n}\n\nconst HELP = `agent-eval — wire-protocol entry point.\n\nCommands:\n serve [--port 5005] [--host 127.0.0.1]\n Start the HTTP server. POST /v1/judge, GET /v1/rubrics, GET /v1/version, GET /openapi.json.\n rpc <method>\n Read one JSON object from stdin (the params for <method>), write one\n JSON object to stdout. Method ∈ {judge, listRubrics, version}.\n rpc-batch <method>\n Like 'rpc' but JSONL in / JSONL out.\n openapi [--out openapi.json]\n Write the OpenAPI 3.1 spec.\n version\n Print server + wire-protocol version JSON.\n\nWithout arguments, prints this help.`\n\nasync function main(): Promise<number> {\n const { command, positional, flags } = parseArgs(process.argv.slice(2))\n\n switch (command) {\n case 'serve': {\n const port = Number(flags.port ?? 5005)\n const host = flags.host ?? '127.0.0.1'\n const server = startServer({ port, host })\n // Keep process alive on SIGINT/SIGTERM\n const shutdown = (sig: string) => {\n // eslint-disable-next-line no-console\n console.log(`[agent-eval] received ${sig}, shutting down`)\n server.close(() => process.exit(0))\n // Force exit after 5s if close hangs\n setTimeout(() => process.exit(1), 5000).unref()\n }\n process.on('SIGINT', () => shutdown('SIGINT'))\n process.on('SIGTERM', () => shutdown('SIGTERM'))\n // Block forever\n await new Promise(() => {})\n return 0\n }\n case 'rpc': {\n const [method] = positional\n return await runRpcOnce(method)\n }\n case 'rpc-batch': {\n const [method] = positional\n return await runRpcBatch(method)\n }\n case 'openapi': {\n const out = flags.out ?? 'openapi.json'\n const spec = buildOpenApi(handleVersion().version)\n writeFileSync(out, JSON.stringify(spec, null, 2) + '\\n', 'utf-8')\n // eslint-disable-next-line no-console\n console.log(`[agent-eval] wrote OpenAPI 3.1 spec to ${out}`)\n return 0\n }\n case 'version': {\n process.stdout.write(JSON.stringify(handleVersion(), null, 2) + '\\n')\n return 0\n }\n case 'help':\n case '--help':\n case '-h':\n case '':\n process.stdout.write(HELP + '\\n')\n return 0\n default:\n process.stderr.write(`unknown command: ${command}\\n${HELP}\\n`)\n return 1\n }\n}\n\nmain()\n .then((code) => process.exit(code))\n .catch((err) => {\n // eslint-disable-next-line no-console\n console.error('[agent-eval] cli error:', err)\n process.exit(1)\n })\n"],"mappings":";;;;;;;;;;;;;AAaA,SAAS,qBAAqB;AAa9B,SAAS,UAAU,MAAsB;AACvC,QAAM,CAAC,SAAS,GAAG,IAAI,IAAI;AAC3B,QAAM,aAAuB,CAAC;AAC9B,QAAM,QAAgC,CAAC;AACvC,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,MAAM,KAAK,CAAC;AAClB,QAAI,IAAI,WAAW,IAAI,GAAG;AACxB,YAAM,MAAM,IAAI,MAAM,CAAC;AACvB,YAAM,OAAO,KAAK,IAAI,CAAC;AACvB,UAAI,QAAQ,QAAQ,CAAC,KAAK,WAAW,IAAI,GAAG;AAC1C,cAAM,GAAG,IAAI;AACb;AAAA,MACF,OAAO;AACL,cAAM,GAAG,IAAI;AAAA,MACf;AAAA,IACF,OAAO;AACL,iBAAW,KAAK,GAAG;AAAA,IACrB;AAAA,EACF;AACA,SAAO,EAAE,SAAS,WAAW,QAAQ,YAAY,MAAM;AACzD;AAEA,IAAM,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAiBb,eAAe,OAAwB;AACrC,QAAM,EAAE,SAAS,YAAY,MAAM,IAAI,UAAU,QAAQ,KAAK,MAAM,CAAC,CAAC;AAEtE,UAAQ,SAAS;AAAA,IACf,KAAK,SAAS;AACZ,YAAM,OAAO,OAAO,MAAM,QAAQ,IAAI;AACtC,YAAM,OAAO,MAAM,QAAQ;AAC3B,YAAM,SAAS,YAAY,EAAE,MAAM,KAAK,CAAC;AAEzC,YAAM,WAAW,CAAC,QAAgB;AAEhC,gBAAQ,IAAI,yBAAyB,GAAG,iBAAiB;AACzD,eAAO,MAAM,MAAM,QAAQ,KAAK,CAAC,CAAC;AAElC,mBAAW,MAAM,QAAQ,KAAK,CAAC,GAAG,GAAI,EAAE,MAAM;AAAA,MAChD;AACA,cAAQ,GAAG,UAAU,MAAM,SAAS,QAAQ,CAAC;AAC7C,cAAQ,GAAG,WAAW,MAAM,SAAS,SAAS,CAAC;AAE/C,YAAM,IAAI,QAAQ,MAAM;AAAA,MAAC,CAAC;AAC1B,aAAO;AAAA,IACT;AAAA,IACA,KAAK,OAAO;AACV,YAAM,CAAC,MAAM,IAAI;AACjB,aAAO,MAAM,WAAW,MAAM;AAAA,IAChC;AAAA,IACA,KAAK,aAAa;AAChB,YAAM,CAAC,MAAM,IAAI;AACjB,aAAO,MAAM,YAAY,MAAM;AAAA,IACjC;AAAA,IACA,KAAK,WAAW;AACd,YAAM,MAAM,MAAM,OAAO;AACzB,YAAM,OAAO,aAAa,cAAc,EAAE,OAAO;AACjD,oBAAc,KAAK,KAAK,UAAU,MAAM,MAAM,CAAC,IAAI,MAAM,OAAO;AAEhE,cAAQ,IAAI,0CAA0C,GAAG,EAAE;AAC3D,aAAO;AAAA,IACT;AAAA,IACA,KAAK,WAAW;AACd,cAAQ,OAAO,MAAM,KAAK,UAAU,cAAc,GAAG,MAAM,CAAC,IAAI,IAAI;AACpE,aAAO;AAAA,IACT;AAAA,IACA,KAAK;AAAA,IACL,KAAK;AAAA,IACL,KAAK;AAAA,IACL,KAAK;AACH,cAAQ,OAAO,MAAM,OAAO,IAAI;AAChC,aAAO;AAAA,IACT;AACE,cAAQ,OAAO,MAAM,oBAAoB,OAAO;AAAA,EAAK,IAAI;AAAA,CAAI;AAC7D,aAAO;AAAA,EACX;AACF;AAEA,KAAK,EACF,KAAK,CAAC,SAAS,QAAQ,KAAK,IAAI,CAAC,EACjC,MAAM,CAAC,QAAQ;AAEd,UAAQ,MAAM,2BAA2B,GAAG;AAC5C,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
import { c as ControlEvalResult, i as ControlRunResult, F as FeedbackLabel, A as ProposedSideEffect, j as ControlRuntimeConfig } from './feedback-trajectory-CB0A32o3.js';
|
|
2
|
+
import { R as RunSplitTag, e as RunTokenUsage, a as RunRecord } from './run-record-CX_jcAyr.js';
|
|
3
|
+
import { T as TraceStore, F as FailureClass } from './store-u47QaJ9G.js';
|
|
4
|
+
import { T as TraceEmitter } from './emitter-B2XqDKFU.js';
|
|
5
|
+
|
|
6
|
+
interface RunEvidenceMetadata {
|
|
7
|
+
experimentId: string;
|
|
8
|
+
candidateId: string;
|
|
9
|
+
seed: number;
|
|
10
|
+
model: string;
|
|
11
|
+
promptHash: string;
|
|
12
|
+
configHash: string;
|
|
13
|
+
commitSha: string;
|
|
14
|
+
splitTag: RunSplitTag;
|
|
15
|
+
tokenUsage: RunTokenUsage;
|
|
16
|
+
queueMs?: number;
|
|
17
|
+
judgeMetadata?: RunRecord['judgeMetadata'];
|
|
18
|
+
raw?: Record<string, number>;
|
|
19
|
+
}
|
|
20
|
+
interface ControlRunToRunRecordOptions extends RunEvidenceMetadata {
|
|
21
|
+
runId?: string;
|
|
22
|
+
score?: number;
|
|
23
|
+
failureMode?: string;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Project a completed control-loop run into the strict RunRecord shape used by
|
|
27
|
+
* release gates, optimizer tables, and research reports.
|
|
28
|
+
*
|
|
29
|
+
* The control loop owns live execution evidence. The caller still supplies the
|
|
30
|
+
* experimental cell metadata because prompt/config hashes, split assignment,
|
|
31
|
+
* model snapshot, and commit SHA are product/harness concerns.
|
|
32
|
+
*/
|
|
33
|
+
declare function controlRunToRunRecord<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult>(run: ControlRunResult<TState, TAction, TActionResult, TEval>, options: ControlRunToRunRecordOptions): RunRecord;
|
|
34
|
+
declare function scoreFromEvals(evals: readonly ControlEvalResult[]): number | undefined;
|
|
35
|
+
|
|
36
|
+
interface ActionExecutionPolicy {
|
|
37
|
+
allowedTypes?: string[];
|
|
38
|
+
blockedTypes?: string[];
|
|
39
|
+
alwaysRequireApprovalTypes?: string[];
|
|
40
|
+
autoApproveTypes?: string[];
|
|
41
|
+
requireApprovalForExternalSideEffects?: boolean;
|
|
42
|
+
requireApprovalAboveCostUsd?: number;
|
|
43
|
+
maxActionCostUsd?: number;
|
|
44
|
+
remainingBudgetUsd?: number;
|
|
45
|
+
expectedOutcomeRequired?: boolean;
|
|
46
|
+
killCriteriaRequired?: boolean;
|
|
47
|
+
}
|
|
48
|
+
interface ActionPolicyDecision {
|
|
49
|
+
allowed: boolean;
|
|
50
|
+
blocked: boolean;
|
|
51
|
+
requiresApproval: boolean;
|
|
52
|
+
reasons: string[];
|
|
53
|
+
label?: FeedbackLabel;
|
|
54
|
+
}
|
|
55
|
+
declare function evaluateActionPolicy(action: ProposedSideEffect, policy?: ActionExecutionPolicy, options?: {
|
|
56
|
+
createdAt?: string;
|
|
57
|
+
}): ActionPolicyDecision;
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Propose / Verify / Review — the core multi-shot primitive.
|
|
61
|
+
*
|
|
62
|
+
* shot N: propose(state, priorReview) → new state
|
|
63
|
+
* verify(state) → pass/fail, optional layers
|
|
64
|
+
* review(state, verification, memory) → observations + next-shot
|
|
65
|
+
* instruction + shouldContinue
|
|
66
|
+
* memory.append(entry)
|
|
67
|
+
*
|
|
68
|
+
* Roles are strictly separated:
|
|
69
|
+
*
|
|
70
|
+
* - The WORKER is whatever the caller wraps in `propose`. It is
|
|
71
|
+
* stateful — caller owns its resume/session mechanism.
|
|
72
|
+
* - The VERIFIER grades the state. It produces the ground truth.
|
|
73
|
+
* The reviewer cannot overturn or downgrade a verification layer.
|
|
74
|
+
* - The REVIEWER is stateless per call. Its continuity is the
|
|
75
|
+
* `ReviewMemoryStore` — durable JSONL by default, or any store
|
|
76
|
+
* implementing the interface. It reads memory + trace summary +
|
|
77
|
+
* verification and directs the NEXT proposer shot.
|
|
78
|
+
*
|
|
79
|
+
* This shape is load-bearing. The reviewer never grades; the verifier
|
|
80
|
+
* never directs. Two processes, two prompts, two concerns — which is
|
|
81
|
+
* what keeps the loop from confirmation-biasing itself into "all
|
|
82
|
+
* passed" when it didn't.
|
|
83
|
+
*
|
|
84
|
+
* Short-circuits and soft-fails are both first-class:
|
|
85
|
+
* - verify.pass === true → reviewer LLM call is skipped, memory
|
|
86
|
+
* records a success entry, loop exits.
|
|
87
|
+
* - review throws → the shot still counts; the loop uses the
|
|
88
|
+
* last-known instruction (or `fallbackInstruction`) for the next
|
|
89
|
+
* propose call. A transient reviewer failure must NEVER abort a
|
|
90
|
+
* valid arc.
|
|
91
|
+
*
|
|
92
|
+
* Composable: `propose` itself can be another `runProposeReview` call.
|
|
93
|
+
* That's the dogfooding path — a harness built on this primitive is in
|
|
94
|
+
* turn evaluable by it.
|
|
95
|
+
*/
|
|
96
|
+
|
|
97
|
+
interface Verification {
|
|
98
|
+
pass: boolean;
|
|
99
|
+
score?: number;
|
|
100
|
+
failingLayers?: string[];
|
|
101
|
+
details?: unknown;
|
|
102
|
+
}
|
|
103
|
+
interface Review {
|
|
104
|
+
observations: string;
|
|
105
|
+
diagnosis: string;
|
|
106
|
+
nextShotInstruction: string;
|
|
107
|
+
shouldContinue: boolean;
|
|
108
|
+
confidence: number;
|
|
109
|
+
}
|
|
110
|
+
interface ReviewMemoryEntry extends Review {
|
|
111
|
+
shot: number;
|
|
112
|
+
timestamp: number;
|
|
113
|
+
verification: {
|
|
114
|
+
pass: boolean;
|
|
115
|
+
score?: number;
|
|
116
|
+
failingLayers?: string[];
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
interface ProposeInput<State> {
|
|
120
|
+
shot: number;
|
|
121
|
+
goal: string;
|
|
122
|
+
state: State;
|
|
123
|
+
priorReview: Review | null;
|
|
124
|
+
abortSignal: AbortSignal;
|
|
125
|
+
emitter?: TraceEmitter;
|
|
126
|
+
}
|
|
127
|
+
interface ProposeOutput<State, Summary = unknown> {
|
|
128
|
+
state: State;
|
|
129
|
+
traceSummary?: Summary;
|
|
130
|
+
}
|
|
131
|
+
interface ReviewInput<State, Summary = unknown> {
|
|
132
|
+
shot: number;
|
|
133
|
+
goal: string;
|
|
134
|
+
state: State;
|
|
135
|
+
verification: Verification;
|
|
136
|
+
traceSummary: Summary | undefined;
|
|
137
|
+
memory: ReviewMemoryEntry[];
|
|
138
|
+
}
|
|
139
|
+
type ProposeFn<State, Summary = unknown> = (input: ProposeInput<State>) => Promise<ProposeOutput<State, Summary>>;
|
|
140
|
+
type VerifyFn<State> = (state: State) => Promise<Verification>;
|
|
141
|
+
type ReviewFn<State, Summary = unknown> = (input: ReviewInput<State, Summary>) => Promise<Review>;
|
|
142
|
+
interface ReviewMemoryStore {
|
|
143
|
+
load(): Promise<ReviewMemoryEntry[]>;
|
|
144
|
+
append(entry: ReviewMemoryEntry): Promise<void>;
|
|
145
|
+
}
|
|
146
|
+
interface ProposeReviewConfig<State, Summary = unknown> {
|
|
147
|
+
goal: string;
|
|
148
|
+
initialState: State;
|
|
149
|
+
propose: ProposeFn<State, Summary>;
|
|
150
|
+
verify: VerifyFn<State>;
|
|
151
|
+
review: ReviewFn<State, Summary>;
|
|
152
|
+
/** Hard shot cap. Default 10. */
|
|
153
|
+
maxShots?: number;
|
|
154
|
+
/** Wall-clock cap in ms. Default 10 min. */
|
|
155
|
+
maxWallMs?: number;
|
|
156
|
+
/**
|
|
157
|
+
* If the reviewer returns confidence ≤ floor on `confidenceFloorWindow`
|
|
158
|
+
* consecutive shots, terminate early. Default floor 0.3, window 2.
|
|
159
|
+
* Set window to 0 or floor to <0 to disable.
|
|
160
|
+
*/
|
|
161
|
+
confidenceFloor?: number;
|
|
162
|
+
confidenceFloorWindow?: number;
|
|
163
|
+
/** Defaults to an in-memory store if omitted. */
|
|
164
|
+
memory?: ReviewMemoryStore;
|
|
165
|
+
/** If provided, emit a Run + per-shot spans. */
|
|
166
|
+
store?: TraceStore;
|
|
167
|
+
scenarioId?: string;
|
|
168
|
+
projectId?: string;
|
|
169
|
+
variantId?: string;
|
|
170
|
+
/**
|
|
171
|
+
* Used when the reviewer soft-fails on shot 1 (no prior instruction to
|
|
172
|
+
* fall back to). Default is a generic "inspect failures and fix".
|
|
173
|
+
*/
|
|
174
|
+
fallbackInstruction?: string;
|
|
175
|
+
}
|
|
176
|
+
interface ProposeReviewShot<State, Summary = unknown> {
|
|
177
|
+
shot: number;
|
|
178
|
+
state: State;
|
|
179
|
+
verification: Verification;
|
|
180
|
+
traceSummary: Summary | undefined;
|
|
181
|
+
review: Review;
|
|
182
|
+
reviewAvailable: boolean;
|
|
183
|
+
reviewError?: string;
|
|
184
|
+
durationMs: number;
|
|
185
|
+
}
|
|
186
|
+
interface ProposeReviewReport<State, Summary = unknown> {
|
|
187
|
+
runId: string | null;
|
|
188
|
+
completed: boolean;
|
|
189
|
+
shots: ProposeReviewShot<State, Summary>[];
|
|
190
|
+
finalState: State;
|
|
191
|
+
finalVerification: Verification;
|
|
192
|
+
failureClass?: FailureClass;
|
|
193
|
+
wallMs: number;
|
|
194
|
+
score: number;
|
|
195
|
+
}
|
|
196
|
+
declare function inMemoryReviewStore(initial?: ReviewMemoryEntry[]): ReviewMemoryStore;
|
|
197
|
+
declare function jsonlReviewStore(path: string): ReviewMemoryStore;
|
|
198
|
+
declare function runProposeReview<State, Summary = unknown>(config: ProposeReviewConfig<State, Summary>): Promise<ProposeReviewReport<State, Summary>>;
|
|
199
|
+
interface LlmJsonCall {
|
|
200
|
+
(req: {
|
|
201
|
+
system: string;
|
|
202
|
+
user: string;
|
|
203
|
+
}): Promise<unknown>;
|
|
204
|
+
}
|
|
205
|
+
interface LlmReviewerConfig<State, Summary = unknown> {
|
|
206
|
+
callJson: LlmJsonCall;
|
|
207
|
+
renderState?: (state: State) => string;
|
|
208
|
+
renderTraceSummary?: (summary: Summary | undefined) => string;
|
|
209
|
+
/** Appended to the default system prompt. */
|
|
210
|
+
systemPromptAddendum?: string;
|
|
211
|
+
}
|
|
212
|
+
declare function createLlmReviewer<State, Summary = unknown>(cfg: LlmReviewerConfig<State, Summary>): ReviewFn<State, Summary>;
|
|
213
|
+
|
|
214
|
+
interface ProposeReviewControlState<State, Summary = unknown> {
|
|
215
|
+
shot: number;
|
|
216
|
+
state: State;
|
|
217
|
+
priorReview: Review | null;
|
|
218
|
+
verification: Verification;
|
|
219
|
+
traceSummary?: Summary;
|
|
220
|
+
memory: ReviewMemoryEntry[];
|
|
221
|
+
completed: boolean;
|
|
222
|
+
reviewAvailable: boolean;
|
|
223
|
+
reviewError?: string;
|
|
224
|
+
}
|
|
225
|
+
interface ProposeReviewControlAction {
|
|
226
|
+
type: 'propose-review-shot';
|
|
227
|
+
shot: number;
|
|
228
|
+
}
|
|
229
|
+
interface ProposeReviewControlResult<State, Summary = unknown> {
|
|
230
|
+
state: State;
|
|
231
|
+
verification: Verification;
|
|
232
|
+
traceSummary?: Summary;
|
|
233
|
+
review: Review | null;
|
|
234
|
+
reviewAvailable: boolean;
|
|
235
|
+
reviewError?: string;
|
|
236
|
+
}
|
|
237
|
+
interface ProposeReviewControlConfig<State, Summary = unknown> {
|
|
238
|
+
goal: string;
|
|
239
|
+
initialState: State;
|
|
240
|
+
propose: ProposeFn<State, Summary>;
|
|
241
|
+
verify: VerifyFn<State>;
|
|
242
|
+
review: ReviewFn<State, Summary>;
|
|
243
|
+
maxShots?: number;
|
|
244
|
+
maxWallMs?: number;
|
|
245
|
+
memory?: ReviewMemoryStore;
|
|
246
|
+
store?: TraceStore;
|
|
247
|
+
scenarioId?: string;
|
|
248
|
+
projectId?: string;
|
|
249
|
+
variantId?: string;
|
|
250
|
+
fallbackInstruction?: string;
|
|
251
|
+
confidenceFloor?: number;
|
|
252
|
+
confidenceFloorWindow?: number;
|
|
253
|
+
failureClassFromVerification?: (verification: Verification) => FailureClass | undefined;
|
|
254
|
+
actionFailure?: ControlRuntimeConfig<ProposeReviewControlState<State, Summary>, ProposeReviewControlAction, ProposeReviewControlResult<State, Summary>>['actionFailure'];
|
|
255
|
+
}
|
|
256
|
+
declare function runProposeReviewAsControlLoop<State, Summary = unknown>(config: ProposeReviewControlConfig<State, Summary>): Promise<ControlRunResult<ProposeReviewControlState<State, Summary>, ProposeReviewControlAction, ProposeReviewControlResult<State, Summary>>>;
|
|
257
|
+
declare function controlFailureClassFromVerification(verification: Verification): FailureClass | undefined;
|
|
258
|
+
|
|
259
|
+
export { type ActionExecutionPolicy as A, type ControlRunToRunRecordOptions as C, type LlmJsonCall as L, type ProposeFn as P, type Review as R, type Verification as V, type ActionPolicyDecision as a, type LlmReviewerConfig as b, type ProposeInput as c, type ProposeOutput as d, type ProposeReviewConfig as e, type ProposeReviewControlAction as f, type ProposeReviewControlConfig as g, type ProposeReviewControlResult as h, type ProposeReviewControlState as i, type ProposeReviewReport as j, type ProposeReviewShot as k, type ReviewFn as l, type ReviewInput as m, type ReviewMemoryEntry as n, type ReviewMemoryStore as o, type RunEvidenceMetadata as p, type VerifyFn as q, controlFailureClassFromVerification as r, controlRunToRunRecord as s, createLlmReviewer as t, evaluateActionPolicy as u, inMemoryReviewStore as v, jsonlReviewStore as w, runProposeReview as x, runProposeReviewAsControlLoop as y, scoreFromEvals as z };
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { d as ControlActionFailureMode, e as ControlActionOutcome, f as ControlBudget, g as ControlContext, h as ControlDecision, c as ControlEvalResult, i as ControlRunResult, j as ControlRuntimeConfig, k as ControlRuntimeError, C as ControlSeverity, l as ControlStep, m as ControlStopPolicies, S as StopDecision, B as allCriticalPassed, M as objectiveEval, T as runAgentControlLoop, V as stopOnNoProgress, W as stopOnRepeatedAction, X as subjectiveEval } from './feedback-trajectory-CB0A32o3.js';
|
|
2
|
+
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-cxwMOAsy.js';
|
|
3
|
+
import './dataset-B9qvlm_o.js';
|
|
4
|
+
import './emitter-B2XqDKFU.js';
|
|
5
|
+
import './store-u47QaJ9G.js';
|
|
6
|
+
import './run-record-CX_jcAyr.js';
|
package/dist/control.js
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import {
|
|
2
|
+
allCriticalPassed,
|
|
3
|
+
controlRunToRunRecord,
|
|
4
|
+
evaluateActionPolicy,
|
|
5
|
+
objectiveEval,
|
|
6
|
+
runAgentControlLoop,
|
|
7
|
+
runProposeReview,
|
|
8
|
+
runProposeReviewAsControlLoop,
|
|
9
|
+
scoreFromEvals,
|
|
10
|
+
stopOnNoProgress,
|
|
11
|
+
stopOnRepeatedAction,
|
|
12
|
+
subjectiveEval
|
|
13
|
+
} from "./chunk-ARZ6BEV6.js";
|
|
14
|
+
import "./chunk-YUFXO3TU.js";
|
|
15
|
+
import "./chunk-5IIQKMD5.js";
|
|
16
|
+
import "./chunk-PZ5AY32C.js";
|
|
17
|
+
export {
|
|
18
|
+
allCriticalPassed,
|
|
19
|
+
controlRunToRunRecord,
|
|
20
|
+
evaluateActionPolicy,
|
|
21
|
+
objectiveEval,
|
|
22
|
+
runAgentControlLoop,
|
|
23
|
+
runProposeReview,
|
|
24
|
+
runProposeReviewAsControlLoop,
|
|
25
|
+
scoreFromEvals,
|
|
26
|
+
stopOnNoProgress,
|
|
27
|
+
stopOnRepeatedAction,
|
|
28
|
+
subjectiveEval
|
|
29
|
+
};
|
|
30
|
+
//# sourceMappingURL=control.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Dataset — versioned, sliceable, content-hashed scenario collection.
|
|
3
|
+
*
|
|
4
|
+
* Scenarios stop being ephemeral arrays and become first-class
|
|
5
|
+
* artifacts. Every Dataset carries:
|
|
6
|
+
* - content hash (sha256 over canonicalized scenario array)
|
|
7
|
+
* - provenance (contributor, createdAt, sourceUrl)
|
|
8
|
+
* - split labels (train | dev | test | holdout)
|
|
9
|
+
* - difficulty tiers (easy | medium | hard | extreme)
|
|
10
|
+
* - tags (free-form, per-scenario)
|
|
11
|
+
*
|
|
12
|
+
* `Dataset.slice({ difficulty, split, holdout, seed })` returns a
|
|
13
|
+
* deterministic, reproducible subset. Holdout slices are locked: you
|
|
14
|
+
* can read them but `mutate` throws, which prevents "oh I'll just
|
|
15
|
+
* tweak that one scenario" contamination drift.
|
|
16
|
+
*/
|
|
17
|
+
type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
|
|
18
|
+
type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
|
|
19
|
+
interface DatasetScenario {
|
|
20
|
+
id: string;
|
|
21
|
+
/** Arbitrary payload; the framework doesn't interpret it. */
|
|
22
|
+
payload: unknown;
|
|
23
|
+
split?: DatasetSplit;
|
|
24
|
+
difficulty?: DatasetDifficulty;
|
|
25
|
+
/** Canary token that MUST NOT round-trip through a correct agent output. */
|
|
26
|
+
canary?: string;
|
|
27
|
+
/**
|
|
28
|
+
* Behavioral-canary forbidden pattern. A string OR a serialized regex
|
|
29
|
+
* (`/.../flags`) that the agent under test MUST NOT emit. Used by
|
|
30
|
+
* {@link import('./canary').checkBehavioralCanary | checkBehavioralCanary},
|
|
31
|
+
* which inverts the contamination-style semantic: presence in the
|
|
32
|
+
* agent output is a LEAK / failure, not a positive signal.
|
|
33
|
+
*
|
|
34
|
+
* Falls back to {@link canary} when omitted.
|
|
35
|
+
*/
|
|
36
|
+
forbiddenPattern?: string;
|
|
37
|
+
tags?: Record<string, string>;
|
|
38
|
+
}
|
|
39
|
+
interface DatasetProvenance {
|
|
40
|
+
contributor?: string;
|
|
41
|
+
createdAt: string;
|
|
42
|
+
sourceUrl?: string;
|
|
43
|
+
license?: string;
|
|
44
|
+
description?: string;
|
|
45
|
+
/** Monotonic human-readable version (e.g. "2026.04.20"). */
|
|
46
|
+
version: string;
|
|
47
|
+
}
|
|
48
|
+
interface DatasetManifest {
|
|
49
|
+
name: string;
|
|
50
|
+
provenance: DatasetProvenance;
|
|
51
|
+
/** sha256 hex over canonicalized scenarios. */
|
|
52
|
+
contentHash: string;
|
|
53
|
+
scenarioCount: number;
|
|
54
|
+
splitCounts: Record<DatasetSplit, number>;
|
|
55
|
+
}
|
|
56
|
+
interface SliceOptions {
|
|
57
|
+
split?: DatasetSplit;
|
|
58
|
+
difficulty?: DatasetDifficulty;
|
|
59
|
+
/** Number of scenarios (random sample, seeded). Omit to take all that match. */
|
|
60
|
+
limit?: number;
|
|
61
|
+
seed?: number;
|
|
62
|
+
/** Predicate narrowing. Applied after split/difficulty filters. */
|
|
63
|
+
filter?: (scenario: DatasetScenario) => boolean;
|
|
64
|
+
/** If true, include scenarios marked as holdout. Default false. */
|
|
65
|
+
includeHoldout?: boolean;
|
|
66
|
+
}
|
|
67
|
+
/** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
|
|
68
|
+
declare class HoldoutLockedError extends Error {
|
|
69
|
+
constructor(datasetName: string);
|
|
70
|
+
}
|
|
71
|
+
declare class Dataset {
|
|
72
|
+
readonly name: string;
|
|
73
|
+
readonly provenance: DatasetProvenance;
|
|
74
|
+
private scenarios;
|
|
75
|
+
private locked;
|
|
76
|
+
constructor(init: {
|
|
77
|
+
name: string;
|
|
78
|
+
provenance: DatasetProvenance;
|
|
79
|
+
scenarios: DatasetScenario[];
|
|
80
|
+
locked?: boolean;
|
|
81
|
+
});
|
|
82
|
+
/** All scenarios. Readonly — callers must go through `slice` or `clone`. */
|
|
83
|
+
all(): readonly DatasetScenario[];
|
|
84
|
+
get size(): number;
|
|
85
|
+
/**
|
|
86
|
+
* Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
|
|
87
|
+
* the same arguments always produce the same slice across machines.
|
|
88
|
+
*/
|
|
89
|
+
slice(options?: SliceOptions): DatasetScenario[];
|
|
90
|
+
/**
|
|
91
|
+
* Assemble the manifest (name + provenance + content hash + counts).
|
|
92
|
+
* Content hash is deterministic over canonicalized scenarios.
|
|
93
|
+
*/
|
|
94
|
+
manifest(): Promise<DatasetManifest>;
|
|
95
|
+
/** Fresh unlocked copy — for post-release forks when mutation is needed. */
|
|
96
|
+
clone(overrides?: Partial<{
|
|
97
|
+
name: string;
|
|
98
|
+
version: string;
|
|
99
|
+
}>): Dataset;
|
|
100
|
+
lock(): void;
|
|
101
|
+
add(scenario: DatasetScenario): void;
|
|
102
|
+
remove(scenarioId: string): void;
|
|
103
|
+
/**
|
|
104
|
+
* Stable JSON-Lines serialization — deterministic byte-for-byte.
|
|
105
|
+
* Write to disk for contamination-verifiable archives.
|
|
106
|
+
*/
|
|
107
|
+
toJsonl(): string;
|
|
108
|
+
static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
|
|
109
|
+
}
|
|
110
|
+
declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
|
|
111
|
+
|
|
112
|
+
export { type DatasetSplit as D, HoldoutLockedError as H, type SliceOptions as S, type DatasetScenario as a, Dataset as b, type DatasetManifest as c, type DatasetDifficulty as d, type DatasetProvenance as e, hashScenarios as h };
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import { T as TraceStore, R as RunOutcome, a as Run, S as Span, b as SpanKind, L as LlmSpan, c as ToolSpan, d as RetrievalSpan, J as JudgeSpan, e as SandboxSpan, E as EventKind, f as TraceEvent, B as BudgetLedgerEntry, A as Artifact, M as Message } from './store-u47QaJ9G.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* TraceEmitter — hierarchical span builder that auto-parents using an
|
|
5
|
+
* internal stack. One emitter per Run; emitters do NOT share state.
|
|
6
|
+
*
|
|
7
|
+
* Convenience methods (`llm`, `tool`, `retrieval`, `judge`, `sandbox`)
|
|
8
|
+
* return a `SpanHandle` with `.end()` / `.fail()` so callers don't
|
|
9
|
+
* have to thread spanIds manually. For async workflows that can't use
|
|
10
|
+
* the stack (e.g. fan-out parallel calls), pass `parentSpanId`
|
|
11
|
+
* explicitly.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
interface SpanHandle<S extends Span = Span> {
|
|
15
|
+
span: S;
|
|
16
|
+
end(patch?: Partial<S>): Promise<void>;
|
|
17
|
+
fail(error: string | Error, patch?: Partial<S>): Promise<void>;
|
|
18
|
+
}
|
|
19
|
+
interface RunCompleteHookContext {
|
|
20
|
+
runId: string;
|
|
21
|
+
emitter: TraceEmitter;
|
|
22
|
+
store: TraceStore;
|
|
23
|
+
/** Outcome the caller passed to `endRun` (undefined for `abortRun`). */
|
|
24
|
+
outcome?: RunOutcome;
|
|
25
|
+
/** Final run status. */
|
|
26
|
+
status: 'completed' | 'failed' | 'aborted';
|
|
27
|
+
}
|
|
28
|
+
type RunCompleteHook = (ctx: RunCompleteHookContext) => Promise<void> | void;
|
|
29
|
+
interface TraceEmitterOptions {
|
|
30
|
+
runId?: string;
|
|
31
|
+
/** Inject a clock for deterministic tests. */
|
|
32
|
+
now?: () => number;
|
|
33
|
+
/** Inject an id generator for deterministic tests. */
|
|
34
|
+
id?: () => string;
|
|
35
|
+
/**
|
|
36
|
+
* Hooks fired after `endRun` / `abortRun` writes the final run state.
|
|
37
|
+
* Designed for trace-analyst auto-execution, integrity assertions, and
|
|
38
|
+
* outbound notifications. Hooks run sequentially in the order supplied.
|
|
39
|
+
*
|
|
40
|
+
* By default a hook that throws is swallowed and logged as a `note` event
|
|
41
|
+
* on the run — auto-orchestration must not crash the underlying flow.
|
|
42
|
+
* Set `hookErrors: 'throw'` to propagate.
|
|
43
|
+
*/
|
|
44
|
+
onRunComplete?: RunCompleteHook[];
|
|
45
|
+
/** `'swallow'` (default) | `'throw'`. */
|
|
46
|
+
hookErrors?: 'swallow' | 'throw';
|
|
47
|
+
}
|
|
48
|
+
declare class TraceEmitter {
|
|
49
|
+
private store;
|
|
50
|
+
private stack;
|
|
51
|
+
private _runId;
|
|
52
|
+
private now;
|
|
53
|
+
private id;
|
|
54
|
+
private hooks;
|
|
55
|
+
private hookErrors;
|
|
56
|
+
constructor(store: TraceStore, options?: TraceEmitterOptions);
|
|
57
|
+
get runId(): string;
|
|
58
|
+
get traceStore(): TraceStore;
|
|
59
|
+
/** Append a hook after construction (e.g. attach the trace analyst). */
|
|
60
|
+
addRunCompleteHook(hook: RunCompleteHook): void;
|
|
61
|
+
/**
|
|
62
|
+
* Begin a Run.
|
|
63
|
+
*
|
|
64
|
+
* `scenarioId` is required on the persisted Run shape — every Run downstream
|
|
65
|
+
* gets a non-empty scenarioId so filters and aggregations stay simple — but
|
|
66
|
+
* the INPUT here accepts it as optional. When omitted, startRun substitutes
|
|
67
|
+
* a sensible default (`run.layer ?? run.tags?.['kind'] ?? 'runtime'`) so
|
|
68
|
+
* runtime / operator / meta-eval runs that have no curated-scenario corpus
|
|
69
|
+
* to anchor to don't have to invent placeholder strings at the call site.
|
|
70
|
+
*/
|
|
71
|
+
startRun(run: Omit<Run, 'runId' | 'scenarioId' | 'startedAt' | 'status'> & {
|
|
72
|
+
scenarioId?: string;
|
|
73
|
+
}): Promise<Run>;
|
|
74
|
+
endRun(outcome?: RunOutcome): Promise<void>;
|
|
75
|
+
abortRun(reason: string): Promise<void>;
|
|
76
|
+
private runHooks;
|
|
77
|
+
span<S extends Span = Span>(init: {
|
|
78
|
+
kind: SpanKind;
|
|
79
|
+
name: string;
|
|
80
|
+
parentSpanId?: string;
|
|
81
|
+
attributes?: Record<string, unknown>;
|
|
82
|
+
} & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
|
|
83
|
+
private handle;
|
|
84
|
+
private pop;
|
|
85
|
+
llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
|
|
86
|
+
tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
|
|
87
|
+
retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
|
|
88
|
+
recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
|
|
89
|
+
sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
|
|
90
|
+
emit(event: {
|
|
91
|
+
kind: EventKind;
|
|
92
|
+
spanId?: string;
|
|
93
|
+
payload?: Record<string, unknown>;
|
|
94
|
+
}): Promise<TraceEvent>;
|
|
95
|
+
recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
|
|
96
|
+
timestamp?: number;
|
|
97
|
+
}): Promise<BudgetLedgerEntry>;
|
|
98
|
+
recordArtifact(artifact: Omit<Artifact, 'artifactId' | 'runId'>): Promise<Artifact>;
|
|
99
|
+
/**
|
|
100
|
+
* Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
|
|
101
|
+
* Returns the fn's return value. Use this for the 95% case.
|
|
102
|
+
*/
|
|
103
|
+
within<T>(init: Parameters<TraceEmitter['span']>[0], fn: (handle: SpanHandle) => Promise<T>): Promise<T>;
|
|
104
|
+
}
|
|
105
|
+
/** Helper to build an LLM span handle args object from a provider-shaped response. */
|
|
106
|
+
declare function llmSpanFromProvider(args: {
|
|
107
|
+
name?: string;
|
|
108
|
+
model: string;
|
|
109
|
+
messages: Message[];
|
|
110
|
+
output: string;
|
|
111
|
+
usage?: {
|
|
112
|
+
inputTokens?: number;
|
|
113
|
+
outputTokens?: number;
|
|
114
|
+
cachedTokens?: number;
|
|
115
|
+
reasoningTokens?: number;
|
|
116
|
+
};
|
|
117
|
+
costUsd?: number;
|
|
118
|
+
finishReason?: string;
|
|
119
|
+
}): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
|
|
120
|
+
|
|
121
|
+
export { type RunCompleteHook as R, type SpanHandle as S, TraceEmitter as T, type RunCompleteHookContext as a, type TraceEmitterOptions as b, llmSpanFromProvider as l };
|