@tangle-network/agent-eval 0.20.11 → 0.20.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +99 -170
- package/dist/benchmarks/index.d.ts +2 -1
- package/dist/{chunk-JAOLXRIA.js → chunk-75MCTH7P.js} +8 -2
- package/dist/chunk-75MCTH7P.js.map +1 -0
- package/dist/chunk-HKYRWNHV.js +1354 -0
- package/dist/chunk-HKYRWNHV.js.map +1 -0
- package/dist/{chunk-LSR4IAYN.js → chunk-HNJLMAJ2.js} +2 -2
- package/dist/chunk-IKFVX537.js +717 -0
- package/dist/chunk-IKFVX537.js.map +1 -0
- package/dist/chunk-KWUAAIHR.js +1764 -0
- package/dist/chunk-KWUAAIHR.js.map +1 -0
- package/dist/chunk-MCMV7DUL.js +1310 -0
- package/dist/chunk-MCMV7DUL.js.map +1 -0
- package/dist/chunk-ODFINDLQ.js +413 -0
- package/dist/chunk-ODFINDLQ.js.map +1 -0
- package/dist/chunk-PKCVBYTQ.js +200 -0
- package/dist/chunk-PKCVBYTQ.js.map +1 -0
- package/dist/chunk-YUFXO3TU.js +148 -0
- package/dist/chunk-YUFXO3TU.js.map +1 -0
- package/dist/cli.js +2 -2
- package/dist/control-C8NKbF3w.d.ts +258 -0
- package/dist/control.d.ts +5 -0
- package/dist/control.js +30 -0
- package/dist/control.js.map +1 -0
- package/dist/dataset-B9qvlm_o.d.ts +112 -0
- package/dist/emitter-BYO2nSDA.d.ts +387 -0
- package/dist/feedback-trajectory-BGQ_ANCN.d.ts +345 -0
- package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
- package/dist/index.d.ts +115 -2870
- package/dist/index.js +1049 -6156
- package/dist/index.js.map +1 -1
- package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +145 -0
- package/dist/optimization.js +60 -0
- package/dist/optimization.js.map +1 -0
- package/dist/reporting.d.ts +426 -0
- package/dist/reporting.js +32 -0
- package/dist/reporting.js.map +1 -0
- package/dist/run-record-CX_jcAyr.d.ts +134 -0
- package/dist/traces.d.ts +658 -0
- package/dist/traces.js +100 -0
- package/dist/traces.js.map +1 -0
- package/dist/wire/index.js +2 -2
- package/docs/concepts.md +16 -11
- package/docs/feature-guide.md +10 -17
- package/docs/integration-launch-gates.md +77 -0
- package/docs/product-eval-adoption.md +27 -0
- package/docs/trace-analysis.md +75 -0
- package/package.json +21 -1
- package/dist/chunk-JAOLXRIA.js.map +0 -1
- /package/dist/{chunk-LSR4IAYN.js.map → chunk-HNJLMAJ2.js.map} +0 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Paper-grade RunRecord schema + runtime validator.
|
|
3
|
+
*
|
|
4
|
+
* Every run that participates in a promotion gate, paper table, or
|
|
5
|
+
* researcher loop SHOULD be recorded as a `RunRecord`. The mandatory
|
|
6
|
+
* fields are exactly those the paper "Two Loops, Three Roles" requires
|
|
7
|
+
* for reproducibility: who/what/when/cost/seed/hash, plus the search vs
|
|
8
|
+
* holdout split tag and either a `searchScore` or a `holdoutScore`.
|
|
9
|
+
*
|
|
10
|
+
* This is intentionally NOT a replacement for the rich `Run` /
|
|
11
|
+
* `ProposeReviewReport` / `ScenarioResult` types already in the
|
|
12
|
+
* package. Those are runtime structures with full provenance. A
|
|
13
|
+
* `RunRecord` is the analysis-time projection — the JSON-friendly
|
|
14
|
+
* row you'd put in a parquet file or paste into a notebook.
|
|
15
|
+
*
|
|
16
|
+
* Validate at the boundary:
|
|
17
|
+
*
|
|
18
|
+
* const rec = validateRunRecord(rawJson) // throws on missing
|
|
19
|
+
* const ok = isRunRecord(rawJson) // boolean check
|
|
20
|
+
* const rec = parseRunRecordSafe(rawJson) // { ok, value | error }
|
|
21
|
+
*
|
|
22
|
+
* The validator runs in pure TS — zod is intentionally NOT a
|
|
23
|
+
* dependency. Round-trip tested in `tests/run-record.test.ts`.
|
|
24
|
+
*/
|
|
25
|
+
/** Search/dev/holdout split tag. 'search' is the paper-grade alias for the
|
|
26
|
+
* combined train+test pool that the optimizer is allowed to read. */
|
|
27
|
+
type RunSplitTag = 'search' | 'dev' | 'holdout';
|
|
28
|
+
interface RunTokenUsage {
|
|
29
|
+
input: number;
|
|
30
|
+
output: number;
|
|
31
|
+
cached?: number;
|
|
32
|
+
}
|
|
33
|
+
interface RunJudgeMetadata {
|
|
34
|
+
model: string;
|
|
35
|
+
promptVersion: string;
|
|
36
|
+
/** [0,1] confidence the judge declared. Constant judge confidence
|
|
37
|
+
* across many runs is a fallback signal (see `canary.ts`). */
|
|
38
|
+
confidence: number;
|
|
39
|
+
/** True if the judge degraded to a fallback path (rules-only,
|
|
40
|
+
* prior-call cache, etc.). The canary uses this to alert. */
|
|
41
|
+
fallback: boolean;
|
|
42
|
+
}
|
|
43
|
+
interface RunOutcome {
|
|
44
|
+
/** Score on the search/optimization split. Optional because a
|
|
45
|
+
* holdout-only evaluation only fills `holdoutScore`. */
|
|
46
|
+
searchScore?: number;
|
|
47
|
+
/** Score on the held-out split. Optional because a search-only run
|
|
48
|
+
* only fills `searchScore`. At least one must be present. */
|
|
49
|
+
holdoutScore?: number;
|
|
50
|
+
/** Bag of any other metric the run produced — judge dimensions,
|
|
51
|
+
* pass/fail counters, latency stats, etc. Numeric only — keeps
|
|
52
|
+
* reporters honest. */
|
|
53
|
+
raw: Record<string, number>;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Mandatory paper-grade fields for a single evaluation run. Optional
|
|
57
|
+
* fields are extension points; mandatory fields throw if missing.
|
|
58
|
+
*
|
|
59
|
+
* Hash discipline:
|
|
60
|
+
* - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the
|
|
61
|
+
* model (after any steering bundle merge).
|
|
62
|
+
* - `configHash` is the sha256 of the effective run config (model,
|
|
63
|
+
* temperature, tools, judges, splits). The pair (promptHash,
|
|
64
|
+
* configHash) uniquely identifies an experimental cell.
|
|
65
|
+
*
|
|
66
|
+
* Model snapshot discipline:
|
|
67
|
+
* - `model` MUST encode a snapshot version. Bare aliases like
|
|
68
|
+
* `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.
|
|
69
|
+
* Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.
|
|
70
|
+
*/
|
|
71
|
+
interface RunRecord {
|
|
72
|
+
/** UUID for the run. */
|
|
73
|
+
runId: string;
|
|
74
|
+
/** Logical experiment grouping (a treatment vs a baseline within
|
|
75
|
+
* the same sweep should share `experimentId`). */
|
|
76
|
+
experimentId: string;
|
|
77
|
+
/** Stable identifier for the candidate (variant) being run. The
|
|
78
|
+
* promotion gate compares two `candidateId`s on matched items. */
|
|
79
|
+
candidateId: string;
|
|
80
|
+
/** RNG seed for the run. Always recorded — silent re-seeding is
|
|
81
|
+
* the most common cause of non-reproducible numbers. */
|
|
82
|
+
seed: number;
|
|
83
|
+
/** Model identifier WITH snapshot version. */
|
|
84
|
+
model: string;
|
|
85
|
+
/** sha256 of the effective prompt (post-steering). */
|
|
86
|
+
promptHash: string;
|
|
87
|
+
/** sha256 of the effective config. */
|
|
88
|
+
configHash: string;
|
|
89
|
+
/** Git SHA the harness was run from. */
|
|
90
|
+
commitSha: string;
|
|
91
|
+
/** End-to-end wall-clock duration in milliseconds. */
|
|
92
|
+
wallMs: number;
|
|
93
|
+
/** Time spent queued before execution started, if known. */
|
|
94
|
+
queueMs?: number;
|
|
95
|
+
/** Total USD cost. Mandatory — runs without a cost number are
|
|
96
|
+
* unbounded by definition and must not be admitted into the gate. */
|
|
97
|
+
costUsd: number;
|
|
98
|
+
/** Token usage breakdown. */
|
|
99
|
+
tokenUsage: RunTokenUsage;
|
|
100
|
+
/** Judge-side metadata, if a judge was used. */
|
|
101
|
+
judgeMetadata?: RunJudgeMetadata;
|
|
102
|
+
/** Per-split scores + raw bag. */
|
|
103
|
+
outcome: RunOutcome;
|
|
104
|
+
/** Categorical failure tag, when the run failed and the harness
|
|
105
|
+
* classified it. Free-form string; standard tags live in
|
|
106
|
+
* `failure-taxonomy.ts`. */
|
|
107
|
+
failureMode?: string;
|
|
108
|
+
/** Which split this run was drawn from. */
|
|
109
|
+
splitTag: RunSplitTag;
|
|
110
|
+
}
|
|
111
|
+
declare class RunRecordValidationError extends Error {
|
|
112
|
+
readonly path: string;
|
|
113
|
+
constructor(message: string, path?: string);
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Strict validator. Throws `RunRecordValidationError` on the first
|
|
117
|
+
* missing or wrongly-typed field. Returns the input cast to
|
|
118
|
+
* `RunRecord` on success — the validator does not coerce.
|
|
119
|
+
*/
|
|
120
|
+
declare function validateRunRecord(input: unknown): RunRecord;
|
|
121
|
+
/** Boolean validator — convenience for filtering arrays. */
|
|
122
|
+
declare function isRunRecord(input: unknown): input is RunRecord;
|
|
123
|
+
/** Non-throwing validator — returns a discriminated union. */
|
|
124
|
+
declare function parseRunRecordSafe(input: unknown): {
|
|
125
|
+
ok: true;
|
|
126
|
+
value: RunRecord;
|
|
127
|
+
} | {
|
|
128
|
+
ok: false;
|
|
129
|
+
error: RunRecordValidationError;
|
|
130
|
+
};
|
|
131
|
+
/** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
|
|
132
|
+
declare function roundTripRunRecord(record: RunRecord): RunRecord;
|
|
133
|
+
|
|
134
|
+
export { type RunSplitTag as R, type RunRecord as a, type RunJudgeMetadata as b, type RunOutcome as c, RunRecordValidationError as d, type RunTokenUsage as e, isRunRecord as i, parseRunRecordSafe as p, roundTripRunRecord as r, validateRunRecord as v };
|