@tangle-network/agent-eval 0.20.10 → 0.20.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +129 -126
  2. package/dist/benchmarks/index.d.ts +2 -1
  3. package/dist/{chunk-JAOLXRIA.js → chunk-75MCTH7P.js} +8 -2
  4. package/dist/chunk-75MCTH7P.js.map +1 -0
  5. package/dist/chunk-HKYRWNHV.js +1354 -0
  6. package/dist/chunk-HKYRWNHV.js.map +1 -0
  7. package/dist/{chunk-LSR4IAYN.js → chunk-HNJLMAJ2.js} +2 -2
  8. package/dist/chunk-IKFVX537.js +717 -0
  9. package/dist/chunk-IKFVX537.js.map +1 -0
  10. package/dist/chunk-KWUAAIHR.js +1764 -0
  11. package/dist/chunk-KWUAAIHR.js.map +1 -0
  12. package/dist/chunk-MCMV7DUL.js +1310 -0
  13. package/dist/chunk-MCMV7DUL.js.map +1 -0
  14. package/dist/chunk-ODFINDLQ.js +413 -0
  15. package/dist/chunk-ODFINDLQ.js.map +1 -0
  16. package/dist/chunk-PKCVBYTQ.js +200 -0
  17. package/dist/chunk-PKCVBYTQ.js.map +1 -0
  18. package/dist/chunk-YUFXO3TU.js +148 -0
  19. package/dist/chunk-YUFXO3TU.js.map +1 -0
  20. package/dist/cli.js +2 -2
  21. package/dist/control-C8NKbF3w.d.ts +258 -0
  22. package/dist/control.d.ts +5 -0
  23. package/dist/control.js +30 -0
  24. package/dist/control.js.map +1 -0
  25. package/dist/dataset-B9qvlm_o.d.ts +112 -0
  26. package/dist/emitter-BYO2nSDA.d.ts +387 -0
  27. package/dist/feedback-trajectory-BGQ_ANCN.d.ts +345 -0
  28. package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
  29. package/dist/index.d.ts +115 -2870
  30. package/dist/index.js +1049 -6156
  31. package/dist/index.js.map +1 -1
  32. package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
  33. package/dist/openapi.json +1 -1
  34. package/dist/optimization.d.ts +145 -0
  35. package/dist/optimization.js +60 -0
  36. package/dist/optimization.js.map +1 -0
  37. package/dist/reporting.d.ts +426 -0
  38. package/dist/reporting.js +32 -0
  39. package/dist/reporting.js.map +1 -0
  40. package/dist/run-record-CX_jcAyr.d.ts +134 -0
  41. package/dist/traces.d.ts +658 -0
  42. package/dist/traces.js +100 -0
  43. package/dist/traces.js.map +1 -0
  44. package/dist/wire/index.js +2 -2
  45. package/docs/concepts.md +16 -11
  46. package/docs/feature-guide.md +10 -17
  47. package/docs/integration-launch-gates.md +77 -0
  48. package/docs/product-eval-adoption.md +221 -0
  49. package/docs/trace-analysis.md +75 -0
  50. package/package.json +21 -1
  51. package/dist/chunk-JAOLXRIA.js.map +0 -1
  52. /package/dist/{chunk-LSR4IAYN.js.map → chunk-HNJLMAJ2.js.map} +0 -0
@@ -0,0 +1,134 @@
1
+ /**
2
+ * Paper-grade RunRecord schema + runtime validator.
3
+ *
4
+ * Every run that participates in a promotion gate, paper table, or
5
+ * researcher loop SHOULD be recorded as a `RunRecord`. The mandatory
6
+ * fields are exactly those the paper "Two Loops, Three Roles" requires
7
+ * for reproducibility: who/what/when/cost/seed/hash, plus the search vs
8
+ * holdout split tag and either a `searchScore` or a `holdoutScore`.
9
+ *
10
+ * This is intentionally NOT a replacement for the rich `Run` /
11
+ * `ProposeReviewReport` / `ScenarioResult` types already in the
12
+ * package. Those are runtime structures with full provenance. A
13
+ * `RunRecord` is the analysis-time projection — the JSON-friendly
14
+ * row you'd put in a parquet file or paste into a notebook.
15
+ *
16
+ * Validate at the boundary:
17
+ *
18
+ * const rec = validateRunRecord(rawJson) // throws on missing
19
+ * const ok = isRunRecord(rawJson) // boolean check
20
+ * const rec = parseRunRecordSafe(rawJson) // { ok, value | error }
21
+ *
22
+ * The validator runs in pure TS — zod is intentionally NOT a
23
+ * dependency. Round-trip tested in `tests/run-record.test.ts`.
24
+ */
25
+ /** Search/dev/holdout split tag. 'search' is the paper-grade alias for the
26
+ * combined train+test pool that the optimizer is allowed to read. */
27
+ type RunSplitTag = 'search' | 'dev' | 'holdout';
28
+ interface RunTokenUsage {
29
+ input: number;
30
+ output: number;
31
+ cached?: number;
32
+ }
33
+ interface RunJudgeMetadata {
34
+ model: string;
35
+ promptVersion: string;
36
+ /** [0,1] confidence the judge declared. Constant judge confidence
37
+ * across many runs is a fallback signal (see `canary.ts`). */
38
+ confidence: number;
39
+ /** True if the judge degraded to a fallback path (rules-only,
40
+ * prior-call cache, etc.). The canary uses this to alert. */
41
+ fallback: boolean;
42
+ }
43
+ interface RunOutcome {
44
+ /** Score on the search/optimization split. Optional because a
45
+ * holdout-only evaluation only fills `holdoutScore`. */
46
+ searchScore?: number;
47
+ /** Score on the held-out split. Optional because a search-only run
48
+ * only fills `searchScore`. At least one must be present. */
49
+ holdoutScore?: number;
50
+ /** Bag of any other metric the run produced — judge dimensions,
51
+ * pass/fail counters, latency stats, etc. Numeric only — keeps
52
+ * reporters honest. */
53
+ raw: Record<string, number>;
54
+ }
55
+ /**
56
+ * Mandatory paper-grade fields for a single evaluation run. Optional
57
+ * fields are extension points; mandatory fields throw if missing.
58
+ *
59
+ * Hash discipline:
60
+ * - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the
61
+ * model (after any steering bundle merge).
62
+ * - `configHash` is the sha256 of the effective run config (model,
63
+ * temperature, tools, judges, splits). The pair (promptHash,
64
+ * configHash) uniquely identifies an experimental cell.
65
+ *
66
+ * Model snapshot discipline:
67
+ * - `model` MUST encode a snapshot version. Bare aliases like
68
+ * `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.
69
+ * Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.
70
+ */
71
+ interface RunRecord {
72
+ /** UUID for the run. */
73
+ runId: string;
74
+ /** Logical experiment grouping (a treatment vs a baseline within
75
+ * the same sweep should share `experimentId`). */
76
+ experimentId: string;
77
+ /** Stable identifier for the candidate (variant) being run. The
78
+ * promotion gate compares two `candidateId`s on matched items. */
79
+ candidateId: string;
80
+ /** RNG seed for the run. Always recorded — silent re-seeding is
81
+ * the most common cause of non-reproducible numbers. */
82
+ seed: number;
83
+ /** Model identifier WITH snapshot version. */
84
+ model: string;
85
+ /** sha256 of the effective prompt (post-steering). */
86
+ promptHash: string;
87
+ /** sha256 of the effective config. */
88
+ configHash: string;
89
+ /** Git SHA the harness was run from. */
90
+ commitSha: string;
91
+ /** End-to-end wall-clock duration in milliseconds. */
92
+ wallMs: number;
93
+ /** Time spent queued before execution started, if known. */
94
+ queueMs?: number;
95
+ /** Total USD cost. Mandatory — runs without a cost number are
96
+ * unbounded by definition and must not be admitted into the gate. */
97
+ costUsd: number;
98
+ /** Token usage breakdown. */
99
+ tokenUsage: RunTokenUsage;
100
+ /** Judge-side metadata, if a judge was used. */
101
+ judgeMetadata?: RunJudgeMetadata;
102
+ /** Per-split scores + raw bag. */
103
+ outcome: RunOutcome;
104
+ /** Categorical failure tag, when the run failed and the harness
105
+ * classified it. Free-form string; standard tags live in
106
+ * `failure-taxonomy.ts`. */
107
+ failureMode?: string;
108
+ /** Which split this run was drawn from. */
109
+ splitTag: RunSplitTag;
110
+ }
111
+ declare class RunRecordValidationError extends Error {
112
+ readonly path: string;
113
+ constructor(message: string, path?: string);
114
+ }
115
+ /**
116
+ * Strict validator. Throws `RunRecordValidationError` on the first
117
+ * missing or wrongly-typed field. Returns the input cast to
118
+ * `RunRecord` on success — the validator does not coerce.
119
+ */
120
+ declare function validateRunRecord(input: unknown): RunRecord;
121
+ /** Boolean validator — convenience for filtering arrays. */
122
+ declare function isRunRecord(input: unknown): input is RunRecord;
123
+ /** Non-throwing validator — returns a discriminated union. */
124
+ declare function parseRunRecordSafe(input: unknown): {
125
+ ok: true;
126
+ value: RunRecord;
127
+ } | {
128
+ ok: false;
129
+ error: RunRecordValidationError;
130
+ };
131
+ /** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
132
+ declare function roundTripRunRecord(record: RunRecord): RunRecord;
133
+
134
+ export { type RunSplitTag as R, type RunRecord as a, type RunJudgeMetadata as b, type RunOutcome as c, RunRecordValidationError as d, type RunTokenUsage as e, isRunRecord as i, parseRunRecordSafe as p, roundTripRunRecord as r, validateRunRecord as v };