@tangle-network/agent-eval 0.20.11 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/CHANGELOG.md +76 -0
  2. package/README.md +137 -170
  3. package/dist/benchmarks/index.d.ts +2 -1
  4. package/dist/{chunk-JAOLXRIA.js → chunk-3GN6U53I.js} +205 -4
  5. package/dist/chunk-3GN6U53I.js.map +1 -0
  6. package/dist/chunk-3IX6QTB7.js +1349 -0
  7. package/dist/chunk-3IX6QTB7.js.map +1 -0
  8. package/dist/chunk-5IIQKMD5.js +236 -0
  9. package/dist/chunk-5IIQKMD5.js.map +1 -0
  10. package/dist/chunk-ARZ6BEV6.js +1310 -0
  11. package/dist/chunk-ARZ6BEV6.js.map +1 -0
  12. package/dist/chunk-HRZELXCR.js +1354 -0
  13. package/dist/chunk-HRZELXCR.js.map +1 -0
  14. package/dist/chunk-KRR4VMH7.js +423 -0
  15. package/dist/chunk-KRR4VMH7.js.map +1 -0
  16. package/dist/chunk-SNUHRBDL.js +154 -0
  17. package/dist/chunk-SNUHRBDL.js.map +1 -0
  18. package/dist/chunk-WOK2RTWG.js +1920 -0
  19. package/dist/chunk-WOK2RTWG.js.map +1 -0
  20. package/dist/{chunk-LSR4IAYN.js → chunk-WOPGKVN4.js} +2 -2
  21. package/dist/chunk-YUFXO3TU.js +148 -0
  22. package/dist/chunk-YUFXO3TU.js.map +1 -0
  23. package/dist/cli.js +3 -2
  24. package/dist/cli.js.map +1 -1
  25. package/dist/control-cxwMOAsy.d.ts +259 -0
  26. package/dist/control.d.ts +6 -0
  27. package/dist/control.js +30 -0
  28. package/dist/control.js.map +1 -0
  29. package/dist/dataset-B9qvlm_o.d.ts +112 -0
  30. package/dist/emitter-B2XqDKFU.d.ts +121 -0
  31. package/dist/feedback-trajectory-CB0A32o3.d.ts +346 -0
  32. package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
  33. package/dist/index.d.ts +178 -2945
  34. package/dist/index.js +1066 -6185
  35. package/dist/index.js.map +1 -1
  36. package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
  37. package/dist/openapi.json +1 -1
  38. package/dist/optimization.d.ts +146 -0
  39. package/dist/optimization.js +60 -0
  40. package/dist/optimization.js.map +1 -0
  41. package/dist/reporting-Da2ihlcM.d.ts +672 -0
  42. package/dist/reporting.d.ts +5 -0
  43. package/dist/reporting.js +36 -0
  44. package/dist/reporting.js.map +1 -0
  45. package/dist/run-record-CX_jcAyr.d.ts +134 -0
  46. package/dist/store-u47QaJ9G.d.ts +297 -0
  47. package/dist/traces.d.ts +914 -0
  48. package/dist/traces.js +120 -0
  49. package/dist/traces.js.map +1 -0
  50. package/dist/wire/index.js +3 -2
  51. package/docs/concepts.md +16 -11
  52. package/docs/feature-guide.md +10 -17
  53. package/docs/integration-launch-gates.md +77 -0
  54. package/docs/product-eval-adoption.md +27 -0
  55. package/docs/research-report-methodology.md +155 -0
  56. package/docs/trace-analysis.md +75 -0
  57. package/package.json +30 -12
  58. package/dist/chunk-JAOLXRIA.js.map +0 -1
  59. /package/dist/{chunk-LSR4IAYN.js.map → chunk-WOPGKVN4.js.map} +0 -0
package/dist/traces.js ADDED
@@ -0,0 +1,120 @@
1
+ import {
2
+ DEFAULT_REDACTION_RULES,
3
+ DEFAULT_TRACE_ANALYST_BUDGETS,
4
+ FAILURE_CLASSES,
5
+ FileSystemTraceStore,
6
+ InMemoryTraceStore,
7
+ OTEL_AGENT_EVAL_SCOPE,
8
+ OtlpFileTraceStore,
9
+ REDACTION_VERSION,
10
+ RunIntegrityError,
11
+ SpanNotFoundError,
12
+ TRACE_ANALYST_ACTOR_DESCRIPTION,
13
+ TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
14
+ TRACE_ANALYST_SUBAGENT_DESCRIPTION,
15
+ TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
16
+ TRACE_SCHEMA_VERSION,
17
+ TraceFileMissingError,
18
+ TraceNotFoundError,
19
+ aggregateLlm,
20
+ analyzeTraces,
21
+ argHash,
22
+ assertRunCaptured,
23
+ buildTraceAnalystTools,
24
+ buildTraceInsightContext,
25
+ buildTraceInsightPrompt,
26
+ defaultTraceInsightPanel,
27
+ describeTraceInsightScope,
28
+ domainEvidencePattern,
29
+ exportRunAsOtlp,
30
+ groupBy,
31
+ inferDomainKeywords,
32
+ isJudgeSpan,
33
+ isLlmSpan,
34
+ isRetrievalSpan,
35
+ isSandboxSpan,
36
+ isToolSpan,
37
+ judgeSpans,
38
+ llmSpans,
39
+ planTraceInsightQuestions,
40
+ redactString,
41
+ redactValue,
42
+ runFailureClass,
43
+ runsForScenario,
44
+ scoreTraceInsightReadiness,
45
+ throwIfRunIncomplete,
46
+ tokenizeDomainWords,
47
+ toolSpans,
48
+ traceAnalystFunctionGroup,
49
+ traceAnalystOnRunComplete
50
+ } from "./chunk-WOK2RTWG.js";
51
+ import {
52
+ TraceEmitter,
53
+ llmSpanFromProvider
54
+ } from "./chunk-5IIQKMD5.js";
55
+ import {
56
+ FileSystemRawProviderSink,
57
+ InMemoryRawProviderSink,
58
+ NoopRawProviderSink,
59
+ defaultProviderRedactor,
60
+ providerFromBaseUrl
61
+ } from "./chunk-SNUHRBDL.js";
62
+ import "./chunk-PZ5AY32C.js";
63
+ export {
64
+ DEFAULT_REDACTION_RULES,
65
+ DEFAULT_TRACE_ANALYST_BUDGETS,
66
+ FAILURE_CLASSES,
67
+ FileSystemRawProviderSink,
68
+ FileSystemTraceStore,
69
+ InMemoryRawProviderSink,
70
+ InMemoryTraceStore,
71
+ NoopRawProviderSink,
72
+ OTEL_AGENT_EVAL_SCOPE,
73
+ OtlpFileTraceStore,
74
+ REDACTION_VERSION,
75
+ RunIntegrityError,
76
+ SpanNotFoundError,
77
+ TRACE_ANALYST_ACTOR_DESCRIPTION,
78
+ TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
79
+ TRACE_ANALYST_SUBAGENT_DESCRIPTION,
80
+ TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
81
+ TRACE_SCHEMA_VERSION,
82
+ TraceEmitter,
83
+ TraceFileMissingError,
84
+ TraceNotFoundError,
85
+ aggregateLlm,
86
+ analyzeTraces,
87
+ argHash,
88
+ assertRunCaptured,
89
+ buildTraceAnalystTools,
90
+ buildTraceInsightContext,
91
+ buildTraceInsightPrompt,
92
+ defaultProviderRedactor,
93
+ defaultTraceInsightPanel,
94
+ describeTraceInsightScope,
95
+ domainEvidencePattern,
96
+ exportRunAsOtlp,
97
+ groupBy,
98
+ inferDomainKeywords,
99
+ isJudgeSpan,
100
+ isLlmSpan,
101
+ isRetrievalSpan,
102
+ isSandboxSpan,
103
+ isToolSpan,
104
+ judgeSpans,
105
+ llmSpanFromProvider,
106
+ llmSpans,
107
+ planTraceInsightQuestions,
108
+ providerFromBaseUrl,
109
+ redactString,
110
+ redactValue,
111
+ runFailureClass,
112
+ runsForScenario,
113
+ scoreTraceInsightReadiness,
114
+ throwIfRunIncomplete,
115
+ tokenizeDomainWords,
116
+ toolSpans,
117
+ traceAnalystFunctionGroup,
118
+ traceAnalystOnRunComplete
119
+ };
120
+ //# sourceMappingURL=traces.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
@@ -24,8 +24,9 @@ import {
24
24
  runRpcBatch,
25
25
  runRpcOnce,
26
26
  startServer
27
- } from "../chunk-LSR4IAYN.js";
28
- import "../chunk-JAOLXRIA.js";
27
+ } from "../chunk-WOPGKVN4.js";
28
+ import "../chunk-3GN6U53I.js";
29
+ import "../chunk-SNUHRBDL.js";
29
30
  import "../chunk-PZ5AY32C.js";
30
31
  export {
31
32
  BUILTIN_RUBRICS,
package/docs/concepts.md CHANGED
@@ -1,14 +1,15 @@
1
1
  # Concepts
2
2
 
3
- Read this once and the rest of agent-eval makes sense.
3
+ `agent-eval` is for deciding whether an agent run should pass, keep working, be
4
+ replayed, be optimized, or be promoted.
4
5
 
5
- ## What is agent-eval?
6
+ It exists because agent output is not evidence. A model can say a task is done
7
+ while the build fails, the browser flow is broken, the integration was never
8
+ connected, or the answer lacks required sources. The package gives products a
9
+ shared way to record runs, check outcomes, classify failures, compare variants,
10
+ and make release decisions.
6
11
 
7
- A library for **deciding whether a code generator or content generator did its job.** You give it a thing the generator produced (a scaffold, a patch, a tweet, a JSON config), and you get back a structured verdict: pass/fail, dimension scores, a reason in plain English.
8
-
9
- It exists because LLMs lie about whether they succeeded. A model will say "Done!" and ship code that doesn't compile. agent-eval is the layer between the model's output and your decision to ship.
10
-
11
- ## The three things you'll touch most
12
+ ## Main Objects
12
13
 
13
14
  | Thing | What it is | One-line example |
14
15
  |---|---|---|
@@ -17,7 +18,8 @@ It exists because LLMs lie about whether they succeeded. A model will say "Done!
17
18
  | **Verifier** | A pipeline of judges run in order, with dependencies. | "install → typecheck → build → semantic" |
18
19
  | **Feedback trajectory** | A multi-shot record of attempts, approvals, rejections, edits, metrics, and policy outcomes. | "draft → user rejects → revised draft → approved → measured" |
19
20
 
20
- That's the whole framework. Everything else (sessions, traces, layers) is plumbing around those three.
21
+ Everything else exists to make those objects useful in real product loops:
22
+ traces, datasets, control runtime, optimizers, statistics, and reports.
21
23
 
22
24
  When the thing being evaluated is an agent that should keep working, use
23
25
  [`runAgentControlLoop`](./control-runtime.md). It turns validators into a
@@ -62,7 +64,7 @@ shape stays the same.
62
64
  Those trajectories can be converted into preference memory, `DatasetScenario`
63
65
  rows, optimizer rows, and held-out examples for overfit checks.
64
66
 
65
- ## The three-layer eval (for code generators)
67
+ ## Code Generator Eval
66
68
 
67
69
  When the artifact is generated code, agent-eval scores it at three independent layers. Each layer fails differently, and you want to know which one broke:
68
70
 
@@ -125,7 +127,7 @@ Two rules that will save you bugs:
125
127
 
126
128
  2. **Pair LLM judges with build outcomes.** An LLM judge will rate non-compiling code as "looks right" (0.8). Always short-circuit on `buildOutcome.passed === false` before any LLM judging.
127
129
 
128
- ## The trace model (skip on first read)
130
+ ## Trace Model
129
131
 
130
132
  Every operation emits structured spans into a `TraceStore`. A run is a tree:
131
133
 
@@ -142,7 +144,10 @@ builder-session [span]
142
144
 
143
145
  Spans are append-only and have stable ids — replay is reading the same store back. OTLP export ships them out for distributed tracing.
144
146
 
145
- You don't need to build the trace tree by hand. `BuilderSession` does it for you. Look at the trace store when you're debugging a flaky run; ignore it otherwise.
147
+ You usually should not build this tree by hand. Product runtimes,
148
+ `runAgentControlLoop`, harnesses, and verifiers should emit it while they run.
149
+ Use traces when debugging a flaky run, building replay data, or explaining a
150
+ release decision.
146
151
 
147
152
  ## Where to go next
148
153
 
@@ -3,7 +3,7 @@
3
3
  This page explains the main `agent-eval` primitives in plain English first,
4
4
  then shows when to use each one.
5
5
 
6
- ## ELI5
6
+ ## Overview
7
7
 
8
8
  LLM agents can write code, drafts, research, plans, and actions. The hard part
9
9
  is knowing whether they actually did a good job, whether they should keep
@@ -41,7 +41,7 @@ trying, and whether a change made them better or worse.
41
41
 
42
42
  ## Integration Patterns
43
43
 
44
- ### Recommended Agent Product Shape
44
+ ### Recommended Product Shape
45
45
 
46
46
  Use this shape when the product needs to keep pushing work forward instead of
47
47
  only answering once:
@@ -175,21 +175,6 @@ Store as `FeedbackTrajectory`, then derive:
175
175
  logs, screenshots, or browser state. Use separate sandboxes for parallel
176
176
  variants or destructive checks.
177
177
 
178
- ## Same-Sandbox Example
179
-
180
- `examples/same-sandbox-harness/` shows the common coding/browser pattern:
181
-
182
- ```text
183
- one sandbox/workdir -> install/build/test -> inspect evidence -> emit judge span
184
- ```
185
-
186
- Use this when a judge needs evidence produced by earlier harness phases. Use
187
- isolated sandboxes when variants run in parallel or a phase can corrupt the
188
- workspace.
189
- - Treat telemetry as evidence, not control flow. A trace sink outage should be
190
- visible in `runtimeErrors`, but it should not stop the worker from completing
191
- the user task.
192
-
193
178
  ## Highest-ROI Adoption Order
194
179
 
195
180
  1. Wrap one real product workflow in `runAgentControlLoop`.
@@ -211,3 +196,11 @@ reusable:
211
196
 
212
197
  Core should provide shapes, stores, runners, scoring, traces, and converters.
213
198
  Downstream integrations provide domain state, policy, tools, and storage.
199
+
200
+ ## Examples
201
+
202
+ - `examples/same-sandbox-harness`: one workdir for install/build/test plus
203
+ evidence inspection.
204
+ - `examples/multi-shot-optimization`: full-trajectory optimization with a
205
+ holdout gate.
206
+ - `examples/benchmarks`: benchmark adapter contracts and reference wrappers.
@@ -0,0 +1,77 @@
1
+ # Integration Launch Gates
2
+
3
+ Use these gates when a product lets generated apps or agents use user-owned
4
+ connections through an integration hub.
5
+
6
+ The eval should wrap the real product path:
7
+
8
+ ```txt
9
+ user prompt
10
+ -> product emits IntegrationManifest
11
+ -> platform resolves connections and grants
12
+ -> sandbox receives capability bundle
13
+ -> generated app invokes integration action
14
+ -> platform enforces policy, approval, idempotency, audit
15
+ ```
16
+
17
+ ## Deterministic Gates
18
+
19
+ - The generated app declares an integration manifest before sandbox launch.
20
+ - Manifest validation passes.
21
+ - Required connections and scopes are present before execution.
22
+ - Sandbox environment contains a capability bundle, not raw provider tokens.
23
+ - Reads invoke through the platform bridge.
24
+ - Writes return `approval_required` unless product policy explicitly allows
25
+ them.
26
+ - Approved writes are bound to the same action, input hash, connection, and
27
+ subject.
28
+ - Revoked grants or expired capabilities stop invocation.
29
+ - Resumed or long-running sandboxes receive a refreshed bundle before expiry.
30
+ - Audit includes grant creation, capability issue, invoke success/failure,
31
+ approval resolution, and revoke events.
32
+
33
+ ## Failure Classes
34
+
35
+ `agent-eval` classifies integration failures separately from prompt/tool
36
+ failures:
37
+
38
+ - `bad_integration_manifest`
39
+ - `missing_integration_connection`
40
+ - `missing_integration_scope`
41
+ - `integration_approval_required`
42
+ - `integration_auth_expired`
43
+ - `integration_provider_failure`
44
+ - `unsafe_integration_write_denied`
45
+
46
+ Use the helper payload builders and eval builders so products emit the same
47
+ trace evidence:
48
+
49
+ ```ts
50
+ const gate = {
51
+ connectorId: 'google-calendar',
52
+ actionId: 'events.create',
53
+ valid: true,
54
+ missingConnections: [],
55
+ missingScopes: ['calendar.events.write'],
56
+ }
57
+
58
+ const evals = integrationGateEvals(gate)
59
+
60
+ await emitter.emit({
61
+ kind: 'custom',
62
+ payload: integrationManifestResolvedPayload(gate),
63
+ })
64
+
65
+ await emitter.emit({
66
+ kind: 'custom',
67
+ payload: integrationInvokeFailedPayload({
68
+ connectorId: 'google-calendar',
69
+ actionId: 'events.create',
70
+ code: 'scope_denied',
71
+ message: 'calendar.events.write was not granted',
72
+ }),
73
+ })
74
+ ```
75
+
76
+ The classifier then reports the real missing surface instead of burying the
77
+ failure under `tool_recovery_failure` or `unknown`.
@@ -98,6 +98,23 @@ replayable eval data.
98
98
  production run -> feedback trajectory -> dataset scenario -> optimizer row
99
99
  ```
100
100
 
101
+ For promotion-grade runs, also project the completed control result into a
102
+ strict `RunRecord`:
103
+
104
+ ```ts
105
+ const record = controlRunToRunRecord(controlResult, {
106
+ experimentId,
107
+ candidateId,
108
+ seed,
109
+ model: 'gpt-4o-2024-11-20',
110
+ promptHash,
111
+ configHash,
112
+ commitSha,
113
+ splitTag: 'holdout',
114
+ tokenUsage,
115
+ })
116
+ ```
117
+
101
118
  ## Datasets And Holdouts
102
119
 
103
120
  Use four splits:
@@ -107,6 +124,10 @@ Use four splits:
107
124
  - `test`: normal reporting.
108
125
  - `holdout`: promotion-only gate.
109
126
 
127
+ The low-level `RunRecord` schema uses `search | dev | holdout`; map `train`
128
+ and normal non-holdout test/report rows to `search` when producing promotion
129
+ tables.
130
+
110
131
  Do not inspect or tune against holdout failures during optimization. If a
111
132
  holdout failure reveals a real product bug, fix the bug and rotate the holdout
112
133
  set with a signed note.
@@ -149,6 +170,7 @@ A launch or promotion should require:
149
170
  - cost and latency within budget
150
171
  - no unresolved canary or contamination failures
151
172
  - trace evidence for representative successes and failures
173
+ - TraceAnalyst findings for failure-heavy or regression-heavy corpora
152
174
  - human-readable report with failure clusters and next actions
153
175
 
154
176
  `evaluateReleaseConfidence()` and the paired statistics helpers provide the
@@ -177,6 +199,11 @@ Use `@tangle-network/agent-integrations` manifests as readiness inputs. Gate
177
199
  missing connections, missing scopes, approval-required writes, and stale tokens
178
200
  before blaming the agent prompt.
179
201
 
202
+ For generated apps and sandbox agents, also run the
203
+ [Integration Launch Gates](./integration-launch-gates.md). The eval should prove
204
+ that app code invokes through the integration bridge, not provider SDKs with raw
205
+ OAuth tokens.
206
+
180
207
  ### Voice Agent
181
208
 
182
209
  Record transcript, timing, interruptions, tool calls, and task outcome. Judge
@@ -0,0 +1,155 @@
1
+ # researchReport — methodology
2
+
3
+ This document is the methodological brief for `researchReport` (exported from
4
+ `@tangle-network/agent-eval` and `@tangle-network/agent-eval/reporting`). It
5
+ exists so a launch reviewer, peer reviewer, or auditor can quickly verify that
6
+ the verdict embedded in any rendered report is defensible, reproducible, and
7
+ appropriate to the data.
8
+
9
+ The companion code is `src/summary-report.ts`. Each item below names the
10
+ corresponding function or option so the doc and the code don't drift.
11
+
12
+ ## Inputs
13
+
14
+ - `runs: RunRecord[]` — every record carries `runId`, `candidateId`, `seed`,
15
+ `experimentId`, `splitTag`, and an `outcome` with the configured score.
16
+ - `comparator: string` — the candidate id treated as the null reference. Must
17
+ be selected before data inspection; `preregistrationHash` should pin this.
18
+ - `split: 'search' | 'holdout'` — defaults to `holdout`. Decisions on `search`
19
+ are descriptive only; promotion calls require the holdout.
20
+ - `rope: { low, high }` — Region of Practical Equivalence on the paired delta,
21
+ in score units. Must come from the domain owner — there is no
22
+ statistically-defensible default.
23
+ - `minPairs` (soft floor, default 20) and `RESEARCH_REPORT_HARD_PAIR_FLOOR`
24
+ (hard floor, 6). Below the soft floor, the verdict is `needs_more_data` and
25
+ the report carries the MDE at the current N.
26
+ - `fdr` (default 0.05), `confidence` (default 0.95), `mdePower` (default 0.8),
27
+ `mdeAlpha` (default = `fdr`).
28
+
29
+ ## Pairing
30
+
31
+ Pairs are joined by `(experimentId, seed)` so the comparator and candidate
32
+ share scenario *and* seed. This is the same join `gainHistogram` uses; see
33
+ `pairScoresByKey` in `src/summary-report.ts`. Records on the wrong split or
34
+ with non-finite scores are dropped before pairing.
35
+
36
+ ## Decision rule
37
+
38
+ In order — first match wins:
39
+
40
+ 1. `comparator` itself → `hold` (baseline).
41
+ 2. No comparator → `hold` if on the cost/quality Pareto frontier, else
42
+ `needs_more_data`. The verdict is descriptive, not causal.
43
+ 3. Held-out gate verdict ≠ `promote` → `reject`. The gate is *necessary but
44
+ not sufficient*; even a `promote` gate must clear the paired test below.
45
+ 4. Paired N < `RESEARCH_REPORT_HARD_PAIR_FLOOR` → `needs_more_data` with a
46
+ "below hard floor" reason. Bootstrap CIs degenerate at this size.
47
+ 5. ROPE configured AND paired-delta CI ⊂ ROPE → `equivalent`.
48
+ 6. Paired-delta CI upper bound < 0 → `reject` (CI excludes a non-negative
49
+ effect). Note: this uses **paired delta only** — not the marginal mean.
50
+ 7. Paired N < `minPairs` (soft floor) → `needs_more_data` with the MDE at
51
+ current N attached so the verdict is actionable.
52
+ 8. BH-adjusted q ≤ `fdr` AND CI lower bound > 0 → `promote`. The BH q-value
53
+ controls FDR across all candidates in the same sweep; the bootstrap CI
54
+ provides an effect-size guarantee independent of the test.
55
+ 9. Otherwise → `hold`.
56
+
57
+ ## Statistical primitives used
58
+
59
+ | Quantity | Function | Source file |
60
+ |---|---|---|
61
+ | Marginal CI on score mean | `confidenceInterval` | `statistics.ts` |
62
+ | Cohen's d vs comparator | `cohensD` | `statistics.ts` |
63
+ | Wilcoxon signed-rank (paired) | `wilcoxonSignedRank` | `statistics.ts` |
64
+ | BH-FDR q-values | `benjaminiHochberg` | `power-analysis.ts` |
65
+ | Paired bootstrap CI on median delta | `pairedBootstrap` | `paired-stats.ts` |
66
+ | Bayesian-bootstrap-style Pr(Δ>0), Pr(Δ∈ROPE) | `bootstrapMeanSamples` | `summary-report.ts` (private) |
67
+ | Minimum detectable paired effect | `pairedMde` | `power-analysis.ts` |
68
+ | Run fingerprint | `hashJson(canonicalize(...))` | `pre-registration.ts` |
69
+
70
+ The Pr(Δ>0) and Pr(Δ∈ROPE) summaries use the bootstrap-prior duality of
71
+ [Rubin 1981]: under a non-informative Dirichlet prior, the bootstrap
72
+ distribution of a sample statistic is its posterior. We expose these as
73
+ posterior summaries on the **mean** delta and the bootstrap CI on the
74
+ **median** delta — the median is more robust to the heavy-tailed score
75
+ distributions seen in agent benchmarks; the mean lets us read off the
76
+ Bayesian-style probability of superiority in a single number.
77
+
78
+ ## MDE
79
+
80
+ The minimum detectable paired effect at N pairs, two-sided α, and power β:
81
+
82
+ $$d_\text{min} = \frac{z_{1-\alpha/2} + z_\beta}{\sqrt{n}}$$
83
+
84
+ reported on the standardised scale, then multiplied by the observed paired-
85
+ delta SD to get the MDE in score units. Consumers reading a `needs_more_data`
86
+ verdict can use the MDE to budget the next round of runs:
87
+
88
+ - Observed paired SD = 0.10 score units, paired N = 20, α = 0.05, β = 0.8 →
89
+ d_min ≈ 0.63 standardised → MDE ≈ 0.063 score units. If the smallest
90
+ effect that would change a launch decision is below this, run more pairs.
91
+
92
+ ## Provenance
93
+
94
+ Every report carries:
95
+
96
+ - `runFingerprint`: SHA-256 over the canonicalised list of
97
+ `(runId, candidateId, splitTag)` triples (sorted by runId), plus the
98
+ comparator id and split. Same `(runs, comparator, split)` produces the same
99
+ fingerprint regardless of input order.
100
+ - `preregistrationHash`: the caller passes the hash of a signed
101
+ `HypothesisManifest` (see `pre-registration.ts`). The fingerprint and the
102
+ preregistration hash together let a reader verify both *what data the
103
+ report saw* and *what protocol it was supposed to run.*
104
+
105
+ Reports without a `preregistrationHash` carry a "post-hoc" warning in the
106
+ risks list and the executive summary. Treat them as descriptive only.
107
+
108
+ ## Alternatives considered
109
+
110
+ - **Paired t-test instead of Wilcoxon + bootstrap.** Rejected: agent score
111
+ distributions are heavy-tailed (judges saturate near 0 and 1) and the t
112
+ approximation breaks down with the small N typical of holdouts.
113
+ - **Unpaired Mann–Whitney.** Rejected: matched scenarios make pairing free,
114
+ and unpaired tests throw away the variance reduction. Use the paired test
115
+ by default.
116
+ - **Sequential / always-valid inference (e-values, mSPRT, alpha-spending).**
117
+ Out of scope for a single-look report. If users iterate, wrap this report
118
+ in an alpha-spending schedule, or commit to one preregistered look.
119
+ - **Hierarchical Bayesian shrinkage across many candidates.** Future work.
120
+ The current ranking is on raw paired statistics and over-credits the top
121
+ candidate when many are tested.
122
+ - **Calibration / coverage simulation on the bootstrap CI.** Future work; we
123
+ rely on the asymptotic guarantee plus the hard pair floor to keep coverage
124
+ reasonable.
125
+
126
+ ## When NOT to apply
127
+
128
+ - Paired N below the hard floor (6) on any candidate.
129
+ - Comparator chosen by inspecting the data (post-hoc selection inflates
130
+ false-discovery rates beyond the BH guarantee).
131
+ - Mid-run distribution shift: judge model swap, rubric change, infrastructure
132
+ outage. Pair exchangeability is violated and the bootstrap is not valid.
133
+ - Scenarios drawn non-randomly from a stream the candidate can influence
134
+ (data-leak across runs). The pairing is no longer ignorable.
135
+ - Highly skewed cost distributions: the Pareto frontier still works but the
136
+ marginal CI on cost may be misleading.
137
+
138
+ ## Citations
139
+
140
+ - Benjamini, Y. & Hochberg, Y. (1995). Controlling the false discovery rate:
141
+ a practical and powerful approach to multiple testing. *JRSS B*,
142
+ 57(1), 289–300.
143
+ - Wilcoxon, F. (1945). Individual comparisons by ranking methods.
144
+ *Biometrics Bulletin*, 1(6), 80–83.
145
+ - Efron, B. (1979). Bootstrap methods: another look at the jackknife.
146
+ *Annals of Statistics*, 7(1), 1–26.
147
+ - Rubin, D. B. (1981). The Bayesian bootstrap.
148
+ *Annals of Statistics*, 9(1), 130–134.
149
+ - Kruschke, J. K. (2018). Rejecting or accepting parameter values in
150
+ Bayesian estimation. *Advances in Methods and Practices in
151
+ Psychological Science*, 1(2), 270–280. (ROPE.)
152
+ - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).
153
+ Time-uniform, nonparametric, nonasymptotic confidence sequences.
154
+ *Annals of Statistics*, 49(2), 1055–1080. (Background reading on
155
+ always-valid inference for sequential extensions.)
@@ -0,0 +1,75 @@
1
+ # Trace Analysis
2
+
3
+ Trace analysis is the bridge between raw product telemetry and useful eval work.
4
+
5
+ ```txt
6
+ live product run
7
+ -> TraceEmitter / TraceStore
8
+ -> TraceAnalyst investigates trace corpora
9
+ -> findings become ASI, failures, replay cases, and release actions
10
+ ```
11
+
12
+ ## When To Use TraceAnalyst
13
+
14
+ Use `TraceAnalyst` when you have more than a few traces and need to answer:
15
+
16
+ - which failure modes are recurring?
17
+ - which spans explain a regression?
18
+ - did retrieval, integrations, sandbox, or policy block the run?
19
+ - are failed runs missing evidence that the optimizer needs?
20
+ - which product surfaces deserve the next fix?
21
+
22
+ Use summary tables and release confidence for promotion decisions. Use
23
+ TraceAnalyst to explain the evidence behind those decisions.
24
+
25
+ ## Minimal Flow
26
+
27
+ ```ts
28
+ import {
29
+ OtlpFileTraceStore,
30
+ analyzeTraces,
31
+ } from '@tangle-network/agent-eval'
32
+
33
+ const result = await analyzeTraces({
34
+ question: 'Why did app-runtime holdout runs fail this week?',
35
+ }, {
36
+ source: new OtlpFileTraceStore({ path: 'traces/otlp.jsonl' }),
37
+ ai,
38
+ model: 'gpt-4o-2024-11-20',
39
+ })
40
+
41
+ console.log(result.findings)
42
+ ```
43
+
44
+ Products can pass any `TraceAnalysisStore`; they do not need to use the file
45
+ store in production.
46
+
47
+ ## Required Trace Shape
48
+
49
+ Every serious product run should include:
50
+
51
+ - `runId`, `projectId`, `scenarioId`, `variantId`, and `layer`
52
+ - commit, prompt hash, config hash, model fingerprint, and dataset version
53
+ - LLM spans with model, inputs, outputs, token counts, and cost
54
+ - tool/integration spans with arguments, result summaries, and error codes
55
+ - retrieval spans with query, source ids, hit scores, and freshness metadata
56
+ - sandbox/build/test/deploy spans with exit codes and log artifacts
57
+ - custom events for knowledge readiness and integration gates
58
+ - final run outcome with pass/score/failure class
59
+
60
+ Do not put secrets, raw OAuth tokens, or unredacted PII in traces.
61
+
62
+ ## Product Loop
63
+
64
+ The product loop should not treat traces as a separate debug dump. The intended
65
+ path is:
66
+
67
+ 1. Wrap the real workflow in `runAgentControlLoop` or the product runtime.
68
+ 2. Emit canonical spans/events while the user task runs.
69
+ 3. Convert the completed run to `FeedbackTrajectory` for replay.
70
+ 4. Convert promotion-grade runs to `RunRecord` with `controlRunToRunRecord`.
71
+ 5. Run TraceAnalyst over failure-heavy trace sets.
72
+ 6. Feed findings into `ActionableSideInfo`, failure clusters, and release
73
+ reports.
74
+
75
+ That makes normal product usage become eval data instead of isolated logs.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.20.11",
3
+ "version": "0.21.0",
4
4
  "description": "Trace-first evaluation infrastructure for agent systems: traces, harnesses, verifier pipelines, judges, datasets, gates, optimization, and reporting.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -19,6 +19,26 @@
19
19
  "import": "./dist/index.js",
20
20
  "default": "./dist/index.js"
21
21
  },
22
+ "./control": {
23
+ "types": "./dist/control.d.ts",
24
+ "import": "./dist/control.js",
25
+ "default": "./dist/control.js"
26
+ },
27
+ "./optimization": {
28
+ "types": "./dist/optimization.d.ts",
29
+ "import": "./dist/optimization.js",
30
+ "default": "./dist/optimization.js"
31
+ },
32
+ "./reporting": {
33
+ "types": "./dist/reporting.d.ts",
34
+ "import": "./dist/reporting.js",
35
+ "default": "./dist/reporting.js"
36
+ },
37
+ "./traces": {
38
+ "types": "./dist/traces.d.ts",
39
+ "import": "./dist/traces.js",
40
+ "default": "./dist/traces.js"
41
+ },
22
42
  "./telemetry": {
23
43
  "types": "./dist/telemetry/index.d.ts",
24
44
  "import": "./dist/telemetry/index.js",
@@ -54,15 +74,6 @@
54
74
  "publishConfig": {
55
75
  "access": "public"
56
76
  },
57
- "scripts": {
58
- "build": "tsup && pnpm openapi",
59
- "dev": "tsup --watch",
60
- "prepare": "pnpm build",
61
- "test": "vitest run",
62
- "test:watch": "vitest",
63
- "typecheck": "tsc --noEmit",
64
- "openapi": "node dist/cli.js openapi --out dist/openapi.json"
65
- },
66
77
  "dependencies": {
67
78
  "@asteasolutions/zod-to-openapi": "^8.5.0",
68
79
  "@ax-llm/ax": "^19.0.25",
@@ -82,5 +93,12 @@
82
93
  "node": ">=20"
83
94
  },
84
95
  "license": "MIT",
85
- "packageManager": "pnpm@10.22.0"
86
- }
96
+ "scripts": {
97
+ "build": "tsup && pnpm openapi",
98
+ "dev": "tsup --watch",
99
+ "test": "vitest run",
100
+ "test:watch": "vitest",
101
+ "typecheck": "tsc --noEmit",
102
+ "openapi": "node dist/cli.js openapi --out dist/openapi.json"
103
+ }
104
+ }