@tangle-network/agent-eval 0.20.11 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +76 -0
- package/README.md +137 -170
- package/dist/benchmarks/index.d.ts +2 -1
- package/dist/{chunk-JAOLXRIA.js → chunk-3GN6U53I.js} +205 -4
- package/dist/chunk-3GN6U53I.js.map +1 -0
- package/dist/chunk-3IX6QTB7.js +1349 -0
- package/dist/chunk-3IX6QTB7.js.map +1 -0
- package/dist/chunk-5IIQKMD5.js +236 -0
- package/dist/chunk-5IIQKMD5.js.map +1 -0
- package/dist/chunk-ARZ6BEV6.js +1310 -0
- package/dist/chunk-ARZ6BEV6.js.map +1 -0
- package/dist/chunk-HRZELXCR.js +1354 -0
- package/dist/chunk-HRZELXCR.js.map +1 -0
- package/dist/chunk-KRR4VMH7.js +423 -0
- package/dist/chunk-KRR4VMH7.js.map +1 -0
- package/dist/chunk-SNUHRBDL.js +154 -0
- package/dist/chunk-SNUHRBDL.js.map +1 -0
- package/dist/chunk-WOK2RTWG.js +1920 -0
- package/dist/chunk-WOK2RTWG.js.map +1 -0
- package/dist/{chunk-LSR4IAYN.js → chunk-WOPGKVN4.js} +2 -2
- package/dist/chunk-YUFXO3TU.js +148 -0
- package/dist/chunk-YUFXO3TU.js.map +1 -0
- package/dist/cli.js +3 -2
- package/dist/cli.js.map +1 -1
- package/dist/control-cxwMOAsy.d.ts +259 -0
- package/dist/control.d.ts +6 -0
- package/dist/control.js +30 -0
- package/dist/control.js.map +1 -0
- package/dist/dataset-B9qvlm_o.d.ts +112 -0
- package/dist/emitter-B2XqDKFU.d.ts +121 -0
- package/dist/feedback-trajectory-CB0A32o3.d.ts +346 -0
- package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
- package/dist/index.d.ts +178 -2945
- package/dist/index.js +1066 -6185
- package/dist/index.js.map +1 -1
- package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +146 -0
- package/dist/optimization.js +60 -0
- package/dist/optimization.js.map +1 -0
- package/dist/reporting-Da2ihlcM.d.ts +672 -0
- package/dist/reporting.d.ts +5 -0
- package/dist/reporting.js +36 -0
- package/dist/reporting.js.map +1 -0
- package/dist/run-record-CX_jcAyr.d.ts +134 -0
- package/dist/store-u47QaJ9G.d.ts +297 -0
- package/dist/traces.d.ts +914 -0
- package/dist/traces.js +120 -0
- package/dist/traces.js.map +1 -0
- package/dist/wire/index.js +3 -2
- package/docs/concepts.md +16 -11
- package/docs/feature-guide.md +10 -17
- package/docs/integration-launch-gates.md +77 -0
- package/docs/product-eval-adoption.md +27 -0
- package/docs/research-report-methodology.md +155 -0
- package/docs/trace-analysis.md +75 -0
- package/package.json +30 -12
- package/dist/chunk-JAOLXRIA.js.map +0 -1
- /package/dist/{chunk-LSR4IAYN.js.map → chunk-WOPGKVN4.js.map} +0 -0
package/dist/traces.js
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DEFAULT_REDACTION_RULES,
|
|
3
|
+
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
4
|
+
FAILURE_CLASSES,
|
|
5
|
+
FileSystemTraceStore,
|
|
6
|
+
InMemoryTraceStore,
|
|
7
|
+
OTEL_AGENT_EVAL_SCOPE,
|
|
8
|
+
OtlpFileTraceStore,
|
|
9
|
+
REDACTION_VERSION,
|
|
10
|
+
RunIntegrityError,
|
|
11
|
+
SpanNotFoundError,
|
|
12
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
13
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
|
|
14
|
+
TRACE_ANALYST_SUBAGENT_DESCRIPTION,
|
|
15
|
+
TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
|
|
16
|
+
TRACE_SCHEMA_VERSION,
|
|
17
|
+
TraceFileMissingError,
|
|
18
|
+
TraceNotFoundError,
|
|
19
|
+
aggregateLlm,
|
|
20
|
+
analyzeTraces,
|
|
21
|
+
argHash,
|
|
22
|
+
assertRunCaptured,
|
|
23
|
+
buildTraceAnalystTools,
|
|
24
|
+
buildTraceInsightContext,
|
|
25
|
+
buildTraceInsightPrompt,
|
|
26
|
+
defaultTraceInsightPanel,
|
|
27
|
+
describeTraceInsightScope,
|
|
28
|
+
domainEvidencePattern,
|
|
29
|
+
exportRunAsOtlp,
|
|
30
|
+
groupBy,
|
|
31
|
+
inferDomainKeywords,
|
|
32
|
+
isJudgeSpan,
|
|
33
|
+
isLlmSpan,
|
|
34
|
+
isRetrievalSpan,
|
|
35
|
+
isSandboxSpan,
|
|
36
|
+
isToolSpan,
|
|
37
|
+
judgeSpans,
|
|
38
|
+
llmSpans,
|
|
39
|
+
planTraceInsightQuestions,
|
|
40
|
+
redactString,
|
|
41
|
+
redactValue,
|
|
42
|
+
runFailureClass,
|
|
43
|
+
runsForScenario,
|
|
44
|
+
scoreTraceInsightReadiness,
|
|
45
|
+
throwIfRunIncomplete,
|
|
46
|
+
tokenizeDomainWords,
|
|
47
|
+
toolSpans,
|
|
48
|
+
traceAnalystFunctionGroup,
|
|
49
|
+
traceAnalystOnRunComplete
|
|
50
|
+
} from "./chunk-WOK2RTWG.js";
|
|
51
|
+
import {
|
|
52
|
+
TraceEmitter,
|
|
53
|
+
llmSpanFromProvider
|
|
54
|
+
} from "./chunk-5IIQKMD5.js";
|
|
55
|
+
import {
|
|
56
|
+
FileSystemRawProviderSink,
|
|
57
|
+
InMemoryRawProviderSink,
|
|
58
|
+
NoopRawProviderSink,
|
|
59
|
+
defaultProviderRedactor,
|
|
60
|
+
providerFromBaseUrl
|
|
61
|
+
} from "./chunk-SNUHRBDL.js";
|
|
62
|
+
import "./chunk-PZ5AY32C.js";
|
|
63
|
+
export {
|
|
64
|
+
DEFAULT_REDACTION_RULES,
|
|
65
|
+
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
66
|
+
FAILURE_CLASSES,
|
|
67
|
+
FileSystemRawProviderSink,
|
|
68
|
+
FileSystemTraceStore,
|
|
69
|
+
InMemoryRawProviderSink,
|
|
70
|
+
InMemoryTraceStore,
|
|
71
|
+
NoopRawProviderSink,
|
|
72
|
+
OTEL_AGENT_EVAL_SCOPE,
|
|
73
|
+
OtlpFileTraceStore,
|
|
74
|
+
REDACTION_VERSION,
|
|
75
|
+
RunIntegrityError,
|
|
76
|
+
SpanNotFoundError,
|
|
77
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
78
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
|
|
79
|
+
TRACE_ANALYST_SUBAGENT_DESCRIPTION,
|
|
80
|
+
TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
|
|
81
|
+
TRACE_SCHEMA_VERSION,
|
|
82
|
+
TraceEmitter,
|
|
83
|
+
TraceFileMissingError,
|
|
84
|
+
TraceNotFoundError,
|
|
85
|
+
aggregateLlm,
|
|
86
|
+
analyzeTraces,
|
|
87
|
+
argHash,
|
|
88
|
+
assertRunCaptured,
|
|
89
|
+
buildTraceAnalystTools,
|
|
90
|
+
buildTraceInsightContext,
|
|
91
|
+
buildTraceInsightPrompt,
|
|
92
|
+
defaultProviderRedactor,
|
|
93
|
+
defaultTraceInsightPanel,
|
|
94
|
+
describeTraceInsightScope,
|
|
95
|
+
domainEvidencePattern,
|
|
96
|
+
exportRunAsOtlp,
|
|
97
|
+
groupBy,
|
|
98
|
+
inferDomainKeywords,
|
|
99
|
+
isJudgeSpan,
|
|
100
|
+
isLlmSpan,
|
|
101
|
+
isRetrievalSpan,
|
|
102
|
+
isSandboxSpan,
|
|
103
|
+
isToolSpan,
|
|
104
|
+
judgeSpans,
|
|
105
|
+
llmSpanFromProvider,
|
|
106
|
+
llmSpans,
|
|
107
|
+
planTraceInsightQuestions,
|
|
108
|
+
providerFromBaseUrl,
|
|
109
|
+
redactString,
|
|
110
|
+
redactValue,
|
|
111
|
+
runFailureClass,
|
|
112
|
+
runsForScenario,
|
|
113
|
+
scoreTraceInsightReadiness,
|
|
114
|
+
throwIfRunIncomplete,
|
|
115
|
+
tokenizeDomainWords,
|
|
116
|
+
toolSpans,
|
|
117
|
+
traceAnalystFunctionGroup,
|
|
118
|
+
traceAnalystOnRunComplete
|
|
119
|
+
};
|
|
120
|
+
//# sourceMappingURL=traces.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
|
package/dist/wire/index.js
CHANGED
|
@@ -24,8 +24,9 @@ import {
|
|
|
24
24
|
runRpcBatch,
|
|
25
25
|
runRpcOnce,
|
|
26
26
|
startServer
|
|
27
|
-
} from "../chunk-
|
|
28
|
-
import "../chunk-
|
|
27
|
+
} from "../chunk-WOPGKVN4.js";
|
|
28
|
+
import "../chunk-3GN6U53I.js";
|
|
29
|
+
import "../chunk-SNUHRBDL.js";
|
|
29
30
|
import "../chunk-PZ5AY32C.js";
|
|
30
31
|
export {
|
|
31
32
|
BUILTIN_RUBRICS,
|
package/docs/concepts.md
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
# Concepts
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
`agent-eval` is for deciding whether an agent run should pass, keep working, be
|
|
4
|
+
replayed, be optimized, or be promoted.
|
|
4
5
|
|
|
5
|
-
|
|
6
|
+
It exists because agent output is not evidence. A model can say a task is done
|
|
7
|
+
while the build fails, the browser flow is broken, the integration was never
|
|
8
|
+
connected, or the answer lacks required sources. The package gives products a
|
|
9
|
+
shared way to record runs, check outcomes, classify failures, compare variants,
|
|
10
|
+
and make release decisions.
|
|
6
11
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
It exists because LLMs lie about whether they succeeded. A model will say "Done!" and ship code that doesn't compile. agent-eval is the layer between the model's output and your decision to ship.
|
|
10
|
-
|
|
11
|
-
## The three things you'll touch most
|
|
12
|
+
## Main Objects
|
|
12
13
|
|
|
13
14
|
| Thing | What it is | One-line example |
|
|
14
15
|
|---|---|---|
|
|
@@ -17,7 +18,8 @@ It exists because LLMs lie about whether they succeeded. A model will say "Done!
|
|
|
17
18
|
| **Verifier** | A pipeline of judges run in order, with dependencies. | "install → typecheck → build → semantic" |
|
|
18
19
|
| **Feedback trajectory** | A multi-shot record of attempts, approvals, rejections, edits, metrics, and policy outcomes. | "draft → user rejects → revised draft → approved → measured" |
|
|
19
20
|
|
|
20
|
-
|
|
21
|
+
Everything else exists to make those objects useful in real product loops:
|
|
22
|
+
traces, datasets, control runtime, optimizers, statistics, and reports.
|
|
21
23
|
|
|
22
24
|
When the thing being evaluated is an agent that should keep working, use
|
|
23
25
|
[`runAgentControlLoop`](./control-runtime.md). It turns validators into a
|
|
@@ -62,7 +64,7 @@ shape stays the same.
|
|
|
62
64
|
Those trajectories can be converted into preference memory, `DatasetScenario`
|
|
63
65
|
rows, optimizer rows, and held-out examples for overfit checks.
|
|
64
66
|
|
|
65
|
-
##
|
|
67
|
+
## Code Generator Eval
|
|
66
68
|
|
|
67
69
|
When the artifact is generated code, agent-eval scores it at three independent layers. Each layer fails differently, and you want to know which one broke:
|
|
68
70
|
|
|
@@ -125,7 +127,7 @@ Two rules that will save you bugs:
|
|
|
125
127
|
|
|
126
128
|
2. **Pair LLM judges with build outcomes.** An LLM judge will rate non-compiling code as "looks right" (0.8). Always short-circuit on `buildOutcome.passed === false` before any LLM judging.
|
|
127
129
|
|
|
128
|
-
##
|
|
130
|
+
## Trace Model
|
|
129
131
|
|
|
130
132
|
Every operation emits structured spans into a `TraceStore`. A run is a tree:
|
|
131
133
|
|
|
@@ -142,7 +144,10 @@ builder-session [span]
|
|
|
142
144
|
|
|
143
145
|
Spans are append-only and have stable ids — replay is reading the same store back. OTLP export ships them out for distributed tracing.
|
|
144
146
|
|
|
145
|
-
You
|
|
147
|
+
You usually should not build this tree by hand. Product runtimes,
|
|
148
|
+
`runAgentControlLoop`, harnesses, and verifiers should emit it while they run.
|
|
149
|
+
Use traces when debugging a flaky run, building replay data, or explaining a
|
|
150
|
+
release decision.
|
|
146
151
|
|
|
147
152
|
## Where to go next
|
|
148
153
|
|
package/docs/feature-guide.md
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
This page explains the main `agent-eval` primitives in plain English first,
|
|
4
4
|
then shows when to use each one.
|
|
5
5
|
|
|
6
|
-
##
|
|
6
|
+
## Overview
|
|
7
7
|
|
|
8
8
|
LLM agents can write code, drafts, research, plans, and actions. The hard part
|
|
9
9
|
is knowing whether they actually did a good job, whether they should keep
|
|
@@ -41,7 +41,7 @@ trying, and whether a change made them better or worse.
|
|
|
41
41
|
|
|
42
42
|
## Integration Patterns
|
|
43
43
|
|
|
44
|
-
### Recommended
|
|
44
|
+
### Recommended Product Shape
|
|
45
45
|
|
|
46
46
|
Use this shape when the product needs to keep pushing work forward instead of
|
|
47
47
|
only answering once:
|
|
@@ -175,21 +175,6 @@ Store as `FeedbackTrajectory`, then derive:
|
|
|
175
175
|
logs, screenshots, or browser state. Use separate sandboxes for parallel
|
|
176
176
|
variants or destructive checks.
|
|
177
177
|
|
|
178
|
-
## Same-Sandbox Example
|
|
179
|
-
|
|
180
|
-
`examples/same-sandbox-harness/` shows the common coding/browser pattern:
|
|
181
|
-
|
|
182
|
-
```text
|
|
183
|
-
one sandbox/workdir -> install/build/test -> inspect evidence -> emit judge span
|
|
184
|
-
```
|
|
185
|
-
|
|
186
|
-
Use this when a judge needs evidence produced by earlier harness phases. Use
|
|
187
|
-
isolated sandboxes when variants run in parallel or a phase can corrupt the
|
|
188
|
-
workspace.
|
|
189
|
-
- Treat telemetry as evidence, not control flow. A trace sink outage should be
|
|
190
|
-
visible in `runtimeErrors`, but it should not stop the worker from completing
|
|
191
|
-
the user task.
|
|
192
|
-
|
|
193
178
|
## Highest-ROI Adoption Order
|
|
194
179
|
|
|
195
180
|
1. Wrap one real product workflow in `runAgentControlLoop`.
|
|
@@ -211,3 +196,11 @@ reusable:
|
|
|
211
196
|
|
|
212
197
|
Core should provide shapes, stores, runners, scoring, traces, and converters.
|
|
213
198
|
Downstream integrations provide domain state, policy, tools, and storage.
|
|
199
|
+
|
|
200
|
+
## Examples
|
|
201
|
+
|
|
202
|
+
- `examples/same-sandbox-harness`: one workdir for install/build/test plus
|
|
203
|
+
evidence inspection.
|
|
204
|
+
- `examples/multi-shot-optimization`: full-trajectory optimization with a
|
|
205
|
+
holdout gate.
|
|
206
|
+
- `examples/benchmarks`: benchmark adapter contracts and reference wrappers.
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Integration Launch Gates
|
|
2
|
+
|
|
3
|
+
Use these gates when a product lets generated apps or agents use user-owned
|
|
4
|
+
connections through an integration hub.
|
|
5
|
+
|
|
6
|
+
The eval should wrap the real product path:
|
|
7
|
+
|
|
8
|
+
```txt
|
|
9
|
+
user prompt
|
|
10
|
+
-> product emits IntegrationManifest
|
|
11
|
+
-> platform resolves connections and grants
|
|
12
|
+
-> sandbox receives capability bundle
|
|
13
|
+
-> generated app invokes integration action
|
|
14
|
+
-> platform enforces policy, approval, idempotency, audit
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Deterministic Gates
|
|
18
|
+
|
|
19
|
+
- The generated app declares an integration manifest before sandbox launch.
|
|
20
|
+
- Manifest validation passes.
|
|
21
|
+
- Required connections and scopes are present before execution.
|
|
22
|
+
- Sandbox environment contains a capability bundle, not raw provider tokens.
|
|
23
|
+
- Reads invoke through the platform bridge.
|
|
24
|
+
- Writes return `approval_required` unless product policy explicitly allows
|
|
25
|
+
them.
|
|
26
|
+
- Approved writes are bound to the same action, input hash, connection, and
|
|
27
|
+
subject.
|
|
28
|
+
- Revoked grants or expired capabilities stop invocation.
|
|
29
|
+
- Resumed or long-running sandboxes receive a refreshed bundle before expiry.
|
|
30
|
+
- Audit includes grant creation, capability issue, invoke success/failure,
|
|
31
|
+
approval resolution, and revoke events.
|
|
32
|
+
|
|
33
|
+
## Failure Classes
|
|
34
|
+
|
|
35
|
+
`agent-eval` classifies integration failures separately from prompt/tool
|
|
36
|
+
failures:
|
|
37
|
+
|
|
38
|
+
- `bad_integration_manifest`
|
|
39
|
+
- `missing_integration_connection`
|
|
40
|
+
- `missing_integration_scope`
|
|
41
|
+
- `integration_approval_required`
|
|
42
|
+
- `integration_auth_expired`
|
|
43
|
+
- `integration_provider_failure`
|
|
44
|
+
- `unsafe_integration_write_denied`
|
|
45
|
+
|
|
46
|
+
Use the helper payload builders and eval builders so products emit the same
|
|
47
|
+
trace evidence:
|
|
48
|
+
|
|
49
|
+
```ts
|
|
50
|
+
const gate = {
|
|
51
|
+
connectorId: 'google-calendar',
|
|
52
|
+
actionId: 'events.create',
|
|
53
|
+
valid: true,
|
|
54
|
+
missingConnections: [],
|
|
55
|
+
missingScopes: ['calendar.events.write'],
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const evals = integrationGateEvals(gate)
|
|
59
|
+
|
|
60
|
+
await emitter.emit({
|
|
61
|
+
kind: 'custom',
|
|
62
|
+
payload: integrationManifestResolvedPayload(gate),
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
await emitter.emit({
|
|
66
|
+
kind: 'custom',
|
|
67
|
+
payload: integrationInvokeFailedPayload({
|
|
68
|
+
connectorId: 'google-calendar',
|
|
69
|
+
actionId: 'events.create',
|
|
70
|
+
code: 'scope_denied',
|
|
71
|
+
message: 'calendar.events.write was not granted',
|
|
72
|
+
}),
|
|
73
|
+
})
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
The classifier then reports the real missing surface instead of burying the
|
|
77
|
+
failure under `tool_recovery_failure` or `unknown`.
|
|
@@ -98,6 +98,23 @@ replayable eval data.
|
|
|
98
98
|
production run -> feedback trajectory -> dataset scenario -> optimizer row
|
|
99
99
|
```
|
|
100
100
|
|
|
101
|
+
For promotion-grade runs, also project the completed control result into a
|
|
102
|
+
strict `RunRecord`:
|
|
103
|
+
|
|
104
|
+
```ts
|
|
105
|
+
const record = controlRunToRunRecord(controlResult, {
|
|
106
|
+
experimentId,
|
|
107
|
+
candidateId,
|
|
108
|
+
seed,
|
|
109
|
+
model: 'gpt-4o-2024-11-20',
|
|
110
|
+
promptHash,
|
|
111
|
+
configHash,
|
|
112
|
+
commitSha,
|
|
113
|
+
splitTag: 'holdout',
|
|
114
|
+
tokenUsage,
|
|
115
|
+
})
|
|
116
|
+
```
|
|
117
|
+
|
|
101
118
|
## Datasets And Holdouts
|
|
102
119
|
|
|
103
120
|
Use four splits:
|
|
@@ -107,6 +124,10 @@ Use four splits:
|
|
|
107
124
|
- `test`: normal reporting.
|
|
108
125
|
- `holdout`: promotion-only gate.
|
|
109
126
|
|
|
127
|
+
The low-level `RunRecord` schema uses `search | dev | holdout`; map `train`
|
|
128
|
+
and normal non-holdout test/report rows to `search` when producing promotion
|
|
129
|
+
tables.
|
|
130
|
+
|
|
110
131
|
Do not inspect or tune against holdout failures during optimization. If a
|
|
111
132
|
holdout failure reveals a real product bug, fix the bug and rotate the holdout
|
|
112
133
|
set with a signed note.
|
|
@@ -149,6 +170,7 @@ A launch or promotion should require:
|
|
|
149
170
|
- cost and latency within budget
|
|
150
171
|
- no unresolved canary or contamination failures
|
|
151
172
|
- trace evidence for representative successes and failures
|
|
173
|
+
- TraceAnalyst findings for failure-heavy or regression-heavy corpora
|
|
152
174
|
- human-readable report with failure clusters and next actions
|
|
153
175
|
|
|
154
176
|
`evaluateReleaseConfidence()` and the paired statistics helpers provide the
|
|
@@ -177,6 +199,11 @@ Use `@tangle-network/agent-integrations` manifests as readiness inputs. Gate
|
|
|
177
199
|
missing connections, missing scopes, approval-required writes, and stale tokens
|
|
178
200
|
before blaming the agent prompt.
|
|
179
201
|
|
|
202
|
+
For generated apps and sandbox agents, also run the
|
|
203
|
+
[Integration Launch Gates](./integration-launch-gates.md). The eval should prove
|
|
204
|
+
that app code invokes through the integration bridge, not provider SDKs with raw
|
|
205
|
+
OAuth tokens.
|
|
206
|
+
|
|
180
207
|
### Voice Agent
|
|
181
208
|
|
|
182
209
|
Record transcript, timing, interruptions, tool calls, and task outcome. Judge
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# researchReport — methodology
|
|
2
|
+
|
|
3
|
+
This document is the methodological brief for `researchReport` (exported from
|
|
4
|
+
`@tangle-network/agent-eval` and `@tangle-network/agent-eval/reporting`). It
|
|
5
|
+
exists so a launch reviewer, peer reviewer, or auditor can quickly verify that
|
|
6
|
+
the verdict embedded in any rendered report is defensible, reproducible, and
|
|
7
|
+
appropriate to the data.
|
|
8
|
+
|
|
9
|
+
The companion code is `src/summary-report.ts`. Each item below names the
|
|
10
|
+
corresponding function or option so the doc and the code don't drift.
|
|
11
|
+
|
|
12
|
+
## Inputs
|
|
13
|
+
|
|
14
|
+
- `runs: RunRecord[]` — every record carries `runId`, `candidateId`, `seed`,
|
|
15
|
+
`experimentId`, `splitTag`, and an `outcome` with the configured score.
|
|
16
|
+
- `comparator: string` — the candidate id treated as the null reference. Must
|
|
17
|
+
be selected before data inspection; `preregistrationHash` should pin this.
|
|
18
|
+
- `split: 'search' | 'holdout'` — defaults to `holdout`. Decisions on `search`
|
|
19
|
+
are descriptive only; promotion calls require the holdout.
|
|
20
|
+
- `rope: { low, high }` — Region of Practical Equivalence on the paired delta,
|
|
21
|
+
in score units. Must come from the domain owner — there is no
|
|
22
|
+
statistically-defensible default.
|
|
23
|
+
- `minPairs` (soft floor, default 20) and `RESEARCH_REPORT_HARD_PAIR_FLOOR`
|
|
24
|
+
(hard floor, 6). Below the soft floor, the verdict is `needs_more_data` and
|
|
25
|
+
the report carries the MDE at the current N.
|
|
26
|
+
- `fdr` (default 0.05), `confidence` (default 0.95), `mdePower` (default 0.8),
|
|
27
|
+
`mdeAlpha` (default = `fdr`).
|
|
28
|
+
|
|
29
|
+
## Pairing
|
|
30
|
+
|
|
31
|
+
Pairs are joined by `(experimentId, seed)` so the comparator and candidate
|
|
32
|
+
share scenario *and* seed. This is the same join `gainHistogram` uses; see
|
|
33
|
+
`pairScoresByKey` in `src/summary-report.ts`. Records on the wrong split or
|
|
34
|
+
with non-finite scores are dropped before pairing.
|
|
35
|
+
|
|
36
|
+
## Decision rule
|
|
37
|
+
|
|
38
|
+
In order — first match wins:
|
|
39
|
+
|
|
40
|
+
1. `comparator` itself → `hold` (baseline).
|
|
41
|
+
2. No comparator → `hold` if on the cost/quality Pareto frontier, else
|
|
42
|
+
`needs_more_data`. The verdict is descriptive, not causal.
|
|
43
|
+
3. Held-out gate verdict ≠ `promote` → `reject`. The gate is *necessary but
|
|
44
|
+
not sufficient*; even a `promote` gate must clear the paired test below.
|
|
45
|
+
4. Paired N < `RESEARCH_REPORT_HARD_PAIR_FLOOR` → `needs_more_data` with a
|
|
46
|
+
"below hard floor" reason. Bootstrap CIs degenerate at this size.
|
|
47
|
+
5. ROPE configured AND paired-delta CI ⊂ ROPE → `equivalent`.
|
|
48
|
+
6. Paired-delta CI upper bound < 0 → `reject` (CI excludes a non-negative
|
|
49
|
+
effect). Note: this uses **paired delta only** — not the marginal mean.
|
|
50
|
+
7. Paired N < `minPairs` (soft floor) → `needs_more_data` with the MDE at
|
|
51
|
+
current N attached so the verdict is actionable.
|
|
52
|
+
8. BH-adjusted q ≤ `fdr` AND CI lower bound > 0 → `promote`. The BH q-value
|
|
53
|
+
controls FDR across all candidates in the same sweep; the bootstrap CI
|
|
54
|
+
provides an effect-size guarantee independent of the test.
|
|
55
|
+
9. Otherwise → `hold`.
|
|
56
|
+
|
|
57
|
+
## Statistical primitives used
|
|
58
|
+
|
|
59
|
+
| Quantity | Function | Source file |
|
|
60
|
+
|---|---|---|
|
|
61
|
+
| Marginal CI on score mean | `confidenceInterval` | `statistics.ts` |
|
|
62
|
+
| Cohen's d vs comparator | `cohensD` | `statistics.ts` |
|
|
63
|
+
| Wilcoxon signed-rank (paired) | `wilcoxonSignedRank` | `statistics.ts` |
|
|
64
|
+
| BH-FDR q-values | `benjaminiHochberg` | `power-analysis.ts` |
|
|
65
|
+
| Paired bootstrap CI on median delta | `pairedBootstrap` | `paired-stats.ts` |
|
|
66
|
+
| Bayesian-bootstrap-style Pr(Δ>0), Pr(Δ∈ROPE) | `bootstrapMeanSamples` | `summary-report.ts` (private) |
|
|
67
|
+
| Minimum detectable paired effect | `pairedMde` | `power-analysis.ts` |
|
|
68
|
+
| Run fingerprint | `hashJson(canonicalize(...))` | `pre-registration.ts` |
|
|
69
|
+
|
|
70
|
+
The Pr(Δ>0) and Pr(Δ∈ROPE) summaries use the bootstrap-prior duality of
|
|
71
|
+
[Rubin 1981]: under a non-informative Dirichlet prior, the bootstrap
|
|
72
|
+
distribution of a sample statistic is its posterior. We expose these as
|
|
73
|
+
posterior summaries on the **mean** delta and the bootstrap CI on the
|
|
74
|
+
**median** delta — the median is more robust to the heavy-tailed score
|
|
75
|
+
distributions seen in agent benchmarks; the mean lets us read off the
|
|
76
|
+
Bayesian-style probability of superiority in a single number.
|
|
77
|
+
|
|
78
|
+
## MDE
|
|
79
|
+
|
|
80
|
+
The minimum detectable paired effect at N pairs, two-sided α, and power β:
|
|
81
|
+
|
|
82
|
+
$$d_\text{min} = \frac{z_{1-\alpha/2} + z_\beta}{\sqrt{n}}$$
|
|
83
|
+
|
|
84
|
+
reported on the standardised scale, then multiplied by the observed paired-
|
|
85
|
+
delta SD to get the MDE in score units. Consumers reading a `needs_more_data`
|
|
86
|
+
verdict can use the MDE to budget the next round of runs:
|
|
87
|
+
|
|
88
|
+
- Observed paired SD = 0.10 score units, paired N = 20, α = 0.05, β = 0.8 →
|
|
89
|
+
d_min ≈ 0.63 standardised → MDE ≈ 0.063 score units. If the smallest
|
|
90
|
+
effect that would change a launch decision is below this, run more pairs.
|
|
91
|
+
|
|
92
|
+
## Provenance
|
|
93
|
+
|
|
94
|
+
Every report carries:
|
|
95
|
+
|
|
96
|
+
- `runFingerprint`: SHA-256 over the canonicalised list of
|
|
97
|
+
`(runId, candidateId, splitTag)` triples (sorted by runId), plus the
|
|
98
|
+
comparator id and split. Same `(runs, comparator, split)` produces the same
|
|
99
|
+
fingerprint regardless of input order.
|
|
100
|
+
- `preregistrationHash`: the caller passes the hash of a signed
|
|
101
|
+
`HypothesisManifest` (see `pre-registration.ts`). The fingerprint and the
|
|
102
|
+
preregistration hash together let a reader verify both *what data the
|
|
103
|
+
report saw* and *what protocol it was supposed to run.*
|
|
104
|
+
|
|
105
|
+
Reports without a `preregistrationHash` carry a "post-hoc" warning in the
|
|
106
|
+
risks list and the executive summary. Treat them as descriptive only.
|
|
107
|
+
|
|
108
|
+
## Alternatives considered
|
|
109
|
+
|
|
110
|
+
- **Paired t-test instead of Wilcoxon + bootstrap.** Rejected: agent score
|
|
111
|
+
distributions are heavy-tailed (judges saturate near 0 and 1) and the t
|
|
112
|
+
approximation breaks down with the small N typical of holdouts.
|
|
113
|
+
- **Unpaired Mann–Whitney.** Rejected: matched scenarios make pairing free,
|
|
114
|
+
and unpaired tests throw away the variance reduction. Use the paired test
|
|
115
|
+
by default.
|
|
116
|
+
- **Sequential / always-valid inference (e-values, mSPRT, alpha-spending).**
|
|
117
|
+
Out of scope for a single-look report. If users iterate, wrap this report
|
|
118
|
+
in an alpha-spending schedule, or commit to one preregistered look.
|
|
119
|
+
- **Hierarchical Bayesian shrinkage across many candidates.** Future work.
|
|
120
|
+
The current ranking is on raw paired statistics and over-credits the top
|
|
121
|
+
candidate when many are tested.
|
|
122
|
+
- **Calibration / coverage simulation on the bootstrap CI.** Future work; we
|
|
123
|
+
rely on the asymptotic guarantee plus the hard pair floor to keep coverage
|
|
124
|
+
reasonable.
|
|
125
|
+
|
|
126
|
+
## When NOT to apply
|
|
127
|
+
|
|
128
|
+
- Paired N below the hard floor (6) on any candidate.
|
|
129
|
+
- Comparator chosen by inspecting the data (post-hoc selection inflates
|
|
130
|
+
false-discovery rates beyond the BH guarantee).
|
|
131
|
+
- Mid-run distribution shift: judge model swap, rubric change, infrastructure
|
|
132
|
+
outage. Pair exchangeability is violated and the bootstrap is not valid.
|
|
133
|
+
- Scenarios drawn non-randomly from a stream the candidate can influence
|
|
134
|
+
(data-leak across runs). The pairing is no longer ignorable.
|
|
135
|
+
- Highly skewed cost distributions: the Pareto frontier still works but the
|
|
136
|
+
marginal CI on cost may be misleading.
|
|
137
|
+
|
|
138
|
+
## Citations
|
|
139
|
+
|
|
140
|
+
- Benjamini, Y. & Hochberg, Y. (1995). Controlling the false discovery rate:
|
|
141
|
+
a practical and powerful approach to multiple testing. *JRSS B*,
|
|
142
|
+
57(1), 289–300.
|
|
143
|
+
- Wilcoxon, F. (1945). Individual comparisons by ranking methods.
|
|
144
|
+
*Biometrics Bulletin*, 1(6), 80–83.
|
|
145
|
+
- Efron, B. (1979). Bootstrap methods: another look at the jackknife.
|
|
146
|
+
*Annals of Statistics*, 7(1), 1–26.
|
|
147
|
+
- Rubin, D. B. (1981). The Bayesian bootstrap.
|
|
148
|
+
*Annals of Statistics*, 9(1), 130–134.
|
|
149
|
+
- Kruschke, J. K. (2018). Rejecting or accepting parameter values in
|
|
150
|
+
Bayesian estimation. *Advances in Methods and Practices in
|
|
151
|
+
Psychological Science*, 1(2), 270–280. (ROPE.)
|
|
152
|
+
- Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).
|
|
153
|
+
Time-uniform, nonparametric, nonasymptotic confidence sequences.
|
|
154
|
+
*Annals of Statistics*, 49(2), 1055–1080. (Background reading on
|
|
155
|
+
always-valid inference for sequential extensions.)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Trace Analysis
|
|
2
|
+
|
|
3
|
+
Trace analysis is the bridge between raw product telemetry and useful eval work.
|
|
4
|
+
|
|
5
|
+
```txt
|
|
6
|
+
live product run
|
|
7
|
+
-> TraceEmitter / TraceStore
|
|
8
|
+
-> TraceAnalyst investigates trace corpora
|
|
9
|
+
-> findings become ASI, failures, replay cases, and release actions
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## When To Use TraceAnalyst
|
|
13
|
+
|
|
14
|
+
Use `TraceAnalyst` when you have more than a few traces and need to answer:
|
|
15
|
+
|
|
16
|
+
- which failure modes are recurring?
|
|
17
|
+
- which spans explain a regression?
|
|
18
|
+
- did retrieval, integrations, sandbox, or policy block the run?
|
|
19
|
+
- are failed runs missing evidence that the optimizer needs?
|
|
20
|
+
- which product surfaces deserve the next fix?
|
|
21
|
+
|
|
22
|
+
Use summary tables and release confidence for promotion decisions. Use
|
|
23
|
+
TraceAnalyst to explain the evidence behind those decisions.
|
|
24
|
+
|
|
25
|
+
## Minimal Flow
|
|
26
|
+
|
|
27
|
+
```ts
|
|
28
|
+
import {
|
|
29
|
+
OtlpFileTraceStore,
|
|
30
|
+
analyzeTraces,
|
|
31
|
+
} from '@tangle-network/agent-eval'
|
|
32
|
+
|
|
33
|
+
const result = await analyzeTraces({
|
|
34
|
+
question: 'Why did app-runtime holdout runs fail this week?',
|
|
35
|
+
}, {
|
|
36
|
+
source: new OtlpFileTraceStore({ path: 'traces/otlp.jsonl' }),
|
|
37
|
+
ai,
|
|
38
|
+
model: 'gpt-4o-2024-11-20',
|
|
39
|
+
})
|
|
40
|
+
|
|
41
|
+
console.log(result.findings)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Products can pass any `TraceAnalysisStore`; they do not need to use the file
|
|
45
|
+
store in production.
|
|
46
|
+
|
|
47
|
+
## Required Trace Shape
|
|
48
|
+
|
|
49
|
+
Every serious product run should include:
|
|
50
|
+
|
|
51
|
+
- `runId`, `projectId`, `scenarioId`, `variantId`, and `layer`
|
|
52
|
+
- commit, prompt hash, config hash, model fingerprint, and dataset version
|
|
53
|
+
- LLM spans with model, inputs, outputs, token counts, and cost
|
|
54
|
+
- tool/integration spans with arguments, result summaries, and error codes
|
|
55
|
+
- retrieval spans with query, source ids, hit scores, and freshness metadata
|
|
56
|
+
- sandbox/build/test/deploy spans with exit codes and log artifacts
|
|
57
|
+
- custom events for knowledge readiness and integration gates
|
|
58
|
+
- final run outcome with pass/score/failure class
|
|
59
|
+
|
|
60
|
+
Do not put secrets, raw OAuth tokens, or unredacted PII in traces.
|
|
61
|
+
|
|
62
|
+
## Product Loop
|
|
63
|
+
|
|
64
|
+
The product loop should not treat traces as a separate debug dump. The intended
|
|
65
|
+
path is:
|
|
66
|
+
|
|
67
|
+
1. Wrap the real workflow in `runAgentControlLoop` or the product runtime.
|
|
68
|
+
2. Emit canonical spans/events while the user task runs.
|
|
69
|
+
3. Convert the completed run to `FeedbackTrajectory` for replay.
|
|
70
|
+
4. Convert promotion-grade runs to `RunRecord` with `controlRunToRunRecord`.
|
|
71
|
+
5. Run TraceAnalyst over failure-heavy trace sets.
|
|
72
|
+
6. Feed findings into `ActionableSideInfo`, failure clusters, and release
|
|
73
|
+
reports.
|
|
74
|
+
|
|
75
|
+
That makes normal product usage become eval data instead of isolated logs.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.21.0",
|
|
4
4
|
"description": "Trace-first evaluation infrastructure for agent systems: traces, harnesses, verifier pipelines, judges, datasets, gates, optimization, and reporting.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -19,6 +19,26 @@
|
|
|
19
19
|
"import": "./dist/index.js",
|
|
20
20
|
"default": "./dist/index.js"
|
|
21
21
|
},
|
|
22
|
+
"./control": {
|
|
23
|
+
"types": "./dist/control.d.ts",
|
|
24
|
+
"import": "./dist/control.js",
|
|
25
|
+
"default": "./dist/control.js"
|
|
26
|
+
},
|
|
27
|
+
"./optimization": {
|
|
28
|
+
"types": "./dist/optimization.d.ts",
|
|
29
|
+
"import": "./dist/optimization.js",
|
|
30
|
+
"default": "./dist/optimization.js"
|
|
31
|
+
},
|
|
32
|
+
"./reporting": {
|
|
33
|
+
"types": "./dist/reporting.d.ts",
|
|
34
|
+
"import": "./dist/reporting.js",
|
|
35
|
+
"default": "./dist/reporting.js"
|
|
36
|
+
},
|
|
37
|
+
"./traces": {
|
|
38
|
+
"types": "./dist/traces.d.ts",
|
|
39
|
+
"import": "./dist/traces.js",
|
|
40
|
+
"default": "./dist/traces.js"
|
|
41
|
+
},
|
|
22
42
|
"./telemetry": {
|
|
23
43
|
"types": "./dist/telemetry/index.d.ts",
|
|
24
44
|
"import": "./dist/telemetry/index.js",
|
|
@@ -54,15 +74,6 @@
|
|
|
54
74
|
"publishConfig": {
|
|
55
75
|
"access": "public"
|
|
56
76
|
},
|
|
57
|
-
"scripts": {
|
|
58
|
-
"build": "tsup && pnpm openapi",
|
|
59
|
-
"dev": "tsup --watch",
|
|
60
|
-
"prepare": "pnpm build",
|
|
61
|
-
"test": "vitest run",
|
|
62
|
-
"test:watch": "vitest",
|
|
63
|
-
"typecheck": "tsc --noEmit",
|
|
64
|
-
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
65
|
-
},
|
|
66
77
|
"dependencies": {
|
|
67
78
|
"@asteasolutions/zod-to-openapi": "^8.5.0",
|
|
68
79
|
"@ax-llm/ax": "^19.0.25",
|
|
@@ -82,5 +93,12 @@
|
|
|
82
93
|
"node": ">=20"
|
|
83
94
|
},
|
|
84
95
|
"license": "MIT",
|
|
85
|
-
"
|
|
86
|
-
|
|
96
|
+
"scripts": {
|
|
97
|
+
"build": "tsup && pnpm openapi",
|
|
98
|
+
"dev": "tsup --watch",
|
|
99
|
+
"test": "vitest run",
|
|
100
|
+
"test:watch": "vitest",
|
|
101
|
+
"typecheck": "tsc --noEmit",
|
|
102
|
+
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
103
|
+
}
|
|
104
|
+
}
|