@tangle-network/agent-eval 0.20.10 → 0.20.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +129 -126
- package/dist/benchmarks/index.d.ts +2 -1
- package/dist/{chunk-JAOLXRIA.js → chunk-75MCTH7P.js} +8 -2
- package/dist/chunk-75MCTH7P.js.map +1 -0
- package/dist/chunk-HKYRWNHV.js +1354 -0
- package/dist/chunk-HKYRWNHV.js.map +1 -0
- package/dist/{chunk-LSR4IAYN.js → chunk-HNJLMAJ2.js} +2 -2
- package/dist/chunk-IKFVX537.js +717 -0
- package/dist/chunk-IKFVX537.js.map +1 -0
- package/dist/chunk-KWUAAIHR.js +1764 -0
- package/dist/chunk-KWUAAIHR.js.map +1 -0
- package/dist/chunk-MCMV7DUL.js +1310 -0
- package/dist/chunk-MCMV7DUL.js.map +1 -0
- package/dist/chunk-ODFINDLQ.js +413 -0
- package/dist/chunk-ODFINDLQ.js.map +1 -0
- package/dist/chunk-PKCVBYTQ.js +200 -0
- package/dist/chunk-PKCVBYTQ.js.map +1 -0
- package/dist/chunk-YUFXO3TU.js +148 -0
- package/dist/chunk-YUFXO3TU.js.map +1 -0
- package/dist/cli.js +2 -2
- package/dist/control-C8NKbF3w.d.ts +258 -0
- package/dist/control.d.ts +5 -0
- package/dist/control.js +30 -0
- package/dist/control.js.map +1 -0
- package/dist/dataset-B9qvlm_o.d.ts +112 -0
- package/dist/emitter-BYO2nSDA.d.ts +387 -0
- package/dist/feedback-trajectory-BGQ_ANCN.d.ts +345 -0
- package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
- package/dist/index.d.ts +115 -2870
- package/dist/index.js +1049 -6156
- package/dist/index.js.map +1 -1
- package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +145 -0
- package/dist/optimization.js +60 -0
- package/dist/optimization.js.map +1 -0
- package/dist/reporting.d.ts +426 -0
- package/dist/reporting.js +32 -0
- package/dist/reporting.js.map +1 -0
- package/dist/run-record-CX_jcAyr.d.ts +134 -0
- package/dist/traces.d.ts +658 -0
- package/dist/traces.js +100 -0
- package/dist/traces.js.map +1 -0
- package/dist/wire/index.js +2 -2
- package/docs/concepts.md +16 -11
- package/docs/feature-guide.md +10 -17
- package/docs/integration-launch-gates.md +77 -0
- package/docs/product-eval-adoption.md +221 -0
- package/docs/trace-analysis.md +75 -0
- package/package.json +21 -1
- package/dist/chunk-JAOLXRIA.js.map +0 -1
- /package/dist/{chunk-LSR4IAYN.js.map → chunk-HNJLMAJ2.js.map} +0 -0
package/dist/traces.js
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DEFAULT_REDACTION_RULES,
|
|
3
|
+
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
4
|
+
FAILURE_CLASSES,
|
|
5
|
+
FileSystemTraceStore,
|
|
6
|
+
InMemoryTraceStore,
|
|
7
|
+
OTEL_AGENT_EVAL_SCOPE,
|
|
8
|
+
OtlpFileTraceStore,
|
|
9
|
+
REDACTION_VERSION,
|
|
10
|
+
SpanNotFoundError,
|
|
11
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
12
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
|
|
13
|
+
TRACE_ANALYST_SUBAGENT_DESCRIPTION,
|
|
14
|
+
TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
|
|
15
|
+
TRACE_SCHEMA_VERSION,
|
|
16
|
+
TraceFileMissingError,
|
|
17
|
+
TraceNotFoundError,
|
|
18
|
+
aggregateLlm,
|
|
19
|
+
analyzeTraces,
|
|
20
|
+
argHash,
|
|
21
|
+
buildTraceAnalystTools,
|
|
22
|
+
buildTraceInsightContext,
|
|
23
|
+
buildTraceInsightPrompt,
|
|
24
|
+
defaultTraceInsightPanel,
|
|
25
|
+
describeTraceInsightScope,
|
|
26
|
+
domainEvidencePattern,
|
|
27
|
+
exportRunAsOtlp,
|
|
28
|
+
groupBy,
|
|
29
|
+
inferDomainKeywords,
|
|
30
|
+
isJudgeSpan,
|
|
31
|
+
isLlmSpan,
|
|
32
|
+
isRetrievalSpan,
|
|
33
|
+
isSandboxSpan,
|
|
34
|
+
isToolSpan,
|
|
35
|
+
judgeSpans,
|
|
36
|
+
llmSpans,
|
|
37
|
+
planTraceInsightQuestions,
|
|
38
|
+
redactString,
|
|
39
|
+
redactValue,
|
|
40
|
+
runFailureClass,
|
|
41
|
+
runsForScenario,
|
|
42
|
+
scoreTraceInsightReadiness,
|
|
43
|
+
tokenizeDomainWords,
|
|
44
|
+
toolSpans,
|
|
45
|
+
traceAnalystFunctionGroup
|
|
46
|
+
} from "./chunk-KWUAAIHR.js";
|
|
47
|
+
import {
|
|
48
|
+
TraceEmitter,
|
|
49
|
+
llmSpanFromProvider
|
|
50
|
+
} from "./chunk-PKCVBYTQ.js";
|
|
51
|
+
import "./chunk-PZ5AY32C.js";
|
|
52
|
+
export {
|
|
53
|
+
DEFAULT_REDACTION_RULES,
|
|
54
|
+
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
55
|
+
FAILURE_CLASSES,
|
|
56
|
+
FileSystemTraceStore,
|
|
57
|
+
InMemoryTraceStore,
|
|
58
|
+
OTEL_AGENT_EVAL_SCOPE,
|
|
59
|
+
OtlpFileTraceStore,
|
|
60
|
+
REDACTION_VERSION,
|
|
61
|
+
SpanNotFoundError,
|
|
62
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
63
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
|
|
64
|
+
TRACE_ANALYST_SUBAGENT_DESCRIPTION,
|
|
65
|
+
TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
|
|
66
|
+
TRACE_SCHEMA_VERSION,
|
|
67
|
+
TraceEmitter,
|
|
68
|
+
TraceFileMissingError,
|
|
69
|
+
TraceNotFoundError,
|
|
70
|
+
aggregateLlm,
|
|
71
|
+
analyzeTraces,
|
|
72
|
+
argHash,
|
|
73
|
+
buildTraceAnalystTools,
|
|
74
|
+
buildTraceInsightContext,
|
|
75
|
+
buildTraceInsightPrompt,
|
|
76
|
+
defaultTraceInsightPanel,
|
|
77
|
+
describeTraceInsightScope,
|
|
78
|
+
domainEvidencePattern,
|
|
79
|
+
exportRunAsOtlp,
|
|
80
|
+
groupBy,
|
|
81
|
+
inferDomainKeywords,
|
|
82
|
+
isJudgeSpan,
|
|
83
|
+
isLlmSpan,
|
|
84
|
+
isRetrievalSpan,
|
|
85
|
+
isSandboxSpan,
|
|
86
|
+
isToolSpan,
|
|
87
|
+
judgeSpans,
|
|
88
|
+
llmSpanFromProvider,
|
|
89
|
+
llmSpans,
|
|
90
|
+
planTraceInsightQuestions,
|
|
91
|
+
redactString,
|
|
92
|
+
redactValue,
|
|
93
|
+
runFailureClass,
|
|
94
|
+
runsForScenario,
|
|
95
|
+
scoreTraceInsightReadiness,
|
|
96
|
+
tokenizeDomainWords,
|
|
97
|
+
toolSpans,
|
|
98
|
+
traceAnalystFunctionGroup
|
|
99
|
+
};
|
|
100
|
+
//# sourceMappingURL=traces.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
|
package/dist/wire/index.js
CHANGED
|
@@ -24,8 +24,8 @@ import {
|
|
|
24
24
|
runRpcBatch,
|
|
25
25
|
runRpcOnce,
|
|
26
26
|
startServer
|
|
27
|
-
} from "../chunk-
|
|
28
|
-
import "../chunk-
|
|
27
|
+
} from "../chunk-HNJLMAJ2.js";
|
|
28
|
+
import "../chunk-75MCTH7P.js";
|
|
29
29
|
import "../chunk-PZ5AY32C.js";
|
|
30
30
|
export {
|
|
31
31
|
BUILTIN_RUBRICS,
|
package/docs/concepts.md
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
# Concepts
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
`agent-eval` is for deciding whether an agent run should pass, keep working, be
|
|
4
|
+
replayed, be optimized, or be promoted.
|
|
4
5
|
|
|
5
|
-
|
|
6
|
+
It exists because agent output is not evidence. A model can say a task is done
|
|
7
|
+
while the build fails, the browser flow is broken, the integration was never
|
|
8
|
+
connected, or the answer lacks required sources. The package gives products a
|
|
9
|
+
shared way to record runs, check outcomes, classify failures, compare variants,
|
|
10
|
+
and make release decisions.
|
|
6
11
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
It exists because LLMs lie about whether they succeeded. A model will say "Done!" and ship code that doesn't compile. agent-eval is the layer between the model's output and your decision to ship.
|
|
10
|
-
|
|
11
|
-
## The three things you'll touch most
|
|
12
|
+
## Main Objects
|
|
12
13
|
|
|
13
14
|
| Thing | What it is | One-line example |
|
|
14
15
|
|---|---|---|
|
|
@@ -17,7 +18,8 @@ It exists because LLMs lie about whether they succeeded. A model will say "Done!
|
|
|
17
18
|
| **Verifier** | A pipeline of judges run in order, with dependencies. | "install → typecheck → build → semantic" |
|
|
18
19
|
| **Feedback trajectory** | A multi-shot record of attempts, approvals, rejections, edits, metrics, and policy outcomes. | "draft → user rejects → revised draft → approved → measured" |
|
|
19
20
|
|
|
20
|
-
|
|
21
|
+
Everything else exists to make those objects useful in real product loops:
|
|
22
|
+
traces, datasets, control runtime, optimizers, statistics, and reports.
|
|
21
23
|
|
|
22
24
|
When the thing being evaluated is an agent that should keep working, use
|
|
23
25
|
[`runAgentControlLoop`](./control-runtime.md). It turns validators into a
|
|
@@ -62,7 +64,7 @@ shape stays the same.
|
|
|
62
64
|
Those trajectories can be converted into preference memory, `DatasetScenario`
|
|
63
65
|
rows, optimizer rows, and held-out examples for overfit checks.
|
|
64
66
|
|
|
65
|
-
##
|
|
67
|
+
## Code Generator Eval
|
|
66
68
|
|
|
67
69
|
When the artifact is generated code, agent-eval scores it at three independent layers. Each layer fails differently, and you want to know which one broke:
|
|
68
70
|
|
|
@@ -125,7 +127,7 @@ Two rules that will save you bugs:
|
|
|
125
127
|
|
|
126
128
|
2. **Pair LLM judges with build outcomes.** An LLM judge will rate non-compiling code as "looks right" (0.8). Always short-circuit on `buildOutcome.passed === false` before any LLM judging.
|
|
127
129
|
|
|
128
|
-
##
|
|
130
|
+
## Trace Model
|
|
129
131
|
|
|
130
132
|
Every operation emits structured spans into a `TraceStore`. A run is a tree:
|
|
131
133
|
|
|
@@ -142,7 +144,10 @@ builder-session [span]
|
|
|
142
144
|
|
|
143
145
|
Spans are append-only and have stable ids — replay is reading the same store back. OTLP export ships them out for distributed tracing.
|
|
144
146
|
|
|
145
|
-
You
|
|
147
|
+
You usually should not build this tree by hand. Product runtimes,
|
|
148
|
+
`runAgentControlLoop`, harnesses, and verifiers should emit it while they run.
|
|
149
|
+
Use traces when debugging a flaky run, building replay data, or explaining a
|
|
150
|
+
release decision.
|
|
146
151
|
|
|
147
152
|
## Where to go next
|
|
148
153
|
|
package/docs/feature-guide.md
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
This page explains the main `agent-eval` primitives in plain English first,
|
|
4
4
|
then shows when to use each one.
|
|
5
5
|
|
|
6
|
-
##
|
|
6
|
+
## Overview
|
|
7
7
|
|
|
8
8
|
LLM agents can write code, drafts, research, plans, and actions. The hard part
|
|
9
9
|
is knowing whether they actually did a good job, whether they should keep
|
|
@@ -41,7 +41,7 @@ trying, and whether a change made them better or worse.
|
|
|
41
41
|
|
|
42
42
|
## Integration Patterns
|
|
43
43
|
|
|
44
|
-
### Recommended
|
|
44
|
+
### Recommended Product Shape
|
|
45
45
|
|
|
46
46
|
Use this shape when the product needs to keep pushing work forward instead of
|
|
47
47
|
only answering once:
|
|
@@ -175,21 +175,6 @@ Store as `FeedbackTrajectory`, then derive:
|
|
|
175
175
|
logs, screenshots, or browser state. Use separate sandboxes for parallel
|
|
176
176
|
variants or destructive checks.
|
|
177
177
|
|
|
178
|
-
## Same-Sandbox Example
|
|
179
|
-
|
|
180
|
-
`examples/same-sandbox-harness/` shows the common coding/browser pattern:
|
|
181
|
-
|
|
182
|
-
```text
|
|
183
|
-
one sandbox/workdir -> install/build/test -> inspect evidence -> emit judge span
|
|
184
|
-
```
|
|
185
|
-
|
|
186
|
-
Use this when a judge needs evidence produced by earlier harness phases. Use
|
|
187
|
-
isolated sandboxes when variants run in parallel or a phase can corrupt the
|
|
188
|
-
workspace.
|
|
189
|
-
- Treat telemetry as evidence, not control flow. A trace sink outage should be
|
|
190
|
-
visible in `runtimeErrors`, but it should not stop the worker from completing
|
|
191
|
-
the user task.
|
|
192
|
-
|
|
193
178
|
## Highest-ROI Adoption Order
|
|
194
179
|
|
|
195
180
|
1. Wrap one real product workflow in `runAgentControlLoop`.
|
|
@@ -211,3 +196,11 @@ reusable:
|
|
|
211
196
|
|
|
212
197
|
Core should provide shapes, stores, runners, scoring, traces, and converters.
|
|
213
198
|
Downstream integrations provide domain state, policy, tools, and storage.
|
|
199
|
+
|
|
200
|
+
## Examples
|
|
201
|
+
|
|
202
|
+
- `examples/same-sandbox-harness`: one workdir for install/build/test plus
|
|
203
|
+
evidence inspection.
|
|
204
|
+
- `examples/multi-shot-optimization`: full-trajectory optimization with a
|
|
205
|
+
holdout gate.
|
|
206
|
+
- `examples/benchmarks`: benchmark adapter contracts and reference wrappers.
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Integration Launch Gates
|
|
2
|
+
|
|
3
|
+
Use these gates when a product lets generated apps or agents use user-owned
|
|
4
|
+
connections through an integration hub.
|
|
5
|
+
|
|
6
|
+
The eval should wrap the real product path:
|
|
7
|
+
|
|
8
|
+
```txt
|
|
9
|
+
user prompt
|
|
10
|
+
-> product emits IntegrationManifest
|
|
11
|
+
-> platform resolves connections and grants
|
|
12
|
+
-> sandbox receives capability bundle
|
|
13
|
+
-> generated app invokes integration action
|
|
14
|
+
-> platform enforces policy, approval, idempotency, audit
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Deterministic Gates
|
|
18
|
+
|
|
19
|
+
- The generated app declares an integration manifest before sandbox launch.
|
|
20
|
+
- Manifest validation passes.
|
|
21
|
+
- Required connections and scopes are present before execution.
|
|
22
|
+
- Sandbox environment contains a capability bundle, not raw provider tokens.
|
|
23
|
+
- Reads invoke through the platform bridge.
|
|
24
|
+
- Writes return `approval_required` unless product policy explicitly allows
|
|
25
|
+
them.
|
|
26
|
+
- Approved writes are bound to the same action, input hash, connection, and
|
|
27
|
+
subject.
|
|
28
|
+
- Revoked grants or expired capabilities stop invocation.
|
|
29
|
+
- Resumed or long-running sandboxes receive a refreshed bundle before expiry.
|
|
30
|
+
- Audit includes grant creation, capability issue, invoke success/failure,
|
|
31
|
+
approval resolution, and revoke events.
|
|
32
|
+
|
|
33
|
+
## Failure Classes
|
|
34
|
+
|
|
35
|
+
`agent-eval` classifies integration failures separately from prompt/tool
|
|
36
|
+
failures:
|
|
37
|
+
|
|
38
|
+
- `bad_integration_manifest`
|
|
39
|
+
- `missing_integration_connection`
|
|
40
|
+
- `missing_integration_scope`
|
|
41
|
+
- `integration_approval_required`
|
|
42
|
+
- `integration_auth_expired`
|
|
43
|
+
- `integration_provider_failure`
|
|
44
|
+
- `unsafe_integration_write_denied`
|
|
45
|
+
|
|
46
|
+
Use the helper payload builders and eval builders so products emit the same
|
|
47
|
+
trace evidence:
|
|
48
|
+
|
|
49
|
+
```ts
|
|
50
|
+
const gate = {
|
|
51
|
+
connectorId: 'google-calendar',
|
|
52
|
+
actionId: 'events.create',
|
|
53
|
+
valid: true,
|
|
54
|
+
missingConnections: [],
|
|
55
|
+
missingScopes: ['calendar.events.write'],
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const evals = integrationGateEvals(gate)
|
|
59
|
+
|
|
60
|
+
await emitter.emit({
|
|
61
|
+
kind: 'custom',
|
|
62
|
+
payload: integrationManifestResolvedPayload(gate),
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
await emitter.emit({
|
|
66
|
+
kind: 'custom',
|
|
67
|
+
payload: integrationInvokeFailedPayload({
|
|
68
|
+
connectorId: 'google-calendar',
|
|
69
|
+
actionId: 'events.create',
|
|
70
|
+
code: 'scope_denied',
|
|
71
|
+
message: 'calendar.events.write was not granted',
|
|
72
|
+
}),
|
|
73
|
+
})
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
The classifier then reports the real missing surface instead of burying the
|
|
77
|
+
failure under `tool_recovery_failure` or `unknown`.
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# Product Eval Adoption
|
|
2
|
+
|
|
3
|
+
This guide is for teams adding `@tangle-network/agent-eval` to a real agent
|
|
4
|
+
product. The package supplies evaluation contracts and runtime primitives. Your
|
|
5
|
+
product supplies the actual workflow adapter, state, credentials, tools, UI, and
|
|
6
|
+
storage.
|
|
7
|
+
|
|
8
|
+
## Goal
|
|
9
|
+
|
|
10
|
+
Use the same loop for production, replay, and optimization:
|
|
11
|
+
|
|
12
|
+
```txt
|
|
13
|
+
real user task
|
|
14
|
+
-> product adapter observes state
|
|
15
|
+
-> validators and judges grade state
|
|
16
|
+
-> control loop decides next action
|
|
17
|
+
-> product agent acts in the real environment
|
|
18
|
+
-> trace + feedback trajectory are stored
|
|
19
|
+
-> datasets and optimizers replay the same adapter
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
If production and eval use different loops, benchmark gains will not transfer.
|
|
23
|
+
|
|
24
|
+
## What The Product Owns
|
|
25
|
+
|
|
26
|
+
The product owns:
|
|
27
|
+
|
|
28
|
+
- task state and domain models
|
|
29
|
+
- credentials, tenant policy, approval, and side-effect rules
|
|
30
|
+
- browser, sandbox, CLI, connector, or voice drivers
|
|
31
|
+
- database and trace persistence
|
|
32
|
+
- user/reviewer feedback collection
|
|
33
|
+
- deployment and live canary routing
|
|
34
|
+
- model gateway configuration
|
|
35
|
+
|
|
36
|
+
`agent-eval` owns:
|
|
37
|
+
|
|
38
|
+
- trace, run, dataset, feedback, and score contracts
|
|
39
|
+
- control-loop mechanics
|
|
40
|
+
- verifier and judge orchestration
|
|
41
|
+
- failure taxonomy
|
|
42
|
+
- paired statistics and holdout gates
|
|
43
|
+
- optimizer inputs and promotion reports
|
|
44
|
+
|
|
45
|
+
## Minimal Production Adapter
|
|
46
|
+
|
|
47
|
+
Start with a small adapter that mirrors one real workflow.
|
|
48
|
+
|
|
49
|
+
```ts
|
|
50
|
+
interface ProductEvalAdapter<TState, TAction> {
|
|
51
|
+
observe(taskId: string): Promise<TState>
|
|
52
|
+
validate(state: TState): Promise<ControlEvalResult[]>
|
|
53
|
+
decide(input: {
|
|
54
|
+
state: TState
|
|
55
|
+
evals: ControlEvalResult[]
|
|
56
|
+
history: unknown[]
|
|
57
|
+
}): Promise<TAction | 'stop'>
|
|
58
|
+
act(taskId: string, action: TAction): Promise<void>
|
|
59
|
+
}
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Keep the adapter product-owned until at least two products need the same shape.
|
|
63
|
+
|
|
64
|
+
## Validator Order
|
|
65
|
+
|
|
66
|
+
Use deterministic checks before judges.
|
|
67
|
+
|
|
68
|
+
1. **State validity**: schema, required files, required DB rows, required
|
|
69
|
+
connections.
|
|
70
|
+
2. **Runtime gates**: install, build, typecheck, tests, serve, deploy smoke.
|
|
71
|
+
3. **Policy gates**: approvals, side effects, budget, credentials, data
|
|
72
|
+
freshness.
|
|
73
|
+
4. **Behavior gates**: browser flows, API calls, generated app preview, voice
|
|
74
|
+
transcript checks.
|
|
75
|
+
5. **Semantic judges**: intent fit, quality, completeness, safety,
|
|
76
|
+
professional correctness.
|
|
77
|
+
|
|
78
|
+
Semantic judges should never turn a failed build into a pass.
|
|
79
|
+
|
|
80
|
+
## Traces And Feedback
|
|
81
|
+
|
|
82
|
+
Every serious run should record:
|
|
83
|
+
|
|
84
|
+
- task id and scenario id
|
|
85
|
+
- git commit
|
|
86
|
+
- model and provider
|
|
87
|
+
- prompt/config hashes
|
|
88
|
+
- tool calls and retrieval spans
|
|
89
|
+
- build/test/deploy output
|
|
90
|
+
- cost, latency, and token use
|
|
91
|
+
- user/reviewer feedback
|
|
92
|
+
- final outcome and failure class
|
|
93
|
+
|
|
94
|
+
Convert runs into `FeedbackTrajectory` records so normal product usage becomes
|
|
95
|
+
replayable eval data.
|
|
96
|
+
|
|
97
|
+
```txt
|
|
98
|
+
production run -> feedback trajectory -> dataset scenario -> optimizer row
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
For promotion-grade runs, also project the completed control result into a
|
|
102
|
+
strict `RunRecord`:
|
|
103
|
+
|
|
104
|
+
```ts
|
|
105
|
+
const record = controlRunToRunRecord(controlResult, {
|
|
106
|
+
experimentId,
|
|
107
|
+
candidateId,
|
|
108
|
+
seed,
|
|
109
|
+
model: 'gpt-4o-2024-11-20',
|
|
110
|
+
promptHash,
|
|
111
|
+
configHash,
|
|
112
|
+
commitSha,
|
|
113
|
+
splitTag: 'holdout',
|
|
114
|
+
tokenUsage,
|
|
115
|
+
})
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Datasets And Holdouts
|
|
119
|
+
|
|
120
|
+
Use four splits:
|
|
121
|
+
|
|
122
|
+
- `train`: optimizer search.
|
|
123
|
+
- `dev`: tuning and threshold selection.
|
|
124
|
+
- `test`: normal reporting.
|
|
125
|
+
- `holdout`: promotion-only gate.
|
|
126
|
+
|
|
127
|
+
The low-level `RunRecord` schema uses `search | dev | holdout`; map `train`
|
|
128
|
+
and normal non-holdout test/report rows to `search` when producing promotion
|
|
129
|
+
tables.
|
|
130
|
+
|
|
131
|
+
Do not inspect or tune against holdout failures during optimization. If a
|
|
132
|
+
holdout failure reveals a real product bug, fix the bug and rotate the holdout
|
|
133
|
+
set with a signed note.
|
|
134
|
+
|
|
135
|
+
## Optimization
|
|
136
|
+
|
|
137
|
+
Use `runMultiShotOptimization()` when the system is a multi-step agent, not a
|
|
138
|
+
single prompt.
|
|
139
|
+
|
|
140
|
+
Good optimization targets:
|
|
141
|
+
|
|
142
|
+
- system prompt
|
|
143
|
+
- tool descriptions
|
|
144
|
+
- retrieval policy
|
|
145
|
+
- data acquisition policy
|
|
146
|
+
- user-question policy
|
|
147
|
+
- evaluator threshold
|
|
148
|
+
- agent topology
|
|
149
|
+
- scaffold/template choice
|
|
150
|
+
|
|
151
|
+
Bad optimization targets:
|
|
152
|
+
|
|
153
|
+
- hidden holdout examples
|
|
154
|
+
- production credentials
|
|
155
|
+
- brittle string checks that do not match user value
|
|
156
|
+
- fake workflows that do not call the product adapter
|
|
157
|
+
|
|
158
|
+
Use actionable side information so the optimizer knows whether a failure belongs
|
|
159
|
+
to prompt, tools, retrieval, data acquisition, sandbox, evaluator, or product
|
|
160
|
+
runtime.
|
|
161
|
+
|
|
162
|
+
## Release Gate
|
|
163
|
+
|
|
164
|
+
A launch or promotion should require:
|
|
165
|
+
|
|
166
|
+
- enough runs for the target risk level
|
|
167
|
+
- paired improvement over the current baseline
|
|
168
|
+
- no critical regression on test
|
|
169
|
+
- holdout pass or explicit rejection
|
|
170
|
+
- cost and latency within budget
|
|
171
|
+
- no unresolved canary or contamination failures
|
|
172
|
+
- trace evidence for representative successes and failures
|
|
173
|
+
- TraceAnalyst findings for failure-heavy or regression-heavy corpora
|
|
174
|
+
- human-readable report with failure clusters and next actions
|
|
175
|
+
|
|
176
|
+
`evaluateReleaseConfidence()` and the paired statistics helpers provide the
|
|
177
|
+
decision data. The product decides the business threshold.
|
|
178
|
+
|
|
179
|
+
## Product Patterns
|
|
180
|
+
|
|
181
|
+
### Coding Or Builder Agent
|
|
182
|
+
|
|
183
|
+
Use sandbox/build/test/serve/browser validators. Add intent and semantic
|
|
184
|
+
concept judges only after the generated app runs.
|
|
185
|
+
|
|
186
|
+
### Browser Agent
|
|
187
|
+
|
|
188
|
+
Record browser steps, screenshots, network errors, console errors, and final
|
|
189
|
+
state. Use deterministic DOM/API assertions before visual or semantic judges.
|
|
190
|
+
|
|
191
|
+
### Domain Agent
|
|
192
|
+
|
|
193
|
+
Use domain fixtures, jurisdiction/date metadata, retrieval spans, and
|
|
194
|
+
professional judges. Fail missing/stale evidence separately from bad reasoning.
|
|
195
|
+
|
|
196
|
+
### Workflow Or Integration Agent
|
|
197
|
+
|
|
198
|
+
Use `@tangle-network/agent-integrations` manifests as readiness inputs. Gate
|
|
199
|
+
missing connections, missing scopes, approval-required writes, and stale tokens
|
|
200
|
+
before blaming the agent prompt.
|
|
201
|
+
|
|
202
|
+
For generated apps and sandbox agents, also run the
|
|
203
|
+
[Integration Launch Gates](./integration-launch-gates.md). The eval should prove
|
|
204
|
+
that app code invokes through the integration bridge, not provider SDKs with raw
|
|
205
|
+
OAuth tokens.
|
|
206
|
+
|
|
207
|
+
### Voice Agent
|
|
208
|
+
|
|
209
|
+
Record transcript, timing, interruptions, tool calls, and task outcome. Judge
|
|
210
|
+
conversation quality separately from tool success and policy compliance.
|
|
211
|
+
|
|
212
|
+
## Anti-Patterns
|
|
213
|
+
|
|
214
|
+
- Evaluating only final prose for an agent that actually builds, browses, or
|
|
215
|
+
calls tools.
|
|
216
|
+
- Letting an LLM judge override failed tests.
|
|
217
|
+
- Optimizing on examples that users will never hit.
|
|
218
|
+
- Recording traces as logs but never converting them to datasets.
|
|
219
|
+
- Calling every failure a prompt failure when context, data, auth, or runtime
|
|
220
|
+
readiness was missing.
|
|
221
|
+
- Shipping reports without run ids, commits, model ids, or evidence links.
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Trace Analysis
|
|
2
|
+
|
|
3
|
+
Trace analysis is the bridge between raw product telemetry and useful eval work.
|
|
4
|
+
|
|
5
|
+
```txt
|
|
6
|
+
live product run
|
|
7
|
+
-> TraceEmitter / TraceStore
|
|
8
|
+
-> TraceAnalyst investigates trace corpora
|
|
9
|
+
-> findings become ASI, failures, replay cases, and release actions
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## When To Use TraceAnalyst
|
|
13
|
+
|
|
14
|
+
Use `TraceAnalyst` when you have more than a few traces and need to answer:
|
|
15
|
+
|
|
16
|
+
- which failure modes are recurring?
|
|
17
|
+
- which spans explain a regression?
|
|
18
|
+
- did retrieval, integrations, sandbox, or policy block the run?
|
|
19
|
+
- are failed runs missing evidence that the optimizer needs?
|
|
20
|
+
- which product surfaces deserve the next fix?
|
|
21
|
+
|
|
22
|
+
Use summary tables and release confidence for promotion decisions. Use
|
|
23
|
+
TraceAnalyst to explain the evidence behind those decisions.
|
|
24
|
+
|
|
25
|
+
## Minimal Flow
|
|
26
|
+
|
|
27
|
+
```ts
|
|
28
|
+
import {
|
|
29
|
+
OtlpFileTraceStore,
|
|
30
|
+
analyzeTraces,
|
|
31
|
+
} from '@tangle-network/agent-eval'
|
|
32
|
+
|
|
33
|
+
const result = await analyzeTraces({
|
|
34
|
+
question: 'Why did app-runtime holdout runs fail this week?',
|
|
35
|
+
}, {
|
|
36
|
+
source: new OtlpFileTraceStore({ path: 'traces/otlp.jsonl' }),
|
|
37
|
+
ai,
|
|
38
|
+
model: 'gpt-4o-2024-11-20',
|
|
39
|
+
})
|
|
40
|
+
|
|
41
|
+
console.log(result.findings)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Products can pass any `TraceAnalysisStore`; they do not need to use the file
|
|
45
|
+
store in production.
|
|
46
|
+
|
|
47
|
+
## Required Trace Shape
|
|
48
|
+
|
|
49
|
+
Every serious product run should include:
|
|
50
|
+
|
|
51
|
+
- `runId`, `projectId`, `scenarioId`, `variantId`, and `layer`
|
|
52
|
+
- commit, prompt hash, config hash, model fingerprint, and dataset version
|
|
53
|
+
- LLM spans with model, inputs, outputs, token counts, and cost
|
|
54
|
+
- tool/integration spans with arguments, result summaries, and error codes
|
|
55
|
+
- retrieval spans with query, source ids, hit scores, and freshness metadata
|
|
56
|
+
- sandbox/build/test/deploy spans with exit codes and log artifacts
|
|
57
|
+
- custom events for knowledge readiness and integration gates
|
|
58
|
+
- final run outcome with pass/score/failure class
|
|
59
|
+
|
|
60
|
+
Do not put secrets, raw OAuth tokens, or unredacted PII in traces.
|
|
61
|
+
|
|
62
|
+
## Product Loop
|
|
63
|
+
|
|
64
|
+
The product loop should not treat traces as a separate debug dump. The intended
|
|
65
|
+
path is:
|
|
66
|
+
|
|
67
|
+
1. Wrap the real workflow in `runAgentControlLoop` or the product runtime.
|
|
68
|
+
2. Emit canonical spans/events while the user task runs.
|
|
69
|
+
3. Convert the completed run to `FeedbackTrajectory` for replay.
|
|
70
|
+
4. Convert promotion-grade runs to `RunRecord` with `controlRunToRunRecord`.
|
|
71
|
+
5. Run TraceAnalyst over failure-heavy trace sets.
|
|
72
|
+
6. Feed findings into `ActionableSideInfo`, failure clusters, and release
|
|
73
|
+
reports.
|
|
74
|
+
|
|
75
|
+
That makes normal product usage become eval data instead of isolated logs.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.20.
|
|
3
|
+
"version": "0.20.12",
|
|
4
4
|
"description": "Trace-first evaluation infrastructure for agent systems: traces, harnesses, verifier pipelines, judges, datasets, gates, optimization, and reporting.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -19,6 +19,26 @@
|
|
|
19
19
|
"import": "./dist/index.js",
|
|
20
20
|
"default": "./dist/index.js"
|
|
21
21
|
},
|
|
22
|
+
"./control": {
|
|
23
|
+
"types": "./dist/control.d.ts",
|
|
24
|
+
"import": "./dist/control.js",
|
|
25
|
+
"default": "./dist/control.js"
|
|
26
|
+
},
|
|
27
|
+
"./optimization": {
|
|
28
|
+
"types": "./dist/optimization.d.ts",
|
|
29
|
+
"import": "./dist/optimization.js",
|
|
30
|
+
"default": "./dist/optimization.js"
|
|
31
|
+
},
|
|
32
|
+
"./reporting": {
|
|
33
|
+
"types": "./dist/reporting.d.ts",
|
|
34
|
+
"import": "./dist/reporting.js",
|
|
35
|
+
"default": "./dist/reporting.js"
|
|
36
|
+
},
|
|
37
|
+
"./traces": {
|
|
38
|
+
"types": "./dist/traces.d.ts",
|
|
39
|
+
"import": "./dist/traces.js",
|
|
40
|
+
"default": "./dist/traces.js"
|
|
41
|
+
},
|
|
22
42
|
"./telemetry": {
|
|
23
43
|
"types": "./dist/telemetry/index.d.ts",
|
|
24
44
|
"import": "./dist/telemetry/index.js",
|