@tangle-network/agent-eval 0.20.10 → 0.20.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +129 -126
  2. package/dist/benchmarks/index.d.ts +2 -1
  3. package/dist/{chunk-JAOLXRIA.js → chunk-75MCTH7P.js} +8 -2
  4. package/dist/chunk-75MCTH7P.js.map +1 -0
  5. package/dist/chunk-HKYRWNHV.js +1354 -0
  6. package/dist/chunk-HKYRWNHV.js.map +1 -0
  7. package/dist/{chunk-LSR4IAYN.js → chunk-HNJLMAJ2.js} +2 -2
  8. package/dist/chunk-IKFVX537.js +717 -0
  9. package/dist/chunk-IKFVX537.js.map +1 -0
  10. package/dist/chunk-KWUAAIHR.js +1764 -0
  11. package/dist/chunk-KWUAAIHR.js.map +1 -0
  12. package/dist/chunk-MCMV7DUL.js +1310 -0
  13. package/dist/chunk-MCMV7DUL.js.map +1 -0
  14. package/dist/chunk-ODFINDLQ.js +413 -0
  15. package/dist/chunk-ODFINDLQ.js.map +1 -0
  16. package/dist/chunk-PKCVBYTQ.js +200 -0
  17. package/dist/chunk-PKCVBYTQ.js.map +1 -0
  18. package/dist/chunk-YUFXO3TU.js +148 -0
  19. package/dist/chunk-YUFXO3TU.js.map +1 -0
  20. package/dist/cli.js +2 -2
  21. package/dist/control-C8NKbF3w.d.ts +258 -0
  22. package/dist/control.d.ts +5 -0
  23. package/dist/control.js +30 -0
  24. package/dist/control.js.map +1 -0
  25. package/dist/dataset-B9qvlm_o.d.ts +112 -0
  26. package/dist/emitter-BYO2nSDA.d.ts +387 -0
  27. package/dist/feedback-trajectory-BGQ_ANCN.d.ts +345 -0
  28. package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
  29. package/dist/index.d.ts +115 -2870
  30. package/dist/index.js +1049 -6156
  31. package/dist/index.js.map +1 -1
  32. package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
  33. package/dist/openapi.json +1 -1
  34. package/dist/optimization.d.ts +145 -0
  35. package/dist/optimization.js +60 -0
  36. package/dist/optimization.js.map +1 -0
  37. package/dist/reporting.d.ts +426 -0
  38. package/dist/reporting.js +32 -0
  39. package/dist/reporting.js.map +1 -0
  40. package/dist/run-record-CX_jcAyr.d.ts +134 -0
  41. package/dist/traces.d.ts +658 -0
  42. package/dist/traces.js +100 -0
  43. package/dist/traces.js.map +1 -0
  44. package/dist/wire/index.js +2 -2
  45. package/docs/concepts.md +16 -11
  46. package/docs/feature-guide.md +10 -17
  47. package/docs/integration-launch-gates.md +77 -0
  48. package/docs/product-eval-adoption.md +221 -0
  49. package/docs/trace-analysis.md +75 -0
  50. package/package.json +21 -1
  51. package/dist/chunk-JAOLXRIA.js.map +0 -1
  52. /package/dist/{chunk-LSR4IAYN.js.map → chunk-HNJLMAJ2.js.map} +0 -0
package/dist/traces.js ADDED
@@ -0,0 +1,100 @@
1
+ import {
2
+ DEFAULT_REDACTION_RULES,
3
+ DEFAULT_TRACE_ANALYST_BUDGETS,
4
+ FAILURE_CLASSES,
5
+ FileSystemTraceStore,
6
+ InMemoryTraceStore,
7
+ OTEL_AGENT_EVAL_SCOPE,
8
+ OtlpFileTraceStore,
9
+ REDACTION_VERSION,
10
+ SpanNotFoundError,
11
+ TRACE_ANALYST_ACTOR_DESCRIPTION,
12
+ TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
13
+ TRACE_ANALYST_SUBAGENT_DESCRIPTION,
14
+ TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
15
+ TRACE_SCHEMA_VERSION,
16
+ TraceFileMissingError,
17
+ TraceNotFoundError,
18
+ aggregateLlm,
19
+ analyzeTraces,
20
+ argHash,
21
+ buildTraceAnalystTools,
22
+ buildTraceInsightContext,
23
+ buildTraceInsightPrompt,
24
+ defaultTraceInsightPanel,
25
+ describeTraceInsightScope,
26
+ domainEvidencePattern,
27
+ exportRunAsOtlp,
28
+ groupBy,
29
+ inferDomainKeywords,
30
+ isJudgeSpan,
31
+ isLlmSpan,
32
+ isRetrievalSpan,
33
+ isSandboxSpan,
34
+ isToolSpan,
35
+ judgeSpans,
36
+ llmSpans,
37
+ planTraceInsightQuestions,
38
+ redactString,
39
+ redactValue,
40
+ runFailureClass,
41
+ runsForScenario,
42
+ scoreTraceInsightReadiness,
43
+ tokenizeDomainWords,
44
+ toolSpans,
45
+ traceAnalystFunctionGroup
46
+ } from "./chunk-KWUAAIHR.js";
47
+ import {
48
+ TraceEmitter,
49
+ llmSpanFromProvider
50
+ } from "./chunk-PKCVBYTQ.js";
51
+ import "./chunk-PZ5AY32C.js";
52
+ export {
53
+ DEFAULT_REDACTION_RULES,
54
+ DEFAULT_TRACE_ANALYST_BUDGETS,
55
+ FAILURE_CLASSES,
56
+ FileSystemTraceStore,
57
+ InMemoryTraceStore,
58
+ OTEL_AGENT_EVAL_SCOPE,
59
+ OtlpFileTraceStore,
60
+ REDACTION_VERSION,
61
+ SpanNotFoundError,
62
+ TRACE_ANALYST_ACTOR_DESCRIPTION,
63
+ TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
64
+ TRACE_ANALYST_SUBAGENT_DESCRIPTION,
65
+ TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
66
+ TRACE_SCHEMA_VERSION,
67
+ TraceEmitter,
68
+ TraceFileMissingError,
69
+ TraceNotFoundError,
70
+ aggregateLlm,
71
+ analyzeTraces,
72
+ argHash,
73
+ buildTraceAnalystTools,
74
+ buildTraceInsightContext,
75
+ buildTraceInsightPrompt,
76
+ defaultTraceInsightPanel,
77
+ describeTraceInsightScope,
78
+ domainEvidencePattern,
79
+ exportRunAsOtlp,
80
+ groupBy,
81
+ inferDomainKeywords,
82
+ isJudgeSpan,
83
+ isLlmSpan,
84
+ isRetrievalSpan,
85
+ isSandboxSpan,
86
+ isToolSpan,
87
+ judgeSpans,
88
+ llmSpanFromProvider,
89
+ llmSpans,
90
+ planTraceInsightQuestions,
91
+ redactString,
92
+ redactValue,
93
+ runFailureClass,
94
+ runsForScenario,
95
+ scoreTraceInsightReadiness,
96
+ tokenizeDomainWords,
97
+ toolSpans,
98
+ traceAnalystFunctionGroup
99
+ };
100
+ //# sourceMappingURL=traces.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
@@ -24,8 +24,8 @@ import {
24
24
  runRpcBatch,
25
25
  runRpcOnce,
26
26
  startServer
27
- } from "../chunk-LSR4IAYN.js";
28
- import "../chunk-JAOLXRIA.js";
27
+ } from "../chunk-HNJLMAJ2.js";
28
+ import "../chunk-75MCTH7P.js";
29
29
  import "../chunk-PZ5AY32C.js";
30
30
  export {
31
31
  BUILTIN_RUBRICS,
package/docs/concepts.md CHANGED
@@ -1,14 +1,15 @@
1
1
  # Concepts
2
2
 
3
- Read this once and the rest of agent-eval makes sense.
3
+ `agent-eval` is for deciding whether an agent run should pass, keep working, be
4
+ replayed, be optimized, or be promoted.
4
5
 
5
- ## What is agent-eval?
6
+ It exists because agent output is not evidence. A model can say a task is done
7
+ while the build fails, the browser flow is broken, the integration was never
8
+ connected, or the answer lacks required sources. The package gives products a
9
+ shared way to record runs, check outcomes, classify failures, compare variants,
10
+ and make release decisions.
6
11
 
7
- A library for **deciding whether a code generator or content generator did its job.** You give it a thing the generator produced (a scaffold, a patch, a tweet, a JSON config), and you get back a structured verdict: pass/fail, dimension scores, a reason in plain English.
8
-
9
- It exists because LLMs lie about whether they succeeded. A model will say "Done!" and ship code that doesn't compile. agent-eval is the layer between the model's output and your decision to ship.
10
-
11
- ## The three things you'll touch most
12
+ ## Main Objects
12
13
 
13
14
  | Thing | What it is | One-line example |
14
15
  |---|---|---|
@@ -17,7 +18,8 @@ It exists because LLMs lie about whether they succeeded. A model will say "Done!
17
18
  | **Verifier** | A pipeline of judges run in order, with dependencies. | "install → typecheck → build → semantic" |
18
19
  | **Feedback trajectory** | A multi-shot record of attempts, approvals, rejections, edits, metrics, and policy outcomes. | "draft → user rejects → revised draft → approved → measured" |
19
20
 
20
- That's the whole framework. Everything else (sessions, traces, layers) is plumbing around those three.
21
+ Everything else exists to make those objects useful in real product loops:
22
+ traces, datasets, control runtime, optimizers, statistics, and reports.
21
23
 
22
24
  When the thing being evaluated is an agent that should keep working, use
23
25
  [`runAgentControlLoop`](./control-runtime.md). It turns validators into a
@@ -62,7 +64,7 @@ shape stays the same.
62
64
  Those trajectories can be converted into preference memory, `DatasetScenario`
63
65
  rows, optimizer rows, and held-out examples for overfit checks.
64
66
 
65
- ## The three-layer eval (for code generators)
67
+ ## Code Generator Eval
66
68
 
67
69
  When the artifact is generated code, agent-eval scores it at three independent layers. Each layer fails differently, and you want to know which one broke:
68
70
 
@@ -125,7 +127,7 @@ Two rules that will save you bugs:
125
127
 
126
128
  2. **Pair LLM judges with build outcomes.** An LLM judge will rate non-compiling code as "looks right" (0.8). Always short-circuit on `buildOutcome.passed === false` before any LLM judging.
127
129
 
128
- ## The trace model (skip on first read)
130
+ ## Trace Model
129
131
 
130
132
  Every operation emits structured spans into a `TraceStore`. A run is a tree:
131
133
 
@@ -142,7 +144,10 @@ builder-session [span]
142
144
 
143
145
  Spans are append-only and have stable ids — replay is reading the same store back. OTLP export ships them out for distributed tracing.
144
146
 
145
- You don't need to build the trace tree by hand. `BuilderSession` does it for you. Look at the trace store when you're debugging a flaky run; ignore it otherwise.
147
+ You usually should not build this tree by hand. Product runtimes,
148
+ `runAgentControlLoop`, harnesses, and verifiers should emit it while they run.
149
+ Use traces when debugging a flaky run, building replay data, or explaining a
150
+ release decision.
146
151
 
147
152
  ## Where to go next
148
153
 
@@ -3,7 +3,7 @@
3
3
  This page explains the main `agent-eval` primitives in plain English first,
4
4
  then shows when to use each one.
5
5
 
6
- ## ELI5
6
+ ## Overview
7
7
 
8
8
  LLM agents can write code, drafts, research, plans, and actions. The hard part
9
9
  is knowing whether they actually did a good job, whether they should keep
@@ -41,7 +41,7 @@ trying, and whether a change made them better or worse.
41
41
 
42
42
  ## Integration Patterns
43
43
 
44
- ### Recommended Agent Product Shape
44
+ ### Recommended Product Shape
45
45
 
46
46
  Use this shape when the product needs to keep pushing work forward instead of
47
47
  only answering once:
@@ -175,21 +175,6 @@ Store as `FeedbackTrajectory`, then derive:
175
175
  logs, screenshots, or browser state. Use separate sandboxes for parallel
176
176
  variants or destructive checks.
177
177
 
178
- ## Same-Sandbox Example
179
-
180
- `examples/same-sandbox-harness/` shows the common coding/browser pattern:
181
-
182
- ```text
183
- one sandbox/workdir -> install/build/test -> inspect evidence -> emit judge span
184
- ```
185
-
186
- Use this when a judge needs evidence produced by earlier harness phases. Use
187
- isolated sandboxes when variants run in parallel or a phase can corrupt the
188
- workspace.
189
- - Treat telemetry as evidence, not control flow. A trace sink outage should be
190
- visible in `runtimeErrors`, but it should not stop the worker from completing
191
- the user task.
192
-
193
178
  ## Highest-ROI Adoption Order
194
179
 
195
180
  1. Wrap one real product workflow in `runAgentControlLoop`.
@@ -211,3 +196,11 @@ reusable:
211
196
 
212
197
  Core should provide shapes, stores, runners, scoring, traces, and converters.
213
198
  Downstream integrations provide domain state, policy, tools, and storage.
199
+
200
+ ## Examples
201
+
202
+ - `examples/same-sandbox-harness`: one workdir for install/build/test plus
203
+ evidence inspection.
204
+ - `examples/multi-shot-optimization`: full-trajectory optimization with a
205
+ holdout gate.
206
+ - `examples/benchmarks`: benchmark adapter contracts and reference wrappers.
@@ -0,0 +1,77 @@
1
+ # Integration Launch Gates
2
+
3
+ Use these gates when a product lets generated apps or agents use user-owned
4
+ connections through an integration hub.
5
+
6
+ The eval should wrap the real product path:
7
+
8
+ ```txt
9
+ user prompt
10
+ -> product emits IntegrationManifest
11
+ -> platform resolves connections and grants
12
+ -> sandbox receives capability bundle
13
+ -> generated app invokes integration action
14
+ -> platform enforces policy, approval, idempotency, audit
15
+ ```
16
+
17
+ ## Deterministic Gates
18
+
19
+ - The generated app declares an integration manifest before sandbox launch.
20
+ - Manifest validation passes.
21
+ - Required connections and scopes are present before execution.
22
+ - Sandbox environment contains a capability bundle, not raw provider tokens.
23
+ - Reads invoke through the platform bridge.
24
+ - Writes return `approval_required` unless product policy explicitly allows
25
+ them.
26
+ - Approved writes are bound to the same action, input hash, connection, and
27
+ subject.
28
+ - Revoked grants or expired capabilities stop invocation.
29
+ - Resumed or long-running sandboxes receive a refreshed bundle before expiry.
30
+ - Audit includes grant creation, capability issue, invoke success/failure,
31
+ approval resolution, and revoke events.
32
+
33
+ ## Failure Classes
34
+
35
+ `agent-eval` classifies integration failures separately from prompt/tool
36
+ failures:
37
+
38
+ - `bad_integration_manifest`
39
+ - `missing_integration_connection`
40
+ - `missing_integration_scope`
41
+ - `integration_approval_required`
42
+ - `integration_auth_expired`
43
+ - `integration_provider_failure`
44
+ - `unsafe_integration_write_denied`
45
+
46
+ Use the helper payload builders and eval builders so products emit the same
47
+ trace evidence:
48
+
49
+ ```ts
50
+ const gate = {
51
+ connectorId: 'google-calendar',
52
+ actionId: 'events.create',
53
+ valid: true,
54
+ missingConnections: [],
55
+ missingScopes: ['calendar.events.write'],
56
+ }
57
+
58
+ const evals = integrationGateEvals(gate)
59
+
60
+ await emitter.emit({
61
+ kind: 'custom',
62
+ payload: integrationManifestResolvedPayload(gate),
63
+ })
64
+
65
+ await emitter.emit({
66
+ kind: 'custom',
67
+ payload: integrationInvokeFailedPayload({
68
+ connectorId: 'google-calendar',
69
+ actionId: 'events.create',
70
+ code: 'scope_denied',
71
+ message: 'calendar.events.write was not granted',
72
+ }),
73
+ })
74
+ ```
75
+
76
+ The classifier then reports the real missing surface instead of burying the
77
+ failure under `tool_recovery_failure` or `unknown`.
@@ -0,0 +1,221 @@
1
+ # Product Eval Adoption
2
+
3
+ This guide is for teams adding `@tangle-network/agent-eval` to a real agent
4
+ product. The package supplies evaluation contracts and runtime primitives. Your
5
+ product supplies the actual workflow adapter, state, credentials, tools, UI, and
6
+ storage.
7
+
8
+ ## Goal
9
+
10
+ Use the same loop for production, replay, and optimization:
11
+
12
+ ```txt
13
+ real user task
14
+ -> product adapter observes state
15
+ -> validators and judges grade state
16
+ -> control loop decides next action
17
+ -> product agent acts in the real environment
18
+ -> trace + feedback trajectory are stored
19
+ -> datasets and optimizers replay the same adapter
20
+ ```
21
+
22
+ If production and eval use different loops, benchmark gains will not transfer.
23
+
24
+ ## What The Product Owns
25
+
26
+ The product owns:
27
+
28
+ - task state and domain models
29
+ - credentials, tenant policy, approval, and side-effect rules
30
+ - browser, sandbox, CLI, connector, or voice drivers
31
+ - database and trace persistence
32
+ - user/reviewer feedback collection
33
+ - deployment and live canary routing
34
+ - model gateway configuration
35
+
36
+ `agent-eval` owns:
37
+
38
+ - trace, run, dataset, feedback, and score contracts
39
+ - control-loop mechanics
40
+ - verifier and judge orchestration
41
+ - failure taxonomy
42
+ - paired statistics and holdout gates
43
+ - optimizer inputs and promotion reports
44
+
45
+ ## Minimal Production Adapter
46
+
47
+ Start with a small adapter that mirrors one real workflow.
48
+
49
+ ```ts
50
+ interface ProductEvalAdapter<TState, TAction> {
51
+ observe(taskId: string): Promise<TState>
52
+ validate(state: TState): Promise<ControlEvalResult[]>
53
+ decide(input: {
54
+ state: TState
55
+ evals: ControlEvalResult[]
56
+ history: unknown[]
57
+ }): Promise<TAction | 'stop'>
58
+ act(taskId: string, action: TAction): Promise<void>
59
+ }
60
+ ```
61
+
62
+ Keep the adapter product-owned until at least two products need the same shape.
63
+
64
+ ## Validator Order
65
+
66
+ Use deterministic checks before judges.
67
+
68
+ 1. **State validity**: schema, required files, required DB rows, required
69
+ connections.
70
+ 2. **Runtime gates**: install, build, typecheck, tests, serve, deploy smoke.
71
+ 3. **Policy gates**: approvals, side effects, budget, credentials, data
72
+ freshness.
73
+ 4. **Behavior gates**: browser flows, API calls, generated app preview, voice
74
+ transcript checks.
75
+ 5. **Semantic judges**: intent fit, quality, completeness, safety,
76
+ professional correctness.
77
+
78
+ Semantic judges should never turn a failed build into a pass.
79
+
80
+ ## Traces And Feedback
81
+
82
+ Every serious run should record:
83
+
84
+ - task id and scenario id
85
+ - git commit
86
+ - model and provider
87
+ - prompt/config hashes
88
+ - tool calls and retrieval spans
89
+ - build/test/deploy output
90
+ - cost, latency, and token use
91
+ - user/reviewer feedback
92
+ - final outcome and failure class
93
+
94
+ Convert runs into `FeedbackTrajectory` records so normal product usage becomes
95
+ replayable eval data.
96
+
97
+ ```txt
98
+ production run -> feedback trajectory -> dataset scenario -> optimizer row
99
+ ```
100
+
101
+ For promotion-grade runs, also project the completed control result into a
102
+ strict `RunRecord`:
103
+
104
+ ```ts
105
+ const record = controlRunToRunRecord(controlResult, {
106
+ experimentId,
107
+ candidateId,
108
+ seed,
109
+ model: 'gpt-4o-2024-11-20',
110
+ promptHash,
111
+ configHash,
112
+ commitSha,
113
+ splitTag: 'holdout',
114
+ tokenUsage,
115
+ })
116
+ ```
117
+
118
+ ## Datasets And Holdouts
119
+
120
+ Use four splits:
121
+
122
+ - `train`: optimizer search.
123
+ - `dev`: tuning and threshold selection.
124
+ - `test`: normal reporting.
125
+ - `holdout`: promotion-only gate.
126
+
127
+ The low-level `RunRecord` schema uses `search | dev | holdout`; map `train`
128
+ and normal non-holdout test/report rows to `search` when producing promotion
129
+ tables.
130
+
131
+ Do not inspect or tune against holdout failures during optimization. If a
132
+ holdout failure reveals a real product bug, fix the bug and rotate the holdout
133
+ set with a signed note.
134
+
135
+ ## Optimization
136
+
137
+ Use `runMultiShotOptimization()` when the system is a multi-step agent, not a
138
+ single prompt.
139
+
140
+ Good optimization targets:
141
+
142
+ - system prompt
143
+ - tool descriptions
144
+ - retrieval policy
145
+ - data acquisition policy
146
+ - user-question policy
147
+ - evaluator threshold
148
+ - agent topology
149
+ - scaffold/template choice
150
+
151
+ Bad optimization targets:
152
+
153
+ - hidden holdout examples
154
+ - production credentials
155
+ - brittle string checks that do not match user value
156
+ - fake workflows that do not call the product adapter
157
+
158
+ Use actionable side information so the optimizer knows whether a failure belongs
159
+ to prompt, tools, retrieval, data acquisition, sandbox, evaluator, or product
160
+ runtime.
161
+
162
+ ## Release Gate
163
+
164
+ A launch or promotion should require:
165
+
166
+ - enough runs for the target risk level
167
+ - paired improvement over the current baseline
168
+ - no critical regression on test
169
+ - holdout pass or explicit rejection
170
+ - cost and latency within budget
171
+ - no unresolved canary or contamination failures
172
+ - trace evidence for representative successes and failures
173
+ - TraceAnalyst findings for failure-heavy or regression-heavy corpora
174
+ - human-readable report with failure clusters and next actions
175
+
176
+ `evaluateReleaseConfidence()` and the paired statistics helpers provide the
177
+ decision data. The product decides the business threshold.
178
+
179
+ ## Product Patterns
180
+
181
+ ### Coding Or Builder Agent
182
+
183
+ Use sandbox/build/test/serve/browser validators. Add intent and semantic
184
+ concept judges only after the generated app runs.
185
+
186
+ ### Browser Agent
187
+
188
+ Record browser steps, screenshots, network errors, console errors, and final
189
+ state. Use deterministic DOM/API assertions before visual or semantic judges.
190
+
191
+ ### Domain Agent
192
+
193
+ Use domain fixtures, jurisdiction/date metadata, retrieval spans, and
194
+ professional judges. Fail missing/stale evidence separately from bad reasoning.
195
+
196
+ ### Workflow Or Integration Agent
197
+
198
+ Use `@tangle-network/agent-integrations` manifests as readiness inputs. Gate
199
+ missing connections, missing scopes, approval-required writes, and stale tokens
200
+ before blaming the agent prompt.
201
+
202
+ For generated apps and sandbox agents, also run the
203
+ [Integration Launch Gates](./integration-launch-gates.md). The eval should prove
204
+ that app code invokes through the integration bridge, not provider SDKs with raw
205
+ OAuth tokens.
206
+
207
+ ### Voice Agent
208
+
209
+ Record transcript, timing, interruptions, tool calls, and task outcome. Judge
210
+ conversation quality separately from tool success and policy compliance.
211
+
212
+ ## Anti-Patterns
213
+
214
+ - Evaluating only final prose for an agent that actually builds, browses, or
215
+ calls tools.
216
+ - Letting an LLM judge override failed tests.
217
+ - Optimizing on examples that users will never hit.
218
+ - Recording traces as logs but never converting them to datasets.
219
+ - Calling every failure a prompt failure when context, data, auth, or runtime
220
+ readiness was missing.
221
+ - Shipping reports without run ids, commits, model ids, or evidence links.
@@ -0,0 +1,75 @@
1
+ # Trace Analysis
2
+
3
+ Trace analysis is the bridge between raw product telemetry and useful eval work.
4
+
5
+ ```txt
6
+ live product run
7
+ -> TraceEmitter / TraceStore
8
+ -> TraceAnalyst investigates trace corpora
9
+ -> findings become ASI, failures, replay cases, and release actions
10
+ ```
11
+
12
+ ## When To Use TraceAnalyst
13
+
14
+ Use `TraceAnalyst` when you have more than a few traces and need to answer:
15
+
16
+ - which failure modes are recurring?
17
+ - which spans explain a regression?
18
+ - did retrieval, integrations, sandbox, or policy block the run?
19
+ - are failed runs missing evidence that the optimizer needs?
20
+ - which product surfaces deserve the next fix?
21
+
22
+ Use summary tables and release confidence for promotion decisions. Use
23
+ TraceAnalyst to explain the evidence behind those decisions.
24
+
25
+ ## Minimal Flow
26
+
27
+ ```ts
28
+ import {
29
+ OtlpFileTraceStore,
30
+ analyzeTraces,
31
+ } from '@tangle-network/agent-eval'
32
+
33
+ const result = await analyzeTraces({
34
+ question: 'Why did app-runtime holdout runs fail this week?',
35
+ }, {
36
+ source: new OtlpFileTraceStore({ path: 'traces/otlp.jsonl' }),
37
+ ai,
38
+ model: 'gpt-4o-2024-11-20',
39
+ })
40
+
41
+ console.log(result.findings)
42
+ ```
43
+
44
+ Products can pass any `TraceAnalysisStore`; they do not need to use the file
45
+ store in production.
46
+
47
+ ## Required Trace Shape
48
+
49
+ Every serious product run should include:
50
+
51
+ - `runId`, `projectId`, `scenarioId`, `variantId`, and `layer`
52
+ - commit, prompt hash, config hash, model fingerprint, and dataset version
53
+ - LLM spans with model, inputs, outputs, token counts, and cost
54
+ - tool/integration spans with arguments, result summaries, and error codes
55
+ - retrieval spans with query, source ids, hit scores, and freshness metadata
56
+ - sandbox/build/test/deploy spans with exit codes and log artifacts
57
+ - custom events for knowledge readiness and integration gates
58
+ - final run outcome with pass/score/failure class
59
+
60
+ Do not put secrets, raw OAuth tokens, or unredacted PII in traces.
61
+
62
+ ## Product Loop
63
+
64
+ The product loop should not treat traces as a separate debug dump. The intended
65
+ path is:
66
+
67
+ 1. Wrap the real workflow in `runAgentControlLoop` or the product runtime.
68
+ 2. Emit canonical spans/events while the user task runs.
69
+ 3. Convert the completed run to `FeedbackTrajectory` for replay.
70
+ 4. Convert promotion-grade runs to `RunRecord` with `controlRunToRunRecord`.
71
+ 5. Run TraceAnalyst over failure-heavy trace sets.
72
+ 6. Feed findings into `ActionableSideInfo`, failure clusters, and release
73
+ reports.
74
+
75
+ That makes normal product usage become eval data instead of isolated logs.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.20.10",
3
+ "version": "0.20.12",
4
4
  "description": "Trace-first evaluation infrastructure for agent systems: traces, harnesses, verifier pipelines, judges, datasets, gates, optimization, and reporting.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -19,6 +19,26 @@
19
19
  "import": "./dist/index.js",
20
20
  "default": "./dist/index.js"
21
21
  },
22
+ "./control": {
23
+ "types": "./dist/control.d.ts",
24
+ "import": "./dist/control.js",
25
+ "default": "./dist/control.js"
26
+ },
27
+ "./optimization": {
28
+ "types": "./dist/optimization.d.ts",
29
+ "import": "./dist/optimization.js",
30
+ "default": "./dist/optimization.js"
31
+ },
32
+ "./reporting": {
33
+ "types": "./dist/reporting.d.ts",
34
+ "import": "./dist/reporting.js",
35
+ "default": "./dist/reporting.js"
36
+ },
37
+ "./traces": {
38
+ "types": "./dist/traces.d.ts",
39
+ "import": "./dist/traces.js",
40
+ "default": "./dist/traces.js"
41
+ },
22
42
  "./telemetry": {
23
43
  "types": "./dist/telemetry/index.d.ts",
24
44
  "import": "./dist/telemetry/index.js",