@tangle-network/agent-eval 0.76.0 → 0.79.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.d.ts +2 -2
- package/dist/adapters/langchain.d.ts +2 -2
- package/dist/adapters/otel.d.ts +4 -4
- package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
- package/dist/analyst/index.d.ts +42 -8
- package/dist/analyst/index.js +32 -2
- package/dist/analyst/index.js.map +1 -1
- package/dist/authenticity/index.d.ts +161 -0
- package/dist/authenticity/index.js +215 -0
- package/dist/authenticity/index.js.map +1 -0
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/campaign/index.d.ts +11 -11
- package/dist/campaign/index.js +4 -4
- package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
- package/dist/chunk-5LVWPNS5.js.map +1 -0
- package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
- package/dist/chunk-CF67I6QY.js.map +1 -0
- package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
- package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
- package/dist/chunk-KWRRMR3J.js.map +1 -0
- package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
- package/dist/chunk-RPLZ4OIB.js.map +1 -0
- package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
- package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
- package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
- package/dist/contract/index.d.ts +12 -12
- package/dist/contract/index.js +2 -2
- package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/control.js +2 -2
- package/dist/hosted/index.d.ts +4 -4
- package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
- package/dist/index.d.ts +126 -25
- package/dist/index.js +32 -7
- package/dist/index.js.map +1 -1
- package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
- package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/openapi.json +1 -1
- package/dist/{provenance-B-TFszPW.d.ts → provenance-CEAJI9rm.d.ts} +3 -3
- package/dist/{registry-DuVYiTvw.d.ts → registry-BmEuU94S.d.ts} +2 -2
- package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
- package/dist/reporting.d.ts +4 -4
- package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
- package/dist/rl.d.ts +6 -6
- package/dist/rl.js +2 -2
- package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CWyWWLBg.d.ts} +1 -1
- package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-Bgu4C59E.d.ts} +2 -4
- package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
- package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-Du4ZVyef.d.ts} +3 -3
- package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
- package/dist/traces.d.ts +1 -1
- package/dist/traces.js +2 -2
- package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
- package/dist/{types-Bba0vl1V.d.ts → types-QHG0KnkF.d.ts} +11 -3
- package/dist/workflow/index.d.ts +4 -4
- package/dist/workflow/index.js +1 -1
- package/docs/auto-research-loop-end-to-end.md +1 -1
- package/docs/feature-guide.md +4 -4
- package/docs/multi-shot-optimization.md +61 -115
- package/docs/product-eval-adoption.md +1 -1
- package/docs/three-package-architecture.md +1 -1
- package/docs/trace-analysis.md +19 -0
- package/package.json +6 -1
- package/dist/chunk-7W4SM7FD.js.map +0 -1
- package/dist/chunk-F3SRAAZO.js.map +0 -1
- package/dist/chunk-JYE3WOTE.js.map +0 -1
- package/dist/chunk-WYIHD6EB.js.map +0 -1
- /package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
- /package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
- /package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
- /package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-
|
|
1
|
+
import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-BTaXq1TS.js';
|
|
2
2
|
import { a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
2
2
|
import { T as TraceAnalysisStore } from './store-GmBE2pZZ.js';
|
|
3
3
|
import { z } from 'zod';
|
|
4
|
-
import { g as AnalystCost, a as AnalystContext, A as Analyst } from './types-
|
|
4
|
+
import { g as AnalystCost, a as AnalystContext, A as Analyst } from './types-DRvV0zRo.js';
|
|
5
5
|
|
|
6
6
|
/**
|
|
7
7
|
* Typed Ax output for analyst findings.
|
|
@@ -2,8 +2,8 @@ import { T as TraceStore } from '../store-CKUAgsJz.js';
|
|
|
2
2
|
import { R as Run } from '../schema-m0gsnbt3.js';
|
|
3
3
|
import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
4
4
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
5
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-
|
|
6
|
-
import '../run-record-
|
|
5
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-CWyWWLBg.js';
|
|
6
|
+
import '../run-record-sItO5ftF.js';
|
|
7
7
|
import '../errors-Dwqw-T_m.js';
|
|
8
8
|
|
|
9
9
|
/**
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.79.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, k as GateResult, i as GateContext, f as CampaignResult, M as MutableSurface, j as GateDecision } from './types-
|
|
1
|
+
import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, k as GateResult, i as GateContext, f as CampaignResult, M as MutableSurface, j as GateDecision } from './types-QHG0KnkF.js';
|
|
2
2
|
import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
|
|
3
|
-
import { R as RunRecord } from './run-record-
|
|
3
|
+
import { R as RunRecord } from './run-record-sItO5ftF.js';
|
|
4
4
|
import { D as Direction } from './pareto-E-pembql.js';
|
|
5
5
|
import { a as PairedBootstrapResult } from './statistics-B7yCbi9i.js';
|
|
6
|
-
import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-
|
|
6
|
+
import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-Bgu4C59E.js';
|
|
7
7
|
import { HostedClient, TraceSpanEvent } from './hosted/index.js';
|
|
8
8
|
|
|
9
9
|
/**
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as Analyst, a as AnalystContext, b as AnalystRunSummary, c as AnalystFinding, d as AnalystRunResult, C as ChatClient, e as AnalystRunInputs, f as AnalystRunEvent } from './types-
|
|
1
|
+
import { A as Analyst, a as AnalystContext, b as AnalystRunSummary, c as AnalystFinding, d as AnalystRunResult, C as ChatClient, e as AnalystRunInputs, f as AnalystRunEvent } from './types-DRvV0zRo.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* AnalystRegistry — orchestrate N analysts against one run.
|
|
@@ -125,4 +125,4 @@ declare class AnalystRegistry {
|
|
|
125
125
|
private routeInput;
|
|
126
126
|
}
|
|
127
127
|
|
|
128
|
-
export {
|
|
128
|
+
export { AnalystRegistry as A, type BudgetPolicy as B, type RegistryRunOpts as R, type AnalystHooks as a, type AnalystRegistryOptions as b };
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B2kL-fSM.js';
|
|
2
|
-
import { m as GateDecision } from './summary-report-
|
|
3
|
-
import { R as RunRecord, b as RunSplitTag } from './run-record-
|
|
2
|
+
import { m as GateDecision } from './summary-report-BTaXq1TS.js';
|
|
3
|
+
import { R as RunRecord, b as RunSplitTag } from './run-record-sItO5ftF.js';
|
|
4
4
|
|
|
5
5
|
/**
|
|
6
6
|
* Release confidence gate.
|
package/dist/reporting.d.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-
|
|
2
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, f as ReleaseConfidenceScorecard, g as ReleaseConfidenceStatus, h as ReleaseConfidenceThresholds, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-
|
|
1
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-CWyWWLBg.js';
|
|
2
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, f as ReleaseConfidenceScorecard, g as ReleaseConfidenceStatus, h as ReleaseConfidenceThresholds, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CXXZlR8g.js';
|
|
3
3
|
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
4
4
|
export { P as PairedBootstrapOptions, a as PairedBootstrapResult, b as benjaminiHochberg, p as pairedBootstrap, w as wilcoxonSignedRank } from './statistics-B7yCbi9i.js';
|
|
5
|
-
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-
|
|
6
|
-
import './run-record-
|
|
5
|
+
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-BTaXq1TS.js';
|
|
6
|
+
import './run-record-sItO5ftF.js';
|
|
7
7
|
import './errors-Dwqw-T_m.js';
|
|
8
8
|
import './schema-m0gsnbt3.js';
|
|
9
9
|
import './outcome-store-D6KWmYvj.js';
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { b as RunSplitTag, a as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-
|
|
1
|
+
import { b as RunSplitTag, a as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-sItO5ftF.js';
|
|
2
2
|
import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-DbjLfz-K.js';
|
|
3
|
-
import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-
|
|
3
|
+
import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-BTaXq1TS.js';
|
|
4
4
|
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DEZwY14K.js';
|
|
5
5
|
import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CJzrpUua.js';
|
|
6
6
|
import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
|
package/dist/rl.d.ts
CHANGED
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
import { R as RunRecord, b as RunSplitTag } from './run-record-
|
|
2
|
-
import { f as CampaignResult } from './types-
|
|
1
|
+
import { R as RunRecord, b as RunSplitTag } from './run-record-sItO5ftF.js';
|
|
2
|
+
import { f as CampaignResult } from './types-QHG0KnkF.js';
|
|
3
3
|
import { a as VerificationReport } from './multi-layer-verifier-DlWCXuxL.js';
|
|
4
4
|
import { S as Span } from './schema-m0gsnbt3.js';
|
|
5
5
|
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
6
6
|
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
7
7
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
8
|
-
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-
|
|
9
|
-
import { R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-
|
|
10
|
-
export { r as runEvalCampaign } from './researcher-
|
|
8
|
+
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-CWyWWLBg.js';
|
|
9
|
+
import { R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-rInLj9De.js';
|
|
10
|
+
export { r as runEvalCampaign } from './researcher-rInLj9De.js';
|
|
11
11
|
import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
|
|
12
12
|
import './errors-Dwqw-T_m.js';
|
|
13
13
|
import './llm-client-DbjLfz-K.js';
|
|
14
14
|
import './raw-provider-sink-C46HDghv.js';
|
|
15
|
-
import './summary-report-
|
|
15
|
+
import './summary-report-BTaXq1TS.js';
|
|
16
16
|
import './failure-cluster-CL7IVgkJ.js';
|
|
17
17
|
import './emitter-DEZwY14K.js';
|
|
18
18
|
import './integrity-CJzrpUua.js';
|
package/dist/rl.js
CHANGED
|
@@ -10,7 +10,7 @@ import {
|
|
|
10
10
|
} from "./chunk-3RF76KTD.js";
|
|
11
11
|
import {
|
|
12
12
|
runEvalCampaign
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-XXNIODOM.js";
|
|
14
14
|
import "./chunk-IHDHUN2X.js";
|
|
15
15
|
import {
|
|
16
16
|
rubricPredictiveValidity
|
|
@@ -25,7 +25,7 @@ import {
|
|
|
25
25
|
} from "./chunk-ITBRCT73.js";
|
|
26
26
|
import "./chunk-SBCB6VZY.js";
|
|
27
27
|
import "./chunk-PC4UYEBM.js";
|
|
28
|
-
import "./chunk-
|
|
28
|
+
import "./chunk-KWRRMR3J.js";
|
|
29
29
|
import "./chunk-TVVP3ZZQ.js";
|
|
30
30
|
import "./chunk-VSMTAMNK.js";
|
|
31
31
|
import {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
|
|
2
|
-
import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-
|
|
2
|
+
import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-QHG0KnkF.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* @experimental
|
|
@@ -28,9 +28,7 @@ import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateRe
|
|
|
28
28
|
*
|
|
29
29
|
* The driver is surface-agnostic — any string surface in any consumer opts
|
|
30
30
|
* in by selecting it. Reuses the generic reflection primitive
|
|
31
|
-
* (`buildReflectionPrompt` / `parseReflectionResponse`) and the router
|
|
32
|
-
* client; no dependency on the legacy `runMultiShotOptimization` /
|
|
33
|
-
* `prompt-evolution` orchestration.
|
|
31
|
+
* (`buildReflectionPrompt` / `parseReflectionResponse`) and the router client.
|
|
34
32
|
*
|
|
35
33
|
* Earns its keep where there is real per-instance signal (which the
|
|
36
34
|
* dimensional + per-scenario evidence + the `LabeledScenarioStore` flywheel
|
|
@@ -200,6 +200,17 @@ interface RunOutcome {
|
|
|
200
200
|
* these records as input. Optional — single-judge or scalar-only
|
|
201
201
|
* runs leave it unset. */
|
|
202
202
|
judgeScores?: JudgeScoresRecord;
|
|
203
|
+
/** Authenticity / realness verdict — did the run build the REAL thing on the
|
|
204
|
+
* intended infra, or fake it (see `./authenticity`)? Optional: only domains
|
|
205
|
+
* with an authenticity config populate it. Carried in the corpus so the
|
|
206
|
+
* flywheel / off-policy learning can optimize for real completion, not gamed
|
|
207
|
+
* pass-rate. `score` is 0-1; `gated` is the anti-Goodhart flag — a gated run
|
|
208
|
+
* must not count as a real success regardless of `score`. */
|
|
209
|
+
realness?: {
|
|
210
|
+
score: number;
|
|
211
|
+
gated: boolean;
|
|
212
|
+
reason?: string;
|
|
213
|
+
};
|
|
203
214
|
}
|
|
204
215
|
/**
|
|
205
216
|
* Mandatory paper-grade fields for a single evaluation run. Optional
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { AxAIService } from '@ax-llm/ax';
|
|
2
|
-
import { c as TraceAnalystKindSpec } from './kind-factory-
|
|
3
|
-
import { b as AnalystRegistryOptions,
|
|
2
|
+
import { c as TraceAnalystKindSpec } from './kind-factory-DqV2t1Xk.js';
|
|
3
|
+
import { b as AnalystRegistryOptions, A as AnalystRegistry } from './registry-BmEuU94S.js';
|
|
4
4
|
import { z } from 'zod';
|
|
5
|
-
import { c as AnalystFinding, A as Analyst, a as AnalystContext } from './types-
|
|
5
|
+
import { c as AnalystFinding, A as Analyst, a as AnalystContext } from './types-DRvV0zRo.js';
|
|
6
6
|
import { a as TraceAnalystSpan } from './store-GmBE2pZZ.js';
|
|
7
7
|
import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
|
|
8
8
|
import { S as Severity } from './multi-layer-verifier-DlWCXuxL.js';
|
package/dist/traces.d.ts
CHANGED
|
@@ -14,7 +14,7 @@ import { A as AnalyzeTracesOptions, b as AnalyzeTracesResult } from './analyst-t
|
|
|
14
14
|
export { a as AnalyzeTracesInput, c as AnalyzeTracesTurnSnapshot, d as analyzeTraces } from './analyst-t7zZS3TV.js';
|
|
15
15
|
import { h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, T as TraceAnalysisStore, g as TraceAnalystFilters, b as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, c as SearchTraceResult, S as SearchSpanResult } from './store-GmBE2pZZ.js';
|
|
16
16
|
export { D as DEFAULT_TRACE_ANALYST_BUDGETS, d as SpanMatchRecord, e as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, f as TraceAnalystByteBudgets, a as TraceAnalystSpan, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-GmBE2pZZ.js';
|
|
17
|
-
import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-
|
|
17
|
+
import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-sItO5ftF.js';
|
|
18
18
|
import { AxFunction } from '@ax-llm/ax';
|
|
19
19
|
|
|
20
20
|
/**
|
package/dist/traces.js
CHANGED
|
@@ -25,7 +25,7 @@ import {
|
|
|
25
25
|
scoreTraceInsightReadiness,
|
|
26
26
|
tokenizeDomainWords,
|
|
27
27
|
traceAnalystOnRunComplete
|
|
28
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-XQL22JDG.js";
|
|
29
29
|
import {
|
|
30
30
|
DEFAULT_REDACTION_RULES,
|
|
31
31
|
REDACTION_VERSION,
|
|
@@ -86,7 +86,7 @@ import {
|
|
|
86
86
|
defaultProviderRedactor,
|
|
87
87
|
providerFromBaseUrl
|
|
88
88
|
} from "./chunk-PC4UYEBM.js";
|
|
89
|
-
import "./chunk-
|
|
89
|
+
import "./chunk-KWRRMR3J.js";
|
|
90
90
|
import {
|
|
91
91
|
TraceEmitter,
|
|
92
92
|
llmSpanFromProvider
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { R as RunRecord } from './run-record-
|
|
1
|
+
import { R as RunRecord } from './run-record-sItO5ftF.js';
|
|
2
2
|
import { T as TraceAnalysisStore } from './store-GmBE2pZZ.js';
|
|
3
3
|
import { a as JudgeInput } from './types-Croy5h7V.js';
|
|
4
4
|
import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
|
|
@@ -146,6 +146,15 @@ interface AnalystFinding {
|
|
|
146
146
|
* diff cleanly across runs.
|
|
147
147
|
*/
|
|
148
148
|
subject?: string;
|
|
149
|
+
/** FIREWALL provenance (docs/learning-flywheel.md): true iff this finding was
|
|
150
|
+
* lifted from a JUDGE verdict (an acceptance score), not OBSERVED from the
|
|
151
|
+
* agent's behavior. A judge-derived finding must NEVER be admitted as a
|
|
152
|
+
* steering input — that is the held-out judge leaking into the loop. Set at
|
|
153
|
+
* the lift site (createJudgeAdapter); checked by `assertNoJudgeVerdict`.
|
|
154
|
+
* Provenance, not evidence presence, is the correct discriminator: an
|
|
155
|
+
* evidence-less trace-analyst observation legitimately steers, while a judge
|
|
156
|
+
* verdict that happens to cite an artifact must not. */
|
|
157
|
+
derived_from_judge?: boolean;
|
|
149
158
|
/** Analyst-private extras; renderers ignore unless they know the analyst. */
|
|
150
159
|
metadata?: Record<string, unknown>;
|
|
151
160
|
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { a as RunTokenUsage } from './run-record-
|
|
1
|
+
import { a as RunTokenUsage } from './run-record-sItO5ftF.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* @experimental
|
|
@@ -163,8 +163,8 @@ interface ParetoParent {
|
|
|
163
163
|
}
|
|
164
164
|
/** @experimental Stateless surface mutation — given findings + current
|
|
165
165
|
* surface, return N candidate surfaces. Pure transform, no generation
|
|
166
|
-
* awareness. Reflective-mutation
|
|
167
|
-
*
|
|
166
|
+
* awareness. Reflective-mutation and `AxGEPA` mutators conform. Wrapped by
|
|
167
|
+
* `evolutionaryDriver` to become an `ImprovementDriver`. */
|
|
168
168
|
interface Mutator<TFindings = unknown> {
|
|
169
169
|
kind: string;
|
|
170
170
|
mutate(args: {
|
|
@@ -206,6 +206,14 @@ interface ProposeContext<TFindings = unknown> {
|
|
|
206
206
|
* scenarios) into a merged candidate. Drivers doing pure single-parent
|
|
207
207
|
* reflection may ignore it. See {@link ParetoParent}. */
|
|
208
208
|
paretoParents?: ParetoParent[];
|
|
209
|
+
/** FIREWALL (non-negotiable): the held-out judge is write-only — its verdicts
|
|
210
|
+
* score the chosen output and gate promotion, and are NEVER an input to
|
|
211
|
+
* proposal/steering (else the optimizer games the acceptance axis = an
|
|
212
|
+
* oracle). This `never`-typed field makes that a compile-time tripwire: a
|
|
213
|
+
* driver that tries to thread judge verdicts into the proposal will not type.
|
|
214
|
+
* Steering may consume TRACE-OBSERVABLE signals (what the agent did) via
|
|
215
|
+
* `findings`/`report`; it may NOT consume the judge's held-out verdict. */
|
|
216
|
+
judgeScores?: never;
|
|
209
217
|
}
|
|
210
218
|
/** @experimental A surface-improvement strategy — the DRIVER of the
|
|
211
219
|
* improvement loop. Given the current best surface, the history of what's
|
package/dist/workflow/index.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { W as WorkflowTopology } from '../harness-optimizer-EnEnQPsr.js';
|
|
2
|
-
import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from '../run-record-
|
|
3
|
-
import { c as AnalystFinding, h as AnalystSeverity, E as EvidenceRef } from '../types-
|
|
4
|
-
import { F as FailureClusterInsight } from '../insight-report-
|
|
2
|
+
import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from '../run-record-sItO5ftF.js';
|
|
3
|
+
import { c as AnalystFinding, h as AnalystSeverity, E as EvidenceRef } from '../types-DRvV0zRo.js';
|
|
4
|
+
import { F as FailureClusterInsight } from '../insight-report-dlpEzQDi.js';
|
|
5
5
|
import { a as VerificationReport, L as LayerResult } from '../multi-layer-verifier-DlWCXuxL.js';
|
|
6
6
|
import { F as FailureClusterReport } from '../failure-cluster-CL7IVgkJ.js';
|
|
7
7
|
import { R as RedactionRule, a as RedactionReport } from '../redact-B40YG2M_.js';
|
|
@@ -18,7 +18,7 @@ import '../types-Croy5h7V.js';
|
|
|
18
18
|
import '@tangle-network/tcloud';
|
|
19
19
|
import '../llm-client-DbjLfz-K.js';
|
|
20
20
|
import '../raw-provider-sink-C46HDghv.js';
|
|
21
|
-
import '../summary-report-
|
|
21
|
+
import '../summary-report-BTaXq1TS.js';
|
|
22
22
|
import '../judge-calibration-DilmB3Ml.js';
|
|
23
23
|
import '../control-runtime-DuFBYg7A.js';
|
|
24
24
|
import '../emitter-DEZwY14K.js';
|
package/dist/workflow/index.js
CHANGED
|
@@ -152,7 +152,7 @@ async function runAutoResearchLoop(opts: {
|
|
|
152
152
|
Two cases:
|
|
153
153
|
|
|
154
154
|
1. **Trajectory-shaped optimization with steering.** Use
|
|
155
|
-
`
|
|
155
|
+
`runImprovementLoop` directly — it already runs the inner
|
|
156
156
|
search-vs-holdout loop. Wrap with `analyzeOptimizationResult` after
|
|
157
157
|
for the RL bridge.
|
|
158
158
|
|
package/docs/feature-guide.md
CHANGED
|
@@ -33,8 +33,8 @@ trying, and whether a change made them better or worse.
|
|
|
33
33
|
| “Human feedback should become reusable eval data.” | `FeedbackTrajectory` | Captures approvals, rejections, edits, choices, metrics, and policy blocks. |
|
|
34
34
|
| “Can this action run, or does it need approval?” | `evaluateActionPolicy` | Generic preflight for side effects, budgets, and required evidence. |
|
|
35
35
|
| “I need train/dev/test/holdout examples.” | `Dataset` plus feedback trajectory conversion | Stable splits and contamination control. |
|
|
36
|
-
| “Which prompt or signature wins?” | `
|
|
37
|
-
| “Improve a multi-turn agent over real task traces.” | `
|
|
36
|
+
| “Which prompt or signature wins?” | `runImprovementLoop`, steering optimizers | Runs variants on scenarios and compares scores. |
|
|
37
|
+
| “Improve a multi-turn agent over real task traces.” | `runImprovementLoop` | GEPA-style trajectory optimization with ASI and held-out promotion. |
|
|
38
38
|
| “Improve prompts, then code if prompts plateau.” | `runPromptEvolution`, composite mutator, code mutator | Bounded evolution with telemetry and lineage. |
|
|
39
39
|
| “Find why a regression happened.” | bisector, traces, run records | Narrows changes and preserves evidence. |
|
|
40
40
|
| “Expose evals to another language.” | Wire protocol and Python client | HTTP/RPC boundary for non-TypeScript apps. |
|
|
@@ -105,7 +105,7 @@ generated code -> build/test/runtime gates -> score -> ship or revise
|
|
|
105
105
|
|
|
106
106
|
Use when you want Ax/GEPA-style improvement.
|
|
107
107
|
|
|
108
|
-
1. For variable-length agent tasks, use `
|
|
108
|
+
1. For variable-length agent tasks, use `runImprovementLoop`.
|
|
109
109
|
2. Build search/dev/test/holdout splits from the real product loop.
|
|
110
110
|
3. Score full trajectories, not just final text.
|
|
111
111
|
4. Emit actionable side information for failures the mutator can fix.
|
|
@@ -156,7 +156,7 @@ Store as `FeedbackTrajectory`, then derive:
|
|
|
156
156
|
| Feedback data | `FeedbackTrajectory`, stores, converters | Human/environment labels | Domain adapters live in downstream repos. |
|
|
157
157
|
| Action policy | `evaluateActionPolicy` | Approval/budget preflight | Blocks or labels actions before `act()`. |
|
|
158
158
|
| Datasets | `Dataset`, holdout tools, canaries | Train/dev/test/holdout corpora | Keeps optimization honest. |
|
|
159
|
-
| Optimization | `
|
|
159
|
+
| Optimization | `runImprovementLoop`, steering optimizers | Prompt/signature comparison | Use held-out gates before promotion. |
|
|
160
160
|
| Evolution | prompt/code mutators, sandbox pool, telemetry | Autoresearch and mutation loops | Use budgets and lineage; do not run unbounded. |
|
|
161
161
|
| Telemetry | `TraceStore`, OTLP, file sinks | Audit and replay | Treat traces as evidence, not just logs. |
|
|
162
162
|
| Reporting | summaries, pareto, cost tracker | Decision support | Useful for PRs, launch gates, research notes. |
|
|
@@ -1,129 +1,75 @@
|
|
|
1
1
|
# Multi-Shot Optimization
|
|
2
2
|
|
|
3
|
-
`runMultiShotOptimization`
|
|
4
|
-
|
|
3
|
+
> **Renamed.** `runMultiShotOptimization` was retired. The live API is
|
|
4
|
+
> `runImprovementLoop` (driver-agnostic, gated promotion) driven by `gepaDriver`,
|
|
5
|
+
> with `compareDrivers` for head-to-head driver lift. This doc was rewritten to the
|
|
6
|
+
> live API; see also [feature-guide.md](./feature-guide.md) and [concepts.md](./concepts.md).
|
|
5
7
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
8
|
+
`runImprovementLoop` is the public entry for GEPA-style optimization over a whole
|
|
9
|
+
task trajectory — the thing you improve is not a single model call but an agent
|
|
10
|
+
system prompt, tool descriptions, a routing policy, or any scaffolding that affects
|
|
11
|
+
the entire run. It is the OUTER loop: it improves the SURFACE the inner workers run.
|
|
9
12
|
|
|
10
|
-
The
|
|
13
|
+
## The shape
|
|
11
14
|
|
|
12
|
-
|
|
13
|
-
-
|
|
14
|
-
- `scorer`: scores the trajectory and emits actionable side information
|
|
15
|
-
- `mutateAdapter`: proposes new variants from top and bottom trials
|
|
15
|
+
You own a few seams; the loop owns the release-critical glue (paired seeds, the
|
|
16
|
+
held-out re-score, the promotion gate, provenance):
|
|
16
17
|
|
|
17
|
-
|
|
18
|
+
- **`baselineSurface`** — the current surface (a prompt string, or a `CodeSurface`).
|
|
19
|
+
- **`dispatchWithSurface(surface, scenario, ctx)`** — run one task to completion
|
|
20
|
+
under a candidate surface; return the artifact the judges score.
|
|
21
|
+
- **`judges`** — score the artifact (`{ composite, dimensions }`).
|
|
22
|
+
- **`driver`** — proposes candidate surfaces each generation: `gepaDriver`
|
|
23
|
+
(reflective + Pareto frontier) or `evolutionaryDriver` (mutator).
|
|
24
|
+
- **`gate`** — `defaultProductionGate` (held-out significance + red-team +
|
|
25
|
+
reward-hacking + canary). Ships ONLY on a CI-lower-bound held-out lift.
|
|
18
26
|
|
|
19
|
-
|
|
20
|
-
- search-split prompt evolution
|
|
21
|
-
- cost/score Pareto objectives
|
|
22
|
-
- failed-run conversion into failed trials
|
|
23
|
-
- ASI projection into reflection traces and numeric metrics
|
|
24
|
-
- optional paired holdout gating through `HeldOutGate`
|
|
25
|
-
- validated `RunRecord` rows for promotion evidence
|
|
26
|
-
|
|
27
|
-
## Result Contract
|
|
28
|
-
|
|
29
|
-
The return shape separates discovery from promotion:
|
|
30
|
-
|
|
31
|
-
- `searchBestVariant`: best variant on the optimizer-visible search scenarios
|
|
32
|
-
- `searchBestAggregate`: aggregate for that search winner
|
|
33
|
-
- `promotedVariant`: variant callers should ship
|
|
34
|
-
- `promotedAggregate`: aggregate for the promoted variant
|
|
35
|
-
- `gate`: holdout decision and evidence, or `null` when no gate ran
|
|
36
|
-
|
|
37
|
-
If a holdout gate is configured and rejects the search winner,
|
|
38
|
-
`promotedVariant` is the baseline. Do not ship `searchBestVariant` directly
|
|
39
|
-
unless you intentionally run without a holdout gate.
|
|
40
|
-
|
|
41
|
-
## Actionable Side Information
|
|
42
|
-
|
|
43
|
-
The scorer should return `asi` rows for concrete failure modes:
|
|
44
|
-
|
|
45
|
-
```ts
|
|
46
|
-
{
|
|
47
|
-
expectationId: 'used-primary-sources',
|
|
48
|
-
message: 'The final answer cited secondary summaries instead of primary sources.',
|
|
49
|
-
severity: 'error',
|
|
50
|
-
responsibleSurface: 'retrieval-policy',
|
|
51
|
-
suggestion: 'Prefer primary-source domains during source-gathering turns.',
|
|
52
|
-
}
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
Standard knowledge-related responsible surfaces are:
|
|
56
|
-
|
|
57
|
-
- `knowledge-requirements`
|
|
58
|
-
- `data-acquisition`
|
|
59
|
-
- `retrieval-policy`
|
|
60
|
-
- `user-question-policy`
|
|
61
|
-
|
|
62
|
-
These rows become:
|
|
63
|
-
|
|
64
|
-
- reflection expectations via `trialTraceFromMultiShotTrial`
|
|
65
|
-
- aggregate metrics like `asi.error` and `surface.retrieval-policy`
|
|
66
|
-
- trace evidence available to downstream reports
|
|
67
|
-
|
|
68
|
-
This is the main reason to use this primitive instead of reducing each run to a
|
|
69
|
-
single scalar reward.
|
|
70
|
-
|
|
71
|
-
## Holdout Discipline
|
|
72
|
-
|
|
73
|
-
For release gates, configure `gate`. The first seed variant is the baseline and
|
|
74
|
-
`gate.gate.baselineKey` must match its id.
|
|
75
|
-
|
|
76
|
-
Holdout scenarios must be disjoint from `searchScenarioIds`. The adapter runs
|
|
77
|
-
baseline and candidate with the same `(scenarioId, rep)` seed, validates every
|
|
78
|
-
row with `validateRunRecord`, then asks `HeldOutGate` whether to promote.
|
|
79
|
-
|
|
80
|
-
When `gate.searchScenarioIds` is omitted, the adapter reuses
|
|
81
|
-
`searchScenarioIds` for the overfit-gap check.
|
|
82
|
-
|
|
83
|
-
## Minimal Shape
|
|
27
|
+
## Minimal example
|
|
84
28
|
|
|
85
29
|
```ts
|
|
86
30
|
import {
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
} from '@tangle-network/agent-eval'
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
seedVariants: [baseline],
|
|
105
|
-
searchScenarioIds: searchScenarios.map((s) => s.id),
|
|
106
|
-
reps: 2,
|
|
107
|
-
generations: 4,
|
|
31
|
+
runImprovementLoop,
|
|
32
|
+
gepaDriver,
|
|
33
|
+
defaultProductionGate,
|
|
34
|
+
} from '@tangle-network/agent-eval/contract'
|
|
35
|
+
|
|
36
|
+
const result = await runImprovementLoop({
|
|
37
|
+
baselineSurface: currentSystemPrompt,
|
|
38
|
+
scenarios: trainScenarios, // optimizer-visible
|
|
39
|
+
holdoutScenarios, // DISJOINT — only the gate sees these
|
|
40
|
+
dispatchWithSurface: async (surface, scenario) =>
|
|
41
|
+
runYourAgentToCompletion({ scenario, prompt: String(surface) }),
|
|
42
|
+
judges: [myJudge],
|
|
43
|
+
driver: gepaDriver({
|
|
44
|
+
llm: { apiKey, baseUrl },
|
|
45
|
+
model: 'gpt-5',
|
|
46
|
+
target: 'enforce a strict output schema',
|
|
47
|
+
}),
|
|
108
48
|
populationSize: 4,
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
},
|
|
114
|
-
},
|
|
115
|
-
scorer: {
|
|
116
|
-
async score({ run }) {
|
|
117
|
-
return scoreFullTrajectory(run.trace)
|
|
118
|
-
},
|
|
119
|
-
},
|
|
120
|
-
mutateAdapter: {
|
|
121
|
-
async mutate({ parent, bottomTrials, childCount, generation }) {
|
|
122
|
-
const traces = bottomTrials.map((t) => trialTraceFromMultiShotTrial(t))
|
|
123
|
-
return proposePromptMutations({ parent, traces, childCount, generation })
|
|
124
|
-
},
|
|
125
|
-
},
|
|
49
|
+
maxGenerations: 4,
|
|
50
|
+
gate: defaultProductionGate({ holdoutScenarios, deltaThreshold: 0 }),
|
|
51
|
+
autoOnPromote: 'none', // or 'pr' (+ ghOwner/ghRepo) to open a PR on ship
|
|
52
|
+
runDir,
|
|
126
53
|
})
|
|
127
54
|
|
|
128
|
-
|
|
55
|
+
if (result.gateResult.decision === 'ship') {
|
|
56
|
+
deploy(result.winnerSurface) // the driver's proposal, gated on a real held-out lift
|
|
57
|
+
}
|
|
129
58
|
```
|
|
59
|
+
|
|
60
|
+
## Discipline (what makes it trustworthy)
|
|
61
|
+
|
|
62
|
+
- **Holdout is disjoint + gated.** `holdoutScenarios` must not overlap the training
|
|
63
|
+
pool. The gate re-scores baseline vs winner on the holdout and ships only when the
|
|
64
|
+
paired-bootstrap CI lower bound clears `deltaThreshold`; a few-instance swing at
|
|
65
|
+
thin `n` is held (`few_runs`), not promoted.
|
|
66
|
+
- **No-op never ships.** If no candidate beats the baseline, the winner IS the
|
|
67
|
+
baseline (empty diff) and the loop forces `hold` — it does not score
|
|
68
|
+
baseline-vs-itself and read model noise as lift.
|
|
69
|
+
- **Provenance falls out.** `result.promotedDiff` + `emitLoopProvenance` give the
|
|
70
|
+
auditable candidate→gate→promote chain (rationale, content hashes, a held-out lift
|
|
71
|
+
recomputable from the emitted record).
|
|
72
|
+
|
|
73
|
+
Reach for `compareDrivers` when the question is "which DRIVER wins" rather than
|
|
74
|
+
"improve this surface", and see `tests/campaign/presets.test.ts` for the executable
|
|
75
|
+
contract (no-op guard, fail-loud holdout, gate promotion).
|
|
@@ -152,7 +152,7 @@ set with a signed note.
|
|
|
152
152
|
|
|
153
153
|
## Optimization
|
|
154
154
|
|
|
155
|
-
Use `
|
|
155
|
+
Use `runImprovementLoop()` when the system is a multi-step agent, not a
|
|
156
156
|
single prompt.
|
|
157
157
|
|
|
158
158
|
Good optimization targets:
|
|
@@ -136,7 +136,7 @@ report, RL bridge).
|
|
|
136
136
|
|
|
137
137
|
| From → To | Type | What it carries |
|
|
138
138
|
|---|---|---|
|
|
139
|
-
| agent-knowledge → agent-eval | `RunRecord` | (consumed via `
|
|
139
|
+
| agent-knowledge → agent-eval | `RunRecord` | (consumed via `runImprovementLoop` for knowledge-base optimization) |
|
|
140
140
|
| agent-knowledge → agent-eval | `KnowledgeReadinessReport`, `KnowledgeBundle`, `KnowledgeRequirement` | (re-exported from agent-eval; agent-knowledge populates) |
|
|
141
141
|
| agent-knowledge → agent-eval | `ControlRuntimeConfig<KnowledgeBaseCandidate>` | (knowledge research adapter) |
|
|
142
142
|
| agent-runtime → agent-eval | `runAgentControlLoop`, `scoreKnowledgeReadiness`, `blockingKnowledgeEval` | (consumed; agent-runtime calls these in its task lifecycle) |
|
package/docs/trace-analysis.md
CHANGED
|
@@ -44,6 +44,25 @@ console.log(result.findings)
|
|
|
44
44
|
Products can pass any `TraceAnalysisStore`; they do not need to use the file
|
|
45
45
|
store in production.
|
|
46
46
|
|
|
47
|
+
## Deterministic failure coverage (no LLM)
|
|
48
|
+
|
|
49
|
+
Before (or alongside) the LLM analyst, `OtlpFileTraceStore.getOverview()` returns a
|
|
50
|
+
`DatasetOverview` whose `error_clusters` are computed deterministically — error
|
|
51
|
+
spans are grouped by a normalized failure signature (uuids / hex ids / numbers /
|
|
52
|
+
absolute paths / durations collapsed), each cluster carrying its prevalence,
|
|
53
|
+
exemplar `trace_id`/`span_id`, and a verbatim sample. This is a zero-LLM,
|
|
54
|
+
reproducible failure checklist the analyst then explains and closes:
|
|
55
|
+
|
|
56
|
+
```ts
|
|
57
|
+
const overview = await store.getOverview()
|
|
58
|
+
for (const c of overview.error_clusters) {
|
|
59
|
+
console.log(`${c.trace_count}× ${c.signature} — e.g. trace ${c.exemplar_trace_ids[0]}`)
|
|
60
|
+
}
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
See `failureClusters` in [insight-report.md](./insight-report.md) and the
|
|
64
|
+
`ErrorCluster` type doc-comments for the field-level contract.
|
|
65
|
+
|
|
47
66
|
## Required Trace Shape
|
|
48
67
|
|
|
49
68
|
Every serious product run should include:
|