@tangle-network/agent-eval 0.20.12 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +76 -0
- package/README.md +39 -1
- package/dist/{chunk-75MCTH7P.js → chunk-3GN6U53I.js} +198 -3
- package/dist/chunk-3GN6U53I.js.map +1 -0
- package/dist/chunk-3IX6QTB7.js +1349 -0
- package/dist/chunk-3IX6QTB7.js.map +1 -0
- package/dist/{chunk-PKCVBYTQ.js → chunk-5IIQKMD5.js} +38 -2
- package/dist/chunk-5IIQKMD5.js.map +1 -0
- package/dist/{chunk-MCMV7DUL.js → chunk-ARZ6BEV6.js} +2 -2
- package/dist/{chunk-HKYRWNHV.js → chunk-HRZELXCR.js} +2 -2
- package/dist/{chunk-ODFINDLQ.js → chunk-KRR4VMH7.js} +11 -1
- package/dist/chunk-KRR4VMH7.js.map +1 -0
- package/dist/chunk-SNUHRBDL.js +154 -0
- package/dist/chunk-SNUHRBDL.js.map +1 -0
- package/dist/{chunk-KWUAAIHR.js → chunk-WOK2RTWG.js} +157 -1
- package/dist/chunk-WOK2RTWG.js.map +1 -0
- package/dist/{chunk-HNJLMAJ2.js → chunk-WOPGKVN4.js} +2 -2
- package/dist/cli.js +3 -2
- package/dist/cli.js.map +1 -1
- package/dist/{control-C8NKbF3w.d.ts → control-cxwMOAsy.d.ts} +3 -2
- package/dist/control.d.ts +4 -3
- package/dist/control.js +2 -2
- package/dist/emitter-B2XqDKFU.d.ts +121 -0
- package/dist/{feedback-trajectory-BGQ_ANCN.d.ts → feedback-trajectory-CB0A32o3.d.ts} +2 -1
- package/dist/index.d.ts +71 -83
- package/dist/index.js +48 -60
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +3 -2
- package/dist/optimization.js +2 -2
- package/dist/reporting-Da2ihlcM.d.ts +672 -0
- package/dist/reporting.d.ts +5 -426
- package/dist/reporting.js +6 -2
- package/dist/{emitter-BYO2nSDA.d.ts → store-u47QaJ9G.d.ts} +1 -91
- package/dist/traces.d.ts +259 -3
- package/dist/traces.js +24 -4
- package/dist/wire/index.js +3 -2
- package/docs/research-report-methodology.md +155 -0
- package/package.json +10 -12
- package/dist/chunk-75MCTH7P.js.map +0 -1
- package/dist/chunk-IKFVX537.js +0 -717
- package/dist/chunk-IKFVX537.js.map +0 -1
- package/dist/chunk-KWUAAIHR.js.map +0 -1
- package/dist/chunk-ODFINDLQ.js.map +0 -1
- package/dist/chunk-PKCVBYTQ.js.map +0 -1
- /package/dist/{chunk-MCMV7DUL.js.map → chunk-ARZ6BEV6.js.map} +0 -0
- /package/dist/{chunk-HKYRWNHV.js.map → chunk-HRZELXCR.js.map} +0 -0
- /package/dist/{chunk-HNJLMAJ2.js.map → chunk-WOPGKVN4.js.map} +0 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import { T as TraceStore, R as RunOutcome, a as Run, S as Span, b as SpanKind, L as LlmSpan, c as ToolSpan, d as RetrievalSpan, J as JudgeSpan, e as SandboxSpan, E as EventKind, f as TraceEvent, B as BudgetLedgerEntry, A as Artifact, M as Message } from './store-u47QaJ9G.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* TraceEmitter — hierarchical span builder that auto-parents using an
|
|
5
|
+
* internal stack. One emitter per Run; emitters do NOT share state.
|
|
6
|
+
*
|
|
7
|
+
* Convenience methods (`llm`, `tool`, `retrieval`, `judge`, `sandbox`)
|
|
8
|
+
* return a `SpanHandle` with `.end()` / `.fail()` so callers don't
|
|
9
|
+
* have to thread spanIds manually. For async workflows that can't use
|
|
10
|
+
* the stack (e.g. fan-out parallel calls), pass `parentSpanId`
|
|
11
|
+
* explicitly.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
interface SpanHandle<S extends Span = Span> {
|
|
15
|
+
span: S;
|
|
16
|
+
end(patch?: Partial<S>): Promise<void>;
|
|
17
|
+
fail(error: string | Error, patch?: Partial<S>): Promise<void>;
|
|
18
|
+
}
|
|
19
|
+
interface RunCompleteHookContext {
|
|
20
|
+
runId: string;
|
|
21
|
+
emitter: TraceEmitter;
|
|
22
|
+
store: TraceStore;
|
|
23
|
+
/** Outcome the caller passed to `endRun` (undefined for `abortRun`). */
|
|
24
|
+
outcome?: RunOutcome;
|
|
25
|
+
/** Final run status. */
|
|
26
|
+
status: 'completed' | 'failed' | 'aborted';
|
|
27
|
+
}
|
|
28
|
+
type RunCompleteHook = (ctx: RunCompleteHookContext) => Promise<void> | void;
|
|
29
|
+
interface TraceEmitterOptions {
|
|
30
|
+
runId?: string;
|
|
31
|
+
/** Inject a clock for deterministic tests. */
|
|
32
|
+
now?: () => number;
|
|
33
|
+
/** Inject an id generator for deterministic tests. */
|
|
34
|
+
id?: () => string;
|
|
35
|
+
/**
|
|
36
|
+
* Hooks fired after `endRun` / `abortRun` writes the final run state.
|
|
37
|
+
* Designed for trace-analyst auto-execution, integrity assertions, and
|
|
38
|
+
* outbound notifications. Hooks run sequentially in the order supplied.
|
|
39
|
+
*
|
|
40
|
+
* By default a hook that throws is swallowed and logged as a `note` event
|
|
41
|
+
* on the run — auto-orchestration must not crash the underlying flow.
|
|
42
|
+
* Set `hookErrors: 'throw'` to propagate.
|
|
43
|
+
*/
|
|
44
|
+
onRunComplete?: RunCompleteHook[];
|
|
45
|
+
/** `'swallow'` (default) | `'throw'`. */
|
|
46
|
+
hookErrors?: 'swallow' | 'throw';
|
|
47
|
+
}
|
|
48
|
+
declare class TraceEmitter {
|
|
49
|
+
private store;
|
|
50
|
+
private stack;
|
|
51
|
+
private _runId;
|
|
52
|
+
private now;
|
|
53
|
+
private id;
|
|
54
|
+
private hooks;
|
|
55
|
+
private hookErrors;
|
|
56
|
+
constructor(store: TraceStore, options?: TraceEmitterOptions);
|
|
57
|
+
get runId(): string;
|
|
58
|
+
get traceStore(): TraceStore;
|
|
59
|
+
/** Append a hook after construction (e.g. attach the trace analyst). */
|
|
60
|
+
addRunCompleteHook(hook: RunCompleteHook): void;
|
|
61
|
+
/**
|
|
62
|
+
* Begin a Run.
|
|
63
|
+
*
|
|
64
|
+
* `scenarioId` is required on the persisted Run shape — every Run downstream
|
|
65
|
+
* gets a non-empty scenarioId so filters and aggregations stay simple — but
|
|
66
|
+
* the INPUT here accepts it as optional. When omitted, startRun substitutes
|
|
67
|
+
* a sensible default (`run.layer ?? run.tags?.['kind'] ?? 'runtime'`) so
|
|
68
|
+
* runtime / operator / meta-eval runs that have no curated-scenario corpus
|
|
69
|
+
* to anchor to don't have to invent placeholder strings at the call site.
|
|
70
|
+
*/
|
|
71
|
+
startRun(run: Omit<Run, 'runId' | 'scenarioId' | 'startedAt' | 'status'> & {
|
|
72
|
+
scenarioId?: string;
|
|
73
|
+
}): Promise<Run>;
|
|
74
|
+
endRun(outcome?: RunOutcome): Promise<void>;
|
|
75
|
+
abortRun(reason: string): Promise<void>;
|
|
76
|
+
private runHooks;
|
|
77
|
+
span<S extends Span = Span>(init: {
|
|
78
|
+
kind: SpanKind;
|
|
79
|
+
name: string;
|
|
80
|
+
parentSpanId?: string;
|
|
81
|
+
attributes?: Record<string, unknown>;
|
|
82
|
+
} & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
|
|
83
|
+
private handle;
|
|
84
|
+
private pop;
|
|
85
|
+
llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
|
|
86
|
+
tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
|
|
87
|
+
retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
|
|
88
|
+
recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
|
|
89
|
+
sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
|
|
90
|
+
emit(event: {
|
|
91
|
+
kind: EventKind;
|
|
92
|
+
spanId?: string;
|
|
93
|
+
payload?: Record<string, unknown>;
|
|
94
|
+
}): Promise<TraceEvent>;
|
|
95
|
+
recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
|
|
96
|
+
timestamp?: number;
|
|
97
|
+
}): Promise<BudgetLedgerEntry>;
|
|
98
|
+
recordArtifact(artifact: Omit<Artifact, 'artifactId' | 'runId'>): Promise<Artifact>;
|
|
99
|
+
/**
|
|
100
|
+
* Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
|
|
101
|
+
* Returns the fn's return value. Use this for the 95% case.
|
|
102
|
+
*/
|
|
103
|
+
within<T>(init: Parameters<TraceEmitter['span']>[0], fn: (handle: SpanHandle) => Promise<T>): Promise<T>;
|
|
104
|
+
}
|
|
105
|
+
/** Helper to build an LLM span handle args object from a provider-shaped response. */
|
|
106
|
+
declare function llmSpanFromProvider(args: {
|
|
107
|
+
name?: string;
|
|
108
|
+
model: string;
|
|
109
|
+
messages: Message[];
|
|
110
|
+
output: string;
|
|
111
|
+
usage?: {
|
|
112
|
+
inputTokens?: number;
|
|
113
|
+
outputTokens?: number;
|
|
114
|
+
cachedTokens?: number;
|
|
115
|
+
reasoningTokens?: number;
|
|
116
|
+
};
|
|
117
|
+
costUsd?: number;
|
|
118
|
+
finishReason?: string;
|
|
119
|
+
}): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
|
|
120
|
+
|
|
121
|
+
export { type RunCompleteHook as R, type SpanHandle as S, TraceEmitter as T, type RunCompleteHookContext as a, type TraceEmitterOptions as b, llmSpanFromProvider as l };
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { D as DatasetSplit, a as DatasetScenario } from './dataset-B9qvlm_o.js';
|
|
2
|
-
import { T as TraceEmitter
|
|
2
|
+
import { T as TraceEmitter } from './emitter-B2XqDKFU.js';
|
|
3
|
+
import { F as FailureClass, T as TraceStore } from './store-u47QaJ9G.js';
|
|
3
4
|
|
|
4
5
|
/**
|
|
5
6
|
* Policy-based agent control runtime.
|
package/dist/index.d.ts
CHANGED
|
@@ -1,15 +1,17 @@
|
|
|
1
1
|
import { TCloud } from '@tangle-network/tcloud';
|
|
2
|
-
import { ReleaseConfidenceThresholds, ReleaseConfidenceScorecard } from './reporting.js';
|
|
3
|
-
export { BootstrapOptions, BootstrapResult, GainDistributionBin, GainDistributionFigureSpec, GainDistributionOptions, JudgeReplayGateArgs, PairedBootstrapOptions, PairedBootstrapResult, ParetoFigureSpec, ParetoPoint, ReleaseConfidenceAxis, ReleaseConfidenceAxisName, ReleaseConfidenceInput, ReleaseConfidenceIssue, ReleaseConfidenceMetrics, ReleaseConfidenceStatus, ReleaseTraceEvidence, RenderReleaseReportOptions, SummaryTable, SummaryTableOptions, SummaryTableRow, Verdict, assertReleaseConfidence, bhAdjust, bootstrapCi, evaluateReleaseConfidence, gainHistogram, judgeReplayGate, pairedBootstrap, pairedWilcoxon, paretoChart, releaseTraceEvidenceFromMultiShotTrials, renderReleaseReport, summaryTable } from './reporting.js';
|
|
4
|
-
import { F as FeedbackLabel, a as FeedbackTrajectoryStore, b as FeedbackTrajectory, C as ControlSeverity, c as ControlEvalResult } from './feedback-trajectory-
|
|
5
|
-
export { d as ControlActionFailureMode, e as ControlActionOutcome, f as ControlBudget, g as ControlContext, h as ControlDecision, i as ControlRunResult, j as ControlRuntimeConfig, k as ControlRuntimeError, l as ControlStep, m as ControlStopPolicies, n as FeedbackArtifactType, o as FeedbackAttempt, p as FeedbackLabelKind, q as FeedbackLabelSource, r as FeedbackOptimizerRow, s as FeedbackOutcome, t as FeedbackReplayAdapter, u as FeedbackReplayResult, v as FeedbackSeverity, w as FeedbackSplitPolicy, x as FeedbackTask, y as FeedbackTrajectoryFilter, z as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, A as ProposedSideEffect, S as StopDecision, B as allCriticalPassed, D as assignFeedbackSplit, E as controlRunToFeedbackTrajectory, G as createFeedbackTrajectory, H as feedbackTrajectoriesToDatasetScenarios, J as feedbackTrajectoriesToOptimizerRows, K as feedbackTrajectoryToDatasetScenario, L as feedbackTrajectoryToOptimizerRow, M as objectiveEval, N as parseFeedbackTrajectoriesJsonl, O as renderPreferenceMemoryMarkdown, Q as replayFeedbackTrajectories, R as replayFeedbackTrajectory, T as runAgentControlLoop, U as serializeFeedbackTrajectoriesJsonl, V as stopOnNoProgress, W as stopOnRepeatedAction, X as subjectiveEval, Y as summarizePreferenceMemory, Z as withAssignedFeedbackSplit } from './feedback-trajectory-
|
|
6
|
-
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-
|
|
7
|
-
import { T as TraceEmitter
|
|
8
|
-
export {
|
|
2
|
+
import { R as ReleaseConfidenceThresholds, a as ReleaseConfidenceScorecard } from './reporting-Da2ihlcM.js';
|
|
3
|
+
export { B as BootstrapOptions, b as BootstrapResult, D as DEFAULT_FAILURE_RULES, F as FailureClassification, c as FailureCluster, d as FailureClusterReport, e as FailureContext, f as FailureRule, G as GainDistributionBin, g as GainDistributionFigureSpec, h as GainDistributionOptions, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, i as PairedBootstrapResult, j as ParetoFigureSpec, k as ParetoPoint, l as RESEARCH_REPORT_HARD_PAIR_FLOOR, m as ReleaseConfidenceAxis, n as ReleaseConfidenceAxisName, o as ReleaseConfidenceInput, p as ReleaseConfidenceIssue, q as ReleaseConfidenceMetrics, r as ReleaseConfidenceStatus, s as ReleaseTraceEvidence, t as RenderReleaseReportOptions, u as ResearchReport, v as ResearchReportCandidate, w as ResearchReportDecision, x as ResearchReportMethodology, y as ResearchReportOptions, z as ResearchReportRecommendation, S as SummaryTable, A as SummaryTableOptions, C as SummaryTableRow, V as Verdict, E as assertReleaseConfidence, H as bhAdjust, I as bootstrapCi, K as classifyFailure, L as evaluateReleaseConfidence, M as failureClusterView, N as gainHistogram, O as judgeReplayGate, Q as pairedBootstrap, T as pairedWilcoxon, U as paretoChart, W as releaseTraceEvidenceFromMultiShotTrials, X as renderReleaseReport, Y as researchReport, Z as summaryTable } from './reporting-Da2ihlcM.js';
|
|
4
|
+
import { F as FeedbackLabel, a as FeedbackTrajectoryStore, b as FeedbackTrajectory, C as ControlSeverity, c as ControlEvalResult } from './feedback-trajectory-CB0A32o3.js';
|
|
5
|
+
export { d as ControlActionFailureMode, e as ControlActionOutcome, f as ControlBudget, g as ControlContext, h as ControlDecision, i as ControlRunResult, j as ControlRuntimeConfig, k as ControlRuntimeError, l as ControlStep, m as ControlStopPolicies, n as FeedbackArtifactType, o as FeedbackAttempt, p as FeedbackLabelKind, q as FeedbackLabelSource, r as FeedbackOptimizerRow, s as FeedbackOutcome, t as FeedbackReplayAdapter, u as FeedbackReplayResult, v as FeedbackSeverity, w as FeedbackSplitPolicy, x as FeedbackTask, y as FeedbackTrajectoryFilter, z as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, A as ProposedSideEffect, S as StopDecision, B as allCriticalPassed, D as assignFeedbackSplit, E as controlRunToFeedbackTrajectory, G as createFeedbackTrajectory, H as feedbackTrajectoriesToDatasetScenarios, J as feedbackTrajectoriesToOptimizerRows, K as feedbackTrajectoryToDatasetScenario, L as feedbackTrajectoryToOptimizerRow, M as objectiveEval, N as parseFeedbackTrajectoriesJsonl, O as renderPreferenceMemoryMarkdown, Q as replayFeedbackTrajectories, R as replayFeedbackTrajectory, T as runAgentControlLoop, U as serializeFeedbackTrajectoriesJsonl, V as stopOnNoProgress, W as stopOnRepeatedAction, X as subjectiveEval, Y as summarizePreferenceMemory, Z as withAssignedFeedbackSplit } from './feedback-trajectory-CB0A32o3.js';
|
|
6
|
+
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-cxwMOAsy.js';
|
|
7
|
+
import { T as TraceEmitter } from './emitter-B2XqDKFU.js';
|
|
8
|
+
export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
|
|
9
9
|
import { A as ActionableSideInfo, O as Objective, P as ParetoResult, T as TrialCache, a as TrialResult, E as EvolvableVariant, M as MutateAdapter, V as VariantAggregate } from './multi-shot-optimization-Bvtz294B.js';
|
|
10
10
|
export { b as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, c as Direction, G as GateDecision, d as GateEvidence, e as GenerationReport, H as HeldOutGate, f as HeldOutGateConfig, g as HeldOutGateRejectionCode, I as InMemoryTrialCache, h as MultiShotGateConfig, i as MultiShotGateResult, j as MultiShotMutateAdapter, k as MultiShotOptimizationConfig, l as MultiShotOptimizationResult, m as MultiShotRun, n as MultiShotRunInput, o as MultiShotRunner, p as MultiShotScore, q as MultiShotScorer, r as MultiShotSplit, s as MultiShotTrace, t as MultiShotTrialResult, u as MultiShotVariant, v as PromptEvolutionConfig, w as PromptEvolutionEvent, x as PromptEvolutionResult, R as ReflectionContext, y as ReflectionProposal, S as ScenarioAggregate, z as ScoreAdapter, B as TrialTrace, C as buildReflectionPrompt, F as crowdingDistance, J as defaultMultiShotObjectives, K as dominates, L as paretoFrontier, N as paretoFrontierWithCrowding, Q as parseReflectionResponse, U as runMultiShotOptimization, W as runPromptEvolution, X as scalarScore, Y as trialTraceFromMultiShotTrial } from './multi-shot-optimization-Bvtz294B.js';
|
|
11
|
-
import {
|
|
12
|
-
export {
|
|
11
|
+
import { a as Run$1, S as Span, f as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, F as FailureClass, g as BudgetSpec, c as ToolSpan, h as RunFilter, L as LlmSpan, J as JudgeSpan } from './store-u47QaJ9G.js';
|
|
12
|
+
export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, m as RunLayer, n as RunStatus, e as SandboxSpan, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
|
|
13
|
+
import { llmSpans, RawProviderSink, ProviderRedactor } from './traces.js';
|
|
14
|
+
export { AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, FileSystemRawProviderSink, FileSystemRawProviderSinkOptions, InMemoryRawProviderSink, InMemoryRawProviderSinkOptions, NoopRawProviderSink, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RawProviderDirection, RawProviderEvent, RawProviderSinkFilter, RedactionReport, RedactionRule, RunIntegrityError, RunIntegrityExpectations, RunIntegrityIssue, RunIntegrityIssueCode, RunIntegrityReport, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalysisStore, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, aggregateLlm, analyzeTraces, argHash, assertRunCaptured, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, defaultProviderRedactor, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, judgeSpans, planTraceInsightQuestions, providerFromBaseUrl, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, throwIfRunIncomplete, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
|
|
13
15
|
import { a as DatasetScenario, b as Dataset, c as DatasetManifest } from './dataset-B9qvlm_o.js';
|
|
14
16
|
export { d as DatasetDifficulty, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-B9qvlm_o.js';
|
|
15
17
|
import { a as RunRecord } from './run-record-CX_jcAyr.js';
|
|
@@ -1806,42 +1808,6 @@ declare class BudgetGuard {
|
|
|
1806
1808
|
get state(): Record<keyof BudgetSpec, number>;
|
|
1807
1809
|
}
|
|
1808
1810
|
|
|
1809
|
-
/**
|
|
1810
|
-
* Failure taxonomy — canonical classes + a default classifier.
|
|
1811
|
-
*
|
|
1812
|
-
* Every failed run should end up in a named class. The classifier here
|
|
1813
|
-
* is rule-based (fast, deterministic); an LLM fallback can be added by
|
|
1814
|
-
* the consumer for novel cases and trained into the rule base over time.
|
|
1815
|
-
*
|
|
1816
|
-
* Consumers call `classifyFailure(run, spans, events)` and persist the
|
|
1817
|
-
* returned class as `Run.outcome.failureClass`.
|
|
1818
|
-
*/
|
|
1819
|
-
|
|
1820
|
-
interface FailureContext {
|
|
1821
|
-
run: Run$1;
|
|
1822
|
-
spans: Span[];
|
|
1823
|
-
events: TraceEvent[];
|
|
1824
|
-
}
|
|
1825
|
-
interface FailureClassification {
|
|
1826
|
-
failureClass: FailureClass;
|
|
1827
|
-
reason: string;
|
|
1828
|
-
triggerSpanId?: string;
|
|
1829
|
-
triggerEventId?: string;
|
|
1830
|
-
}
|
|
1831
|
-
/** Ordered rules — first match wins. */
|
|
1832
|
-
interface FailureRule {
|
|
1833
|
-
id: string;
|
|
1834
|
-
match: (ctx: FailureContext) => {
|
|
1835
|
-
failureClass: FailureClass;
|
|
1836
|
-
reason: string;
|
|
1837
|
-
triggerSpanId?: string;
|
|
1838
|
-
triggerEventId?: string;
|
|
1839
|
-
} | null;
|
|
1840
|
-
}
|
|
1841
|
-
declare const DEFAULT_RULES: FailureRule[];
|
|
1842
|
-
/** Classify the failure mode of a run using an ordered rule list. */
|
|
1843
|
-
declare function classifyFailure(ctx: FailureContext, rules?: FailureRule[]): FailureClassification;
|
|
1844
|
-
|
|
1845
1811
|
/**
|
|
1846
1812
|
* Trajectory — ordered, structured view over a run's spans.
|
|
1847
1813
|
*
|
|
@@ -1996,43 +1962,6 @@ declare function budgetBreachView(store: TraceStore, options?: {
|
|
|
1996
1962
|
variantId?: string;
|
|
1997
1963
|
}): Promise<BudgetBreachReport>;
|
|
1998
1964
|
|
|
1999
|
-
/**
|
|
2000
|
-
* FailureClusterView — groups failed runs by (failureClass, triggerTool,
|
|
2001
|
-
* argHash-prefix) so weekly reviews can prioritize the top-N clusters.
|
|
2002
|
-
*
|
|
2003
|
-
* Each cluster includes: N runs, scenarios affected, representative
|
|
2004
|
-
* error message, a proposed mitigation hint (rule → action table).
|
|
2005
|
-
*/
|
|
2006
|
-
|
|
2007
|
-
interface FailureCluster {
|
|
2008
|
-
failureClass: FailureClass;
|
|
2009
|
-
/** Tool name when the trigger was a tool span, else undefined. */
|
|
2010
|
-
toolName?: string;
|
|
2011
|
-
/** First 16 chars of argHash — clusters similar args. */
|
|
2012
|
-
argPrefix?: string;
|
|
2013
|
-
/**
|
|
2014
|
-
* Source dimension when the trigger was a judge span (e.g. `'format'`,
|
|
2015
|
-
* `'safety'`, `'correctness'`). Lets cross-template aggregators
|
|
2016
|
-
* group failures by the dimension that fired without overloading
|
|
2017
|
-
* `argPrefix`. Optional — legacy clusters without this field
|
|
2018
|
-
* deserialize cleanly.
|
|
2019
|
-
*/
|
|
2020
|
-
dimension?: string;
|
|
2021
|
-
runCount: number;
|
|
2022
|
-
scenarioIds: string[];
|
|
2023
|
-
exampleError?: string;
|
|
2024
|
-
exampleRunId: string;
|
|
2025
|
-
}
|
|
2026
|
-
interface FailureClusterReport {
|
|
2027
|
-
clusters: FailureCluster[];
|
|
2028
|
-
totalFailures: number;
|
|
2029
|
-
totalRuns: number;
|
|
2030
|
-
}
|
|
2031
|
-
declare function failureClusterView(store: TraceStore, options?: {
|
|
2032
|
-
rules?: FailureRule[];
|
|
2033
|
-
minClusterSize?: number;
|
|
2034
|
-
}): Promise<FailureClusterReport>;
|
|
2035
|
-
|
|
2036
1965
|
/**
|
|
2037
1966
|
* JudgeAgreementView — pairwise agreement between judges across the
|
|
2038
1967
|
* corpus, grouped by dimension.
|
|
@@ -4378,6 +4307,7 @@ declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals):
|
|
|
4378
4307
|
* output (semantic concept judge, reviewer directives, critic scores). Primitives
|
|
4379
4308
|
* that need free-form text use `callLlm` and parse output themselves.
|
|
4380
4309
|
*/
|
|
4310
|
+
|
|
4381
4311
|
interface LlmMessage {
|
|
4382
4312
|
role: 'system' | 'user' | 'assistant';
|
|
4383
4313
|
/**
|
|
@@ -4456,6 +4386,26 @@ interface LlmClientOptions {
|
|
|
4456
4386
|
maxRetries?: number;
|
|
4457
4387
|
/** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */
|
|
4458
4388
|
fetch?: typeof fetch;
|
|
4389
|
+
/**
|
|
4390
|
+
* Optional raw HTTP capture sink. When provided, every request, response,
|
|
4391
|
+
* and error (across all retry attempts) is recorded to the sink, with auth
|
|
4392
|
+
* headers and credential-shaped body fields redacted by default. This is
|
|
4393
|
+
* the layer-1 forensics primitive: structured `LlmSpan`s record intent,
|
|
4394
|
+
* raw events record what actually crossed the wire.
|
|
4395
|
+
*/
|
|
4396
|
+
rawSink?: RawProviderSink;
|
|
4397
|
+
/**
|
|
4398
|
+
* Logical provider id attached to raw events. When omitted, derived from
|
|
4399
|
+
* `baseUrl` via `providerFromBaseUrl`.
|
|
4400
|
+
*/
|
|
4401
|
+
provider?: string;
|
|
4402
|
+
/** Trace context attached to raw events; populated by emitter-aware callers. */
|
|
4403
|
+
traceContext?: {
|
|
4404
|
+
runId?: string;
|
|
4405
|
+
spanId?: string;
|
|
4406
|
+
};
|
|
4407
|
+
/** Override the redaction strategy for this call. Defaults to `defaultProviderRedactor`. */
|
|
4408
|
+
redactor?: ProviderRedactor;
|
|
4459
4409
|
}
|
|
4460
4410
|
/**
|
|
4461
4411
|
* Strip a ```json / ``` code fence if the model emitted one.
|
|
@@ -4479,6 +4429,44 @@ declare function callLlmJson<T = unknown>(req: LlmCallRequest, opts?: LlmClientO
|
|
|
4479
4429
|
value: T;
|
|
4480
4430
|
result: LlmCallResult;
|
|
4481
4431
|
}>;
|
|
4432
|
+
declare class LlmRouteAssertionError extends Error {
|
|
4433
|
+
readonly code: 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider';
|
|
4434
|
+
readonly baseUrl: string;
|
|
4435
|
+
constructor(message: string, code: 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider', baseUrl: string);
|
|
4436
|
+
}
|
|
4437
|
+
interface LlmRouteRequirements {
|
|
4438
|
+
/**
|
|
4439
|
+
* Throw if `opts.baseUrl` is undefined, i.e. the call would fall back to
|
|
4440
|
+
* `DEFAULT_BASE_URL`. Set this for evaluation runs where silently using
|
|
4441
|
+
* the public/free-tier router is a defect — the launch reviewer needs to
|
|
4442
|
+
* know exactly which provider answered.
|
|
4443
|
+
*/
|
|
4444
|
+
requireExplicitBaseUrl?: boolean;
|
|
4445
|
+
/**
|
|
4446
|
+
* Allowlist of acceptable base URLs. Strings match by prefix
|
|
4447
|
+
* (case-insensitive); RegExps test against the full base URL.
|
|
4448
|
+
*/
|
|
4449
|
+
allowedBaseUrls?: Array<string | RegExp>;
|
|
4450
|
+
/** Blocklist that takes precedence over `allowedBaseUrls`. */
|
|
4451
|
+
blockedBaseUrls?: Array<string | RegExp>;
|
|
4452
|
+
/** Throw if no auth header / api key is configured. */
|
|
4453
|
+
requireAuth?: boolean;
|
|
4454
|
+
/**
|
|
4455
|
+
* Logical provider id the configured `baseUrl` is expected to match (via
|
|
4456
|
+
* `providerFromBaseUrl`). Mainly useful when paired with `requireExplicitBaseUrl`.
|
|
4457
|
+
*/
|
|
4458
|
+
expectedProvider?: string;
|
|
4459
|
+
}
|
|
4460
|
+
/**
|
|
4461
|
+
* Fail-loud assertion that the configured LLM client points at the route
|
|
4462
|
+
* the caller intends. Designed for the matrix-runner preflight: invoke
|
|
4463
|
+
* once before any LLM call to catch misconfiguration before a sweep burns
|
|
4464
|
+
* dollars on the wrong provider.
|
|
4465
|
+
*
|
|
4466
|
+
* Throws `LlmRouteAssertionError`. Pure — no I/O — so it's safe to call
|
|
4467
|
+
* from constructors and CI gates.
|
|
4468
|
+
*/
|
|
4469
|
+
declare function assertLlmRoute(opts: LlmClientOptions, req?: LlmRouteRequirements): void;
|
|
4482
4470
|
/**
|
|
4483
4471
|
* Probe whether a model is reachable. Returns latency + null error on
|
|
4484
4472
|
* success; `ok=false` + error message on any failure (HTTP, timeout,
|
|
@@ -6324,4 +6312,4 @@ interface OrthogonalityResult {
|
|
|
6324
6312
|
}
|
|
6325
6313
|
declare function passOrthogonality<T>(input: OrthogonalityInput<T>): OrthogonalityResult;
|
|
6326
6314
|
|
|
6327
|
-
export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, BudgetLedgerEntry, BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ControlSeverity, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, type DataAcquisitionPlan, Dataset, DatasetManifest, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmMessage, LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OutcomeFilter, type OutcomePair, type OutcomeStore, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegressionOptions, type RegressionSpec, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, TraceEvent, TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerbosityBiasResult, type VerificationReport, type VerifyContext, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateRunScore, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, blockingKnowledgeEval, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, hashContent, hashJson, htmlContainsElement, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, iqr, isPrmVerdict, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeAgreementView, keyPreserved, knowledgeReadinessTracePayload, linterJudge, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSuccessRubric, toolWasteView, typoMutator, urlContains, userQuestionsForKnowledgeGaps, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
|
|
6315
|
+
export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, BudgetLedgerEntry, BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ControlSeverity, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, type DataAcquisitionPlan, Dataset, DatasetManifest, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FailureClass, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmMessage, LlmRouteAssertionError, type LlmRouteRequirements, LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OutcomeFilter, type OutcomePair, type OutcomeStore, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, ProviderRedactor, RawProviderSink, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegressionOptions, type RegressionSpec, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, TraceEvent, TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerbosityBiasResult, type VerificationReport, type VerifyContext, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateRunScore, analyzeAntiSlop, analyzeSeries, assertLlmRoute, attributeCounterfactuals, benjaminiHochberg, bisect, blockingKnowledgeEval, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportTrainingData, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, hashContent, hashJson, htmlContainsElement, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, iqr, isPrmVerdict, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeAgreementView, keyPreserved, knowledgeReadinessTracePayload, linterJudge, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSuccessRubric, toolWasteView, typoMutator, urlContains, userQuestionsForKnowledgeGaps, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
|